Samagra-Development · Azazel0203 · Apr 18, 2024 · Apr 18, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/repository_data.json b/repository_data.json
@@ -121,6 +121,11 @@
                 }
             }
         },
+        "pdfparsing": {
+            "detectron2": {
+                "local": {
+
+                }
             }
         }
     }

diff --git a/src/pdfparsing/__init__.py b/src/pdfparsing/__init__.py
diff --git a/src/pdfparsing/detectron2/__init__.py b/src/pdfparsing/detectron2/__init__.py
diff --git a/src/pdfparsing/detectron2/app.py b/src/pdfparsing/detectron2/app.py
@@ -0,0 +1,69 @@
+
+import os
+import base64
+import json
+from flask import Flask, render_template, request, redirect, url_for
+import tempfile
+from pdf2image import convert_from_path
+from PIL import Image
+from utils.model import ml_part
+app = Flask(__name__)
+
+ALLOWED_EXTENSIONS = {'pdf'}
+
+app.config['UPLOAD_FOLDER'] = tempfile.mkdtemp()
+app.config['OUTPUT_FOLDER'] = tempfile.mkdtemp()
+
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+
+@app.route('/')
+def index():
+    return render_template('index.html')
+
+@app.route('/upload', methods=['POST'])
+def upload_file():
+    if 'file' not in request.files:
+        return redirect(request.url)
+    file = request.files['file']
+    if file.filename == '':
+        return redirect(request.url)
+    if file and allowed_file(file.filename):
+        filename = file.filename
+        file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        file.save(file_path)
+        return redirect(url_for('display_images', filename=filename))
+    return redirect(request.url)
+
+@app.route('/display/<filename>')
+def display_images(filename):
+    output_dir = app.config['OUTPUT_FOLDER']
+    result = ml_part(filename, app.config['UPLOAD_FOLDER'])
+    sorted_results = sorted(result, key=lambda x: x['page'])
+    texts = {}
+    for item in sorted_results:
+        pagenumber = item['page']
+        labeled_img = item['labeled_image']
+        file_path = os.path.join(output_dir, f"{pagenumber}")
+        labeled_img.save(file_path)
+        texts[file_path] = item['text']
+    images = []
+    for image_file in sorted(os.listdir(output_dir)):
+        if image_file.endswith('.png'):
+            try:
+                page_number = int(image_file.split('_')[-1].split('.')[0])
+            except ValueError:
+                continue
+            with open(os.path.join(output_dir, image_file), 'rb') as img_file:
+                img_data = base64.b64encode(img_file.read()).decode('utf-8')
+                images.append({
+                    'data': f"data:image/png;base64,{img_data}",
+                    'page_number': page_number,
+                    'text': texts[f"{app.config['OUTPUT_FOLDER']}\\{image_file}"],
+                })
+    return render_template('display.html', images=sorted(images, key=lambda x: x['page_number']))
+
+
+
+if __name__ == '__main__':
+    app.run(debug=True, host="0.0.0.0", port=8000)
diff --git a/src/pdfparsing/detectron2/example.pdf b/src/pdfparsing/detectron2/example.pdf
diff --git a/src/pdfparsing/detectron2/experiment_notebook/pdf_ff.ipynb b/src/pdfparsing/detectron2/experiment_notebook/pdf_ff.ipynb
diff --git a/src/pdfparsing/detectron2/requirements.txt b/src/pdfparsing/detectron2/requirements.txt
@@ -0,0 +1,11 @@
+torch>=1.10
+torchvision>=0.10
+pdf2image
+flask
+pillow
+opencv-python
+numpy
+layoutparser
+detectron2 @ git+https://github.com/facebookresearch/[email protected]#egg=detectron2
+layoutparser[ocr]
+pytesseract
diff --git a/src/pdfparsing/detectron2/templates/display.html b/src/pdfparsing/detectron2/templates/display.html
@@ -0,0 +1,45 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Images and Text</title>
+    <style>
+        /* Styling for the layout */
+        .container {
+            display: flex;
+            flex-direction: column;
+            padding: 20px;
+        }
+        .item {
+            display: flex;
+            margin-bottom: 20px; /* Add some space between each pair */
+        }
+        .image {
+            flex-basis: 50%; /* Each image takes up half of the container */
+        }
+        .image img {
+            max-width: 100%;
+            height: auto;
+        }
+        .text {
+            flex-basis: 50%; /* Each text takes up half of the container */
+            margin-left: 20px; /* Add some space between image and text */
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        {% for image in images %}
+        <div class="item">
+            <div class="image">
+                <img src="{{ image.data }}" alt="Page {{ image.page_number }}">
+            </div>
+            <div class="text">
+                <p>{{ image.text }}</p>
+            </div>
+        </div>
+        {% endfor %}
+    </div>
+</body>
+</html>
diff --git a/src/pdfparsing/detectron2/templates/index.html b/src/pdfparsing/detectron2/templates/index.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Upload PDF</title>
+</head>
+<body>
+    <h1>Upload a PDF file</h1>
+    <form method="POST" action="/upload" enctype="multipart/form-data">
+        <input type="file" name="file" accept=".pdf" required>
+        <button type="submit">Upload</button>
+    </form>
+</body>
+</html>