Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Featurepdf viewer #314

Open
wants to merge 5 commits into
base: restructure
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions repository_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@
}
}
},
"pdfparsing": {
"detectron2": {
"local": {

}
}
}
}
Expand Down
Empty file added src/pdfparsing/__init__.py
Empty file.
Empty file.
69 changes: 69 additions & 0 deletions src/pdfparsing/detectron2/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@

import os
import base64
import json
from flask import Flask, render_template, request, redirect, url_for
import tempfile
from pdf2image import convert_from_path
from PIL import Image
from utils.model import ml_part
app = Flask(__name__)

ALLOWED_EXTENSIONS = {'pdf'}

app.config['UPLOAD_FOLDER'] = tempfile.mkdtemp()
app.config['OUTPUT_FOLDER'] = tempfile.mkdtemp()

def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route('/')
def index():
return render_template('index.html')

@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
return redirect(request.url)
file = request.files['file']
if file.filename == '':
return redirect(request.url)
if file and allowed_file(file.filename):
filename = file.filename
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(file_path)
return redirect(url_for('display_images', filename=filename))
return redirect(request.url)

@app.route('/display/<filename>')
def display_images(filename):
output_dir = app.config['OUTPUT_FOLDER']
result = ml_part(filename, app.config['UPLOAD_FOLDER'])
sorted_results = sorted(result, key=lambda x: x['page'])
texts = {}
for item in sorted_results:
pagenumber = item['page']
labeled_img = item['labeled_image']
file_path = os.path.join(output_dir, f"{pagenumber}")
labeled_img.save(file_path)
texts[file_path] = item['text']
images = []
for image_file in sorted(os.listdir(output_dir)):
if image_file.endswith('.png'):
try:
page_number = int(image_file.split('_')[-1].split('.')[0])
except ValueError:
continue
with open(os.path.join(output_dir, image_file), 'rb') as img_file:
img_data = base64.b64encode(img_file.read()).decode('utf-8')
images.append({
'data': f"data:image/png;base64,{img_data}",
'page_number': page_number,
'text': texts[f"{app.config['OUTPUT_FOLDER']}\\{image_file}"],
})
return render_template('display.html', images=sorted(images, key=lambda x: x['page_number']))



if __name__ == '__main__':
app.run(debug=True, host="0.0.0.0", port=8000)
Binary file added src/pdfparsing/detectron2/example.pdf
Binary file not shown.
512 changes: 512 additions & 0 deletions src/pdfparsing/detectron2/experiment_notebook/pdf_ff.ipynb

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions src/pdfparsing/detectron2/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
torch>=1.10
torchvision>=0.10
pdf2image
flask
pillow
opencv-python
numpy
layoutparser
detectron2 @ git+https://github.com/facebookresearch/[email protected]#egg=detectron2
layoutparser[ocr]
pytesseract
45 changes: 45 additions & 0 deletions src/pdfparsing/detectron2/templates/display.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Images and Text</title>
<style>
/* Styling for the layout */
.container {
display: flex;
flex-direction: column;
padding: 20px;
}
.item {
display: flex;
margin-bottom: 20px; /* Add some space between each pair */
}
.image {
flex-basis: 50%; /* Each image takes up half of the container */
}
.image img {
max-width: 100%;
height: auto;
}
.text {
flex-basis: 50%; /* Each text takes up half of the container */
margin-left: 20px; /* Add some space between image and text */
}
</style>
</head>
<body>
<div class="container">
{% for image in images %}
<div class="item">
<div class="image">
<img src="{{ image.data }}" alt="Page {{ image.page_number }}">
</div>
<div class="text">
<p>{{ image.text }}</p>
</div>
</div>
{% endfor %}
</div>
</body>
</html>
15 changes: 15 additions & 0 deletions src/pdfparsing/detectron2/templates/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Upload PDF</title>
</head>
<body>
<h1>Upload a PDF file</h1>
<form method="POST" action="/upload" enctype="multipart/form-data">
<input type="file" name="file" accept=".pdf" required>
<button type="submit">Upload</button>
</form>
</body>
</html>
Loading