-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_utilities.py
106 lines (80 loc) · 2.98 KB
/
pdf_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
def load_pdf(pdf_name):
"""
Load pdf and convert it to a list of its pages as images.
Parameters
----------
pdf_name: str
Name of the pdf (must be in the same folder as this notebook)
Returns
-------
slides: list
List of Images
"""
from pdf2image import convert_from_path
slides = convert_from_path(pdf_name)
return slides
def save_images(filepath, slides, new_width=None):
"""
Save a list of images to a specified directory , optionally with a resized width.
Parameters
----------
filepath : str
The directory path where the images will be saved.
slides : list
A list of images to be saved.
new_width : int, optional
The new width for resizing each image while maintaining its aspect ratio.
If not provided, the original width of each image will be used.
Returns
-------
None
Saves each image in the specified directory as PNG files named slide1.png, slide2.png, etc.
"""
from PIL import Image
import os
for i, img in enumerate(slides, start=1):
# Use original width if new_width is not specified
if new_width is None:
new_width = img.width
# Calculate the new height to maintain the aspect ratio
aspect_ratio = img.height / img.width
new_height = int(new_width * aspect_ratio)
# Resize the image
img_resized = img.resize((new_width, new_height))
# Ensure filepath directory exists
os.makedirs(filepath, exist_ok=True)
# Saving the image in the specified filepath
image_filename = os.path.join(filepath, f"slide{i}.png")
img_resized.save(image_filename, "PNG")
def text_extraction(pdf_name, images):
"""
Extract text from a PDF and pair it with corresponding slide images, saving the result as a JSON file.
Parameters
----------
pdf_name : str
The name or path of the PDF file from which to extract text.
images : list
A list of images corresponding to the slides in the PDF.
Returns
-------
None
Saves a dictionary mapping slide image file names (e.g., slide1.png) to the extracted text
in a JSON file named 'dict_slides_text.json'.
"""
import json
import pdfplumber
slide_texts = []
# Extract the original text from each Slide
with pdfplumber.open(pdf_name) as pdf:
for page in pdf.pages:
text = page.extract_text()
slide_texts.append(text)
slide_dict = {}
for i, (img, text) in enumerate(zip(images, slide_texts), start=1):
# Define the path for each image file
image_path = f"slide{i}.png"
# Add the image path and corresponding text to the dictionary
slide_dict[image_path] = text
# Save the dictionary to a JSON file
with open(f"dict_{pdf_name}.json", "w") as json_file:
json.dump(slide_dict, json_file)