-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
185 lines (164 loc) · 6.06 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
""" This module contains the utility functions used in the pipeline. """
import os
import time
from functools import wraps
import cv2
import boto3
import pymongo
from PyPDF2 import PdfWriter, PdfReader
from uuid import uuid4
from dotenv import load_dotenv
# sonali: depenedncy for utils currently commented out
import base64
import numpy as np
load_dotenv()
aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID']
aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY']
aws_region = os.environ['AWS_REGION']
bucket_name = os.environ['AWS_BUCKET_NAME']
folder_name=os.environ['BOOK_FOLDER_NAME']
mongo_connection_string = os.environ['DATABASE_URL']
mongo_db_name = os.environ['MONGO_DB']
pdf_batch_size = int(os.environ['PDF_BATCH_SIZE'])
# Create an S3 client
s3 = boto3.client('s3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=aws_region)
def timeit(func):
"""
Keeps track of the time taken by a function to execute.
"""
@wraps(func)
def timeit_wrapper(*args, **kwargs):
start_time = time.perf_counter()
result = func(*args, **kwargs)
end_time = time.perf_counter()
total_time = end_time - start_time
print(f'Function {func.__name__} Took {total_time:.4f} seconds')
return result
return timeit_wrapper
def crop_image(block, imagepath, id):
"""
Function to crop the image based on the bounding box coordinates.
"""
x1, y1, x2, y2 = block['x_1'], block['y_1'], block['x_2'], block['y_2']
img = cv2.imread(imagepath)
# Expand the bounding box by 5 pixels on every side
x1-=5
y1-=5
x2+=5
y2+=5
# Ensure the coordinates are within the image boundaries
x1=max(0,x1)
y1=max(0,y1)
x2=min(img.shape[1],x2)
y2=min(img.shape[0],y2)
#crop the expanded bounding box
bbox = img[int(y1):int(y2), int(x1):int(x2)]
cropped_image_path = os.path.abspath(f"cropeed{id}.png")
cv2.imwrite(cropped_image_path,bbox)
return cropped_image_path
def generate_unique_id():
""" generate unique id """
return uuid4().hex
@timeit
def download_book_from_aws(book_id, book_name):
"""
Function used to download the book from AWS S3.
"""
local_path = None
try:
print('AWS book download >> ', book_name)
book_folder = os.path.join(folder_name, book_id)
os.makedirs(book_folder, exist_ok=True)
local_path = os.path.join(book_folder, book_name)
file_key = f'{folder_name}/{book_name}'
response = s3.get_object(Bucket=bucket_name, Key=file_key)
pdf_data = response['Body'].read()
with open(local_path, 'wb') as f:
f.write(pdf_data)
except Exception as e:
print("An error occurred:", e)
return local_path
@timeit
def split_pdf(local_path):
"""
Function used to split the pdf into individual pages.
"""
# book-set-2/123/abc.pdf
print('Splitting pdf >> ', local_path)
book_id = local_path.split('/')[1]
book_split_folder = os.path.join(folder_name, book_id, 'splits')
os.makedirs(book_split_folder, exist_ok=True)
# book-set-2/123/splits
print("split folder >>> ", book_split_folder)
with open(local_path, 'rb') as f:
inputpdf = PdfReader(f)
file_prefix = generate_unique_id()
output_file_paths = []
total_num_pages = len(inputpdf.pages)
print("Total number of pages in the pdf: ", total_num_pages)
if total_num_pages > pdf_batch_size:
print("Splitting pdf into batches")
for i in range(0, total_num_pages, pdf_batch_size):
output = PdfWriter()
for page in inputpdf.pages[i:i+pdf_batch_size]:
output.add_page(page)
file_path = f"{book_split_folder}/{file_prefix}_{int(i/pdf_batch_size)}.pdf"
with open(file_path, "wb") as output_stream:
output.write(output_stream)
output_file_paths.append(file_path)
else:
output = PdfWriter()
for page in inputpdf.pages:
output.add_page(page)
file_path = f"{book_split_folder}/{file_prefix}_0.pdf"
with open(file_path, "wb") as output_stream:
output.write(output_stream)
output_file_paths.append(file_path)
return output_file_paths
def get_mongo_client():
"""
Function to get the mongo client.
"""
mongo_client = pymongo.MongoClient(mongo_connection_string)
return mongo_client
def get_mongo_collection(collection_name):
"""
Function to get the mongo collection.
"""
mongo_client = get_mongo_client()
db = mongo_client[mongo_db_name]
collection = db[collection_name]
return collection
# sonali
def read_image_from_str(image_str):
image_bytes = base64.b64decode(image_str)
image_np_array = np.frombuffer(image_bytes, np.uint8)
image = cv2.imdecode(image_np_array, cv2.IMREAD_COLOR)
return image
def generate_image_str(image_path):
with open(image_path, 'rb') as img:
img_data = img.read()
image_data_base64 = base64.b64encode(img_data).decode('utf-8')
return image_data_base64
def create_image_from_str(image_str):
image_data = base64.b64decode(image_str)
image_path=f"{generate_unique_id()}.jpg"
with open(image_path, 'wb') as img:
img.write(image_data)
return image_path
if __name__ == '__main__':
BOOK_ID = '456'
BOOK = 'output_2.pdf'
BOOKS = [
'Evidence-Based Critical Care - Robert C Hyzy.pdf',
'Evidence-Based Interventions for Children with Challenging Behavior - Kathleen Hague Armstrong- Julia A Ogg- Ashley N Sundman-Wheat- Audra St John Walsh.pdf',
'Evidence-Based Practice in Clinical Social Work - James W Drisko- Melissa D Grady.pdf',
'Evolutionary Thinking in Medicine - Alexandra Alvergne- Crispin Jenkinson- Charlotte Faurie.pdf',
'Exam Survival Guide: Physical Chemistry - Jochen Vogt.pdf'
]
file_local_path = download_book_from_aws(BOOK_ID, BOOK)
split_local_paths = split_pdf(file_local_path)
print(split_local_paths)