fix(text): fix memory problem

instill-ai · Sep 12, 2024 · 3433cc2 · 3433cc2
1 parent 620df7c
commit 3433cc2
Showing 1 changed file with 13 additions and 1 deletion.
diff --git a/operator/document/v0/python/transformPDFToMarkdown.py b/operator/document/v0/python/transformPDFToMarkdown.py
@@ -25,10 +25,19 @@ def preprocess(self):
 			self.process_image(self.image_index)
 
 		for page in self.pages:
+			kwargs = {
+				"return_chars": False,
+				"extra_attrs": ["x0", "x1", "top", "bottom", "text"],
+			}
+
 			page_lines = page.extract_text_lines(
 				layout=True,
 				x_tolerance_ratio=0.1,
+				**kwargs
 			)
+			page.flush_cache()
+			page.get_textmap.cache_clear()
+
 			self.process_line(page_lines, page.page_number)
 			self.process_table(page)
 
@@ -63,7 +72,10 @@ def set_heights(self):
 		heights = []
 		largest_text_height, second_largest_text_height = 0, 0
 		for page in self.pages:
-			for line in page.extract_text_lines():
+			lines = page.extract_text_lines()
+			page.flush_cache()
+			page.get_textmap.cache_clear()
+			for line in lines:
 				height = int(line["bottom"] - line["top"])
 				heights.append(height)
 				if height > largest_text_height: