Skip to content
This repository has been archived by the owner on Oct 29, 2024. It is now read-only.

Commit

Permalink
fix(text): fix memory problem
Browse files Browse the repository at this point in the history
  • Loading branch information
chuang8511 committed Sep 12, 2024
1 parent 620df7c commit 3433cc2
Showing 1 changed file with 13 additions and 1 deletion.
14 changes: 13 additions & 1 deletion operator/document/v0/python/transformPDFToMarkdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,19 @@ def preprocess(self):
self.process_image(self.image_index)

for page in self.pages:
kwargs = {
"return_chars": False,
"extra_attrs": ["x0", "x1", "top", "bottom", "text"],
}

page_lines = page.extract_text_lines(
layout=True,
x_tolerance_ratio=0.1,
**kwargs
)
page.flush_cache()
page.get_textmap.cache_clear()

self.process_line(page_lines, page.page_number)
self.process_table(page)

Expand Down Expand Up @@ -63,7 +72,10 @@ def set_heights(self):
heights = []
largest_text_height, second_largest_text_height = 0, 0
for page in self.pages:
for line in page.extract_text_lines():
lines = page.extract_text_lines()
page.flush_cache()
page.get_textmap.cache_clear()
for line in lines:
height = int(line["bottom"] - line["top"])
heights.append(height)
if height > largest_text_height:
Expand Down

0 comments on commit 3433cc2

Please sign in to comment.