diff --git a/Makefile b/Makefile index 97ff6bb4f..7e92334d0 100644 --- a/Makefile +++ b/Makefile @@ -96,6 +96,11 @@ coverage: .PHONY: test test: +# Ideally, it should be ok to run without installing tparse locally. +# However, there may be some issues that arise from running the tests +# in the container. If you encounter any issues, please install tparse +# locally via `go install github.com/mfridman/tparse/cmd/tparse@latest` +# and run the tests locally. @if [ "${OCR}" = "true" ]; then \ docker run --rm \ -v $(PWD):/${SERVICE_NAME} \ diff --git a/pkg/component/operator/document/v0/README.mdx b/pkg/component/operator/document/v0/README.mdx index 41671ce7a..28cee3963 100644 --- a/pkg/component/operator/document/v0/README.mdx +++ b/pkg/component/operator/document/v0/README.mdx @@ -59,6 +59,7 @@ Convert document to text in Markdown format. | Images (optional) | `images` | array[string] | Images extracted from the document | | Error (optional) | `error` | string | Error message if any during the conversion process | | All Page Images (optional) | `all-page-images` | array[string] | The image contains all the pages in the document if we detect there could be images in the page. It will only support DOCX/DOC/PPTX/PPT/PDF. | +| Markdowns (optional) | `markdowns` | array[string] | Markdown text converted from the PDF document, separated by page. | ### Convert to Text diff --git a/pkg/component/operator/document/v0/config/tasks.json b/pkg/component/operator/document/v0/config/tasks.json index fcf549ccf..1c4fc96e7 100644 --- a/pkg/component/operator/document/v0/config/tasks.json +++ b/pkg/component/operator/document/v0/config/tasks.json @@ -108,6 +108,16 @@ }, "title": "All Page Images", "type": "array" + }, + "markdowns": { + "description": "Markdown text converted from the PDF document, separated by page.", + "instillFormat": "array:string", + "instillUIOrder": 5, + "items": { + "type": "string" + }, + "title": "Markdowns", + "type": "array" } }, "required": [ diff --git a/pkg/component/operator/document/v0/convert_document_to_markdown.go b/pkg/component/operator/document/v0/convert_document_to_markdown.go index 49e1f92c7..a82e7c44c 100644 --- a/pkg/component/operator/document/v0/convert_document_to_markdown.go +++ b/pkg/component/operator/document/v0/convert_document_to_markdown.go @@ -53,6 +53,7 @@ func (e *execution) convertDocumentToMarkdown(ctx context.Context, job *base.Job } return images }(), + Markdowns: transformerOutputStruct.Markdowns, } err = job.Output.WriteData(ctx, outputStruct) diff --git a/pkg/component/operator/document/v0/convert_document_to_markdown_test.go b/pkg/component/operator/document/v0/convert_document_to_markdown_test.go index a13be4dc7..9618b8da9 100644 --- a/pkg/component/operator/document/v0/convert_document_to_markdown_test.go +++ b/pkg/component/operator/document/v0/convert_document_to_markdown_test.go @@ -30,6 +30,7 @@ func TestConvertDocumentToMarkdown(t *testing.T) { Body: "# This is test file for markdown\n", Images: []format.Image{}, AllPageImages: []format.Image{}, + Markdowns: []string{"# This is test file for markdown\n"}, }, }, { @@ -39,6 +40,7 @@ func TestConvertDocumentToMarkdown(t *testing.T) { Body: "# This is test file for markdown\n", Images: []format.Image{}, AllPageImages: []format.Image{}, + Markdowns: []string{"# This is test file for markdown\n"}, }, }, { @@ -57,6 +59,7 @@ func TestConvertDocumentToMarkdown(t *testing.T) { Body: "# This is test file for markdown\n", Images: []format.Image{}, AllPageImages: []format.Image{}, + Markdowns: []string{"# This is test file for markdown\n"}, }, }, { diff --git a/pkg/component/operator/document/v0/execution/pdf_checker.py b/pkg/component/operator/document/v0/execution/pdf_checker.py deleted file mode 100644 index de596d35e..000000000 --- a/pkg/component/operator/document/v0/execution/pdf_checker.py +++ /dev/null @@ -1,23 +0,0 @@ -from io import BytesIO -import json -import base64 -import sys - -# TODO chuang8511: -# Deal with the import error when running the code in the docker container. -# Now, we combine all python code into one file to avoid the import error. -# from pdf_to_markdown import PDFTransformer - -if __name__ == "__main__": - json_str = sys.stdin.buffer.read().decode('utf-8') - params = json.loads(json_str) - pdf_string = params["PDF"] - - decoded_bytes = base64.b64decode(pdf_string) - pdf_file_obj = BytesIO(decoded_bytes) - pdf = PDFTransformer(x=pdf_file_obj) - pages = pdf.raw_pages - output = { - "required": len(pages) == 0, - } - print(json.dumps(output)) diff --git a/pkg/component/operator/document/v0/execution/task_convert_to_images.py b/pkg/component/operator/document/v0/execution/task_convert_to_images.py deleted file mode 100644 index 6ab77416a..000000000 --- a/pkg/component/operator/document/v0/execution/task_convert_to_images.py +++ /dev/null @@ -1,38 +0,0 @@ -from io import BytesIO -import json -import base64 -import sys - -# TODO chuang8511: -# Deal with the import error when running the code in the docker container. -# Now, we combine all python code into one file to avoid the import error. -# from pdf_to_markdown import PDFTransformer -# from pdf_to_markdown import PageImageProcessor - -if __name__ == "__main__": - json_str = sys.stdin.buffer.read().decode('utf-8') - params = json.loads(json_str) - filename = params["filename"] - pdf_string = params["PDF"] - - decoded_bytes = base64.b64decode(pdf_string) - pdf_file_obj = BytesIO(decoded_bytes) - pdf = PDFTransformer(x=pdf_file_obj) - pages = pdf.raw_pages - exclude_file_extension = filename.split(".")[0] - filenames = [] - images = [] - - for i, page in enumerate(pages): - page_image = page.to_image(resolution=500) - encoded_image = PageImageProcessor.encode_image(page_image) - images.append(encoded_image) - filenames.append(f"{exclude_file_extension}_{i}.png") - - - output = { - "images": images, - "filename": filenames, - } - - print(json.dumps(output)) diff --git a/pkg/component/operator/document/v0/execution/task_convert_to_markdown.py b/pkg/component/operator/document/v0/execution/task_convert_to_markdown.py deleted file mode 100644 index a7e6b70e6..000000000 --- a/pkg/component/operator/document/v0/execution/task_convert_to_markdown.py +++ /dev/null @@ -1,64 +0,0 @@ -from io import BytesIO -import json -import base64 -import sys - -# TODO chuang8511: -# Deal with the import error when running the code in the docker container. -# Now, we combine all python code into one file to avoid the import error. -# from pdf_to_markdown import PDFTransformer - - -if __name__ == "__main__": - json_str = sys.stdin.buffer.read().decode('utf-8') - params = json.loads(json_str) - display_image_tag = params["display-image-tag"] - display_all_page_image = params["display-all-page-image"] - pdf_string = params["PDF"] - decoded_bytes = base64.b64decode(pdf_string) - pdf_file_obj = BytesIO(decoded_bytes) - pdf = PDFTransformer(pdf_file_obj, display_image_tag) - - result = "" - images = [] - separator_number = 30 - image_idx = 0 - errors = [] - all_page_images = [] - - try: - times = len(pdf.raw_pages) // separator_number + 1 - for i in range(times): - pdf = PDFTransformer(pdf_file_obj, display_image_tag, image_idx) - if i == times - 1: - pdf.pages = pdf.raw_pages[i*separator_number:] - else: - pdf.pages = pdf.raw_pages[i*separator_number:(i+1)*separator_number] - - pdf.preprocess() - image_idx = pdf.image_index - result += pdf.execute() - for image in pdf.base64_images: - images.append(image) - - if display_all_page_image: - raw_pages = pdf.raw_pages - - for page_number in pdf.page_numbers_with_images: - page = raw_pages[page_number - 1] - page_image = page.to_image(resolution=500) - encoded_image = PageImageProcessor.encode_image(page_image) - all_page_images.append(encoded_image) - - errors += pdf.errors - - output = { - "body": result, - "images": images, - "parsing_error": errors, - "all_page_images": all_page_images, - "display_all_page_image": display_all_page_image, - } - print(json.dumps(output)) - except Exception as e: - print(json.dumps({"system_error": str(e)})) diff --git a/pkg/component/operator/document/v0/io.go b/pkg/component/operator/document/v0/io.go index b8516eed3..829f651b9 100644 --- a/pkg/component/operator/document/v0/io.go +++ b/pkg/component/operator/document/v0/io.go @@ -15,6 +15,7 @@ type ConvertDocumentToMarkdownOutput struct { Images []format.Image `instill:"images"` Error string `instill:"error"` AllPageImages []format.Image `instill:"all-page-images"` + Markdowns []string `instill:"markdowns"` } type ConvertDocumentToImagesInput struct { diff --git a/pkg/component/operator/document/v0/pdf_to_markdown/__init__.py b/pkg/component/operator/document/v0/pdf_to_markdown/__init__.py deleted file mode 100644 index 2fa4bd0d3..000000000 --- a/pkg/component/operator/document/v0/pdf_to_markdown/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from page_image_processor import PageImageProcessor -from pdf_transformer import PDFTransformer diff --git a/pkg/component/operator/document/v0/pdf_to_markdown/page_image_processor.py b/pkg/component/operator/document/v0/pdf_to_markdown/page_image_processor.py deleted file mode 100644 index 35a676ebb..000000000 --- a/pkg/component/operator/document/v0/pdf_to_markdown/page_image_processor.py +++ /dev/null @@ -1,307 +0,0 @@ -import base64 -from io import BytesIO -from PIL import Image - -from pdfplumber.page import Page -from pdfplumber.display import PageImage - -class PageImageProcessor: - page: Page - errors: list[str] - images: list[dict] - - def __init__(self, page: Page, image_index: int): - self.page = page - self.lines = page.extract_text_lines(layout=True, strip=True, return_chars=False) - self.images = [] - self.errors = [] - page.flush_cache() - page.get_textmap.cache_clear() - self.image_index = image_index - - def produce_images_by_blocks(self) -> None: - saved_blocks = [] - page = self.page - images = page.images - - # Process images detect by pdfplumber - for i, image in enumerate(images): - bbox = (image["x0"], image["top"], image["x1"], image["bottom"]) - # There is a bug in pdfplumber that it can't target the image position correctly. - try: - img_page = page.crop(bbox=bbox) - except Exception as e: - self.errors.append(f"image {i} got error: {str(e)}, so it convert all pages into image.") - bbox = (0, 0, page.width, page.height) - img_page = page - - img_obj = img_page.to_image(resolution=500) - img_base64 = self.__class__.encode_image(image=img_obj) - - image["page_number"] = page.page_number - image["img_number"] = self.image_index - self.image_index += 1 - image["img_base64"] = img_base64 - saved_blocks.append(bbox) - self.images.append(image) - - # (x0, top, x1, bottom) - blocks = self.calculate_blank_blocks(page=page) - - for i, block in enumerate(blocks): - block_dict = self.get_block_dict(block) - block_image = { - "page_number": int, - "img_number": int, - "img_base64": str, - "top": block_dict["top"], - "bottom": block_dict["bottom"], - } - overlap = False - for saved_block in saved_blocks: - if self.is_overlap(block1=saved_block, block2=block): - overlap = True - break - if overlap: - continue - - if self.image_too_small(block=block): - continue - - if self.low_possibility_to_be_image(block=block): - continue - - try: - cropped_page = page.crop(block) - except Exception as e: - self.errors.append(f"image {i} got error: {str(e)}, do not convert the images.") - continue - - im = cropped_page.to_image(resolution=200) - - if self.is_blank_pil_image(im=im): - continue - - img_base64 = self.__class__.encode_image(image=im) - block_image["page_number"] = page.page_number - block_image["img_number"] = self.image_index - self.image_index += 1 - block_image["img_base64"] = img_base64 - self.images.append(block_image) - - def encode_image(image: PageImage) -> str: - buffer = BytesIO() - image.save(buffer, format="PNG") - buffer.seek(0) - img_data = buffer.getvalue() - return "data:image/png;base64," + base64.b64encode(img_data).decode("utf-8") - - def get_block_dict(self, block: tuple) -> dict: - return { - "x0": block[0], - "top": block[1], - "x1": block[2], - "bottom": block[3], - } - - def calculate_blank_blocks(self, page: Page) -> list[tuple]: - page_width = page.width - page_height = page.height - lines = self.lines - - page.flush_cache() - page.get_textmap.cache_clear() - - blank_blocks = [] - - # Track the bottom of the last line processed - last_bottom = 0 # Start from the top of the page - - # Check for empty spaces before the first line - if lines: - first_line = lines[0] - if first_line["top"] > 0: - blank_blocks.append((0, 0, page_width, first_line["top"])) - - # Process each line to find blank areas between them - for i, line in enumerate(lines): - # Calculate the blank space above the current line - if i > 0: - previous_line = lines[i - 1] - if line["top"] > previous_line["bottom"]: - # (x0, top, x1, bottom) - blank_blocks.append((0, previous_line["bottom"], page_width, line["top"])) - - # Update last_bottom to the current line's bottom - last_bottom = line["bottom"] - - # Check for empty spaces after the last line - if last_bottom < page_height: - blank_blocks.append((0, last_bottom, page_width, page_height)) - - - return blank_blocks + self.calculate_horizontal_blocks(lines=lines, page_width=page_width, tolerance=30) - - def calculate_horizontal_blocks(self, lines: list[dict[str, any]], page_width: float, tolerance: float) -> list[tuple]: - """ - Calculates horizontal blocks (blank spaces) on the left or right side of text lines. - - Parameters: - - lines: A list of dictionaries, each representing a line of text with 'x0', 'x1', 'top', and 'bottom' attributes. - - page_width: The width of the page being processed. - - tolerance: to tolerate the block judgement - - Returns: - - A list of tuples representing the horizontal blocks (x0, top, x1, bottom) where no text exists. - """ - if not lines: - return [] - - left_blocks = [] - right_blocks = [] - - # Sort the lines by their vertical position (top) - sorted_lines = sorted(lines, key=lambda l: l["top"]) - - found_block = False - block_start_line = None - - for i in range(1, len(sorted_lines)): - - # Check if the block starts with first line - # 4 is number to be tuned - if not block_start_line and page_width / 4 < sorted_lines[i]["x0"]: - block_start_line = sorted_lines[i] - line_count = 1 - - current_line = sorted_lines[i] - previous_line = sorted_lines[i - 1] - if not found_block and block_start_line and abs(current_line["x0"] - block_start_line["x0"]) < tolerance: - line_count += 1 - if line_count > 5: - found_block = True - block_start_top = block_start_line["top"] - - elif not block_start_line and abs(current_line["x0"] - previous_line["x0"]) > tolerance: - block_start_line = current_line - line_count = 1 - elif not found_block and block_start_line: - block_start_line = None - line_count = 0 - - if found_block and abs(current_line["x0"] - block_start_line["x0"]) > tolerance: - # Finalize the left block up to the previous line - left_blocks.append((0, block_start_top, previous_line["x0"], previous_line["bottom"])) - found_block= False - block_start_line = None - line_count = 0 - - - found_block = False - block_start_line = None - - for i in range(1, len(sorted_lines)): - if not block_start_line and page_width / 4 < page_width - sorted_lines[i]["x1"]: - block_start_line = sorted_lines[i] - line_count = 1 - - current_line = sorted_lines[i] - previous_line = sorted_lines[i - 1] - - if not found_block and block_start_line and abs(current_line["x1"] - block_start_line["x1"]) < tolerance: - line_count += 1 - if line_count > 5: - found_block = True - block_start_top = block_start_line["top"] - - elif not block_start_line and (current_line["x1"] - previous_line["x1"]) > tolerance: - block_start_line = current_line - line_count = 1 - - elif not found_block and block_start_line: - block_start_line = None - line_count = 0 - - if found_block and abs(current_line["x1"] - block_start_line["x1"]) > tolerance: - # Finalize the right block up to the previous line - right_blocks.append((previous_line["x1"], block_start_top, page_width, previous_line["bottom"])) - found_block= False - block_start_line = None - line_count = 0 - - return left_blocks + right_blocks - - # (x0, top, x1, bottom) - def image_too_small(self, block: tuple) -> bool: - image_width = block[2] - block[0] - image_height = block[3] - block[1] - size = image_width * image_height - # This is a number to be tuned - return size < 15000 - - def low_possibility_to_be_image(self, block: tuple, min_size: int = 20, max_aspect_ratio: float = 10.0) -> bool: - """ - Determine if a block has a low likelihood of being an image based on its dimensions. - - Parameters: - - block: A 4-tuple (x0, top, x1, bottom) representing the coordinates of the block. - - min_size: The minimum width/height required for a block to be considered a potential image. - - max_aspect_ratio: The maximum allowed width-to-height (or height-to-width) ratio for a block to be considered an image. - - Returns: - - True if the block is unlikely to be an image, False otherwise. - """ - # Calculate the width and height of the block - image_width = block[2] - block[0] # x1 - x0 - image_height = block[3] - block[1] # bottom - top - - # Check if the block is too small to be an image - if image_width < min_size or image_height < min_size: - return True - - # Calculate the aspect ratio - aspect_ratio = image_width / image_height if image_height != 0 else float('inf') - - # Check if the aspect ratio is too extreme to be an image - if aspect_ratio > max_aspect_ratio or aspect_ratio < 1 / max_aspect_ratio: - return True - - # Otherwise, it's not considered "low possibility" of being an image - return False - - def is_blank_pil_image(self, im: Image.Image) -> bool: - """ - Check if an in-memory image (from pdfplumber) is blank using Pillow, without NumPy. - """ - pil_image = im.original # im.original is a PIL Image object - - # Get extrema (min, max) for each channel (for grayscale, there will be one tuple) - extrema = pil_image.getextrema() - - # If the extrema (min, max) values are the same, the image is uniform (blank) - if isinstance(extrema, tuple) and all(min_val == max_val for min_val, max_val in extrema): - return True - return False - - def is_overlap(self, block1: tuple, block2: tuple) -> bool: - """ - Determines if two blocks (x0, top, x1, bottom) overlap. - - Parameters: - - block1: A tuple representing the first block (x0, top, x1, bottom). - - block2: A tuple representing the second block (x0, top, x1, bottom). - - Returns: - - True if the blocks overlap, False otherwise. - """ - x0_1, top_1, x1_1, bottom_1 = block1 - x0_2, top_2, x1_2, bottom_2 = block2 - - # Check for horizontal overlap - horizontal_overlap = (x0_1 < x1_2 and x1_1 > x0_2) - - # Check for vertical overlap - vertical_overlap = (top_1 < bottom_2 and bottom_1 > top_2) - - # If both horizontal and vertical overlaps are true, the blocks overlap - return horizontal_overlap and vertical_overlap diff --git a/pkg/component/operator/document/v0/pdf_to_markdown/pdf_transformer.py b/pkg/component/operator/document/v0/pdf_to_markdown/pdf_transformer.py deleted file mode 100644 index cf5cead25..000000000 --- a/pkg/component/operator/document/v0/pdf_to_markdown/pdf_transformer.py +++ /dev/null @@ -1,439 +0,0 @@ - -from io import BytesIO -from collections import Counter - -import pdfplumber -from pdfplumber.page import Page - -# TODO chuang8511: -# Deal with the import error when running the code in the docker container. -# Now, we combine all python code into one file to avoid the import error. -# from page_image_processor import PageImageProcessor - - -class PDFTransformer: - pdf: pdfplumber.PDF - raw_pages: list[Page] - metadata: dict - display_image_tag: bool - image_index: int - errors: list[str] - pages: list[Page] - lines: list[dict] - images: list[dict] - tables: list[dict] - base64_images: list[dict] - page_numbers_with_images: list[int] - - def __init__(self, x: BytesIO, display_image_tag: bool = False, image_index: int = 0): - self.pdf = pdfplumber.open(x) - self.raw_pages = self.pdf.pages - self.metadata = self.pdf.metadata - self.display_image_tag = display_image_tag - self.image_index = image_index - self.errors = [] - self.page_numbers_with_images = [] - - def preprocess(self): - self.set_heights() - self.lines = [] - self.tables = [] - self.images = [] - self.base64_images = [] - if self.display_image_tag: - self.process_image(self.image_index) - - for page in self.pages: - page_lines = page.extract_text_lines(layout=True, x_tolerance_ratio=0.1, return_chars= False) - page.flush_cache() - page.get_textmap.cache_clear() - - self.process_line(page_lines, page.page_number) - self.process_table(page) - - self.set_paragraph_information(self.lines) - - self.result = "" - - def process_image(self, i: int): - image_index = i - for page in self.pages: - image_processor = PageImageProcessor(page=page, image_index=image_index) - image_processor.produce_images_by_blocks() - processed_images = image_processor.images - self.images += processed_images - image_index = image_processor.image_index - - if page.page_number not in self.page_numbers_with_images: - self.page_numbers_with_images.append(page.page_number) - - self.image_index = image_processor.image_index - - def set_heights(self): - tolerance = 0.95 - heights = [] - largest_text_height, second_largest_text_height = 0, 0 - for page in self.pages: - lines = page.extract_text_lines(layout=True, x_tolerance_ratio=0.1, return_chars= False) - page.flush_cache() - page.get_textmap.cache_clear() - for line in lines: - height = int(line["bottom"] - line["top"]) - heights.append(height) - if height > largest_text_height: - second_largest_text_height = largest_text_height - largest_text_height = height - elif height > second_largest_text_height and height < largest_text_height: - second_largest_text_height = height - - counter = Counter(heights) - - # if there are too many subtitles, we don't use the title height. - # 50 is a temp number. It should be tuned. - if counter[largest_text_height] > 50: - self.title_height = float("inf") - else: - self.title_height = round(largest_text_height * tolerance) - - if counter[second_largest_text_height] > 50 or self.title_height == float("inf"): - self.subtitle_height = float("inf") - else: - self.subtitle_height = round(second_largest_text_height * tolerance) - - def set_paragraph_information(self, lines: list[dict]): - def round_to_nearest_upper_bound(value, step=3): # for the golden sample case - """ - Round the value to the nearest upper bound based on the given step. - For example, with step=3: 0~3 -> 3, 3~6 -> 6, etc. - """ - return ((value // step) + 1) * step - - distances = [] - paragraph_width = 0 - distances_to_left = [] - - for _, line in enumerate(lines): - if line["distance_to_next_line"] and line["distance_to_next_line"] > 0: - # Round the distance to the nearest integer and add to the list - rounded_distance = round_to_nearest_upper_bound(line["distance_to_next_line"]) - distances.append(rounded_distance) - - if line["line_width"] > paragraph_width: - paragraph_width = line["line_width"] - - if line["x0"]: - distances_to_left.append(line["x0"]) - - # Find the most common distance - if distances: - common_distance = Counter(distances).most_common(1)[0][0] - else: - common_distance = 10 ## default value - - if distances_to_left: - zero_indent_distance = min(distances_to_left) - else: - zero_indent_distance = 0 - paragraph_distance = common_distance * 1.5 - self.paragraph_distance = paragraph_distance - self.paragraph_width = paragraph_width - self.zero_indent_distance = zero_indent_distance - - def execute(self): - self.set_line_type(self.title_height, self.subtitle_height, "indent") - self.result = self.transform_line_to_markdown(self.lines) - return self.result - - # It can add more calculation for the future development when we want to extend more use cases. - def process_line(self, lines: list[dict], page_number: int): - for idx, line in enumerate(lines): - line["line_height"] = line["bottom"] - line["top"] - line["line_width"] = line["x1"] - line["x0"] - line["middle"] = (line["x1"] + line["x0"]) / 2 - line["distance_to_next_line"] = lines[idx+1]["top"] - line["bottom"] if idx < len(lines) - 1 else None - line["page_number"] = page_number - self.lines.append(line) - - def process_table(self, page: Page): - tables = page.find_tables( - table_settings={ - "vertical_strategy": "lines", - "horizontal_strategy": "lines", - } - ) - if tables: - for table in tables: - table_info = {} - table_info["bbox"] = table.bbox - text = table.extract() - table_info["text"] = text - table_info["page_number"] = page.page_number - self.tables.append(table_info) - - # TODO: Implement paragraph strategy - def paragraph_strategy(self, lines: list[dict], subtitle_height: int = 14): - # TODO: Implement paragraph strategy - # judge the non-title line in a page. - # If there is a line with indent, return "indent" - # If there is a line with no indent, return "no-indent" - return "indent" - paragraph_lines_start_positions = [] - for line in lines: - if line["line_height"] < subtitle_height: - paragraph_lines_start_positions.append(line["x0"]) - - def set_line_type(self, title_height: int = 16, subtitle_height: int = 14, paragraph_strategy: str = "indent"): - lines = self.lines - current_paragraph = [] - paragraph_start_position = 0 - paragraph_idx = 1 - - for i, line in enumerate(lines): - if line['line_height'] >= title_height: - line["type"] = 'title' - if current_paragraph: - for line_in_paragraph in current_paragraph: - line_in_paragraph["type"] = f'paragraph {paragraph_idx}' - paragraph_idx += 1 - current_paragraph = [] - - elif line['line_height'] >= subtitle_height: - line["type"] = 'subtitle' - if current_paragraph: - for line_in_paragraph in current_paragraph: - line_in_paragraph["type"] = f'paragraph {paragraph_idx}' - paragraph_idx += 1 - current_paragraph = [] - else: - line["type"] = 'paragraph' - if current_paragraph: - current_paragraph.append(line) - - if ((paragraph_strategy == "indent" and i < len(lines) - 1 and - ( # if the next line starts a new paragraph - abs(lines[i+1]['x0'] - paragraph_start_position) < 10 - # if the next line is not in the same layer - # or abs(line["middle"] - lines[i+1]["middle"]) > 5 - ) - ) or - (paragraph_strategy == "no-indent" - and line["distance_to_next_line"] - and line["distance_to_next_line"] > 10) or - (i == len(lines) - 1) # final line - ): - - for line_in_paragraph in current_paragraph: - line_in_paragraph["type"] = f'paragraph {paragraph_idx}' - - paragraph_idx += 1 - current_paragraph = [] - else: - current_paragraph = [line] - paragraph_start_position = line["x0"] - self.lines = lines - - def transform_line_to_markdown(self, lines: list[dict]): - result = "" - to_be_processed_table = [] - for i, line in enumerate(lines): - table = self.meet_table(line, line["page_number"]) - if table and table not in to_be_processed_table: - to_be_processed_table.append(table) - elif table and table in to_be_processed_table: - continue - elif to_be_processed_table: - for table in to_be_processed_table: - result += "\n\n" - result += self.transform_table_markdown(table) - result += "\n\n" - self.tables.remove(table) - to_be_processed_table = [] - - if (i > 0 and - ("title" == lines[i-1]["type"] and "title" == lines[i]["type"] or - "subtitle" == lines[i-1]["type"] and "subtitle" == lines[i]["type"]) - ): - while len(result) > 0 and result[-1] == "\n": - result = result[:-1] - - line_text = self.line_process(line, i, lines, result) - ## If line_text prefix or suffix is \n, remove them - while line_text.startswith("\n") or line_text.endswith("\n"): - line_text = line_text.strip("\n") - else: - line_text = self.line_process(line, i, lines, result) - while ( - (line_text.startswith("\n") or line_text.endswith("\n"))): - line_text = line_text.strip("\n") - - result += line_text - result += "\n" - ## TODO: Do not change another line if it is bullet point or numbered list. - if ( - (line["distance_to_next_line"] and line["distance_to_next_line"] >= self.paragraph_distance) or - ( - line["page_number"] != lines[i+1]["page_number"] if i < len(lines) - 1 else False - and line["line_width"] < self.paragraph_width * 0.8 - ) - ): - result += "\n" - - else: - if (i > 0 and - ("title" == lines[i-1]["type"] and "title" == lines[i]["type"] or - "subtitle" == lines[i-1]["type"] and "subtitle" == lines[i]["type"]) - ): - while len(result) > 0 and result[-1] == "\n": - result = result[:-1] - - line_text = self.line_process(line, i, lines, result) - ## If line_text prefix or suffix is \n, remove them - while line_text.startswith("\n") or line_text.endswith("\n"): - line_text = line_text.strip("\n") - else: - line_text = self.line_process(line, i, lines, result) - while ( - (line_text.startswith("\n") or line_text.endswith("\n"))): - line_text = line_text.strip("\n") - - result += line_text - - ## TODO: Do not change another line if it is bullet point or numbered list. - if ( - (line["distance_to_next_line"] and line["distance_to_next_line"] >= self.paragraph_distance) or - ( - line["page_number"] != lines[i+1]["page_number"] if i < len(lines) - 1 else False - and line["line_width"] < self.paragraph_width * 0.8 - ) - ): - result += "\n" - result += "\n" - - - if i < len(lines) - 1: - result += self.insert_image(line, lines[i+1]) - else: - result += self.insert_image(line, None) - if self.tables: - processed_table = [] - for table in self.tables: - result += "\n\n" - result += self.transform_table_markdown(table) - result += "\n\n" - processed_table.append(table) - for table in processed_table: - self.tables.remove(table) - - return result - - def line_process(self, line: dict, i: int, lines: list[dict], current_result: str): - result = "" - if "type" not in line: - return line["text"] - if line["type"] == "title": - if current_result != "": - result += "\n\n" - if i > 0 and lines[i-1]["type"] == "title": - result += f" {line['text']}\n" - else: - result += f"# {line['text']}\n" - elif line["type"] == "subtitle": - if current_result != "": - result += "\n\n" - if i > 0 and lines[i-1]["type"] == "subtitle": - result += f" {line['text']}\n" - else: - result += f"## {line['text']}\n" - elif "paragraph" in line["type"]: - # Deal with indentation - if self.zero_indent_distance != 0: - indent = round((line["x0"] - self.zero_indent_distance) // 10) # to be tuned - if indent > 0: - result += " " * indent - - result += line["text"] - if ( - (i < len(lines) - 1) and - "type" in lines[i+1] and - len(lines[i+1]["type"].split(" ")) == 2 and - (int(line["type"].split(" ")[1]) < int(lines[i+1]["type"].split(" ")[1])) - ): - result += "\n" - result += "\n" - return result - - def meet_table(self, line: dict, page_number: int): - tables = self.tables - for table in tables: - if table["page_number"] == page_number: - bbox = table["bbox"] - top, bottom = bbox[1], bbox[3] - if line["top"] > top and line["bottom"] < bottom: - return table - else: - None - - def transform_table_markdown(self, table: dict): - result = "" - texts = table["text"] - for i, row in enumerate(texts): - for j, col in enumerate(row): - if col: - if "\n" in col: - col = col.replace("\n", "
") - result += col - - if j < len(row) - 1: - result += " | " - else: - if j == 0: - result += "||" - else: - result += "|" - if i == 0: - result += "\n" - ## TODO: Judge table that cross the page, - result += "|" - result += " --- |" * len(row) - result += "\n" - elif i < len(texts) - 1: - result += "\n" - - return result - - def insert_image(self, line: dict, next_line: dict): - result = "" - images = self.images - to_be_removed_images = [] - - if images: - if next_line: - # If there is image between line and next_line, we insert image. - if next_line["page_number"] == line["page_number"]: - for image in images: - if image["page_number"] == line["page_number"] and image["top"] > line["bottom"] and image["bottom"] < next_line["top"]: - result += "\n\n" - result += f"![image {image['img_number']}]({image['img_number']})" - self.base64_images.append(image["img_base64"]) - result += "\n\n" - to_be_removed_images.append(image) - elif next_line["page_number"] > line["page_number"]: - for image in images: - if image["page_number"] >= line["page_number"] and image["page_number"] < next_line["page_number"]: - result += "\n\n" - result += f"![image {image['img_number']}]({image['img_number']})" - self.base64_images.append(image["img_base64"]) - result += "\n\n" - to_be_removed_images.append(image) - - else: # if images exists and there is no next_line, we insert image. - for image in images: - result += "\n\n" - result += f"![image {image['img_number']}]({image['img_number']})" - self.base64_images.append(image["img_base64"]) - result += "\n\n" - to_be_removed_images.append(image) - for image in to_be_removed_images: - self.images.remove(image) - - return result diff --git a/pkg/component/operator/document/v0/transformer/execution/task_convert_to_markdown.py b/pkg/component/operator/document/v0/transformer/execution/task_convert_to_markdown.py index a7e6b70e6..79f0a3182 100644 --- a/pkg/component/operator/document/v0/transformer/execution/task_convert_to_markdown.py +++ b/pkg/component/operator/document/v0/transformer/execution/task_convert_to_markdown.py @@ -25,6 +25,7 @@ image_idx = 0 errors = [] all_page_images = [] + markdowns = [] try: times = len(pdf.raw_pages) // separator_number + 1 @@ -52,12 +53,15 @@ errors += pdf.errors + markdowns += pdf.markdowns + output = { "body": result, "images": images, "parsing_error": errors, "all_page_images": all_page_images, "display_all_page_image": display_all_page_image, + "markdowns": markdowns, } print(json.dumps(output)) except Exception as e: diff --git a/pkg/component/operator/document/v0/transformer/markdown.go b/pkg/component/operator/document/v0/transformer/markdown.go index eee450644..eff7b52a8 100644 --- a/pkg/component/operator/document/v0/transformer/markdown.go +++ b/pkg/component/operator/document/v0/transformer/markdown.go @@ -20,6 +20,7 @@ type ConvertDocumentToMarkdownTransformerOutput struct { Images []string `json:"images,omitempty"` Error string `json:"error,omitempty"` AllPageImages []string `json:"all-page-images,omitempty"` + Markdowns []string `json:"markdowns"` } func ConvertDocumentToMarkdown(inputStruct *ConvertDocumentToMarkdownTransformerInput, transformerGetter MarkdownTransformerGetterFunc) (*ConvertDocumentToMarkdownTransformerOutput, error) { @@ -50,6 +51,7 @@ func ConvertDocumentToMarkdown(inputStruct *ConvertDocumentToMarkdownTransformer Images: converterOutput.Images, Error: strings.Join(converterOutput.ParsingError, "\n"), AllPageImages: converterOutput.AllPageImages, + Markdowns: converterOutput.Markdowns, } if inputStruct.Filename != "" { diff --git a/pkg/component/operator/document/v0/transformer/pdf_to_markdown/pdf_transformer.py b/pkg/component/operator/document/v0/transformer/pdf_to_markdown/pdf_transformer.py index cf5cead25..9c8df5b3d 100644 --- a/pkg/component/operator/document/v0/transformer/pdf_to_markdown/pdf_transformer.py +++ b/pkg/component/operator/document/v0/transformer/pdf_to_markdown/pdf_transformer.py @@ -24,6 +24,8 @@ class PDFTransformer: tables: list[dict] base64_images: list[dict] page_numbers_with_images: list[int] + # This is the result of the markdown transformation divided by pages. + markdowns: list[str] def __init__(self, x: BytesIO, display_image_tag: bool = False, image_index: int = 0): self.pdf = pdfplumber.open(x) @@ -54,6 +56,7 @@ def preprocess(self): self.set_paragraph_information(self.lines) self.result = "" + self.markdowns = len(self.pdf.pages) * [""] def process_image(self, i: int): image_index = i @@ -157,8 +160,8 @@ def process_line(self, lines: list[dict], page_number: int): def process_table(self, page: Page): tables = page.find_tables( table_settings={ - "vertical_strategy": "lines", - "horizontal_strategy": "lines", + "vertical_strategy": "lines_strict", + "horizontal_strategy": "lines_strict", } ) if tables: @@ -235,11 +238,20 @@ def set_line_type(self, title_height: int = 16, subtitle_height: int = 14, parag def transform_line_to_markdown(self, lines: list[dict]): result = "" to_be_processed_table = [] + need_append_to_markdowns = False + page_number = 0 + for i, line in enumerate(lines): table = self.meet_table(line, line["page_number"]) if table and table not in to_be_processed_table: to_be_processed_table.append(table) elif table and table in to_be_processed_table: + + # Deal with markdowns. If the table is the last element in the page, we need to add the table to the previous markdowns. + if i < len(lines) - 1 and line["page_number"] != lines[i+1]["page_number"]: + need_append_to_markdowns = True + page_number = line["page_number"] + continue elif to_be_processed_table: for table in to_be_processed_table: @@ -310,10 +322,27 @@ def transform_line_to_markdown(self, lines: list[dict]): result += "\n" + # Insert image sections if i < len(lines) - 1: result += self.insert_image(line, lines[i+1]) else: result += self.insert_image(line, None) + + + # Deal with markdowns. + # If the table is the last element in the page, we need to add the table to the previous markdowns. + if need_append_to_markdowns: + self.markdowns[page_number] = result + result = "" + need_append_to_markdowns = False + page_number = 0 + + # If the next line is in the next page, we need to add the result to the markdowns. + elif i < len(lines) - 1 and line["page_number"] != lines[i+1]["page_number"]: + self.markdowns[line["page_number"] - 1] = result + result = "" + + if self.tables: processed_table = [] for table in self.tables: @@ -324,7 +353,16 @@ def transform_line_to_markdown(self, lines: list[dict]): for table in processed_table: self.tables.remove(table) - return result + # Deal with the last page for markdowns + if result: + self.markdowns[lines[-1]["page_number"] - 1] = result + + combined_markdown = "" + + for markdown in self.markdowns: + combined_markdown += markdown + + return combined_markdown def line_process(self, line: dict, i: int, lines: list[dict], current_result: str): result = "" diff --git a/pkg/component/operator/document/v0/transformer/pdftomarkdown.go b/pkg/component/operator/document/v0/transformer/pdftomarkdown.go index 52267d1fc..3d0a3ff3f 100644 --- a/pkg/component/operator/document/v0/transformer/pdftomarkdown.go +++ b/pkg/component/operator/document/v0/transformer/pdftomarkdown.go @@ -15,6 +15,7 @@ type converterOutput struct { SystemError string `json:"system_error"` AllPageImages []string `json:"all_page_images"` AllPage bool `json:"display_all_page_image"` + Markdowns []string `json:"markdowns"` } func convertPDFToMarkdownWithPDFPlumber(base64Text string, displayImageTag bool, displayAllPage bool) (converterOutput, error) {