diff --git a/test_unstructured/partition/common/test_common.py b/test_unstructured/partition/common/test_common.py index 441f0c51f9..277d7ace52 100644 --- a/test_unstructured/partition/common/test_common.py +++ b/test_unstructured/partition/common/test_common.py @@ -1,14 +1,11 @@ import pathlib -from dataclasses import dataclass from multiprocessing import Pool -from unittest import mock import numpy as np import pytest from PIL import Image from unstructured_inference.inference import layout from unstructured_inference.inference.elements import TextRegion -from unstructured_inference.inference.layout import DocumentLayout, PageLayout from unstructured_inference.inference.layoutelement import LayoutElement from test_unstructured.unit_utils import example_doc_path @@ -29,7 +26,6 @@ Image as ImageElement, ) from unstructured.partition.common import common -from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT class MockPageLayout(layout.PageLayout): @@ -399,84 +395,12 @@ def test_contains_emoji(text, expected): assert common.contains_emoji(text) is expected -def test_document_to_element_list_omits_coord_system_when_coord_points_absent(): - layout_elem_absent_coordinates = MockDocumentLayout() - for page in layout_elem_absent_coordinates.pages: - for el in page.elements: - el.bbox = None - elements = common.document_to_element_list(layout_elem_absent_coordinates) - assert elements[0].metadata.coordinates is None - - def test_get_page_image_metadata_and_coordinate_system(): doc = MockDocumentLayout() - metadata = common._get_page_image_metadata(doc.pages[0]) + metadata = common.get_page_image_metadata(doc.pages[0]) assert isinstance(metadata, dict) -@dataclass -class MockImage: - width = 640 - height = 480 - format = "JPG" - - -def test_document_to_element_list_handles_parent(): - block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") - block2 = LayoutElement.from_coords( - 1, - 2, - 3, - 4, - text="block 2", - parent=block1, - type="NarrativeText", - ) - page = PageLayout( - number=1, - image=MockImage(), - ) - page.elements = [block1, block2] - doc = DocumentLayout.from_pages([page]) - el1, el2 = common.document_to_element_list(doc) - assert el2.metadata.parent_id == el1.id - - -@pytest.mark.parametrize( - ("sort_mode", "call_count"), - [(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)], -) -def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count): - block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") - block2 = LayoutElement.from_coords( - 1, - 2, - 3, - 4, - text="block 2", - parent=block1, - type="NarrativeText", - ) - page = PageLayout( - number=1, - image=MockImage(), - ) - page.elements = [block1, block2] - doc = DocumentLayout.from_pages([page]) - with mock.patch.object(common, "sort_page_elements") as mock_sort_page_elements: - common.document_to_element_list(doc, sortable=True, sort_mode=sort_mode) - assert mock_sort_page_elements.call_count == call_count - - -def test_document_to_element_list_sets_category_depth_titles(): - layout_with_hierarchies = MockDocumentLayout() - elements = common.document_to_element_list(layout_with_hierarchies) - assert elements[0].metadata.category_depth == 1 - assert elements[1].metadata.category_depth == 2 - assert elements[2].metadata.category_depth is None - assert elements[3].metadata.category_depth == 0 - - def test_ocr_data_to_elements( filename=example_doc_path("img/layout-parser-paper-fast.jpg"), ): diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index dc574e8d8d..cea6b44129 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -5,6 +5,7 @@ import math import os import tempfile +from dataclasses import dataclass from pathlib import Path from tempfile import SpooledTemporaryFile from unittest import mock @@ -14,6 +15,8 @@ from PIL import Image from pytest_mock import MockFixture from unstructured_inference.inference import layout +from unstructured_inference.inference.layout import DocumentLayout, PageLayout +from unstructured_inference.inference.layoutelement import LayoutElement from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from unstructured.chunking.title import chunk_by_title @@ -35,6 +38,9 @@ from unstructured.partition.pdf_image import ocr, pdfminer_processing from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots from unstructured.partition.utils.constants import ( + SORT_MODE_BASIC, + SORT_MODE_DONT, + SORT_MODE_XY_CUT, UNSTRUCTURED_INCLUDE_DEBUG_METADATA, PartitionStrategy, ) @@ -95,6 +101,37 @@ def __init__(self, number: int, image: Image): ] +class MockSinglePageLayout(layout.PageLayout): + def __init__(self, number: int, image: Image.Image): + self.number = number + self.image = image + + @property + def elements(self): + return [ + LayoutElement( + type="Headline", + text="Charlie Brown and the Great Pumpkin", + bbox=None, + ), + LayoutElement( + type="Subheadline", + text="The Beginning", + bbox=None, + ), + LayoutElement( + type="Text", + text="This time Charlie Brown had it really tricky...", + bbox=None, + ), + LayoutElement( + type="Title", + text="Another book title in the same page", + bbox=None, + ), + ] + + class MockDocumentLayout(layout.DocumentLayout): @property def pages(self): @@ -104,6 +141,14 @@ def pages(self): ] +class MockSinglePageDocumentLayout(layout.DocumentLayout): + @property + def pages(self): + return [ + MockSinglePageLayout(number=1, image=Image.new("1", (1, 1))), + ] + + @pytest.mark.parametrize( ("filename", "file"), [ @@ -1398,3 +1443,75 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_ pdf_hi_res_max_pages=pdf_hi_res_max_pages, is_image=is_image, ) + + +def test_document_to_element_list_omits_coord_system_when_coord_points_absent(): + layout_elem_absent_coordinates = MockSinglePageDocumentLayout() + for page in layout_elem_absent_coordinates.pages: + for el in page.elements: + el.bbox = None + elements = pdf.document_to_element_list(layout_elem_absent_coordinates) + assert elements[0].metadata.coordinates is None + + +@dataclass +class MockImage: + width = 640 + height = 480 + format = "JPG" + + +def test_document_to_element_list_handles_parent(): + block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") + block2 = LayoutElement.from_coords( + 1, + 2, + 3, + 4, + text="block 2", + parent=block1, + type="NarrativeText", + ) + page = PageLayout( + number=1, + image=MockImage(), + ) + page.elements = [block1, block2] + doc = DocumentLayout.from_pages([page]) + el1, el2 = pdf.document_to_element_list(doc) + assert el2.metadata.parent_id == el1.id + + +@pytest.mark.parametrize( + ("sort_mode", "call_count"), + [(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)], +) +def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count): + block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") + block2 = LayoutElement.from_coords( + 1, + 2, + 3, + 4, + text="block 2", + parent=block1, + type="NarrativeText", + ) + page = PageLayout( + number=1, + image=MockImage(), + ) + page.elements = [block1, block2] + doc = DocumentLayout.from_pages([page]) + with mock.patch.object(pdf, "sort_page_elements") as mock_sort_page_elements: + pdf.document_to_element_list(doc, sortable=True, sort_mode=sort_mode) + assert mock_sort_page_elements.call_count == call_count + + +def test_document_to_element_list_sets_category_depth_titles(): + layout_with_hierarchies = MockSinglePageDocumentLayout() + elements = pdf.document_to_element_list(layout_with_hierarchies) + assert elements[0].metadata.category_depth == 1 + assert elements[1].metadata.category_depth == 2 + assert elements[2].metadata.category_depth is None + assert elements[3].metadata.category_depth == 0 diff --git a/unstructured/partition/common/common.py b/unstructured/partition/common/common.py index 001ea47699..267630a87b 100644 --- a/unstructured/partition/common/common.py +++ b/unstructured/partition/common/common.py @@ -9,7 +9,6 @@ import emoji import psutil -from unstructured_inference.inference.elements import Rectangle from unstructured.documents.coordinates import CoordinateSystem, PixelSpace from unstructured.documents.elements import ( @@ -22,18 +21,12 @@ ListItem, PageBreak, Text, - Title, ) from unstructured.logger import logger from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE -from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT -from unstructured.utils import dependency_exists, first - -if dependency_exists("numpy") and dependency_exists("cv2"): - from unstructured.partition.utils.sorting import sort_page_elements if TYPE_CHECKING: - from unstructured_inference.inference.layout import DocumentLayout, PageLayout + from unstructured_inference.inference.layout import PageLayout from unstructured_inference.inference.layoutelement import LayoutElement @@ -407,7 +400,7 @@ def contains_emoji(s: str) -> bool: return bool(emoji.emoji_count(s)) -def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]: +def get_page_image_metadata(page: PageLayout) -> dict[str, Any]: """Retrieve image metadata and coordinate system from a page.""" image = getattr(page, "image", None) @@ -433,146 +426,6 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]: } -# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in -# unstructured.documents.html, which imports this module so we can't import the class for type -# hints. Moreover, those two types of documents have different lists of attributes -# UPDATE(scanny): HTMLDocument no longer exists, so this function can be optimized for use by -# DocumentLayout only. -def document_to_element_list( - document: DocumentLayout, - sortable: bool = False, - include_page_breaks: bool = False, - last_modification_date: Optional[str] = None, - infer_list_items: bool = True, - source_format: Optional[str] = None, - detection_origin: Optional[str] = None, - sort_mode: str = SORT_MODE_XY_CUT, - languages: Optional[list[str]] = None, - starting_page_number: int = 1, - layouts_links: Optional[list[list]] = None, - **kwargs: Any, -) -> list[Element]: - """Converts a DocumentLayout object to a list of unstructured elements.""" - elements: list[Element] = [] - - num_pages = len(document.pages) - for page_number, page in enumerate(document.pages, start=starting_page_number): - page_elements: list[Element] = [] - - page_image_metadata = _get_page_image_metadata(page) - image_format = page_image_metadata.get("format") - image_width = page_image_metadata.get("width") - image_height = page_image_metadata.get("height") - - translation_mapping: list[tuple["LayoutElement", Element]] = [] - - links = ( - layouts_links[page_number - starting_page_number] - if layouts_links and layouts_links[0] - else None - ) - - for layout_element in page.elements: - if image_width and image_height and hasattr(layout_element.bbox, "coordinates"): - coordinate_system = PixelSpace(width=image_width, height=image_height) - else: - coordinate_system = None - - element = normalize_layout_element( - layout_element, - coordinate_system=coordinate_system, - infer_list_items=infer_list_items, - source_format=source_format if source_format else "html", - ) - if isinstance(element, list): - for el in element: - if last_modification_date: - el.metadata.last_modified = last_modification_date - el.metadata.page_number = page_number - page_elements.extend(element) - translation_mapping.extend([(layout_element, el) for el in element]) - continue - else: - - element.metadata.links = ( - _get_links_in_element(links, layout_element.bbox) if links else [] - ) - - if last_modification_date: - element.metadata.last_modified = last_modification_date - element.metadata.text_as_html = getattr(layout_element, "text_as_html", None) - element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None) - # FIXME: here the elements in a page can be either: - # 1. LayoutElement if the document is LayoutDocument (if the partition is on a - # pdf/image); - # 2. Element if the document is HTMLDocument (if the partition is on an html file) - # this discrepency is due to Element class defined in unstructured and LayoutElement - # class defined in unstructured_inference do not have the same list of attributes - if (isinstance(element, Title) and element.metadata.category_depth is None) and any( - getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements - ): - element.metadata.category_depth = 0 - - page_elements.append(element) - translation_mapping.append((layout_element, element)) - coordinates = ( - element.metadata.coordinates.points if element.metadata.coordinates else None - ) - - el_image_path = ( - layout_element.image_path if hasattr(layout_element, "image_path") else None - ) - - add_element_metadata( - element, - page_number=page_number, - filetype=image_format, - coordinates=coordinates, - coordinate_system=coordinate_system, - category_depth=element.metadata.category_depth, - image_path=el_image_path, - detection_origin=detection_origin, - languages=languages, - **kwargs, - ) - - for layout_element, element in translation_mapping: - if hasattr(layout_element, "parent") and layout_element.parent is not None: - element_parent = first( - (el for l_el, el in translation_mapping if l_el is layout_element.parent), - ) - element.metadata.parent_id = element_parent.id - sorted_page_elements = page_elements - if sortable and sort_mode != SORT_MODE_DONT: - sorted_page_elements = sort_page_elements(page_elements, sort_mode) - - if include_page_breaks and page_number < num_pages + starting_page_number: - sorted_page_elements.append(PageBreak(text="")) - elements.extend(sorted_page_elements) - - return elements - - -def _get_links_in_element(page_links: list, region: Rectangle) -> list: - from unstructured.partition.pdf_image.pdfminer_processing import ( - bboxes1_is_almost_subregion_of_bboxes2, - ) - - links_bboxes = [Rectangle(*link.get("bbox")) for link in page_links] - results = bboxes1_is_almost_subregion_of_bboxes2(links_bboxes, [region]) - links = [ - { - "text": page_links[idx].get("text"), - "url": page_links[idx].get("url"), - "start_index": page_links[idx].get("start_index"), - } - for idx, result in enumerate(results) - if any(result) - ] - - return links - - def ocr_data_to_elements( ocr_data: list["LayoutElement"], image_size: tuple[int | float, int | float], diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f25ed73afe..6a5738eb1c 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -17,6 +17,8 @@ from pi_heif import register_heif_opener from PIL import Image as PILImage from pypdf import PdfReader +from unstructured_inference.inference.layout import DocumentLayout +from unstructured_inference.inference.layoutelement import LayoutElement from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import ( @@ -34,6 +36,7 @@ ListItem, PageBreak, Text, + Title, process_metadata, ) from unstructured.errors import PageCountExceededError @@ -42,8 +45,10 @@ from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.partition.common.common import ( - document_to_element_list, + add_element_metadata, exactly_one, + get_page_image_metadata, + normalize_layout_element, ocr_data_to_elements, spooled_to_bytes_io_if_needed, ) @@ -69,6 +74,7 @@ from unstructured.partition.pdf_image.pdfminer_processing import ( check_annotations_within_element, clean_pdfminer_inner_elements, + get_links_in_element, get_uris, get_word_bounding_box_from_element, map_bbox_and_index, @@ -91,7 +97,7 @@ ) from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements from unstructured.patches.pdfminer import parse_keyword -from unstructured.utils import requires_dependencies +from unstructured.utils import first, requires_dependencies if TYPE_CHECKING: pass @@ -1080,3 +1086,113 @@ def check_coords_within_boundary( ) and (coordinates.points[0][1] > boundary_y_min - (vertical_threshold * line_height)) return x_within_boundary and y_within_boundary + + +def document_to_element_list( + document: DocumentLayout, + sortable: bool = False, + include_page_breaks: bool = False, + last_modification_date: Optional[str] = None, + infer_list_items: bool = True, + source_format: Optional[str] = None, + detection_origin: Optional[str] = None, + sort_mode: str = SORT_MODE_XY_CUT, + languages: Optional[list[str]] = None, + starting_page_number: int = 1, + layouts_links: Optional[list[list]] = None, + **kwargs: Any, +) -> list[Element]: + """Converts a DocumentLayout object to a list of unstructured elements.""" + elements: list[Element] = [] + + num_pages = len(document.pages) + for page_number, page in enumerate(document.pages, start=starting_page_number): + page_elements: list[Element] = [] + + page_image_metadata = get_page_image_metadata(page) + image_format = page_image_metadata.get("format") + image_width = page_image_metadata.get("width") + image_height = page_image_metadata.get("height") + + translation_mapping: list[tuple["LayoutElement", Element]] = [] + + links = ( + layouts_links[page_number - starting_page_number] + if layouts_links and layouts_links[0] + else None + ) + + for layout_element in page.elements: + if image_width and image_height and hasattr(layout_element.bbox, "coordinates"): + coordinate_system = PixelSpace(width=image_width, height=image_height) + else: + coordinate_system = None + + element = normalize_layout_element( + layout_element, + coordinate_system=coordinate_system, + infer_list_items=infer_list_items, + source_format=source_format if source_format else "html", + ) + if isinstance(element, list): + for el in element: + if last_modification_date: + el.metadata.last_modified = last_modification_date + el.metadata.page_number = page_number + page_elements.extend(element) + translation_mapping.extend([(layout_element, el) for el in element]) + continue + else: + + element.metadata.links = ( + get_links_in_element(links, layout_element.bbox) if links else [] + ) + + if last_modification_date: + element.metadata.last_modified = last_modification_date + element.metadata.text_as_html = getattr(layout_element, "text_as_html", None) + element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None) + + if (isinstance(element, Title) and element.metadata.category_depth is None) and any( + getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements + ): + element.metadata.category_depth = 0 + + page_elements.append(element) + translation_mapping.append((layout_element, element)) + coordinates = ( + element.metadata.coordinates.points if element.metadata.coordinates else None + ) + + el_image_path = ( + layout_element.image_path if hasattr(layout_element, "image_path") else None + ) + + add_element_metadata( + element, + page_number=page_number, + filetype=image_format, + coordinates=coordinates, + coordinate_system=coordinate_system, + category_depth=element.metadata.category_depth, + image_path=el_image_path, + detection_origin=detection_origin, + languages=languages, + **kwargs, + ) + + for layout_element, element in translation_mapping: + if hasattr(layout_element, "parent") and layout_element.parent is not None: + element_parent = first( + (el for l_el, el in translation_mapping if l_el is layout_element.parent), + ) + element.metadata.parent_id = element_parent.id + sorted_page_elements = page_elements + if sortable and sort_mode != SORT_MODE_DONT: + sorted_page_elements = sort_page_elements(page_elements, sort_mode) + + if include_page_breaks and page_number < num_pages + starting_page_number: + sorted_page_elements.append(PageBreak(text="")) + elements.extend(sorted_page_elements) + + return elements diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 59333abde5..1fa079e93e 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -6,6 +6,7 @@ from pdfminer.layout import LTChar, LTTextBox from pdfminer.pdftypes import PDFObjRef from pdfminer.utils import open_filename +from unstructured_inference.inference.elements import Rectangle from unstructured.documents.coordinates import PixelSpace, PointSpace from unstructured.documents.elements import CoordinatesMetadata @@ -349,6 +350,23 @@ def aggregate_embedded_text_by_block( return text +def get_links_in_element(page_links: list, region: Rectangle) -> list: + + links_bboxes = [Rectangle(*link.get("bbox")) for link in page_links] + results = bboxes1_is_almost_subregion_of_bboxes2(links_bboxes, [region]) + links = [ + { + "text": page_links[idx].get("text"), + "url": page_links[idx].get("url"), + "start_index": page_links[idx].get("start_index"), + } + for idx, result in enumerate(results) + if any(result) + ] + + return links + + def get_uris( annots: PDFObjRef | list[PDFObjRef], height: float,