Skip to content

Commit

Permalink
Fix: embedded text not getting merged with inferred elements (#331)
Browse files Browse the repository at this point in the history
This PR is the first part of fixing "embedded text not getting merged
with inferred elements" and works together with the unstructured PR -
Unstructured-IO/unstructured#2679.

### Summary
- replace `Rectangle.is_in()` with `Rectangle.is_almost_subregion_of()`
when filling in an inferred element with embedded text
- add env_config `EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD`

### Note
The ingest test won't pass until we merge the unstructured PR -
Unstructured-IO/unstructured#2679.
  • Loading branch information
christinestraub authored Mar 22, 2024
1 parent 7dd9449 commit 4a2fd95
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 3 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## 0.7.25-dev2
## 0.7.25

* fix: replace `Rectangle.is_in()` with `Rectangle.is_almost_subregion_of()` when filling in an inferred element with embedded text
* bug: check for None in Chipper bounding box reduction
* chore: removes `install-detectron2` from the `Makefile`
* fix: convert label_map keys read from os.environment `UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH` to int type
Expand Down
13 changes: 13 additions & 0 deletions test_unstructured_inference/test_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,3 +272,16 @@ def test_merge_inferred_layout_with_extracted_layout():
assert merged_layout[0].text == "Example Section Header"
assert merged_layout[1].type == ElementType.TEXT
assert merged_layout[1].text == "Example Title"


def test_aggregate_by_block():
expected = "Inside region1 Inside region2"
embedded_regions = [
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
]
target_region = TextRegion.from_coords(0, 0, 300, 300)

text = elements.aggregate_by_block(target_region, embedded_regions)
assert text == expected
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.25-dev2" # pragma: no cover
__version__ = "0.7.25" # pragma: no cover
10 changes: 10 additions & 0 deletions unstructured_inference/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,16 @@ def LAYOUT_SUBREGION_THRESHOLD(self) -> float:
"""
return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75)

@property
def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
"""threshold to determine if an embedded region is a sub-region of a given block
when aggregating the text from embedded elements that lie within the given block
When the intersection region area divided by self area is larger than this threshold self is
considered a subregion of the other
"""
return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.99)

@property
def ELEMENTS_H_PADDING_COEF(self) -> float:
"""When extending the boundaries of a PDF object for the purpose of determining which other
Expand Down
7 changes: 6 additions & 1 deletion unstructured_inference/inference/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import numpy as np

from unstructured_inference.config import inference_config
from unstructured_inference.constants import Source
from unstructured_inference.math import safe_division

Expand Down Expand Up @@ -246,8 +247,12 @@ def aggregate_by_block(
) -> str:
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""

subregion_threshold = inference_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
filtered_blocks = [
obj for obj in pdf_objects if obj.bbox.is_in(text_region.bbox, error_margin=5)
obj
for obj in pdf_objects
if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold)
]
text = " ".join([x.text for x in filtered_blocks if x.text])
return text
Expand Down

0 comments on commit 4a2fd95

Please sign in to comment.