From a8dd7b8f62b92def549c2d48037e5b410a664fa3 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Fri, 3 Jan 2025 13:45:40 +0100 Subject: [PATCH 01/17] add pobs --- unstructured/partition/utils/config.py | 5 +++++ unstructured/partition/utils/ocr_models/tesseract_ocr.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 7023ff9d33..cbab6f4ec3 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: """optimum text height for tesseract OCR""" return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20) + @property + def TESSERACT_CONFIDENCE_THRESHOLD(self) -> int: + """Tesseract predictions with confidence below this threshold are ignored""" + return self._get_float("TESSERACT_CONFIDENCE_THRESHOLD", 0.0) + @property def GOOGLEVISION_API_ENDPOINT(self) -> str: """API endpoint to use for Google Vision""" diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 46eb8a0cbd..504024aa9d 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -82,7 +82,8 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: output_type=Output.DATAFRAME, ) ocr_df = ocr_df.dropna() - + probabilities = ocr_df["conf"].div(100) + ocr_df = ocr_df[probabilities.ge(env_config.TESSERACT_CONFIDENCE_THRESHOLD)] ocr_regions = self.parse_data(ocr_df, zoom=zoom) return ocr_regions From 9e31ebc1e631bd28ffaf93068272203251836fa6 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Fri, 3 Jan 2025 14:43:12 +0100 Subject: [PATCH 02/17] upadte --- unstructured/partition/utils/ocr_models/tesseract_ocr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 504024aa9d..10a914b90d 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -51,6 +51,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: np.array(image), lang=self.language, output_type=Output.DATAFRAME, + # config='--oem 3 --psm 6' ) ocr_df = ocr_df.dropna() @@ -80,10 +81,12 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: np.array(zoom_image(image, zoom)), lang=self.language, output_type=Output.DATAFRAME, + # config='--oem 3 --psm 6' ) ocr_df = ocr_df.dropna() probabilities = ocr_df["conf"].div(100) ocr_df = ocr_df[probabilities.ge(env_config.TESSERACT_CONFIDENCE_THRESHOLD)] + print("OCR FILTERING") ocr_regions = self.parse_data(ocr_df, zoom=zoom) return ocr_regions From c0f2768cc4dedfc7e473a0afdf05beef640ec6f9 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 6 Jan 2025 16:32:46 +0100 Subject: [PATCH 03/17] feat: Add character level confidence thresholds --- .../partition/pdf_image/test_ocr.py | 42 +++++++++ unstructured/partition/utils/config.py | 2 +- .../utils/ocr_models/tesseract_ocr.py | 88 ++++++++++++++++--- 3 files changed, 121 insertions(+), 11 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index b0be34fbfb..7b1454a189 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -6,6 +6,7 @@ import pandas as pd import pytest import unstructured_pytesseract +from bs4 import BeautifulSoup, Tag from pdf2image.exceptions import PDFPageCountError from PIL import Image, UnidentifiedImageError from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion @@ -484,3 +485,44 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions): # Check if the final layout contains both original elements and OCR-derived elements assert all(element in final_layout for element in mock_out_layout) assert any(element in final_layout for element in ocr_elements) + + +def test_extract_word_from_hocr(): + def _create_hocr_word_span(characters: list[tuple[str, str, list[int]]]) -> Tag: + word_span = BeautifulSoup("", "html.parser").span + for char, x_conf, bbox in characters: + char_span = BeautifulSoup( + f""" + {char} + """, # noqa : E501 + "html.parser", + ).span + word_span.append(char_span) + return word_span + + characters = [ + ("w", "99.0", [10, 10, 20, 20]), + ("o", "98.5", [21, 9, 29, 20]), + ("r", "97.5", [31, 10, 40, 21]), + ("d", "96.0", [41, 11, 50, 22]), + ("!", "50.0", [51, 10, 60, 20]), + ("@", "45.0", [61, 10, 70, 20]), + ] + + word_span = _create_hocr_word_span(characters) + + text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0) + assert text == "word!@" + assert bbox == [10, 9, 70, 22] + + text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960) + assert text == "word" + assert bbox == [10, 9, 50, 22] + + text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990) + assert text == "w" + assert bbox == [10, 10, 20, 20] + + text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999) + assert text == "" + assert bbox is None diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index cbab6f4ec3..ccab59b43f 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -97,7 +97,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20) @property - def TESSERACT_CONFIDENCE_THRESHOLD(self) -> int: + def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int: """Tesseract predictions with confidence below this threshold are ignored""" return self._get_float("TESSERACT_CONFIDENCE_THRESHOLD", 0.0) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 10a914b90d..444a0a6623 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -1,14 +1,15 @@ from __future__ import annotations import os +import re from typing import TYPE_CHECKING, List import cv2 import numpy as np import pandas as pd import unstructured_pytesseract +from bs4 import BeautifulSoup, Tag from PIL import Image as PILImage -from unstructured_pytesseract import Output from unstructured.logger import trace_logger from unstructured.partition.utils.config import env_config @@ -47,11 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: trace_logger.detail("Processing entire page OCR with tesseract...") zoom = 1 - ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data( - np.array(image), + ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter( + np.array(zoom_image(image, zoom)), lang=self.language, - output_type=Output.DATAFRAME, - # config='--oem 3 --psm 6' + character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD, ) ocr_df = ocr_df.dropna() @@ -77,20 +77,88 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1), max_zoom, ) - ocr_df = unstructured_pytesseract.image_to_data( + ocr_df = self.image_to_data_with_character_confidence_filter( np.array(zoom_image(image, zoom)), lang=self.language, - output_type=Output.DATAFRAME, - # config='--oem 3 --psm 6' + character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD, ) ocr_df = ocr_df.dropna() - probabilities = ocr_df["conf"].div(100) - ocr_df = ocr_df[probabilities.ge(env_config.TESSERACT_CONFIDENCE_THRESHOLD)] print("OCR FILTERING") ocr_regions = self.parse_data(ocr_df, zoom=zoom) return ocr_regions + def image_to_data_with_character_confidence_filter( + self, + image: np.ndarray, + lang: str = "eng", + config: str = "", + character_confidence_threshold: float = 0.5, + ) -> pd.DataFrame: + hocr: pd.DataFrame = unstructured_pytesseract.image_to_pdf_or_hocr( + image, + lang=lang, + config="-c hocr_char_boxes=1 " + config, + extension="hocr", + ) + soup = BeautifulSoup(hocr, "html.parser") + words = soup.find_all("span", class_="ocrx_word") + + df_entries = [] + for word in words: + text, bbox = self.extract_word_from_hocr( + word=word, character_confidence_threshold=character_confidence_threshold + ) + if text and bbox: + left, top, right, bottom = bbox + df_entries.append( + { + "left": left, + "top": top, + "width": right - left, + "height": bottom - top, + "text": text, + } + ) + ocr_df = pd.DataFrame(df_entries) + + return ocr_df + + @staticmethod + def extract_word_from_hocr( + word: Tag, character_confidence_threshold: float = 0.0 + ) -> tuple[str, list[int] | None]: + """Extracts a word from an hOCR word tag, filtering out characters with low confidence.""" + word_text = "" + word_bbox = None + + character_spans = word.find_all("span", class_="ocrx_cinfo") + for character_span in character_spans: + char = character_span.text + + char_title = character_span.get("title", "") + conf_match = re.search(r"x_conf (\d+\.\d+)", char_title) + bbox_match = re.search(r"x_bboxes (\d+) (\d+) (\d+) (\d+)", char_title) + + if not (char and conf_match and bbox_match): + continue + + character_probability = float(conf_match.group(1)) / 100 + character_bbox = list(map(int, bbox_match.groups())) + + if character_probability >= character_confidence_threshold: + word_text += char + if word_bbox is None: + word_bbox = character_bbox + else: + word_bbox = [ + min(word_bbox[0], character_bbox[0]), # x1 - starts from 0 + min(word_bbox[1], character_bbox[1]), # y1 - starts from 0 + max(word_bbox[2], character_bbox[2]), + max(word_bbox[3], character_bbox[3]), + ] + return word_text, word_bbox + @requires_dependencies("unstructured_inference") def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]: from unstructured.partition.pdf_image.inference_utils import ( From 052ae5019ce0ee04dab61821477f1537c3787fe3 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 6 Jan 2025 20:19:38 +0100 Subject: [PATCH 04/17] add psm --- unstructured/partition/utils/ocr_models/tesseract_ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 444a0a6623..e72826ffce 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -98,7 +98,7 @@ def image_to_data_with_character_confidence_filter( hocr: pd.DataFrame = unstructured_pytesseract.image_to_pdf_or_hocr( image, lang=lang, - config="-c hocr_char_boxes=1 " + config, + config="-c hocr_char_boxes=1 psm=12" + config, extension="hocr", ) soup = BeautifulSoup(hocr, "html.parser") From 4b54d8a1420dd8e4d474ab1cf162fdbec7873656 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 6 Jan 2025 20:19:53 +0100 Subject: [PATCH 05/17] Fix config name --- unstructured/partition/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index ccab59b43f..291ae1b6a3 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -99,7 +99,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: @property def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int: """Tesseract predictions with confidence below this threshold are ignored""" - return self._get_float("TESSERACT_CONFIDENCE_THRESHOLD", 0.0) + return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0) @property def GOOGLEVISION_API_ENDPOINT(self) -> str: From 6fcd3f47479ee73e119dd5950f913d655600baf1 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Tue, 7 Jan 2025 14:17:43 +0100 Subject: [PATCH 06/17] Update --- .../utils/ocr_models/tesseract_ocr.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index e72826ffce..fd8b888605 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -83,7 +83,6 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD, ) ocr_df = ocr_df.dropna() - print("OCR FILTERING") ocr_regions = self.parse_data(ocr_df, zoom=zoom) return ocr_regions @@ -95,12 +94,18 @@ def image_to_data_with_character_confidence_filter( config: str = "", character_confidence_threshold: float = 0.5, ) -> pd.DataFrame: - hocr: pd.DataFrame = unstructured_pytesseract.image_to_pdf_or_hocr( + hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr( image, lang=lang, - config="-c hocr_char_boxes=1 psm=12" + config, + config="-c hocr_char_boxes=1 " + config, extension="hocr", ) + ocr_df = self.hocr_to_dataframe(hocr, character_confidence_threshold) + return ocr_df + + def hocr_to_dataframe( + self, hocr: str, character_confidence_threshold: float = 0.0 + ) -> pd.DataFrame: soup = BeautifulSoup(hocr, "html.parser") words = soup.find_all("span", class_="ocrx_word") @@ -121,7 +126,6 @@ def image_to_data_with_character_confidence_filter( } ) ocr_df = pd.DataFrame(df_entries) - return ocr_df @staticmethod @@ -129,10 +133,14 @@ def extract_word_from_hocr( word: Tag, character_confidence_threshold: float = 0.0 ) -> tuple[str, list[int] | None]: """Extracts a word from an hOCR word tag, filtering out characters with low confidence.""" + + character_spans = word.find_all("span", class_="ocrx_cinfo") + if len(character_spans) == 0: + return "", None + word_text = "" word_bbox = None - character_spans = word.find_all("span", class_="ocrx_cinfo") for character_span in character_spans: char = character_span.text @@ -157,6 +165,7 @@ def extract_word_from_hocr( max(word_bbox[2], character_bbox[2]), max(word_bbox[3], character_bbox[3]), ] + return word_text, word_bbox @requires_dependencies("unstructured_inference") From 137678fe86c5c03018545186418b6aafd20d1810 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Tue, 7 Jan 2025 14:20:48 +0100 Subject: [PATCH 07/17] Update config --- CHANGELOG.md | 10 ++++++++++ unstructured/__version__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 165cd0e077..cf6aa11fec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.16.13-dev0 + +### Enhancements + +- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable. + +### Features + +### Fixes + ## 0.16.12 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index dcd9ca00b7..a88e673551 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.12" # pragma: no cover +__version__ = "0.16.13-dev0" # pragma: no cover From c25039fc63f7705d8a834f791b1802f9c83195db Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Tue, 7 Jan 2025 16:52:40 +0100 Subject: [PATCH 08/17] Remove unused zoom --- unstructured/partition/utils/ocr_models/tesseract_ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index fd8b888605..6825c2867d 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -49,7 +49,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: trace_logger.detail("Processing entire page OCR with tesseract...") zoom = 1 ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter( - np.array(zoom_image(image, zoom)), + np.array(image, zoom), lang=self.language, character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD, ) From 3bff8aebbae3c65fc6cf3ba749bf4c975803b6e7 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Tue, 7 Jan 2025 16:57:49 +0100 Subject: [PATCH 09/17] Remove unused zoom --- unstructured/partition/utils/ocr_models/tesseract_ocr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 6825c2867d..36e38787fa 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -49,7 +49,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: trace_logger.detail("Processing entire page OCR with tesseract...") zoom = 1 ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter( - np.array(image, zoom), + np.array(image), lang=self.language, character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD, ) From c1e9b8efa09ed2738af19d6738714251777df1da Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 8 Jan 2025 11:35:38 +0100 Subject: [PATCH 10/17] Use word bboxes instead of character bboxees --- .../partition/pdf_image/test_ocr.py | 81 ++++++++++++------- .../utils/ocr_models/tesseract_ocr.py | 42 +++++----- 2 files changed, 71 insertions(+), 52 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index 7b1454a189..fe04d82c55 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -487,42 +487,67 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions): assert any(element in final_layout for element in ocr_elements) -def test_extract_word_from_hocr(): - def _create_hocr_word_span(characters: list[tuple[str, str, list[int]]]) -> Tag: - word_span = BeautifulSoup("", "html.parser").span - for char, x_conf, bbox in characters: - char_span = BeautifulSoup( - f""" - {char} - """, # noqa : E501 - "html.parser", - ).span - word_span.append(char_span) - return word_span +def _create_hocr_word_span( + characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int] +) -> Tag: + word_span = BeautifulSoup( + f"", + "html.parser", + ).span + for char, x_conf in characters: + char_span = BeautifulSoup( + f""" + {char} + """, # noqa : E501 + "html.parser", + ).span + word_span.append(char_span) + return word_span + +def test_extract_word_from_hocr(): characters = [ - ("w", "99.0", [10, 10, 20, 20]), - ("o", "98.5", [21, 9, 29, 20]), - ("r", "97.5", [31, 10, 40, 21]), - ("d", "96.0", [41, 11, 50, 22]), - ("!", "50.0", [51, 10, 60, 20]), - ("@", "45.0", [61, 10, 70, 20]), + ("w", "99.0"), + ("o", "98.5"), + ("r", "97.5"), + ("d", "96.0"), + ("!", "50.0"), + ("@", "45.0"), ] + word_bbox = (10, 9, 70, 22) + word_span = _create_hocr_word_span(characters, word_bbox) - word_span = _create_hocr_word_span(characters) - - text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0) + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0) assert text == "word!@" - assert bbox == [10, 9, 70, 22] - text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960) + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960) assert text == "word" - assert bbox == [10, 9, 50, 22] - text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990) + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990) assert text == "w" - assert bbox == [10, 10, 20, 20] - text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999) + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999) assert text == "" - assert bbox is None + + +def test_hocr_to_dataframe(): + characters = [ + ("w", "99.0"), + ("o", "98.5"), + ("r", "97.5"), + ("d", "96.0"), + ("!", "50.0"), + ("@", "45.0"), + ] + word_bbox = (10, 9, 70, 22) + hocr = str(_create_hocr_word_span(characters, word_bbox)) + df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960) + + assert df.shape == (1, 5) + assert df["left"].iloc[0] == 10 + assert df["top"].iloc[0] == 9 + assert df["width"].iloc[0] == 60 + assert df["height"].iloc[0] == 13 + assert df["text"].iloc[0] == "word" diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 36e38787fa..64ba58e073 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -107,15 +107,22 @@ def hocr_to_dataframe( self, hocr: str, character_confidence_threshold: float = 0.0 ) -> pd.DataFrame: soup = BeautifulSoup(hocr, "html.parser") - words = soup.find_all("span", class_="ocrx_word") + word_spans = soup.find_all("span", class_="ocrx_word") df_entries = [] - for word in words: - text, bbox = self.extract_word_from_hocr( - word=word, character_confidence_threshold=character_confidence_threshold + for word_span in word_spans: + word_title = word_span.get("title", "") + bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title) + + # Note: word bbox is used instead of combining characters together due to tesseract + # bug that causes the character bboxes to be outside the word bbox, and they have 0 + # height or width when text is horizontal + text = self.extract_word_from_hocr( + word=word_span, character_confidence_threshold=character_confidence_threshold ) - if text and bbox: - left, top, right, bottom = bbox + if text and bbox_match: + word_bbox = list(map(int, bbox_match.groups())) + left, top, right, bottom = word_bbox df_entries.append( { "left": left, @@ -131,42 +138,29 @@ def hocr_to_dataframe( @staticmethod def extract_word_from_hocr( word: Tag, character_confidence_threshold: float = 0.0 - ) -> tuple[str, list[int] | None]: + ) -> str | None: """Extracts a word from an hOCR word tag, filtering out characters with low confidence.""" character_spans = word.find_all("span", class_="ocrx_cinfo") if len(character_spans) == 0: - return "", None + return None word_text = "" - word_bbox = None - for character_span in character_spans: char = character_span.text char_title = character_span.get("title", "") conf_match = re.search(r"x_conf (\d+\.\d+)", char_title) - bbox_match = re.search(r"x_bboxes (\d+) (\d+) (\d+) (\d+)", char_title) - if not (char and conf_match and bbox_match): + if not (char and conf_match): continue character_probability = float(conf_match.group(1)) / 100 - character_bbox = list(map(int, bbox_match.groups())) if character_probability >= character_confidence_threshold: word_text += char - if word_bbox is None: - word_bbox = character_bbox - else: - word_bbox = [ - min(word_bbox[0], character_bbox[0]), # x1 - starts from 0 - min(word_bbox[1], character_bbox[1]), # y1 - starts from 0 - max(word_bbox[2], character_bbox[2]), - max(word_bbox[3], character_bbox[3]), - ] - - return word_text, word_bbox + + return word_text @requires_dependencies("unstructured_inference") def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]: From 0e4492619feac1354d84c18c5ff6038d61133d08 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 8 Jan 2025 11:38:42 +0100 Subject: [PATCH 11/17] Do not return None --- unstructured/partition/utils/ocr_models/tesseract_ocr.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 64ba58e073..17589df06d 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -136,14 +136,12 @@ def hocr_to_dataframe( return ocr_df @staticmethod - def extract_word_from_hocr( - word: Tag, character_confidence_threshold: float = 0.0 - ) -> str | None: + def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str: """Extracts a word from an hOCR word tag, filtering out characters with low confidence.""" character_spans = word.find_all("span", class_="ocrx_cinfo") if len(character_spans) == 0: - return None + return "" word_text = "" for character_span in character_spans: From 2d9054d2f181a213ed03dc377fc76f7c44082726 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 8 Jan 2025 11:56:33 +0100 Subject: [PATCH 12/17] Fix empty df scenario --- test_unstructured/partition/pdf_image/test_ocr.py | 11 +++++++++++ .../partition/utils/ocr_models/tesseract_ocr.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index fe04d82c55..85fc5f6d3e 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -551,3 +551,14 @@ def test_hocr_to_dataframe(): assert df["width"].iloc[0] == 60 assert df["height"].iloc[0] == 13 assert df["text"].iloc[0] == "word" + + +def test_hocr_to_dataframe_when_no_prediction_empty_df(): + df = OCRAgentTesseract().hocr_to_dataframe(hocr="") + + assert df.shape == (0, 5) + assert "left" in df.columns + assert "top" in df.columns + assert "width" in df.columns + assert "text" in df.columns + assert "text" in df.columns diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 17589df06d..8668dec1fc 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -132,7 +132,7 @@ def hocr_to_dataframe( "text": text, } ) - ocr_df = pd.DataFrame(df_entries) + ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "width", "height", "text"]) return ocr_df @staticmethod From a61aa8583c082aeb617460cb0609726b19fc2142 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 8 Jan 2025 13:22:28 +0100 Subject: [PATCH 13/17] fix unit test --- test_unstructured/partition/pdf_image/test_pdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 9b1b8de6e1..200edf3e2a 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None(): [ ( PartitionStrategy.HI_RES, - "unstructured_pytesseract.image_to_data", + "unstructured_pytesseract.image_to_pdf_or_hocr", ), ( PartitionStrategy.OCR_ONLY, - "unstructured_pytesseract.image_to_data", + "unstructured_pytesseract.image_to_pdf_or_hocr", ), ( PartitionStrategy.OCR_ONLY, From 1611a61ec8e539d273c6c21d12da676644691ad5 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 8 Jan 2025 14:23:14 +0100 Subject: [PATCH 14/17] Fix unittests --- test_unstructured/partition/pdf_image/test_ocr.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index 85fc5f6d3e..e9982810a0 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -72,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch): def test_get_ocr_layout_from_image_tesseract(monkeypatch): monkeypatch.setattr( - unstructured_pytesseract, - "image_to_data", + OCRAgentTesseract, + "image_to_data_with_character_confidence_filter", lambda *args, **kwargs: pd.DataFrame( { "left": [10, 20, 30, 0], @@ -446,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch): monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000") monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000") monkeypatch.setattr( - unstructured_pytesseract, - "image_to_data", + OCRAgentTesseract, + "image_to_data_with_character_confidence_filter", lambda *args, **kwargs: pd.DataFrame( { "left": [10, 20, 30, 0], From cee5440b43943a96a580faf226da73e27b586546 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 8 Jan 2025 16:46:31 +0100 Subject: [PATCH 15/17] Set default threshold --- unstructured/partition/utils/config.py | 2 +- unstructured/partition/utils/ocr_models/tesseract_ocr.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 291ae1b6a3..43489df74f 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -99,7 +99,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: @property def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int: """Tesseract predictions with confidence below this threshold are ignored""" - return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0) + return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.85) @property def GOOGLEVISION_API_ENDPOINT(self) -> str: diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 8668dec1fc..58572d1a72 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -92,7 +92,7 @@ def image_to_data_with_character_confidence_filter( image: np.ndarray, lang: str = "eng", config: str = "", - character_confidence_threshold: float = 0.5, + character_confidence_threshold: float = 0.85, ) -> pd.DataFrame: hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr( image, @@ -104,7 +104,7 @@ def image_to_data_with_character_confidence_filter( return ocr_df def hocr_to_dataframe( - self, hocr: str, character_confidence_threshold: float = 0.0 + self, hocr: str, character_confidence_threshold: float = 0.85 ) -> pd.DataFrame: soup = BeautifulSoup(hocr, "html.parser") word_spans = soup.find_all("span", class_="ocrx_word") From c5b657051cde60050f024db01aa214a86da561ce Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Thu, 9 Jan 2025 13:06:55 +0100 Subject: [PATCH 16/17] Set default threshold to 0 --- unstructured/partition/utils/config.py | 2 +- unstructured/partition/utils/ocr_models/tesseract_ocr.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 43489df74f..291ae1b6a3 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -99,7 +99,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: @property def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int: """Tesseract predictions with confidence below this threshold are ignored""" - return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.85) + return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0) @property def GOOGLEVISION_API_ENDPOINT(self) -> str: diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 58572d1a72..49a572d5fb 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -92,7 +92,7 @@ def image_to_data_with_character_confidence_filter( image: np.ndarray, lang: str = "eng", config: str = "", - character_confidence_threshold: float = 0.85, + character_confidence_threshold: float = 0.0, ) -> pd.DataFrame: hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr( image, @@ -104,7 +104,7 @@ def image_to_data_with_character_confidence_filter( return ocr_df def hocr_to_dataframe( - self, hocr: str, character_confidence_threshold: float = 0.85 + self, hocr: str, character_confidence_threshold: float = 0.0 ) -> pd.DataFrame: soup = BeautifulSoup(hocr, "html.parser") word_spans = soup.find_all("span", class_="ocrx_word") From 013a3516cbdc2405185a5aa22a56f7832df7fa0a Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 13 Jan 2025 12:54:09 +0100 Subject: [PATCH 17/17] Refactor --- .../partition/utils/ocr_models/tesseract_ocr.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 49a572d5fb..6e2c96da00 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -127,12 +127,17 @@ def hocr_to_dataframe( { "left": left, "top": top, - "width": right - left, - "height": bottom - top, + "right": right, + "bottom": bottom, "text": text, } ) - ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "width", "height", "text"]) + ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "right", "bottom", "text"]) + + ocr_df["width"] = ocr_df["right"] - ocr_df["left"] + ocr_df["height"] = ocr_df["bottom"] - ocr_df["top"] + + ocr_df = ocr_df.drop(columns=["right", "bottom"]) return ocr_df @staticmethod