From a8dd7b8f62b92def549c2d48037e5b410a664fa3 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Fri, 3 Jan 2025 13:45:40 +0100
Subject: [PATCH 01/17] add pobs

---
 unstructured/partition/utils/config.py                   | 5 +++++
 unstructured/partition/utils/ocr_models/tesseract_ocr.py | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index 7023ff9d33..cbab6f4ec3 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
         """optimum text height for tesseract OCR"""
         return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
 
+    @property
+    def TESSERACT_CONFIDENCE_THRESHOLD(self) -> int:
+        """Tesseract predictions with confidence below this threshold are ignored"""
+        return self._get_float("TESSERACT_CONFIDENCE_THRESHOLD", 0.0)
+
     @property
     def GOOGLEVISION_API_ENDPOINT(self) -> str:
         """API endpoint to use for Google Vision"""
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 46eb8a0cbd..504024aa9d 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -82,7 +82,8 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
                 output_type=Output.DATAFRAME,
             )
             ocr_df = ocr_df.dropna()
-
+        probabilities = ocr_df["conf"].div(100)
+        ocr_df = ocr_df[probabilities.ge(env_config.TESSERACT_CONFIDENCE_THRESHOLD)]
         ocr_regions = self.parse_data(ocr_df, zoom=zoom)
 
         return ocr_regions

From 9e31ebc1e631bd28ffaf93068272203251836fa6 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Fri, 3 Jan 2025 14:43:12 +0100
Subject: [PATCH 02/17] upadte

---
 unstructured/partition/utils/ocr_models/tesseract_ocr.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 504024aa9d..10a914b90d 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -51,6 +51,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
             np.array(image),
             lang=self.language,
             output_type=Output.DATAFRAME,
+            # config='--oem 3 --psm 6'
         )
         ocr_df = ocr_df.dropna()
 
@@ -80,10 +81,12 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
                 np.array(zoom_image(image, zoom)),
                 lang=self.language,
                 output_type=Output.DATAFRAME,
+                # config='--oem 3 --psm 6'
             )
             ocr_df = ocr_df.dropna()
         probabilities = ocr_df["conf"].div(100)
         ocr_df = ocr_df[probabilities.ge(env_config.TESSERACT_CONFIDENCE_THRESHOLD)]
+        print("OCR FILTERING")
         ocr_regions = self.parse_data(ocr_df, zoom=zoom)
 
         return ocr_regions

From c0f2768cc4dedfc7e473a0afdf05beef640ec6f9 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Mon, 6 Jan 2025 16:32:46 +0100
Subject: [PATCH 03/17] feat: Add character level confidence thresholds

---
 .../partition/pdf_image/test_ocr.py           | 42 +++++++++
 unstructured/partition/utils/config.py        |  2 +-
 .../utils/ocr_models/tesseract_ocr.py         | 88 ++++++++++++++++---
 3 files changed, 121 insertions(+), 11 deletions(-)

diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
index b0be34fbfb..7b1454a189 100644
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import pytest
 import unstructured_pytesseract
+from bs4 import BeautifulSoup, Tag
 from pdf2image.exceptions import PDFPageCountError
 from PIL import Image, UnidentifiedImageError
 from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@@ -484,3 +485,44 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
     # Check if the final layout contains both original elements and OCR-derived elements
     assert all(element in final_layout for element in mock_out_layout)
     assert any(element in final_layout for element in ocr_elements)
+
+
+def test_extract_word_from_hocr():
+    def _create_hocr_word_span(characters: list[tuple[str, str, list[int]]]) -> Tag:
+        word_span = BeautifulSoup("<span class='ocrx_word'></span>", "html.parser").span
+        for char, x_conf, bbox in characters:
+            char_span = BeautifulSoup(
+                f"""
+                <span class='ocrx_cinfo' title='x_bboxes {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}; x_conf {x_conf}'>{char}</span>
+                """,  # noqa : E501
+                "html.parser",
+            ).span
+            word_span.append(char_span)
+        return word_span
+
+    characters = [
+        ("w", "99.0", [10, 10, 20, 20]),
+        ("o", "98.5", [21, 9, 29, 20]),
+        ("r", "97.5", [31, 10, 40, 21]),
+        ("d", "96.0", [41, 11, 50, 22]),
+        ("!", "50.0", [51, 10, 60, 20]),
+        ("@", "45.0", [61, 10, 70, 20]),
+    ]
+
+    word_span = _create_hocr_word_span(characters)
+
+    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
+    assert text == "word!@"
+    assert bbox == [10, 9, 70, 22]
+
+    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
+    assert text == "word"
+    assert bbox == [10, 9, 50, 22]
+
+    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
+    assert text == "w"
+    assert bbox == [10, 10, 20, 20]
+
+    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
+    assert text == ""
+    assert bbox is None
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index cbab6f4ec3..ccab59b43f 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -97,7 +97,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
         return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
 
     @property
-    def TESSERACT_CONFIDENCE_THRESHOLD(self) -> int:
+    def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
         """Tesseract predictions with confidence below this threshold are ignored"""
         return self._get_float("TESSERACT_CONFIDENCE_THRESHOLD", 0.0)
 
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 10a914b90d..444a0a6623 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
 import os
+import re
 from typing import TYPE_CHECKING, List
 
 import cv2
 import numpy as np
 import pandas as pd
 import unstructured_pytesseract
+from bs4 import BeautifulSoup, Tag
 from PIL import Image as PILImage
-from unstructured_pytesseract import Output
 
 from unstructured.logger import trace_logger
 from unstructured.partition.utils.config import env_config
@@ -47,11 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
 
         trace_logger.detail("Processing entire page OCR with tesseract...")
         zoom = 1
-        ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
-            np.array(image),
+        ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
+            np.array(zoom_image(image, zoom)),
             lang=self.language,
-            output_type=Output.DATAFRAME,
-            # config='--oem 3 --psm 6'
+            character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
         )
         ocr_df = ocr_df.dropna()
 
@@ -77,20 +77,88 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
                 np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
                 max_zoom,
             )
-            ocr_df = unstructured_pytesseract.image_to_data(
+            ocr_df = self.image_to_data_with_character_confidence_filter(
                 np.array(zoom_image(image, zoom)),
                 lang=self.language,
-                output_type=Output.DATAFRAME,
-                # config='--oem 3 --psm 6'
+                character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
             )
             ocr_df = ocr_df.dropna()
-        probabilities = ocr_df["conf"].div(100)
-        ocr_df = ocr_df[probabilities.ge(env_config.TESSERACT_CONFIDENCE_THRESHOLD)]
         print("OCR FILTERING")
         ocr_regions = self.parse_data(ocr_df, zoom=zoom)
 
         return ocr_regions
 
+    def image_to_data_with_character_confidence_filter(
+        self,
+        image: np.ndarray,
+        lang: str = "eng",
+        config: str = "",
+        character_confidence_threshold: float = 0.5,
+    ) -> pd.DataFrame:
+        hocr: pd.DataFrame = unstructured_pytesseract.image_to_pdf_or_hocr(
+            image,
+            lang=lang,
+            config="-c hocr_char_boxes=1 " + config,
+            extension="hocr",
+        )
+        soup = BeautifulSoup(hocr, "html.parser")
+        words = soup.find_all("span", class_="ocrx_word")
+
+        df_entries = []
+        for word in words:
+            text, bbox = self.extract_word_from_hocr(
+                word=word, character_confidence_threshold=character_confidence_threshold
+            )
+            if text and bbox:
+                left, top, right, bottom = bbox
+                df_entries.append(
+                    {
+                        "left": left,
+                        "top": top,
+                        "width": right - left,
+                        "height": bottom - top,
+                        "text": text,
+                    }
+                )
+        ocr_df = pd.DataFrame(df_entries)
+
+        return ocr_df
+
+    @staticmethod
+    def extract_word_from_hocr(
+        word: Tag, character_confidence_threshold: float = 0.0
+    ) -> tuple[str, list[int] | None]:
+        """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
+        word_text = ""
+        word_bbox = None
+
+        character_spans = word.find_all("span", class_="ocrx_cinfo")
+        for character_span in character_spans:
+            char = character_span.text
+
+            char_title = character_span.get("title", "")
+            conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)
+            bbox_match = re.search(r"x_bboxes (\d+) (\d+) (\d+) (\d+)", char_title)
+
+            if not (char and conf_match and bbox_match):
+                continue
+
+            character_probability = float(conf_match.group(1)) / 100
+            character_bbox = list(map(int, bbox_match.groups()))
+
+            if character_probability >= character_confidence_threshold:
+                word_text += char
+                if word_bbox is None:
+                    word_bbox = character_bbox
+                else:
+                    word_bbox = [
+                        min(word_bbox[0], character_bbox[0]),  # x1 - starts from 0
+                        min(word_bbox[1], character_bbox[1]),  # y1 - starts from 0
+                        max(word_bbox[2], character_bbox[2]),
+                        max(word_bbox[3], character_bbox[3]),
+                    ]
+        return word_text, word_bbox
+
     @requires_dependencies("unstructured_inference")
     def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:
         from unstructured.partition.pdf_image.inference_utils import (

From 052ae5019ce0ee04dab61821477f1537c3787fe3 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Mon, 6 Jan 2025 20:19:38 +0100
Subject: [PATCH 04/17] add psm

---
 unstructured/partition/utils/ocr_models/tesseract_ocr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 444a0a6623..e72826ffce 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -98,7 +98,7 @@ def image_to_data_with_character_confidence_filter(
         hocr: pd.DataFrame = unstructured_pytesseract.image_to_pdf_or_hocr(
             image,
             lang=lang,
-            config="-c hocr_char_boxes=1 " + config,
+            config="-c hocr_char_boxes=1 psm=12" + config,
             extension="hocr",
         )
         soup = BeautifulSoup(hocr, "html.parser")

From 4b54d8a1420dd8e4d474ab1cf162fdbec7873656 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Mon, 6 Jan 2025 20:19:53 +0100
Subject: [PATCH 05/17] Fix config name

---
 unstructured/partition/utils/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index ccab59b43f..291ae1b6a3 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -99,7 +99,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
     @property
     def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
         """Tesseract predictions with confidence below this threshold are ignored"""
-        return self._get_float("TESSERACT_CONFIDENCE_THRESHOLD", 0.0)
+        return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
 
     @property
     def GOOGLEVISION_API_ENDPOINT(self) -> str:

From 6fcd3f47479ee73e119dd5950f913d655600baf1 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Tue, 7 Jan 2025 14:17:43 +0100
Subject: [PATCH 06/17] Update

---
 .../utils/ocr_models/tesseract_ocr.py         | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index e72826ffce..fd8b888605 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -83,7 +83,6 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
                 character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
             )
             ocr_df = ocr_df.dropna()
-        print("OCR FILTERING")
         ocr_regions = self.parse_data(ocr_df, zoom=zoom)
 
         return ocr_regions
@@ -95,12 +94,18 @@ def image_to_data_with_character_confidence_filter(
         config: str = "",
         character_confidence_threshold: float = 0.5,
     ) -> pd.DataFrame:
-        hocr: pd.DataFrame = unstructured_pytesseract.image_to_pdf_or_hocr(
+        hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr(
             image,
             lang=lang,
-            config="-c hocr_char_boxes=1 psm=12" + config,
+            config="-c hocr_char_boxes=1 " + config,
             extension="hocr",
         )
+        ocr_df = self.hocr_to_dataframe(hocr, character_confidence_threshold)
+        return ocr_df
+
+    def hocr_to_dataframe(
+        self, hocr: str, character_confidence_threshold: float = 0.0
+    ) -> pd.DataFrame:
         soup = BeautifulSoup(hocr, "html.parser")
         words = soup.find_all("span", class_="ocrx_word")
 
@@ -121,7 +126,6 @@ def image_to_data_with_character_confidence_filter(
                     }
                 )
         ocr_df = pd.DataFrame(df_entries)
-
         return ocr_df
 
     @staticmethod
@@ -129,10 +133,14 @@ def extract_word_from_hocr(
         word: Tag, character_confidence_threshold: float = 0.0
     ) -> tuple[str, list[int] | None]:
         """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
+
+        character_spans = word.find_all("span", class_="ocrx_cinfo")
+        if len(character_spans) == 0:
+            return "", None
+
         word_text = ""
         word_bbox = None
 
-        character_spans = word.find_all("span", class_="ocrx_cinfo")
         for character_span in character_spans:
             char = character_span.text
 
@@ -157,6 +165,7 @@ def extract_word_from_hocr(
                         max(word_bbox[2], character_bbox[2]),
                         max(word_bbox[3], character_bbox[3]),
                     ]
+
         return word_text, word_bbox
 
     @requires_dependencies("unstructured_inference")

From 137678fe86c5c03018545186418b6aafd20d1810 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Tue, 7 Jan 2025 14:20:48 +0100
Subject: [PATCH 07/17] Update config

---
 CHANGELOG.md                | 10 ++++++++++
 unstructured/__version__.py |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 165cd0e077..cf6aa11fec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.16.13-dev0
+
+### Enhancements
+
+- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable.
+
+### Features
+
+### Fixes
+
 ## 0.16.12
 
 ### Enhancements
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index dcd9ca00b7..a88e673551 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.12"  # pragma: no cover
+__version__ = "0.16.13-dev0"  # pragma: no cover

From c25039fc63f7705d8a834f791b1802f9c83195db Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Tue, 7 Jan 2025 16:52:40 +0100
Subject: [PATCH 08/17] Remove unused zoom

---
 unstructured/partition/utils/ocr_models/tesseract_ocr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index fd8b888605..6825c2867d 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -49,7 +49,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
         trace_logger.detail("Processing entire page OCR with tesseract...")
         zoom = 1
         ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
-            np.array(zoom_image(image, zoom)),
+            np.array(image, zoom),
             lang=self.language,
             character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
         )

From 3bff8aebbae3c65fc6cf3ba749bf4c975803b6e7 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Tue, 7 Jan 2025 16:57:49 +0100
Subject: [PATCH 09/17] Remove unused zoom

---
 unstructured/partition/utils/ocr_models/tesseract_ocr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 6825c2867d..36e38787fa 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -49,7 +49,7 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
         trace_logger.detail("Processing entire page OCR with tesseract...")
         zoom = 1
         ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
-            np.array(image, zoom),
+            np.array(image),
             lang=self.language,
             character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
         )

From c1e9b8efa09ed2738af19d6738714251777df1da Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Wed, 8 Jan 2025 11:35:38 +0100
Subject: [PATCH 10/17] Use word bboxes instead of character bboxees

---
 .../partition/pdf_image/test_ocr.py           | 81 ++++++++++++-------
 .../utils/ocr_models/tesseract_ocr.py         | 42 +++++-----
 2 files changed, 71 insertions(+), 52 deletions(-)

diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
index 7b1454a189..fe04d82c55 100644
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -487,42 +487,67 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
     assert any(element in final_layout for element in ocr_elements)
 
 
-def test_extract_word_from_hocr():
-    def _create_hocr_word_span(characters: list[tuple[str, str, list[int]]]) -> Tag:
-        word_span = BeautifulSoup("<span class='ocrx_word'></span>", "html.parser").span
-        for char, x_conf, bbox in characters:
-            char_span = BeautifulSoup(
-                f"""
-                <span class='ocrx_cinfo' title='x_bboxes {bbox[0]} {bbox[1]} {bbox[2]} {bbox[3]}; x_conf {x_conf}'>{char}</span>
-                """,  # noqa : E501
-                "html.parser",
-            ).span
-            word_span.append(char_span)
-        return word_span
+def _create_hocr_word_span(
+    characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
+) -> Tag:
+    word_span = BeautifulSoup(
+        f"<span class='ocrx_word' title='"
+        f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
+        f"; x_wconf 64'></span>",
+        "html.parser",
+    ).span
+    for char, x_conf in characters:
+        char_span = BeautifulSoup(
+            f"""
+            <span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
+            """,  # noqa : E501
+            "html.parser",
+        ).span
+        word_span.append(char_span)
+    return word_span
+
 
+def test_extract_word_from_hocr():
     characters = [
-        ("w", "99.0", [10, 10, 20, 20]),
-        ("o", "98.5", [21, 9, 29, 20]),
-        ("r", "97.5", [31, 10, 40, 21]),
-        ("d", "96.0", [41, 11, 50, 22]),
-        ("!", "50.0", [51, 10, 60, 20]),
-        ("@", "45.0", [61, 10, 70, 20]),
+        ("w", "99.0"),
+        ("o", "98.5"),
+        ("r", "97.5"),
+        ("d", "96.0"),
+        ("!", "50.0"),
+        ("@", "45.0"),
     ]
+    word_bbox = (10, 9, 70, 22)
+    word_span = _create_hocr_word_span(characters, word_bbox)
 
-    word_span = _create_hocr_word_span(characters)
-
-    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
     assert text == "word!@"
-    assert bbox == [10, 9, 70, 22]
 
-    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
     assert text == "word"
-    assert bbox == [10, 9, 50, 22]
 
-    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
     assert text == "w"
-    assert bbox == [10, 10, 20, 20]
 
-    text, bbox = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
     assert text == ""
-    assert bbox is None
+
+
+def test_hocr_to_dataframe():
+    characters = [
+        ("w", "99.0"),
+        ("o", "98.5"),
+        ("r", "97.5"),
+        ("d", "96.0"),
+        ("!", "50.0"),
+        ("@", "45.0"),
+    ]
+    word_bbox = (10, 9, 70, 22)
+    hocr = str(_create_hocr_word_span(characters, word_bbox))
+    df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
+
+    assert df.shape == (1, 5)
+    assert df["left"].iloc[0] == 10
+    assert df["top"].iloc[0] == 9
+    assert df["width"].iloc[0] == 60
+    assert df["height"].iloc[0] == 13
+    assert df["text"].iloc[0] == "word"
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 36e38787fa..64ba58e073 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -107,15 +107,22 @@ def hocr_to_dataframe(
         self, hocr: str, character_confidence_threshold: float = 0.0
     ) -> pd.DataFrame:
         soup = BeautifulSoup(hocr, "html.parser")
-        words = soup.find_all("span", class_="ocrx_word")
+        word_spans = soup.find_all("span", class_="ocrx_word")
 
         df_entries = []
-        for word in words:
-            text, bbox = self.extract_word_from_hocr(
-                word=word, character_confidence_threshold=character_confidence_threshold
+        for word_span in word_spans:
+            word_title = word_span.get("title", "")
+            bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)
+
+            # Note: word bbox is used instead of combining characters together due to tesseract
+            # bug that causes the character bboxes to be outside the word bbox, and they have 0
+            # height or width when text is horizontal
+            text = self.extract_word_from_hocr(
+                word=word_span, character_confidence_threshold=character_confidence_threshold
             )
-            if text and bbox:
-                left, top, right, bottom = bbox
+            if text and bbox_match:
+                word_bbox = list(map(int, bbox_match.groups()))
+                left, top, right, bottom = word_bbox
                 df_entries.append(
                     {
                         "left": left,
@@ -131,42 +138,29 @@ def hocr_to_dataframe(
     @staticmethod
     def extract_word_from_hocr(
         word: Tag, character_confidence_threshold: float = 0.0
-    ) -> tuple[str, list[int] | None]:
+    ) -> str | None:
         """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
 
         character_spans = word.find_all("span", class_="ocrx_cinfo")
         if len(character_spans) == 0:
-            return "", None
+            return None
 
         word_text = ""
-        word_bbox = None
-
         for character_span in character_spans:
             char = character_span.text
 
             char_title = character_span.get("title", "")
             conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)
-            bbox_match = re.search(r"x_bboxes (\d+) (\d+) (\d+) (\d+)", char_title)
 
-            if not (char and conf_match and bbox_match):
+            if not (char and conf_match):
                 continue
 
             character_probability = float(conf_match.group(1)) / 100
-            character_bbox = list(map(int, bbox_match.groups()))
 
             if character_probability >= character_confidence_threshold:
                 word_text += char
-                if word_bbox is None:
-                    word_bbox = character_bbox
-                else:
-                    word_bbox = [
-                        min(word_bbox[0], character_bbox[0]),  # x1 - starts from 0
-                        min(word_bbox[1], character_bbox[1]),  # y1 - starts from 0
-                        max(word_bbox[2], character_bbox[2]),
-                        max(word_bbox[3], character_bbox[3]),
-                    ]
-
-        return word_text, word_bbox
+
+        return word_text
 
     @requires_dependencies("unstructured_inference")
     def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:

From 0e4492619feac1354d84c18c5ff6038d61133d08 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Wed, 8 Jan 2025 11:38:42 +0100
Subject: [PATCH 11/17] Do not return None

---
 unstructured/partition/utils/ocr_models/tesseract_ocr.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 64ba58e073..17589df06d 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -136,14 +136,12 @@ def hocr_to_dataframe(
         return ocr_df
 
     @staticmethod
-    def extract_word_from_hocr(
-        word: Tag, character_confidence_threshold: float = 0.0
-    ) -> str | None:
+    def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
         """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
 
         character_spans = word.find_all("span", class_="ocrx_cinfo")
         if len(character_spans) == 0:
-            return None
+            return ""
 
         word_text = ""
         for character_span in character_spans:

From 2d9054d2f181a213ed03dc377fc76f7c44082726 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Wed, 8 Jan 2025 11:56:33 +0100
Subject: [PATCH 12/17] Fix empty df scenario

---
 test_unstructured/partition/pdf_image/test_ocr.py     | 11 +++++++++++
 .../partition/utils/ocr_models/tesseract_ocr.py       |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
index fe04d82c55..85fc5f6d3e 100644
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -551,3 +551,14 @@ def test_hocr_to_dataframe():
     assert df["width"].iloc[0] == 60
     assert df["height"].iloc[0] == 13
     assert df["text"].iloc[0] == "word"
+
+
+def test_hocr_to_dataframe_when_no_prediction_empty_df():
+    df = OCRAgentTesseract().hocr_to_dataframe(hocr="")
+
+    assert df.shape == (0, 5)
+    assert "left" in df.columns
+    assert "top" in df.columns
+    assert "width" in df.columns
+    assert "text" in df.columns
+    assert "text" in df.columns
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 17589df06d..8668dec1fc 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -132,7 +132,7 @@ def hocr_to_dataframe(
                         "text": text,
                     }
                 )
-        ocr_df = pd.DataFrame(df_entries)
+        ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "width", "height", "text"])
         return ocr_df
 
     @staticmethod

From a61aa8583c082aeb617460cb0609726b19fc2142 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Wed, 8 Jan 2025 13:22:28 +0100
Subject: [PATCH 13/17] fix unit test

---
 test_unstructured/partition/pdf_image/test_pdf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 9b1b8de6e1..200edf3e2a 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None():
     [
         (
             PartitionStrategy.HI_RES,
-            "unstructured_pytesseract.image_to_data",
+            "unstructured_pytesseract.image_to_pdf_or_hocr",
         ),
         (
             PartitionStrategy.OCR_ONLY,
-            "unstructured_pytesseract.image_to_data",
+            "unstructured_pytesseract.image_to_pdf_or_hocr",
         ),
         (
             PartitionStrategy.OCR_ONLY,

From 1611a61ec8e539d273c6c21d12da676644691ad5 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Wed, 8 Jan 2025 14:23:14 +0100
Subject: [PATCH 14/17] Fix unittests

---
 test_unstructured/partition/pdf_image/test_ocr.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
index 85fc5f6d3e..e9982810a0 100644
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -72,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
 
 def test_get_ocr_layout_from_image_tesseract(monkeypatch):
     monkeypatch.setattr(
-        unstructured_pytesseract,
-        "image_to_data",
+        OCRAgentTesseract,
+        "image_to_data_with_character_confidence_filter",
         lambda *args, **kwargs: pd.DataFrame(
             {
                 "left": [10, 20, 30, 0],
@@ -446,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
     monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
     monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
     monkeypatch.setattr(
-        unstructured_pytesseract,
-        "image_to_data",
+        OCRAgentTesseract,
+        "image_to_data_with_character_confidence_filter",
         lambda *args, **kwargs: pd.DataFrame(
             {
                 "left": [10, 20, 30, 0],

From cee5440b43943a96a580faf226da73e27b586546 Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Wed, 8 Jan 2025 16:46:31 +0100
Subject: [PATCH 15/17] Set default threshold

---
 unstructured/partition/utils/config.py                   | 2 +-
 unstructured/partition/utils/ocr_models/tesseract_ocr.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index 291ae1b6a3..43489df74f 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -99,7 +99,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
     @property
     def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
         """Tesseract predictions with confidence below this threshold are ignored"""
-        return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
+        return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.85)
 
     @property
     def GOOGLEVISION_API_ENDPOINT(self) -> str:
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 8668dec1fc..58572d1a72 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -92,7 +92,7 @@ def image_to_data_with_character_confidence_filter(
         image: np.ndarray,
         lang: str = "eng",
         config: str = "",
-        character_confidence_threshold: float = 0.5,
+        character_confidence_threshold: float = 0.85,
     ) -> pd.DataFrame:
         hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr(
             image,
@@ -104,7 +104,7 @@ def image_to_data_with_character_confidence_filter(
         return ocr_df
 
     def hocr_to_dataframe(
-        self, hocr: str, character_confidence_threshold: float = 0.0
+        self, hocr: str, character_confidence_threshold: float = 0.85
     ) -> pd.DataFrame:
         soup = BeautifulSoup(hocr, "html.parser")
         word_spans = soup.find_all("span", class_="ocrx_word")

From c5b657051cde60050f024db01aa214a86da561ce Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Thu, 9 Jan 2025 13:06:55 +0100
Subject: [PATCH 16/17] Set default threshold to 0

---
 unstructured/partition/utils/config.py                   | 2 +-
 unstructured/partition/utils/ocr_models/tesseract_ocr.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index 43489df74f..291ae1b6a3 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -99,7 +99,7 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
     @property
     def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
         """Tesseract predictions with confidence below this threshold are ignored"""
-        return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.85)
+        return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
 
     @property
     def GOOGLEVISION_API_ENDPOINT(self) -> str:
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 58572d1a72..49a572d5fb 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -92,7 +92,7 @@ def image_to_data_with_character_confidence_filter(
         image: np.ndarray,
         lang: str = "eng",
         config: str = "",
-        character_confidence_threshold: float = 0.85,
+        character_confidence_threshold: float = 0.0,
     ) -> pd.DataFrame:
         hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr(
             image,
@@ -104,7 +104,7 @@ def image_to_data_with_character_confidence_filter(
         return ocr_df
 
     def hocr_to_dataframe(
-        self, hocr: str, character_confidence_threshold: float = 0.85
+        self, hocr: str, character_confidence_threshold: float = 0.0
     ) -> pd.DataFrame:
         soup = BeautifulSoup(hocr, "html.parser")
         word_spans = soup.find_all("span", class_="ocrx_word")

From 013a3516cbdc2405185a5aa22a56f7832df7fa0a Mon Sep 17 00:00:00 2001
From: Kamil Plucinski <kamil.plucinski@deepsense.ai>
Date: Mon, 13 Jan 2025 12:54:09 +0100
Subject: [PATCH 17/17] Refactor

---
 .../partition/utils/ocr_models/tesseract_ocr.py       | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 49a572d5fb..6e2c96da00 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -127,12 +127,17 @@ def hocr_to_dataframe(
                     {
                         "left": left,
                         "top": top,
-                        "width": right - left,
-                        "height": bottom - top,
+                        "right": right,
+                        "bottom": bottom,
                         "text": text,
                     }
                 )
-        ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "width", "height", "text"])
+        ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "right", "bottom", "text"])
+
+        ocr_df["width"] = ocr_df["right"] - ocr_df["left"]
+        ocr_df["height"] = ocr_df["bottom"] - ocr_df["top"]
+
+        ocr_df = ocr_df.drop(columns=["right", "bottom"])
         return ocr_df
 
     @staticmethod