Skip to content

Commit

Permalink
Character confidence threshold (#3860)
Browse files Browse the repository at this point in the history
This change adds the ability to filter out characters predicted by
Tesseract with low confidence scores.

Some notes:
- I intentionally disabled it by default; I think some low score(like
0.9-0.95 for Tesseract) could be a safe choice though
- I wanted to use character bboxes and combine them into word bbox
later. However, a bug in Tesseract in some specific scenarios returns
incorrect character bboxes (unit tests caught it 🥳 ). More in comment in
the code
  • Loading branch information
plutasnyy authored Jan 10, 2025
1 parent 8378c26 commit a56dc4e
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 15 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
## 0.16.13-dev0
## 0.16.13-dev1

### Enhancements
- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable.

### Features

### Fixes

- **Fix NLTK Download** to use nltk assets in docker image
- removed the ability to automatically download nltk package if missing

## 0.16.12

### Enhancements
Expand Down
86 changes: 82 additions & 4 deletions test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import pytest
import unstructured_pytesseract
from bs4 import BeautifulSoup, Tag
from pdf2image.exceptions import PDFPageCountError
from PIL import Image, UnidentifiedImageError
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
Expand Down Expand Up @@ -71,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):

def test_get_ocr_layout_from_image_tesseract(monkeypatch):
monkeypatch.setattr(
unstructured_pytesseract,
"image_to_data",
OCRAgentTesseract,
"image_to_data_with_character_confidence_filter",
lambda *args, **kwargs: pd.DataFrame(
{
"left": [10, 20, 30, 0],
Expand Down Expand Up @@ -445,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
monkeypatch.setattr(
unstructured_pytesseract,
"image_to_data",
OCRAgentTesseract,
"image_to_data_with_character_confidence_filter",
lambda *args, **kwargs: pd.DataFrame(
{
"left": [10, 20, 30, 0],
Expand Down Expand Up @@ -484,3 +485,80 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
# Check if the final layout contains both original elements and OCR-derived elements
assert all(element in final_layout for element in mock_out_layout)
assert any(element in final_layout for element in ocr_elements)


def _create_hocr_word_span(
characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
) -> Tag:
word_span = BeautifulSoup(
f"<span class='ocrx_word' title='"
f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
f"; x_wconf 64'></span>",
"html.parser",
).span
for char, x_conf in characters:
char_span = BeautifulSoup(
f"""
<span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
""", # noqa : E501
"html.parser",
).span
word_span.append(char_span)
return word_span


def test_extract_word_from_hocr():
characters = [
("w", "99.0"),
("o", "98.5"),
("r", "97.5"),
("d", "96.0"),
("!", "50.0"),
("@", "45.0"),
]
word_bbox = (10, 9, 70, 22)
word_span = _create_hocr_word_span(characters, word_bbox)

text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
assert text == "word!@"

text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
assert text == "word"

text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
assert text == "w"

text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
assert text == ""


def test_hocr_to_dataframe():
characters = [
("w", "99.0"),
("o", "98.5"),
("r", "97.5"),
("d", "96.0"),
("!", "50.0"),
("@", "45.0"),
]
word_bbox = (10, 9, 70, 22)
hocr = str(_create_hocr_word_span(characters, word_bbox))
df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)

assert df.shape == (1, 5)
assert df["left"].iloc[0] == 10
assert df["top"].iloc[0] == 9
assert df["width"].iloc[0] == 60
assert df["height"].iloc[0] == 13
assert df["text"].iloc[0] == "word"


def test_hocr_to_dataframe_when_no_prediction_empty_df():
df = OCRAgentTesseract().hocr_to_dataframe(hocr="")

assert df.shape == (0, 5)
assert "left" in df.columns
assert "top" in df.columns
assert "width" in df.columns
assert "text" in df.columns
assert "text" in df.columns
4 changes: 2 additions & 2 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None():
[
(
PartitionStrategy.HI_RES,
"unstructured_pytesseract.image_to_data",
"unstructured_pytesseract.image_to_pdf_or_hocr",
),
(
PartitionStrategy.OCR_ONLY,
"unstructured_pytesseract.image_to_data",
"unstructured_pytesseract.image_to_pdf_or_hocr",
),
(
PartitionStrategy.OCR_ONLY,
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.13-dev0" # pragma: no cover
__version__ = "0.16.13-dev1" # pragma: no cover
5 changes: 5 additions & 0 deletions unstructured/partition/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
"""optimum text height for tesseract OCR"""
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)

@property
def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
"""Tesseract predictions with confidence below this threshold are ignored"""
return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)

@property
def GOOGLEVISION_API_ENDPOINT(self) -> str:
"""API endpoint to use for Google Vision"""
Expand Down
85 changes: 79 additions & 6 deletions unstructured/partition/utils/ocr_models/tesseract_ocr.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from __future__ import annotations

import os
import re
from typing import TYPE_CHECKING, List

import cv2
import numpy as np
import pandas as pd
import unstructured_pytesseract
from bs4 import BeautifulSoup, Tag
from PIL import Image as PILImage
from unstructured_pytesseract import Output

from unstructured.logger import trace_logger
from unstructured.partition.utils.config import env_config
Expand Down Expand Up @@ -47,10 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:

trace_logger.detail("Processing entire page OCR with tesseract...")
zoom = 1
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
np.array(image),
lang=self.language,
output_type=Output.DATAFRAME,
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
)
ocr_df = ocr_df.dropna()

Expand All @@ -76,17 +77,89 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
max_zoom,
)
ocr_df = unstructured_pytesseract.image_to_data(
ocr_df = self.image_to_data_with_character_confidence_filter(
np.array(zoom_image(image, zoom)),
lang=self.language,
output_type=Output.DATAFRAME,
character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
)
ocr_df = ocr_df.dropna()

ocr_regions = self.parse_data(ocr_df, zoom=zoom)

return ocr_regions

def image_to_data_with_character_confidence_filter(
self,
image: np.ndarray,
lang: str = "eng",
config: str = "",
character_confidence_threshold: float = 0.0,
) -> pd.DataFrame:
hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr(
image,
lang=lang,
config="-c hocr_char_boxes=1 " + config,
extension="hocr",
)
ocr_df = self.hocr_to_dataframe(hocr, character_confidence_threshold)
return ocr_df

def hocr_to_dataframe(
self, hocr: str, character_confidence_threshold: float = 0.0
) -> pd.DataFrame:
soup = BeautifulSoup(hocr, "html.parser")
word_spans = soup.find_all("span", class_="ocrx_word")

df_entries = []
for word_span in word_spans:
word_title = word_span.get("title", "")
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)

# Note: word bbox is used instead of combining characters together due to tesseract
# bug that causes the character bboxes to be outside the word bbox, and they have 0
# height or width when text is horizontal
text = self.extract_word_from_hocr(
word=word_span, character_confidence_threshold=character_confidence_threshold
)
if text and bbox_match:
word_bbox = list(map(int, bbox_match.groups()))
left, top, right, bottom = word_bbox
df_entries.append(
{
"left": left,
"top": top,
"width": right - left,
"height": bottom - top,
"text": text,
}
)
ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "width", "height", "text"])
return ocr_df

@staticmethod
def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
"""Extracts a word from an hOCR word tag, filtering out characters with low confidence."""

character_spans = word.find_all("span", class_="ocrx_cinfo")
if len(character_spans) == 0:
return ""

word_text = ""
for character_span in character_spans:
char = character_span.text

char_title = character_span.get("title", "")
conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)

if not (char and conf_match):
continue

character_probability = float(conf_match.group(1)) / 100

if character_probability >= character_confidence_threshold:
word_text += char

return word_text

@requires_dependencies("unstructured_inference")
def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:
from unstructured.partition.pdf_image.inference_utils import (
Expand Down

0 comments on commit a56dc4e

Please sign in to comment.