Skip to content

Commit

Permalink
[RELEASE] iText 7 pdfOcr - 1.0.1
Browse files Browse the repository at this point in the history
https://git.itextsupport.com/

* release/1.0.1:
  [RELEASE] 1.0.1-SNAPSHOT -> 1.0.1
  Synchronized collections to avoid test failures
  Remove unused fields
  Improvements in word bbox calculation
  Depend on iText snapshot. Remove redundant methods from ReflectionUtils
  [RELEASE] Update dependency versions
  Refactor the path to tessdata so that it doesn't end with a slash
  Remove irrelevant comment
  Minor Javadoc fix
  [RELEASE] Update dependency versions
  • Loading branch information
iText-CI committed Jul 13, 2020
2 parents f271a87 + 1d10246 commit e413823
Show file tree
Hide file tree
Showing 22 changed files with 330 additions and 154 deletions.
2 changes: 1 addition & 1 deletion pdfocr-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<groupId>com.itextpdf</groupId>
<artifactId>pdfocr-root</artifactId>
<version>1.0.0</version>
<version>1.0.1</version>
</parent>

<artifactId>pdfocr-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ public final OcrPdfCreatorProperties setTitle(

/**
* Returns FontProvider that was set previously or if it is
* <code>null<code/> a new instance of {@link PdfOcrFontProvider} is
* <code>null</code> a new instance of {@link PdfOcrFontProvider} is
* returned.
* @return {@link com.itextpdf.layout.font.FontProvider} object
*/
Expand Down
2 changes: 1 addition & 1 deletion pdfocr-tesseract4/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<parent>
<groupId>com.itextpdf</groupId>
<artifactId>pdfocr-root</artifactId>
<version>1.0.0</version>
<version>1.0.1</version>
</parent>

<artifactId>pdfocr-tesseract4</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,58 +22,25 @@ This file is part of the iText (R) project.
*/
package com.itextpdf.pdfocr.tesseract4;

import com.itextpdf.io.util.MessageFormatUtil;
import com.itextpdf.kernel.Version;
import com.itextpdf.kernel.counter.ContextManager;

import java.lang.reflect.AccessibleObject;
import java.lang.reflect.Array;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

final class ReflectionUtils {

private static final Logger logger = LoggerFactory.getLogger(ReflectionUtils.class);

private static final String KERNEL_PACKAGE = "com.itextpdf.kernel.";
private static final String LICENSEKEY_PACKAGE = "com.itextpdf.licensekey.";

private static final String CONTEXT_MANAGER = "counter.ContextManager";
private static final String LICENSEKEY = "LicenseKey";
private static final String LICENSEKEY_PRODUCT = "LicenseKeyProduct";
private static final String LICENSEKEY_FEATURE = "LicenseKeyProductFeature";

private static final String REGISTER_GENERIC_CONTEXT = "registerGenericContext";
private static final String SCHEDULED_CHECK = "scheduledCheck";

private static final String NO_PDFOCR_TESSERACT4 = "No license loaded for product pdfOcr-Tesseract4. Please use LicenseKey.loadLicense(...) to load one.";

private static Map<String, Class<?>> cachedClasses = new HashMap<>();
private static Map<MethodSignature, AccessibleObject> cachedMethods = new HashMap<>();

static {
try {
ContextManager contextManager = ContextManager.getInstance();
callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager,
new Class[] {Collection.class, Collection.class},
Collections.singletonList("com.itextpdf.pdfocr"),
Collections.singletonList("com.itextpdf.pdfocr.tesseract4"));
callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager,
new Class[] {Collection.class, Collection.class},
Collections.singletonList("com.itextpdf.pdfocr.tesseract4"),
Collections.singletonList("com.itextpdf.pdfocr.tesseract4"));
} catch (Exception e) {
logger.error(e.getMessage());
}
}

private ReflectionUtils() {
}

Expand Down Expand Up @@ -116,52 +83,6 @@ public static void scheduledCheck() {
}
}

private static Object callMethod(String className, String methodName, Object target, Class[] parameterTypes,
Object... args) {
try {
Method method = findMethod(className, methodName, parameterTypes);
return method.invoke(target, args);
} catch (NoSuchMethodException e) {
logger.warn(MessageFormatUtil.format("Cannot find method {0} for class {1}", methodName, className));
} catch (ClassNotFoundException e) {
logger.warn(MessageFormatUtil.format("Cannot find class {0}", className));
} catch (IllegalArgumentException e) {
logger.warn(MessageFormatUtil
.format("Illegal arguments passed to {0}#{1} method call: {2}", className, methodName,
e.getMessage()));
} catch (Exception e) {
// Converting checked exceptions to unchecked RuntimeException (java-specific comment).
//
// If kernel utils throws an exception at this point, we consider it as unrecoverable situation for
// its callers (pdfOcr methods).
// It's might be more suitable to wrap checked exceptions at a bit higher level, but we do it here for
// the sake of convenience.
throw new RuntimeException(e.toString(), e);
}
return null;
}

private static Method findMethod(String className, String methodName, Class[] parameterTypes)
throws NoSuchMethodException, ClassNotFoundException {
MethodSignature tm = new MethodSignature(className, parameterTypes, methodName);
Method m = (Method) cachedMethods.get(tm);
if (m == null) {
m = findClass(className).getDeclaredMethod(methodName, parameterTypes);
m.setAccessible(true);
cachedMethods.put(tm, m);
}
return m;
}

private static Class<?> findClass(String className) throws ClassNotFoundException {
Class<?> c = cachedClasses.get(className);
if (c == null) {
c = getClass(className);
cachedClasses.put(className, c);
}
return c;
}

private static Class<?> getClass(String className) throws ClassNotFoundException {
return Class.forName(className);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@ public class Tesseract4LogMessageConstant {
+ "temporary directory: {0}";
public static final String CANNOT_CONVERT_IMAGE_TO_PIX =
"Cannot convert image to pix: {0}";
public static final String CANNOT_PARSE_NODE_BBOX =
"Cannot parse node BBox, defaults to 0, 0, 0, 0. Node: {0}";


private Tesseract4LogMessageConstant() {
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ This file is part of the iText (R) project.
import com.itextpdf.styledxmlparser.jsoup.Jsoup;
import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
import com.itextpdf.styledxmlparser.jsoup.select.Elements;

import java.io.File;
Expand Down Expand Up @@ -60,6 +61,27 @@ public class TesseractHelper {
private static final Logger LOGGER = LoggerFactory
.getLogger(TesseractHelper.class);

/**
* Patterns for matching hOCR element bboxes.
*/
private static final Pattern BBOX_PATTERN = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
private static final Pattern BBOX_COORDINATE_PATTERN = Pattern
.compile(
".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");

/**
* Indices in array representing bbox.
*/
private static final int LEFT_IDX = 0;
private static final int BOTTOM_IDX = 1;
private static final int RIGHT_IDX = 2;
private static final int TOP_IDX = 3;

/**
* Size of the array containing bbox.
*/
private static final int BBOX_ARRAY_SIZE = 4;

/**
* Creates a new {@link TesseractHelper} instance.
*/
Expand All @@ -86,23 +108,20 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
throws IOException {
Map<Integer, List<TextInfo>> imageData =
new LinkedHashMap<Integer, List<TextInfo>>();
Map<String, Node> unparsedBBoxes = new LinkedHashMap<>();

for (File inputFile : inputFiles) {
if (inputFile != null
&& Files.exists(
java.nio.file.Paths
.get(inputFile.getAbsolutePath()))) {
java.nio.file.Paths
.get(inputFile.getAbsolutePath()))) {
FileInputStream fileInputStream =
new FileInputStream(inputFile.getAbsolutePath());
Document doc = Jsoup.parse(fileInputStream,
java.nio.charset.StandardCharsets.UTF_8.name(),
inputFile.getAbsolutePath());
Elements pages = doc.getElementsByClass("ocr_page");

Pattern bboxPattern = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
Pattern bboxCoordinatePattern = Pattern
.compile(
".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
List<String> searchedClasses = TextPositioning.BY_LINES
.equals(textPositioning)
? Arrays.<String>asList("ocr_line", "ocr_caption")
Expand All @@ -124,26 +143,11 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
}
}
for (Element obj : objects) {
String value = obj.attr("title");
Matcher bboxMatcher = bboxPattern.matcher(value);
if (bboxMatcher.matches()) {
Matcher bboxCoordinateMatcher =
bboxCoordinatePattern
.matcher(bboxMatcher.group());
if (bboxCoordinateMatcher.matches()) {
List<Float> coordinates =
new ArrayList<Float>();
for (int i = 0; i < 4; i++) {
String coord = bboxCoordinateMatcher
.group(i + 1);
coordinates
.add(Float.parseFloat(coord));
}

textData.add(new TextInfo(obj.text(),
coordinates));
}
}
List<Float> coordinates = getAlignedBBox(obj,
textPositioning,
unparsedBBoxes);
textData.add(new TextInfo(obj.text(),
coordinates));
}
}
if (textData.size() > 0) {
Expand All @@ -157,9 +161,97 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
fileInputStream.close();
}
}
for (Node node : unparsedBBoxes.values()) {
LOGGER.warn(MessageFormatUtil.format(
Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX,
node.toString()
));
}
return imageData;
}

/**
* Get and align (if needed) bbox of the element.
*/
static List<Float> getAlignedBBox(Element object,
TextPositioning textPositioning,
Map<String, Node> unparsedBBoxes) {
final List<Float> coordinates = parseBBox(object, unparsedBBoxes);
if (TextPositioning.BY_WORDS_AND_LINES == textPositioning
|| TextPositioning.BY_WORDS == textPositioning) {
Node line = object.parent();
final List<Float> lineCoordinates = parseBBox(line, unparsedBBoxes);
if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) {
coordinates.set(BOTTOM_IDX, lineCoordinates.get(BOTTOM_IDX));
coordinates.set(TOP_IDX, lineCoordinates.get(TOP_IDX));
}
detectAndFixBrokenBBoxes(object, coordinates,
lineCoordinates, unparsedBBoxes);
}
return coordinates;
}

/**
* Parses element bbox.
*
* @param node element containing bbox
* @param unparsedBBoxes list of element ids with bboxes which could not be parsed
* @return parsed bbox
*/
static List<Float> parseBBox(Node node, Map<String, Node> unparsedBBoxes) {
List<Float> bbox = new ArrayList<>();
Matcher bboxMatcher = BBOX_PATTERN.matcher(node.attr("title"));
if (bboxMatcher.matches()) {
Matcher bboxCoordinateMatcher =
BBOX_COORDINATE_PATTERN
.matcher(bboxMatcher.group());
if (bboxCoordinateMatcher.matches()) {
for (int i = 0; i < BBOX_ARRAY_SIZE; i++) {
String coord = bboxCoordinateMatcher
.group(i + 1);
bbox.add(Float.parseFloat(coord));
}
}
}
if (bbox.size() == 0) {
bbox = Arrays.asList(0f, 0f, 0f, 0f);
String id = node.attr("id");
if (id != null && !unparsedBBoxes.containsKey(id)) {
unparsedBBoxes.put(id, node);
}
}
return bbox;
}

/**
* Sometimes hOCR file contains broke character bboxes which are equal to page bbox.
* This method attempts to detect and fix them.
*/
static void detectAndFixBrokenBBoxes(Element object, List<Float> coordinates,
List<Float> lineCoordinates,
Map<String, Node> unparsedBBoxes) {
if (coordinates.get(LEFT_IDX) < lineCoordinates.get(LEFT_IDX)
|| coordinates.get(LEFT_IDX) > lineCoordinates.get(RIGHT_IDX)) {
if (object.previousElementSibling() == null) {
coordinates.set(LEFT_IDX, lineCoordinates.get(LEFT_IDX));
} else {
Element sibling = object.previousElementSibling();
List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
coordinates.set(LEFT_IDX, siblingBBox.get(RIGHT_IDX));
}
}
if (coordinates.get(RIGHT_IDX) > lineCoordinates.get(RIGHT_IDX)
|| coordinates.get(RIGHT_IDX) < lineCoordinates.get(LEFT_IDX)) {
if (object.nextElementSibling() == null) {
coordinates.set(RIGHT_IDX, lineCoordinates.get(RIGHT_IDX));
} else {
Element sibling = object.nextElementSibling();
List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
coordinates.set(RIGHT_IDX, siblingBBox.get(LEFT_IDX));
}
}
}

/**
* Deletes file using provided path.
*
Expand Down Expand Up @@ -208,7 +300,7 @@ static String readTxtFile(final File txtFile) {
* @param data text data in required format as {@link java.lang.String}
*/
static void writeToTextFile(final String path,
final String data) {
final String data) {
try (Writer writer = new OutputStreamWriter(new FileOutputStream(path),
StandardCharsets.UTF_8)) {
writer.write(data);
Expand All @@ -228,7 +320,7 @@ static void writeToTextFile(final String path,
* @throws Tesseract4OcrException if provided command failed
*/
static void runCommand(final String execPath,
final List<String> paramsList) throws Tesseract4OcrException {
final List<String> paramsList) throws Tesseract4OcrException {
try {
String params = String.join(" ", paramsList);
boolean cmdSucceeded = SystemUtil
Expand All @@ -251,4 +343,4 @@ static void runCommand(final String execPath,
.TESSERACT_FAILED);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,9 @@ public enum TextPositioning {
/**
* Text will be located by words retrieved from hocr file.
*/
BY_WORDS
}
BY_WORDS,
/**
* Similar to BY_WORDS mode, but top and bottom of word BBox are inherited from line.
*/
BY_WORDS_AND_LINES,
}
Loading

0 comments on commit e413823

Please sign in to comment.