[RELEASE] iText 7 pdfOcr - 1.0.1

https://git.itextsupport.com/ * release/1.0.1: [RELEASE] 1.0.1-SNAPSHOT -> 1.0.1 Synchronized collections to avoid test failures Remove unused fields Improvements in word bbox calculation Depend on iText snapshot. Remove redundant methods from ReflectionUtils [RELEASE] Update dependency versions Refactor the path to tessdata so that it doesn't end with a slash Remove irrelevant comment Minor Javadoc fix [RELEASE] Update dependency versions
itext · Jul 13, 2020 · e413823 · e413823
2 parents f271a87 + 1d10246
commit e413823
Show file tree

Hide file tree

Showing 22 changed files with 330 additions and 154 deletions.
diff --git a/pdfocr-api/pom.xml b/pdfocr-api/pom.xml
@@ -5,7 +5,7 @@
   <parent>
     <groupId>com.itextpdf</groupId>
     <artifactId>pdfocr-root</artifactId>
-    <version>1.0.0</version>
+    <version>1.0.1</version>
   </parent>
 
   <artifactId>pdfocr-api</artifactId>

diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java
@@ -282,7 +282,7 @@ public final OcrPdfCreatorProperties setTitle(
 
     /**
      * Returns FontProvider that was set previously or if it is
-     * <code>null<code/> a new instance of {@link PdfOcrFontProvider} is
+     * <code>null</code> a new instance of {@link PdfOcrFontProvider} is
      * returned.
      * @return {@link com.itextpdf.layout.font.FontProvider} object
      */

diff --git a/pdfocr-tesseract4/pom.xml b/pdfocr-tesseract4/pom.xml
@@ -5,7 +5,7 @@
   <parent>
     <groupId>com.itextpdf</groupId>
     <artifactId>pdfocr-root</artifactId>
-    <version>1.0.0</version>
+    <version>1.0.1</version>
   </parent>
 
   <artifactId>pdfocr-tesseract4</artifactId>

diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java
@@ -22,58 +22,25 @@ This file is part of the iText (R) project.
  */
 package com.itextpdf.pdfocr.tesseract4;
 
-import com.itextpdf.io.util.MessageFormatUtil;
 import com.itextpdf.kernel.Version;
-import com.itextpdf.kernel.counter.ContextManager;
 
-import java.lang.reflect.AccessibleObject;
 import java.lang.reflect.Array;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.Method;
 import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 final class ReflectionUtils {
 
-    private static final Logger logger = LoggerFactory.getLogger(ReflectionUtils.class);
-
-    private static final String KERNEL_PACKAGE = "com.itextpdf.kernel.";
     private static final String LICENSEKEY_PACKAGE = "com.itextpdf.licensekey.";
 
-    private static final String CONTEXT_MANAGER = "counter.ContextManager";
     private static final String LICENSEKEY = "LicenseKey";
     private static final String LICENSEKEY_PRODUCT = "LicenseKeyProduct";
     private static final String LICENSEKEY_FEATURE = "LicenseKeyProductFeature";
 
-    private static final String REGISTER_GENERIC_CONTEXT = "registerGenericContext";
     private static final String SCHEDULED_CHECK = "scheduledCheck";
 
     private static final String NO_PDFOCR_TESSERACT4 = "No license loaded for product pdfOcr-Tesseract4. Please use LicenseKey.loadLicense(...) to load one.";
 
-    private static Map<String, Class<?>> cachedClasses = new HashMap<>();
-    private static Map<MethodSignature, AccessibleObject> cachedMethods = new HashMap<>();
-
-    static {
-        try {
-            ContextManager contextManager = ContextManager.getInstance();
-            callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager,
-                    new Class[] {Collection.class, Collection.class},
-                    Collections.singletonList("com.itextpdf.pdfocr"),
-                    Collections.singletonList("com.itextpdf.pdfocr.tesseract4"));
-            callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager,
-                    new Class[] {Collection.class, Collection.class},
-                    Collections.singletonList("com.itextpdf.pdfocr.tesseract4"),
-                    Collections.singletonList("com.itextpdf.pdfocr.tesseract4"));
-        } catch (Exception e) {
-            logger.error(e.getMessage());
-        }
-    }
-
     private ReflectionUtils() {
     }
 
@@ -116,52 +83,6 @@ public static void scheduledCheck() {
         }
     }
 
-    private static Object callMethod(String className, String methodName, Object target, Class[] parameterTypes,
-            Object... args) {
-        try {
-            Method method = findMethod(className, methodName, parameterTypes);
-            return method.invoke(target, args);
-        } catch (NoSuchMethodException e) {
-            logger.warn(MessageFormatUtil.format("Cannot find method {0} for class {1}", methodName, className));
-        } catch (ClassNotFoundException e) {
-            logger.warn(MessageFormatUtil.format("Cannot find class {0}", className));
-        } catch (IllegalArgumentException e) {
-            logger.warn(MessageFormatUtil
-                    .format("Illegal arguments passed to {0}#{1} method call: {2}", className, methodName,
-                            e.getMessage()));
-        } catch (Exception e) {
-            // Converting checked exceptions to unchecked RuntimeException (java-specific comment).
-            //
-            // If kernel utils throws an exception at this point, we consider it as unrecoverable situation for
-            // its callers (pdfOcr methods).
-            // It's might be more suitable to wrap checked exceptions at a bit higher level, but we do it here for
-            // the sake of convenience.
-            throw new RuntimeException(e.toString(), e);
-        }
-        return null;
-    }
-
-    private static Method findMethod(String className, String methodName, Class[] parameterTypes)
-            throws NoSuchMethodException, ClassNotFoundException {
-        MethodSignature tm = new MethodSignature(className, parameterTypes, methodName);
-        Method m = (Method) cachedMethods.get(tm);
-        if (m == null) {
-            m = findClass(className).getDeclaredMethod(methodName, parameterTypes);
-            m.setAccessible(true);
-            cachedMethods.put(tm, m);
-        }
-        return m;
-    }
-
-    private static Class<?> findClass(String className) throws ClassNotFoundException {
-        Class<?> c = cachedClasses.get(className);
-        if (c == null) {
-            c = getClass(className);
-            cachedClasses.put(className, c);
-        }
-        return c;
-    }
-
     private static Class<?> getClass(String className) throws ClassNotFoundException {
         return Class.forName(className);
     }

diff --git a/...tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java b/...tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java
@@ -59,7 +59,10 @@ public class Tesseract4LogMessageConstant {
             + "temporary directory: {0}";
     public static final String CANNOT_CONVERT_IMAGE_TO_PIX =
             "Cannot convert image to pix: {0}";
+    public static final String CANNOT_PARSE_NODE_BBOX =
+            "Cannot parse node BBox, defaults to 0, 0, 0, 0. Node: {0}";
+
 
     private Tesseract4LogMessageConstant() {
     }
-}
+}
diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java
@@ -28,6 +28,7 @@ This file is part of the iText (R) project.
 import com.itextpdf.styledxmlparser.jsoup.Jsoup;
 import com.itextpdf.styledxmlparser.jsoup.nodes.Document;
 import com.itextpdf.styledxmlparser.jsoup.nodes.Element;
+import com.itextpdf.styledxmlparser.jsoup.nodes.Node;
 import com.itextpdf.styledxmlparser.jsoup.select.Elements;
 
 import java.io.File;
@@ -60,6 +61,27 @@ public class TesseractHelper {
     private static final Logger LOGGER = LoggerFactory
             .getLogger(TesseractHelper.class);
 
+    /**
+     * Patterns for matching hOCR element bboxes.
+     */
+    private static final Pattern BBOX_PATTERN = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
+    private static final Pattern BBOX_COORDINATE_PATTERN = Pattern
+            .compile(
+                    ".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
+
+    /**
+     * Indices in array representing bbox.
+     */
+    private static final int LEFT_IDX = 0;
+    private static final int BOTTOM_IDX = 1;
+    private static final int RIGHT_IDX = 2;
+    private static final int TOP_IDX = 3;
+
+    /**
+     * Size of the array containing bbox.
+     */
+    private static final int BBOX_ARRAY_SIZE = 4;
+
     /**
      * Creates a new {@link TesseractHelper} instance.
      */
@@ -86,23 +108,20 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
             throws IOException {
         Map<Integer, List<TextInfo>> imageData =
                 new LinkedHashMap<Integer, List<TextInfo>>();
+        Map<String, Node> unparsedBBoxes = new LinkedHashMap<>();
 
         for (File inputFile : inputFiles) {
             if (inputFile != null
                     && Files.exists(
-                            java.nio.file.Paths
-                                    .get(inputFile.getAbsolutePath()))) {
+                    java.nio.file.Paths
+                            .get(inputFile.getAbsolutePath()))) {
                 FileInputStream fileInputStream =
                         new FileInputStream(inputFile.getAbsolutePath());
                 Document doc = Jsoup.parse(fileInputStream,
                         java.nio.charset.StandardCharsets.UTF_8.name(),
                         inputFile.getAbsolutePath());
                 Elements pages = doc.getElementsByClass("ocr_page");
 
-                Pattern bboxPattern = Pattern.compile(".*bbox(\\s+\\d+){4}.*");
-                Pattern bboxCoordinatePattern = Pattern
-                        .compile(
-                                ".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*");
                 List<String> searchedClasses = TextPositioning.BY_LINES
                         .equals(textPositioning)
                         ? Arrays.<String>asList("ocr_line", "ocr_caption")
@@ -124,26 +143,11 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
                             }
                         }
                         for (Element obj : objects) {
-                            String value = obj.attr("title");
-                            Matcher bboxMatcher = bboxPattern.matcher(value);
-                            if (bboxMatcher.matches()) {
-                                Matcher bboxCoordinateMatcher =
-                                        bboxCoordinatePattern
-                                                .matcher(bboxMatcher.group());
-                                if (bboxCoordinateMatcher.matches()) {
-                                    List<Float> coordinates =
-                                            new ArrayList<Float>();
-                                    for (int i = 0; i < 4; i++) {
-                                        String coord = bboxCoordinateMatcher
-                                                .group(i + 1);
-                                        coordinates
-                                                .add(Float.parseFloat(coord));
-                                    }
-
-                                    textData.add(new TextInfo(obj.text(),
-                                            coordinates));
-                                }
-                            }
+                            List<Float> coordinates = getAlignedBBox(obj,
+                                    textPositioning,
+                                    unparsedBBoxes);
+                            textData.add(new TextInfo(obj.text(),
+                                    coordinates));
                         }
                     }
                     if (textData.size() > 0) {
@@ -157,9 +161,97 @@ public static Map<Integer, List<TextInfo>> parseHocrFile(
                 fileInputStream.close();
             }
         }
+        for (Node node : unparsedBBoxes.values()) {
+            LOGGER.warn(MessageFormatUtil.format(
+                    Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX,
+                    node.toString()
+            ));
+        }
         return imageData;
     }
 
+    /**
+     * Get and align (if needed) bbox of the element.
+     */
+    static List<Float> getAlignedBBox(Element object,
+                                      TextPositioning textPositioning,
+                                      Map<String, Node> unparsedBBoxes) {
+        final List<Float> coordinates = parseBBox(object, unparsedBBoxes);
+        if (TextPositioning.BY_WORDS_AND_LINES == textPositioning
+                || TextPositioning.BY_WORDS == textPositioning) {
+            Node line = object.parent();
+            final List<Float> lineCoordinates = parseBBox(line, unparsedBBoxes);
+            if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) {
+                coordinates.set(BOTTOM_IDX, lineCoordinates.get(BOTTOM_IDX));
+                coordinates.set(TOP_IDX, lineCoordinates.get(TOP_IDX));
+            }
+            detectAndFixBrokenBBoxes(object, coordinates,
+                    lineCoordinates, unparsedBBoxes);
+        }
+        return coordinates;
+    }
+
+    /**
+     * Parses element bbox.
+     *
+     * @param node element containing bbox
+     * @param unparsedBBoxes list of element ids with bboxes which could not be parsed
+     * @return parsed bbox
+     */
+    static List<Float> parseBBox(Node node, Map<String, Node> unparsedBBoxes) {
+        List<Float> bbox = new ArrayList<>();
+        Matcher bboxMatcher = BBOX_PATTERN.matcher(node.attr("title"));
+        if (bboxMatcher.matches()) {
+            Matcher bboxCoordinateMatcher =
+                    BBOX_COORDINATE_PATTERN
+                            .matcher(bboxMatcher.group());
+            if (bboxCoordinateMatcher.matches()) {
+                for (int i = 0; i < BBOX_ARRAY_SIZE; i++) {
+                    String coord = bboxCoordinateMatcher
+                            .group(i + 1);
+                    bbox.add(Float.parseFloat(coord));
+                }
+            }
+        }
+        if (bbox.size() == 0) {
+            bbox = Arrays.asList(0f, 0f, 0f, 0f);
+            String id = node.attr("id");
+            if (id != null && !unparsedBBoxes.containsKey(id)) {
+                unparsedBBoxes.put(id, node);
+            }
+        }
+        return bbox;
+    }
+
+    /**
+     * Sometimes hOCR file contains broke character bboxes which are equal to page bbox.
+     * This method attempts to detect and fix them.
+     */
+    static void detectAndFixBrokenBBoxes(Element object, List<Float> coordinates,
+                                         List<Float> lineCoordinates,
+                                         Map<String, Node> unparsedBBoxes) {
+        if (coordinates.get(LEFT_IDX) < lineCoordinates.get(LEFT_IDX)
+                || coordinates.get(LEFT_IDX) > lineCoordinates.get(RIGHT_IDX)) {
+            if (object.previousElementSibling() == null) {
+                coordinates.set(LEFT_IDX, lineCoordinates.get(LEFT_IDX));
+            } else {
+                Element sibling = object.previousElementSibling();
+                List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
+                coordinates.set(LEFT_IDX, siblingBBox.get(RIGHT_IDX));
+            }
+        }
+        if (coordinates.get(RIGHT_IDX) > lineCoordinates.get(RIGHT_IDX)
+                || coordinates.get(RIGHT_IDX) < lineCoordinates.get(LEFT_IDX)) {
+            if (object.nextElementSibling() == null) {
+                coordinates.set(RIGHT_IDX, lineCoordinates.get(RIGHT_IDX));
+            } else {
+                Element sibling = object.nextElementSibling();
+                List<Float> siblingBBox = parseBBox(sibling, unparsedBBoxes);
+                coordinates.set(RIGHT_IDX, siblingBBox.get(LEFT_IDX));
+            }
+        }
+    }
+
     /**
      * Deletes file using provided path.
      *
@@ -208,7 +300,7 @@ static String readTxtFile(final File txtFile) {
      * @param data text data in required format as {@link java.lang.String}
      */
     static void writeToTextFile(final String path,
-            final String data) {
+                                final String data) {
         try (Writer writer = new OutputStreamWriter(new FileOutputStream(path),
                 StandardCharsets.UTF_8)) {
             writer.write(data);
@@ -228,7 +320,7 @@ static void writeToTextFile(final String path,
      * @throws Tesseract4OcrException if provided command failed
      */
     static void runCommand(final String execPath,
-            final List<String> paramsList) throws Tesseract4OcrException {
+                           final List<String> paramsList) throws Tesseract4OcrException {
         try {
             String params = String.join(" ", paramsList);
             boolean cmdSucceeded = SystemUtil
@@ -251,4 +343,4 @@ static void runCommand(final String execPath,
                             .TESSERACT_FAILED);
         }
     }
-}
+}
diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java
@@ -39,5 +39,9 @@ public enum TextPositioning {
     /**
      * Text will be located by words retrieved from hocr file.
      */
-    BY_WORDS
-}
+    BY_WORDS,
+    /**
+     * Similar to BY_WORDS mode, but top and bottom of word BBox are inherited from line.
+     */
+    BY_WORDS_AND_LINES,
+}