diff --git a/pdfocr-api/pom.xml b/pdfocr-api/pom.xml index 87f6927..a0b6bfc 100644 --- a/pdfocr-api/pom.xml +++ b/pdfocr-api/pom.xml @@ -5,7 +5,7 @@ com.itextpdf pdfocr-root - 1.0.0 + 1.0.1 pdfocr-api diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java index 187d768..f789433 100644 --- a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java @@ -282,7 +282,7 @@ public final OcrPdfCreatorProperties setTitle( /** * Returns FontProvider that was set previously or if it is - * null a new instance of {@link PdfOcrFontProvider} is + * null a new instance of {@link PdfOcrFontProvider} is * returned. * @return {@link com.itextpdf.layout.font.FontProvider} object */ diff --git a/pdfocr-tesseract4/pom.xml b/pdfocr-tesseract4/pom.xml index c09cdfe..4afa904 100644 --- a/pdfocr-tesseract4/pom.xml +++ b/pdfocr-tesseract4/pom.xml @@ -5,7 +5,7 @@ com.itextpdf pdfocr-root - 1.0.0 + 1.0.1 pdfocr-tesseract4 diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java index 07c1c53..133436e 100644 --- a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java @@ -22,58 +22,25 @@ This file is part of the iText (R) project. */ package com.itextpdf.pdfocr.tesseract4; -import com.itextpdf.io.util.MessageFormatUtil; import com.itextpdf.kernel.Version; -import com.itextpdf.kernel.counter.ContextManager; -import java.lang.reflect.AccessibleObject; import java.lang.reflect.Array; import java.lang.reflect.Constructor; import java.lang.reflect.Method; import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; final class ReflectionUtils { - private static final Logger logger = LoggerFactory.getLogger(ReflectionUtils.class); - - private static final String KERNEL_PACKAGE = "com.itextpdf.kernel."; private static final String LICENSEKEY_PACKAGE = "com.itextpdf.licensekey."; - private static final String CONTEXT_MANAGER = "counter.ContextManager"; private static final String LICENSEKEY = "LicenseKey"; private static final String LICENSEKEY_PRODUCT = "LicenseKeyProduct"; private static final String LICENSEKEY_FEATURE = "LicenseKeyProductFeature"; - private static final String REGISTER_GENERIC_CONTEXT = "registerGenericContext"; private static final String SCHEDULED_CHECK = "scheduledCheck"; private static final String NO_PDFOCR_TESSERACT4 = "No license loaded for product pdfOcr-Tesseract4. Please use LicenseKey.loadLicense(...) to load one."; - private static Map> cachedClasses = new HashMap<>(); - private static Map cachedMethods = new HashMap<>(); - - static { - try { - ContextManager contextManager = ContextManager.getInstance(); - callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager, - new Class[] {Collection.class, Collection.class}, - Collections.singletonList("com.itextpdf.pdfocr"), - Collections.singletonList("com.itextpdf.pdfocr.tesseract4")); - callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager, - new Class[] {Collection.class, Collection.class}, - Collections.singletonList("com.itextpdf.pdfocr.tesseract4"), - Collections.singletonList("com.itextpdf.pdfocr.tesseract4")); - } catch (Exception e) { - logger.error(e.getMessage()); - } - } - private ReflectionUtils() { } @@ -116,52 +83,6 @@ public static void scheduledCheck() { } } - private static Object callMethod(String className, String methodName, Object target, Class[] parameterTypes, - Object... args) { - try { - Method method = findMethod(className, methodName, parameterTypes); - return method.invoke(target, args); - } catch (NoSuchMethodException e) { - logger.warn(MessageFormatUtil.format("Cannot find method {0} for class {1}", methodName, className)); - } catch (ClassNotFoundException e) { - logger.warn(MessageFormatUtil.format("Cannot find class {0}", className)); - } catch (IllegalArgumentException e) { - logger.warn(MessageFormatUtil - .format("Illegal arguments passed to {0}#{1} method call: {2}", className, methodName, - e.getMessage())); - } catch (Exception e) { - // Converting checked exceptions to unchecked RuntimeException (java-specific comment). - // - // If kernel utils throws an exception at this point, we consider it as unrecoverable situation for - // its callers (pdfOcr methods). - // It's might be more suitable to wrap checked exceptions at a bit higher level, but we do it here for - // the sake of convenience. - throw new RuntimeException(e.toString(), e); - } - return null; - } - - private static Method findMethod(String className, String methodName, Class[] parameterTypes) - throws NoSuchMethodException, ClassNotFoundException { - MethodSignature tm = new MethodSignature(className, parameterTypes, methodName); - Method m = (Method) cachedMethods.get(tm); - if (m == null) { - m = findClass(className).getDeclaredMethod(methodName, parameterTypes); - m.setAccessible(true); - cachedMethods.put(tm, m); - } - return m; - } - - private static Class findClass(String className) throws ClassNotFoundException { - Class c = cachedClasses.get(className); - if (c == null) { - c = getClass(className); - cachedClasses.put(className, c); - } - return c; - } - private static Class getClass(String className) throws ClassNotFoundException { return Class.forName(className); } diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java index 90bf76b..7f028c5 100644 --- a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java @@ -59,7 +59,10 @@ public class Tesseract4LogMessageConstant { + "temporary directory: {0}"; public static final String CANNOT_CONVERT_IMAGE_TO_PIX = "Cannot convert image to pix: {0}"; + public static final String CANNOT_PARSE_NODE_BBOX = + "Cannot parse node BBox, defaults to 0, 0, 0, 0. Node: {0}"; + private Tesseract4LogMessageConstant() { } -} +} \ No newline at end of file diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java index c700f25..d397625 100644 --- a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java @@ -28,6 +28,7 @@ This file is part of the iText (R) project. import com.itextpdf.styledxmlparser.jsoup.Jsoup; import com.itextpdf.styledxmlparser.jsoup.nodes.Document; import com.itextpdf.styledxmlparser.jsoup.nodes.Element; +import com.itextpdf.styledxmlparser.jsoup.nodes.Node; import com.itextpdf.styledxmlparser.jsoup.select.Elements; import java.io.File; @@ -60,6 +61,27 @@ public class TesseractHelper { private static final Logger LOGGER = LoggerFactory .getLogger(TesseractHelper.class); + /** + * Patterns for matching hOCR element bboxes. + */ + private static final Pattern BBOX_PATTERN = Pattern.compile(".*bbox(\\s+\\d+){4}.*"); + private static final Pattern BBOX_COORDINATE_PATTERN = Pattern + .compile( + ".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*"); + + /** + * Indices in array representing bbox. + */ + private static final int LEFT_IDX = 0; + private static final int BOTTOM_IDX = 1; + private static final int RIGHT_IDX = 2; + private static final int TOP_IDX = 3; + + /** + * Size of the array containing bbox. + */ + private static final int BBOX_ARRAY_SIZE = 4; + /** * Creates a new {@link TesseractHelper} instance. */ @@ -86,12 +108,13 @@ public static Map> parseHocrFile( throws IOException { Map> imageData = new LinkedHashMap>(); + Map unparsedBBoxes = new LinkedHashMap<>(); for (File inputFile : inputFiles) { if (inputFile != null && Files.exists( - java.nio.file.Paths - .get(inputFile.getAbsolutePath()))) { + java.nio.file.Paths + .get(inputFile.getAbsolutePath()))) { FileInputStream fileInputStream = new FileInputStream(inputFile.getAbsolutePath()); Document doc = Jsoup.parse(fileInputStream, @@ -99,10 +122,6 @@ public static Map> parseHocrFile( inputFile.getAbsolutePath()); Elements pages = doc.getElementsByClass("ocr_page"); - Pattern bboxPattern = Pattern.compile(".*bbox(\\s+\\d+){4}.*"); - Pattern bboxCoordinatePattern = Pattern - .compile( - ".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*"); List searchedClasses = TextPositioning.BY_LINES .equals(textPositioning) ? Arrays.asList("ocr_line", "ocr_caption") @@ -124,26 +143,11 @@ public static Map> parseHocrFile( } } for (Element obj : objects) { - String value = obj.attr("title"); - Matcher bboxMatcher = bboxPattern.matcher(value); - if (bboxMatcher.matches()) { - Matcher bboxCoordinateMatcher = - bboxCoordinatePattern - .matcher(bboxMatcher.group()); - if (bboxCoordinateMatcher.matches()) { - List coordinates = - new ArrayList(); - for (int i = 0; i < 4; i++) { - String coord = bboxCoordinateMatcher - .group(i + 1); - coordinates - .add(Float.parseFloat(coord)); - } - - textData.add(new TextInfo(obj.text(), - coordinates)); - } - } + List coordinates = getAlignedBBox(obj, + textPositioning, + unparsedBBoxes); + textData.add(new TextInfo(obj.text(), + coordinates)); } } if (textData.size() > 0) { @@ -157,9 +161,97 @@ public static Map> parseHocrFile( fileInputStream.close(); } } + for (Node node : unparsedBBoxes.values()) { + LOGGER.warn(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX, + node.toString() + )); + } return imageData; } + /** + * Get and align (if needed) bbox of the element. + */ + static List getAlignedBBox(Element object, + TextPositioning textPositioning, + Map unparsedBBoxes) { + final List coordinates = parseBBox(object, unparsedBBoxes); + if (TextPositioning.BY_WORDS_AND_LINES == textPositioning + || TextPositioning.BY_WORDS == textPositioning) { + Node line = object.parent(); + final List lineCoordinates = parseBBox(line, unparsedBBoxes); + if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) { + coordinates.set(BOTTOM_IDX, lineCoordinates.get(BOTTOM_IDX)); + coordinates.set(TOP_IDX, lineCoordinates.get(TOP_IDX)); + } + detectAndFixBrokenBBoxes(object, coordinates, + lineCoordinates, unparsedBBoxes); + } + return coordinates; + } + + /** + * Parses element bbox. + * + * @param node element containing bbox + * @param unparsedBBoxes list of element ids with bboxes which could not be parsed + * @return parsed bbox + */ + static List parseBBox(Node node, Map unparsedBBoxes) { + List bbox = new ArrayList<>(); + Matcher bboxMatcher = BBOX_PATTERN.matcher(node.attr("title")); + if (bboxMatcher.matches()) { + Matcher bboxCoordinateMatcher = + BBOX_COORDINATE_PATTERN + .matcher(bboxMatcher.group()); + if (bboxCoordinateMatcher.matches()) { + for (int i = 0; i < BBOX_ARRAY_SIZE; i++) { + String coord = bboxCoordinateMatcher + .group(i + 1); + bbox.add(Float.parseFloat(coord)); + } + } + } + if (bbox.size() == 0) { + bbox = Arrays.asList(0f, 0f, 0f, 0f); + String id = node.attr("id"); + if (id != null && !unparsedBBoxes.containsKey(id)) { + unparsedBBoxes.put(id, node); + } + } + return bbox; + } + + /** + * Sometimes hOCR file contains broke character bboxes which are equal to page bbox. + * This method attempts to detect and fix them. + */ + static void detectAndFixBrokenBBoxes(Element object, List coordinates, + List lineCoordinates, + Map unparsedBBoxes) { + if (coordinates.get(LEFT_IDX) < lineCoordinates.get(LEFT_IDX) + || coordinates.get(LEFT_IDX) > lineCoordinates.get(RIGHT_IDX)) { + if (object.previousElementSibling() == null) { + coordinates.set(LEFT_IDX, lineCoordinates.get(LEFT_IDX)); + } else { + Element sibling = object.previousElementSibling(); + List siblingBBox = parseBBox(sibling, unparsedBBoxes); + coordinates.set(LEFT_IDX, siblingBBox.get(RIGHT_IDX)); + } + } + if (coordinates.get(RIGHT_IDX) > lineCoordinates.get(RIGHT_IDX) + || coordinates.get(RIGHT_IDX) < lineCoordinates.get(LEFT_IDX)) { + if (object.nextElementSibling() == null) { + coordinates.set(RIGHT_IDX, lineCoordinates.get(RIGHT_IDX)); + } else { + Element sibling = object.nextElementSibling(); + List siblingBBox = parseBBox(sibling, unparsedBBoxes); + coordinates.set(RIGHT_IDX, siblingBBox.get(LEFT_IDX)); + } + } + } + /** * Deletes file using provided path. * @@ -208,7 +300,7 @@ static String readTxtFile(final File txtFile) { * @param data text data in required format as {@link java.lang.String} */ static void writeToTextFile(final String path, - final String data) { + final String data) { try (Writer writer = new OutputStreamWriter(new FileOutputStream(path), StandardCharsets.UTF_8)) { writer.write(data); @@ -228,7 +320,7 @@ static void writeToTextFile(final String path, * @throws Tesseract4OcrException if provided command failed */ static void runCommand(final String execPath, - final List paramsList) throws Tesseract4OcrException { + final List paramsList) throws Tesseract4OcrException { try { String params = String.join(" ", paramsList); boolean cmdSucceeded = SystemUtil @@ -251,4 +343,4 @@ static void runCommand(final String execPath, .TESSERACT_FAILED); } } -} +} \ No newline at end of file diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java index c8edb07..f660f7e 100644 --- a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java @@ -39,5 +39,9 @@ public enum TextPositioning { /** * Text will be located by words retrieved from hocr file. */ - BY_WORDS -} + BY_WORDS, + /** + * Similar to BY_WORDS mode, but top and bottom of word BBox are inherited from line. + */ + BY_WORDS_AND_LINES, +} \ No newline at end of file diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java index 394e3d6..0261e09 100644 --- a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java @@ -87,6 +87,8 @@ public class IntegrationTestHelper extends ExtendedITextTest { // path to font for hindi protected static final String NOTO_SANS_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSans-Regular.ttf"; + // path to font for thai + protected static final String NOTO_SANS_THAI_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSansThai-Regular.ttf"; // path to font for japanese protected static final String KOSUGI_FONT_PATH = TEST_FONTS_DIRECTORY + "Kosugi-Regular.ttf"; // path to font for chinese @@ -101,13 +103,14 @@ public class IntegrationTestHelper extends ExtendedITextTest { static { Map fontPathToNameMap = new HashMap<>(); fontPathToNameMap.put(NOTO_SANS_FONT_PATH, "NotoSans"); + fontPathToNameMap.put(NOTO_SANS_THAI_FONT_PATH, "NotoSansThai"); fontPathToNameMap.put(KOSUGI_FONT_PATH, "Kosugi"); fontPathToNameMap.put(NOTO_SANS_SC_FONT_PATH, "NotoSansSC"); fontPathToNameMap.put(CAIRO_FONT_PATH, "Cairo"); fontPathToNameMap.put(FREE_SANS_FONT_PATH, "FreeSans"); FONT_PATH_TO_FONT_NAME_MAP = Collections.unmodifiableMap(fontPathToNameMap); } - + public enum ReaderType { LIB, EXECUTABLE @@ -164,7 +167,7 @@ protected static File getTessDataDirectory() { * Retrieve text from specified page from given PDF document. */ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, - File file, int page, List languages, List fonts) { + File file, int page, List languages, List fonts) { String result = null; String pdfPath = null; try { @@ -183,7 +186,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, * Retrieve text from specified page from given PDF document. */ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, - File file, int page, List languages, String fontPath) { + File file, int page, List languages, String fontPath) { return getTextFromPdf(tesseractReader, file, page, languages, Collections.singletonList(fontPath)); } @@ -192,7 +195,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, * Retrieve text from the first page of given PDF document setting font. */ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file, - List languages, String fontPath) { + List languages, String fontPath) { return getTextFromPdf(tesseractReader, file, 1, languages, fontPath); } @@ -200,7 +203,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil * Retrieve text from the first page of given PDF document. */ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file, - List languages) { + List languages) { return getTextFromPdf(tesseractReader, file, 1, languages, new ArrayList()); } @@ -209,7 +212,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil * Retrieve text from the required page of given PDF document. */ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file, int page, - List languages) { + List languages) { return getTextFromPdf(tesseractReader, file, page, languages, new ArrayList()); } @@ -224,7 +227,7 @@ protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, Fil * Get text from layer specified by name from page. */ protected String getTextFromPdfLayer(String pdfPath, String layerName, - int page, boolean useActualText) throws IOException { + int page, boolean useActualText) throws IOException { PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath), new DocumentProperties().setEventCountingMetaInfo(new PdfOcrMetaInfo())); @@ -243,7 +246,7 @@ protected String getTextFromPdfLayer(String pdfPath, String layerName, * Get text from layer specified by name from page. */ protected String getTextFromPdfLayer(String pdfPath, String layerName, - int page) throws IOException { + int page) throws IOException { return getTextFromPdfLayer(pdfPath, layerName, page, false); } @@ -253,7 +256,7 @@ protected String getTextFromPdfLayer(String pdfPath, String layerName, * {@link LocationTextExtractionStrategy#getResultantText()}. */ protected String getTextFromPdfLayerUsingActualText(String pdfPath, - String layerName, int page) throws IOException { + String layerName, int page) throws IOException { return getTextFromPdfLayer(pdfPath, layerName, page, true) .replace(" ", ""); } @@ -378,7 +381,7 @@ protected void doOcrAndSavePdfToPath( * (Text will be invisible) */ protected void doOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String imgPath, - String pdfPath, List languages, List fonts) { + String pdfPath, List languages, List fonts) { doOcrAndSavePdfToPath(tesseractReader, imgPath, pdfPath, languages, fonts, null); } @@ -469,7 +472,7 @@ public void setImageBBoxRectangle(com.itextpdf.kernel.geom.Rectangle imageBBoxRe @Override protected boolean isChunkAtWordBoundary(TextChunk chunk, - TextChunk previousChunk) { + TextChunk previousChunk) { ITextChunkLocation curLoc = chunk.getLocation(); ITextChunkLocation prevLoc = previousChunk.getLocation(); @@ -522,4 +525,4 @@ else if (type.equals(EventType.RENDER_IMAGE)) { : tagHierarchy.get(0).getProperties().get(PdfName.Name).toString(); } } -} +} \ No newline at end of file diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java index 3a36dee..dc62ce7 100644 --- a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java @@ -69,7 +69,7 @@ public static void beforeClass() { public void initTesseractProperties() { Tesseract4OcrEngineProperties ocrEngineProperties = new Tesseract4OcrEngineProperties(); - ocrEngineProperties.setPathToTessData(new File(sourceFolder + "../../tessdata/")); + ocrEngineProperties.setPathToTessData(new File(sourceFolder + "../../tessdata")); tesseractReader.setTesseract4OcrEngineProperties(ocrEngineProperties); } @@ -94,9 +94,6 @@ public void testEventCountingPdfEvent() throws InterruptedException { } for (int i = 0; i < n; i++) { threads[i].start(); - - // The test will pass in sequential mode, i.e. if the following line is uncommented - //threads[i].join(); } for (int i = 0; i < n; i++) { threads[i].join(); @@ -127,8 +124,8 @@ private static Thread getThread(DoImageOcrRunnable runnable) { } public static class TestEventCounter extends EventCounter { - private List events = new ArrayList<>(); - private List metaInfos = new ArrayList<>(); + private List events = new ArrayList(); + private List metaInfos = new ArrayList(); public List getEvents() { return events; @@ -139,10 +136,9 @@ public List getMetaInfos() { } @Override - protected void onEvent(IEvent event, IMetaInfo metaInfo) { + synchronized protected void onEvent(IEvent event, IMetaInfo metaInfo) { this.events.add(event); this.metaInfos.add(metaInfo); } } - } diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java index 689a058..297036d 100644 --- a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java @@ -22,10 +22,16 @@ This file is part of the iText (R) project. */ package com.itextpdf.pdfocr.tessdata; +import com.itextpdf.kernel.colors.DeviceRgb; +import com.itextpdf.kernel.utils.CompareTool; +import com.itextpdf.pdfocr.PdfOcrLogMessageConstant; import com.itextpdf.pdfocr.TextInfo; import com.itextpdf.pdfocr.tesseract4.OutputFormat; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; import com.itextpdf.pdfocr.tesseract4.TesseractHelper; import com.itextpdf.pdfocr.tesseract4.TextPositioning; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; import com.itextpdf.test.annotations.type.IntegrationTest; import org.junit.Assert; @@ -33,6 +39,7 @@ This file is part of the iText (R) project. import org.junit.experimental.categories.Category; import java.io.File; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -43,7 +50,7 @@ public TessDataIntegrationLibTest() { super(ReaderType.LIB); } - @Test(timeout = 50000) + @Test(timeout = 60000) public void textOutputFromHalftoneFile() { String imgPath = TEST_IMAGES_DIRECTORY + "halftone.jpg"; String expected01 = "Silliness Enablers"; @@ -59,7 +66,7 @@ public void textOutputFromHalftoneFile() { Assert.assertTrue(result.contains(expected03)); } - @Test(timeout = 50000) + @Test(timeout = 60000) public void hocrOutputFromHalftoneFile() throws java.io.IOException { String path = TEST_IMAGES_DIRECTORY + "halftone.jpg"; String expected01 = "Silliness"; @@ -97,6 +104,69 @@ public void hocrOutputFromHalftoneFile() throws java.io.IOException { Assert.assertTrue(findTextInPageData(pageData, 1, expected09)); } + @Test + public void compareInvoiceFrontThaiImage() throws InterruptedException, java.io.IOException { + String testName = "compareInvoiceFrontThaiImage"; + String filename = "invoice_front_thai"; + + //Tesseract for Java and Tesseract for .NET give different output + //So we cannot use one reference pdf file for them + String expectedPdfPathJava = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_java.pdf"; + String expectedPdfPathDotNet = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_dotnet.pdf"; + + String resultPdfPath = getTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf"; + + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setTextPositioning(TextPositioning.BY_WORDS_AND_LINES); + properties.setPathToTessData(getTessDataDirectory()); + properties.setLanguages(Arrays.asList("tha", "eng")); + tesseractReader.setTesseract4OcrEngineProperties(properties); + + doOcrAndSavePdfToPath(tesseractReader, + TEST_IMAGES_DIRECTORY + filename + ".jpg", resultPdfPath, + Arrays.asList("tha", "eng"), Arrays.asList(NOTO_SANS_THAI_FONT_PATH, NOTO_SANS_FONT_PATH), DeviceRgb.RED); + boolean javaTest = new CompareTool().compareByContent(resultPdfPath, expectedPdfPathJava, + TEST_DOCUMENTS_DIRECTORY, "diff_") == null; + boolean dotNetTest = new CompareTool().compareByContent(resultPdfPath, expectedPdfPathDotNet, + TEST_DOCUMENTS_DIRECTORY, "diff_") == null; + + Assert.assertTrue(javaTest || dotNetTest); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, count = 2) + }) + @Test + public void compareThaiTextImage() throws InterruptedException, java.io.IOException { + String testName = "compareThaiTextImage"; + String filename = "thai_01"; + + //Tesseract for Java and Tesseract for .NET give different output + //So we cannot use one reference pdf file for them + String expectedPdfPathJava = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_java.pdf"; + String expectedPdfPathDotNet = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + "_dotnet.pdf"; + + String resultPdfPath = getTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf"; + + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setTextPositioning(TextPositioning.BY_WORDS_AND_LINES); + properties.setPathToTessData(getTessDataDirectory()); + properties.setLanguages(Arrays.asList("tha")); + tesseractReader.setTesseract4OcrEngineProperties(properties); + + doOcrAndSavePdfToPath(tesseractReader, + TEST_IMAGES_DIRECTORY + filename + ".jpg", resultPdfPath, + Arrays.asList("tha"), Arrays.asList(NOTO_SANS_THAI_FONT_PATH), DeviceRgb.RED); + boolean javaTest = new CompareTool().compareByContent(resultPdfPath, expectedPdfPathJava, + TEST_DOCUMENTS_DIRECTORY, "diff_") == null; + boolean dotNetTest = new CompareTool().compareByContent(resultPdfPath, expectedPdfPathDotNet, + TEST_DOCUMENTS_DIRECTORY, "diff_") == null; + + Assert.assertTrue(javaTest || dotNetTest); + } + /** * Searches for certain text in page data. */ @@ -109,4 +179,4 @@ private boolean findTextInPageData(Map> pageData, int pa return false; } -} +} \ No newline at end of file diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java index 1f43e3e..f5eece1 100644 --- a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java @@ -24,6 +24,7 @@ This file is part of the iText (R) project. import com.itextpdf.io.util.MessageFormatUtil; import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.pdfocr.TextInfo; import com.itextpdf.test.annotations.LogMessage; import com.itextpdf.test.annotations.LogMessages; @@ -32,6 +33,10 @@ This file is part of the iText (R) project. import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.Collections; +import java.util.List; +import java.util.Map; + import net.sourceforge.lept4j.Pix; import net.sourceforge.tess4j.TesseractException; import org.junit.Assert; @@ -45,7 +50,7 @@ public class ApiTest extends IntegrationTestHelper { public ExpectedException junitExpectedException = ExpectedException.none(); @LogMessages(messages = { - @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET) + @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET) }) @Test public void testDefaultTessDataPathValidationForLib() { @@ -60,7 +65,7 @@ public void testDefaultTessDataPathValidationForLib() { } @LogMessages(messages = { - @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET) + @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET) }) @Test public void testDefaultTessDataPathValidationForExecutable() { @@ -76,7 +81,7 @@ public void testDefaultTessDataPathValidationForExecutable() { } @LogMessages(messages = { - @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE) + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE) }) @Test public void testDoTesseractOcrForIncorrectImageForExecutable() { @@ -96,8 +101,8 @@ public void testDoTesseractOcrForIncorrectImageForExecutable() { } @LogMessages(messages = { - @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED), - @LogMessage(messageTemplate = Tesseract4LogMessageConstant.TESSERACT_FAILED) + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED), + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.TESSERACT_FAILED) }) @Test public void testOcrResultForSinglePageForNullImage() { @@ -131,10 +136,10 @@ public void testDoTesseractOcrForNonAsciiPathForExecutable() { } @LogMessages(messages = { - @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE), - @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED), - @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND), - @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED) + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE), + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED), + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND), + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED) }, ignore = true) @Test public void testDoTesseractOcrForExecutableForWin() { @@ -143,10 +148,10 @@ public void testDoTesseractOcrForExecutableForWin() { } @LogMessages(messages = { - @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE), - @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED), - @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND), - @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED) + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE), + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED), + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND), + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED) }, ignore = true) @Test public void testDoTesseractOcrForExecutableForLinux() { @@ -154,6 +159,22 @@ public void testDoTesseractOcrForExecutableForLinux() { testSettingOsName("linux"); } + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX, count = 4) + }) + @Test + public void testDetectAndFixBrokenBBoxes() throws IOException { + File hocrFile = new File(TEST_DOCUMENTS_DIRECTORY + "broken_bboxes.hocr"); + Map> parsedHocr = TesseractHelper.parseHocrFile(Collections.singletonList(hocrFile), + TextPositioning.BY_WORDS_AND_LINES); + TextInfo textInfo = parsedHocr.get(1).get(1); + + Assert.assertEquals(383.0f, (float)textInfo.getBbox().get(0), 0.1); + Assert.assertEquals(101.0f, (float)textInfo.getBbox().get(1), 0.1); + Assert.assertEquals(514.0f, (float)textInfo.getBbox().get(2), 0.1); + Assert.assertEquals(136.0f, (float)textInfo.getBbox().get(3), 0.1); + } + private void testSettingOsName(String osName) { String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; File imgFile = new File(path); @@ -175,4 +196,4 @@ private void testSettingOsName(String osName) { System.setProperty(osPropertyName, os); } } -} +} \ No newline at end of file diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/broken_bboxes.hocr b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/broken_bboxes.hocr new file mode 100644 index 0000000..4b432e0 --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/broken_bboxes.hocr @@ -0,0 +1,66 @@ + + + + + + + + +
+
+

+ + + 1 + ซ่ + 1 + + + + + + + เท + + + + + + + ช็ + + + + + + + เป + + + + + + + + + + + + + + + + + ณา + + + + + ไท + + +

+
+
+ + \ No newline at end of file diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf new file mode 100644 index 0000000..011d403 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_dotnet.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_java.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_java.pdf new file mode 100644 index 0000000..616b588 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/invoice_front_thai_lib_java.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_dotnet.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_dotnet.pdf new file mode 100644 index 0000000..68f5d7e Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_dotnet.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_java.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_java.pdf new file mode 100644 index 0000000..b65d60c Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/thai_01_lib_java.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSansThai-Regular.ttf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSansThai-Regular.ttf new file mode 100644 index 0000000..da12e41 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSansThai-Regular.ttf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/invoice_front_thai.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/invoice_front_thai.jpg new file mode 100644 index 0000000..e390fb7 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/invoice_front_thai.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/thai_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/thai_01.jpg new file mode 100644 index 0000000..9aef2a5 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/thai_01.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Thai.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Thai.traineddata new file mode 100644 index 0000000..62acc3d Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Thai.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/tha.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/tha.traineddata new file mode 100644 index 0000000..fa80ee4 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/tha.traineddata differ diff --git a/pom.xml b/pom.xml index e8ab2c5..30ffeef 100644 --- a/pom.xml +++ b/pom.xml @@ -5,12 +5,12 @@ com.itextpdf root - 7.1.11 + 7.1.12 pdfocr-root - 1.0.0 + 1.0.1 pom pdfOCR @@ -22,7 +22,7 @@ - 7.1.11 + 7.1.12 1.8 ${java.version} ${java.version}