diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfCreatorUtil.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfCreatorUtil.java index 2a1b307..ab17ffa 100644 --- a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfCreatorUtil.java +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfCreatorUtil.java @@ -24,10 +24,13 @@ This file is part of the iText (R) project. import com.itextpdf.io.image.ImageData; import com.itextpdf.io.image.ImageDataFactory; +import com.itextpdf.io.image.ImageType; +import com.itextpdf.io.image.ImageTypeDetector; import com.itextpdf.io.image.TiffImageData; import com.itextpdf.io.source.RandomAccessFileOrArray; import com.itextpdf.io.source.RandomAccessSourceFactory; import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.io.util.UrlUtil; import com.itextpdf.kernel.geom.Rectangle; import com.itextpdf.layout.Document; import com.itextpdf.layout.element.Paragraph; @@ -172,15 +175,9 @@ static List getImageData(final File inputImage, IImageRotationHandler throws OcrException, IOException { List images = new ArrayList(); - String ext = ""; - int index = inputImage.getAbsolutePath().lastIndexOf('.'); - if (index > 0) { - ext = new String(inputImage.getAbsolutePath().toCharArray(), - index + 1, - inputImage.getAbsolutePath().length() - index - 1); - - if ("tiff".equals(ext.toLowerCase()) - || "tif".equals(ext.toLowerCase())) { + try { + ImageType imageType = ImageTypeDetector.detectImageType(UrlUtil.toURL(inputImage.getAbsolutePath())); + if (ImageType.TIFF == imageType) { int tiffPages = getNumberOfPageTiff(inputImage); for (int page = 0; page < tiffPages; page++) { @@ -194,21 +191,19 @@ static List getImageData(final File inputImage, IImageRotationHandler images.add(imageData); } } else { - try { - ImageData imageData = ImageDataFactory - .create(inputImage.getAbsolutePath()); - if (imageRotationHandler != null) { - imageData = imageRotationHandler.applyRotation(imageData); - } - images.add(imageData); - } catch (com.itextpdf.io.IOException e) { - LOGGER.error(MessageFormatUtil.format( - PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, - e.getMessage())); - throw new OcrException( - OcrException.CANNOT_READ_INPUT_IMAGE, e); + ImageData imageData = ImageDataFactory + .create(inputImage.getAbsolutePath()); + if (imageRotationHandler != null) { + imageData = imageRotationHandler.applyRotation(imageData); } + images.add(imageData); } + } catch (com.itextpdf.io.IOException e) { + LOGGER.error(MessageFormatUtil.format( + PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, + e.getMessage())); + throw new OcrException( + OcrException.CANNOT_READ_INPUT_IMAGE, e); } return images; } diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfCreatorUtilTest.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfCreatorUtilTest.java new file mode 100644 index 0000000..8e7b581 --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfCreatorUtilTest.java @@ -0,0 +1,112 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2021 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.io.image.ImageData; +import com.itextpdf.io.image.ImageType; +import com.itextpdf.io.image.JpegImageData; +import com.itextpdf.io.image.TiffImageData; +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.pdfocr.helpers.PdfHelper; +import com.itextpdf.test.ExtendedITextTest; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; +import com.itextpdf.test.annotations.type.UnitTest; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExpectedException; + +@Category(UnitTest.class) +public class PdfCreatorUtilTest extends ExtendedITextTest { + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + @Test + public void getImageDataFromValidSinglePagedTiffTest() throws IOException { + File image = new File(PdfHelper.getImagesTestDirectory() + "single7x5cm.tif"); + List images = PdfCreatorUtil.getImageData(image, null); + + Assert.assertEquals(1, images.size()); + + ImageData imageDate = images.get(0); + Assert.assertNotNull(imageDate); + Assert.assertTrue(imageDate instanceof TiffImageData); + Assert.assertEquals(ImageType.TIFF, imageDate.getOriginalType()); + } + + @Test + public void getImageDataFromValidMultiPagedTiffTest() throws IOException { + File image = new File(PdfHelper.getImagesTestDirectory() + "multipage.tiff"); + List images = PdfCreatorUtil.getImageData(image, null); + + Assert.assertEquals(9, images.size()); + for (ImageData imageDate : images) { + Assert.assertNotNull(imageDate); + Assert.assertTrue(imageDate instanceof TiffImageData); + Assert.assertEquals(ImageType.TIFF, imageDate.getOriginalType()); + } + } + + @Test + public void getImageDataFromValidNotTiffTest() throws IOException { + File image = new File(PdfHelper.getImagesTestDirectory() + "numbers_01.jpg"); + List images = PdfCreatorUtil.getImageData(image, null); + + Assert.assertEquals(1, images.size()); + + ImageData imageDate = images.get(0); + Assert.assertNotNull(imageDate); + Assert.assertTrue(imageDate instanceof JpegImageData); + Assert.assertEquals(ImageType.JPEG, imageDate.getOriginalType()); + } + + @Test + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE) + }) + public void getImageDataFromNotExistingImageTest() throws IOException { + junitExpectedException.expect(OcrException.class); + + PdfCreatorUtil.getImageData(new File("no such path"), null); + } + + @Test + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE) + }) + public void getImageDataFromInvalidImageTest() throws IOException { + junitExpectedException.expect(OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil.format( + OcrException.CANNOT_READ_INPUT_IMAGE)); + + PdfCreatorUtil.getImageData(new File(PdfHelper.getImagesTestDirectory() + "corrupted.jpg"), + null); + } +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfInputImageTest.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfInputImageTest.java index af43022..fbf1a70 100644 --- a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfInputImageTest.java +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfInputImageTest.java @@ -41,8 +41,7 @@ public class PdfInputImageTest extends ExtendedITextTest { public ExpectedException junitExpectedException = ExpectedException.none(); @LogMessages(messages = { - @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, - count = 1) + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE) }) @Test public void testCorruptedImage() { @@ -55,7 +54,7 @@ public void testCorruptedImage() { } @LogMessages(messages = { - @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, count = 1) + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE) }) @Test public void testCorruptedImageWithoutExtension() { @@ -67,4 +66,43 @@ public void testCorruptedImageWithoutExtension() { Assert.assertNotNull(realOutput); Assert.assertEquals("", realOutput); } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE) + }) + @Test + public void testInvalidImagePathWithoutDot() { + junitExpectedException.expect(OcrException.class); + + File file = new File("testName"); + String realOutput = PdfHelper.getTextFromPdf(file, "testInvalidImagePathWithoutDot"); + Assert.assertNotNull(realOutput); + Assert.assertEquals("", realOutput); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE) + }) + @Test + public void testInvalidImagePathWithDot() { + junitExpectedException.expect(OcrException.class); + + File file = new File("test.Name"); + String realOutput = PdfHelper.getTextFromPdf(file, "testInvalidImagePathWithDot"); + Assert.assertNotNull(realOutput); + Assert.assertEquals("", realOutput); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE) + }) + @Test + public void testValidImageWithoutExtension() { + junitExpectedException.expect(OcrException.class); + + File file = new File(PdfHelper.getImagesTestDirectory() + "numbers_01"); + String realOutput = PdfHelper.getTextFromPdf(file, "testValidImageWithoutExtension"); + Assert.assertNotNull(realOutput); + Assert.assertEquals("", realOutput); + } } diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/single7x5cm.tif b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/single7x5cm.tif new file mode 100644 index 0000000..804bc76 Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/single7x5cm.tif differ