Skip to content

Commit

Permalink
Consider image's type from its data rather than its extension
Browse files Browse the repository at this point in the history
DEVSIX-5172
  • Loading branch information
ars18wrw committed Apr 6, 2021
1 parent d6d759e commit 7b61601
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 25 deletions.
39 changes: 17 additions & 22 deletions pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfCreatorUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@ This file is part of the iText (R) project.

import com.itextpdf.io.image.ImageData;
import com.itextpdf.io.image.ImageDataFactory;
import com.itextpdf.io.image.ImageType;
import com.itextpdf.io.image.ImageTypeDetector;
import com.itextpdf.io.image.TiffImageData;
import com.itextpdf.io.source.RandomAccessFileOrArray;
import com.itextpdf.io.source.RandomAccessSourceFactory;
import com.itextpdf.io.util.MessageFormatUtil;
import com.itextpdf.io.util.UrlUtil;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.layout.Document;
import com.itextpdf.layout.element.Paragraph;
Expand Down Expand Up @@ -172,15 +175,9 @@ static List<ImageData> getImageData(final File inputImage, IImageRotationHandler
throws OcrException, IOException {
List<ImageData> images = new ArrayList<ImageData>();

String ext = "";
int index = inputImage.getAbsolutePath().lastIndexOf('.');
if (index > 0) {
ext = new String(inputImage.getAbsolutePath().toCharArray(),
index + 1,
inputImage.getAbsolutePath().length() - index - 1);

if ("tiff".equals(ext.toLowerCase())
|| "tif".equals(ext.toLowerCase())) {
try {
ImageType imageType = ImageTypeDetector.detectImageType(UrlUtil.toURL(inputImage.getAbsolutePath()));
if (ImageType.TIFF == imageType) {
int tiffPages = getNumberOfPageTiff(inputImage);

for (int page = 0; page < tiffPages; page++) {
Expand All @@ -194,21 +191,19 @@ static List<ImageData> getImageData(final File inputImage, IImageRotationHandler
images.add(imageData);
}
} else {
try {
ImageData imageData = ImageDataFactory
.create(inputImage.getAbsolutePath());
if (imageRotationHandler != null) {
imageData = imageRotationHandler.applyRotation(imageData);
}
images.add(imageData);
} catch (com.itextpdf.io.IOException e) {
LOGGER.error(MessageFormatUtil.format(
PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE,
e.getMessage()));
throw new OcrException(
OcrException.CANNOT_READ_INPUT_IMAGE, e);
ImageData imageData = ImageDataFactory
.create(inputImage.getAbsolutePath());
if (imageRotationHandler != null) {
imageData = imageRotationHandler.applyRotation(imageData);
}
images.add(imageData);
}
} catch (com.itextpdf.io.IOException e) {
LOGGER.error(MessageFormatUtil.format(
PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE,
e.getMessage()));
throw new OcrException(
OcrException.CANNOT_READ_INPUT_IMAGE, e);
}
return images;
}
Expand Down
112 changes: 112 additions & 0 deletions pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfCreatorUtilTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2021 iText Group NV
Authors: iText Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package com.itextpdf.pdfocr;

import com.itextpdf.io.image.ImageData;
import com.itextpdf.io.image.ImageType;
import com.itextpdf.io.image.JpegImageData;
import com.itextpdf.io.image.TiffImageData;
import com.itextpdf.io.util.MessageFormatUtil;
import com.itextpdf.pdfocr.helpers.PdfHelper;
import com.itextpdf.test.ExtendedITextTest;
import com.itextpdf.test.annotations.LogMessage;
import com.itextpdf.test.annotations.LogMessages;
import com.itextpdf.test.annotations.type.UnitTest;

import java.io.File;
import java.io.IOException;
import java.util.List;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.ExpectedException;

@Category(UnitTest.class)
public class PdfCreatorUtilTest extends ExtendedITextTest {

@Rule
public ExpectedException junitExpectedException = ExpectedException.none();

@Test
public void getImageDataFromValidSinglePagedTiffTest() throws IOException {
File image = new File(PdfHelper.getImagesTestDirectory() + "single7x5cm.tif");
List<ImageData> images = PdfCreatorUtil.getImageData(image, null);

Assert.assertEquals(1, images.size());

ImageData imageDate = images.get(0);
Assert.assertNotNull(imageDate);
Assert.assertTrue(imageDate instanceof TiffImageData);
Assert.assertEquals(ImageType.TIFF, imageDate.getOriginalType());
}

@Test
public void getImageDataFromValidMultiPagedTiffTest() throws IOException {
File image = new File(PdfHelper.getImagesTestDirectory() + "multipage.tiff");
List<ImageData> images = PdfCreatorUtil.getImageData(image, null);

Assert.assertEquals(9, images.size());
for (ImageData imageDate : images) {
Assert.assertNotNull(imageDate);
Assert.assertTrue(imageDate instanceof TiffImageData);
Assert.assertEquals(ImageType.TIFF, imageDate.getOriginalType());
}
}

@Test
public void getImageDataFromValidNotTiffTest() throws IOException {
File image = new File(PdfHelper.getImagesTestDirectory() + "numbers_01.jpg");
List<ImageData> images = PdfCreatorUtil.getImageData(image, null);

Assert.assertEquals(1, images.size());

ImageData imageDate = images.get(0);
Assert.assertNotNull(imageDate);
Assert.assertTrue(imageDate instanceof JpegImageData);
Assert.assertEquals(ImageType.JPEG, imageDate.getOriginalType());
}

@Test
@LogMessages(messages = {
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
})
public void getImageDataFromNotExistingImageTest() throws IOException {
junitExpectedException.expect(OcrException.class);

PdfCreatorUtil.getImageData(new File("no such path"), null);
}

@Test
@LogMessages(messages = {
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
})
public void getImageDataFromInvalidImageTest() throws IOException {
junitExpectedException.expect(OcrException.class);
junitExpectedException.expectMessage(MessageFormatUtil.format(
OcrException.CANNOT_READ_INPUT_IMAGE));

PdfCreatorUtil.getImageData(new File(PdfHelper.getImagesTestDirectory() + "corrupted.jpg"),
null);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ public class PdfInputImageTest extends ExtendedITextTest {
public ExpectedException junitExpectedException = ExpectedException.none();

@LogMessages(messages = {
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE,
count = 1)
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
})
@Test
public void testCorruptedImage() {
Expand All @@ -55,7 +54,7 @@ public void testCorruptedImage() {
}

@LogMessages(messages = {
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, count = 1)
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
})
@Test
public void testCorruptedImageWithoutExtension() {
Expand All @@ -67,4 +66,43 @@ public void testCorruptedImageWithoutExtension() {
Assert.assertNotNull(realOutput);
Assert.assertEquals("", realOutput);
}

@LogMessages(messages = {
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
})
@Test
public void testInvalidImagePathWithoutDot() {
junitExpectedException.expect(OcrException.class);

File file = new File("testName");
String realOutput = PdfHelper.getTextFromPdf(file, "testInvalidImagePathWithoutDot");
Assert.assertNotNull(realOutput);
Assert.assertEquals("", realOutput);
}

@LogMessages(messages = {
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
})
@Test
public void testInvalidImagePathWithDot() {
junitExpectedException.expect(OcrException.class);

File file = new File("test.Name");
String realOutput = PdfHelper.getTextFromPdf(file, "testInvalidImagePathWithDot");
Assert.assertNotNull(realOutput);
Assert.assertEquals("", realOutput);
}

@LogMessages(messages = {
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
})
@Test
public void testValidImageWithoutExtension() {
junitExpectedException.expect(OcrException.class);

File file = new File(PdfHelper.getImagesTestDirectory() + "numbers_01");
String realOutput = PdfHelper.getTextFromPdf(file, "testValidImageWithoutExtension");
Assert.assertNotNull(realOutput);
Assert.assertEquals("", realOutput);
}
}
Binary file not shown.

0 comments on commit 7b61601

Please sign in to comment.