From 23306c851cd0ec4ddda85c7fb4cc5dae4d858499 Mon Sep 17 00:00:00 2001 From: Artyom Skrobov Date: Sat, 1 May 2021 02:02:02 -0400 Subject: [PATCH] Fix leaks of Image objects in TesseractOcrUtil Those leaks made it impossible to delete the input files until after GC. --- .../itext.pdfocr.tesseract4.tests.csproj | 4 +- .../general/BasicTesseractIntegrationTest.cs | 56 +++++++++++++++++++ .../pdfocr/tesseract4/TesseractOcrUtil.cs | 6 +- 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj b/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj index f6d1306..96f0171 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext.pdfocr.tesseract4.tests.csproj @@ -9,7 +9,7 @@ library - net45 + net46 true @@ -42,4 +42,4 @@ ..\..\itext\itext.pdfocr.tesseract4\lib\Tesseract.dll - \ No newline at end of file + diff --git a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs index 7534ebb..b2a7997 100644 --- a/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs +++ b/itext.tests/itext.pdfocr.tesseract4.tests/itext/pdfocr/general/BasicTesseractIntegrationTest.cs @@ -284,6 +284,62 @@ public virtual void TestHocrStringOutput() { } } + [NUnit.Framework.Test] + public virtual void TestTiffIsDisposed() { + String testName = "testTiffIsDisposed"; + String path = TEST_IMAGES_DIRECTORY + "numbers_01.tif"; + String tmpPath = System.IO.Path.GetTempFileName(); + String pdfPath = GetTargetDirectory() + testName + ".pdf"; + FileInfo file = new FileInfo(tmpPath); + File.Copy(path, tmpPath, true); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + NUnit.Framework.Assert.IsTrue(System.GC.TryStartNoGCRegion(8*1024*1024)); + PdfDocument doc = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList(file), GetPdfWriter( + pdfPath)); + File.Delete(tmpPath); + System.GC.EndNoGCRegion(); + NUnit.Framework.Assert.IsNotNull(doc); + doc.Close(); + } + + [NUnit.Framework.Test] + public virtual void TestTiffIsDisposedWithoutPreprocessing() { + String testName = "testTiffIsDisposedWithoutPreprocessing"; + String path = TEST_IMAGES_DIRECTORY + "numbers_01.tif"; + String tmpPath = System.IO.Path.GetTempFileName(); + String pdfPath = GetTargetDirectory() + testName + ".pdf"; + FileInfo file = new FileInfo(tmpPath); + File.Copy(path, tmpPath, true); + tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPreprocessingImages + (false)); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + NUnit.Framework.Assert.IsTrue(System.GC.TryStartNoGCRegion(8 * 1024 * 1024)); + PdfDocument doc = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList(file), GetPdfWriter( + pdfPath)); + File.Delete(tmpPath); + System.GC.EndNoGCRegion(); + NUnit.Framework.Assert.IsNotNull(doc); + doc.Close(); + } + + [NUnit.Framework.Test] + public virtual void TestJpegIsDisposed() { + String testName = "testJpegIsDisposed"; + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + String tmpPath = System.IO.Path.GetTempFileName(); + String pdfPath = GetTargetDirectory() + testName + ".pdf"; + FileInfo file = new FileInfo(tmpPath); + File.Copy(path, tmpPath, true); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + NUnit.Framework.Assert.IsTrue(System.GC.TryStartNoGCRegion(8 * 1024 * 1024)); + PdfDocument doc = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList(file), GetPdfWriter( + pdfPath)); + File.Delete(tmpPath); + System.GC.EndNoGCRegion(); + NUnit.Framework.Assert.IsNotNull(doc); + doc.Close(); + } + /// Parse text from image and compare with expected. private void TestImageOcrText(AbstractTesseract4OcrEngine tesseractReader, String path, String expectedOutput ) { diff --git a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs index 5a3341d..316a6aa 100644 --- a/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs +++ b/itext/itext.pdfocr.tesseract4/itext/pdfocr/tesseract4/TesseractOcrUtil.cs @@ -472,6 +472,7 @@ internal void InitializeImagesListFromTiff(FileInfo inputFile) temp.SetResolution(2 * xResolution, 2 * yResolution); bitmapList.Add(temp); } + originalImage.Dispose(); SetListOfPages(bitmapList); } catch (Exception e) { @@ -514,6 +515,7 @@ internal static Bitmap GetImagePage(FileInfo input, int page) } image.SelectActiveFrame(FrameDimension.Page, page); img = new Bitmap(image); + image.Dispose(); } catch (Exception e) { LogManager.GetLogger(typeof(TesseractOcrUtil)) @@ -889,8 +891,8 @@ internal static int DetectRotation(FileInfo inputFile) { try { - System.Drawing.Image image = System.Drawing.Image.FromFile(inputFile.FullName); - return ReadRotationFromMetadata(image); + using (System.Drawing.Image image = System.Drawing.Image.FromFile(inputFile.FullName)) + return ReadRotationFromMetadata(image); } catch (Exception e) {