diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..079741f --- /dev/null +++ b/.gitattributes @@ -0,0 +1,62 @@ +# Set the default behavior, in case people don't have core.autocrlf set. +* text=auto + +# Explicitly declare text files you want to always be normalized and converted +# to LF line endings on checkout. +*.afm text eol=lf +*.cmap text eol=lf +*.cs text eol=lf ident +*.css text eol=lf +*.htm text eol=lf +*.html text eol=lf +*.java text eol=lf ident +*.lng text eol=lf +*.md text eol=lf +*.pom text eol=lf +*.properties text eol=lf +*.svg text eol=lf +*.txt text eol=lf +*.xfdf text eol=lf +*.xht text eol=lf +*.xhtml text eol=lf +*.xml text eol=lf +port-hash text eol=lf + +# Declare files that will always have CRLF line endings on checkout. +*.bat text eol=crlf +*.csproj text eol=crlf +*.sln text eol=crlf + +# Denote all files that are truly binary and should not be modified. +*.aif binary +*.aiff binary +*.bmp binary +*.cer binary +*.cmp binary +*.crt binary +*.dib binary +*.gif binary +*.icc binary +*.j2k binary +*.jb2 binary +*.jp2 binary +*.jpc binary +*.jpg binary +*.key binary +*.otf binary +*.p12 binary +*.pdf binary +*.pfb binary +*.pfm binary +*.png binary +*.snd binary +*.tif binary +*.tiff binary +*.ttc binary +*.ttf binary +*.u3d binary +*.wav binary +*.wmf binary +*.woff binary +*.woff2 binary +*.dat binary \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3352d5a --- /dev/null +++ b/.gitignore @@ -0,0 +1,157 @@ +# Created by https://www.gitignore.io + +### Java ### +*.class + +# Mobile Tools for Java (J2ME) +.mtj.tmp/ + +# Package Files # +*.jar +*.war +*.ear + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* + + +### Eclipse ### +*.pydevproject +.metadata +.gradle +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath + +# Eclipse Core +.project + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# JDT-specific (Eclipse Java Development Tools) +.classpath + +# PDT-specific +.buildpath + +# sbteclipse plugin +.target + +# TeXlipse plugin +.texlipse + + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm + +*.iml + +## Directory-based project format: +.idea/ +# if you remove the above rule, at least ignore the following: + +# User-specific stuff: +# .idea/workspace.xml +# .idea/tasks.xml +# .idea/dictionaries + +# Sensitive or high-churn files: +# .idea/dataSources.ids +# .idea/dataSources.xml +# .idea/sqlDataSources.xml +# .idea/dynamic.xml +# .idea/uiDesigner.xml + +# Gradle: +# .idea/gradle.xml +# .idea/libraries + +# Mongo Explorer plugin: +# .idea/mongoSettings.xml + +## File-based project format: +*.ipr +*.iws + +## Plugin-specific files: + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties + + +### NetBeans ### +nbproject/private/ +build/ +nbbuild/ +dist/ +nbdist/ +nbactions.xml +nb-configuration.xml +.nb-gradle/ + + +### Linux ### +*~ + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + + +### Windows ### +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk + +target/ +nbactions*.xml +.checkstyle +.pmd +.pmdruleset.xml + +# Ignore generated files +*.log + +.vagrant/ +.vscode/ diff --git a/.mailmap b/.mailmap new file mode 100644 index 0000000..df7e385 --- /dev/null +++ b/.mailmap @@ -0,0 +1,79 @@ +Alan Goo +Alexander Chingarev +Alexander Chingarev +Alexander Chingarev +Alexey Subach +Alexey Subach +Amedee Van Gasse +Amedee Van Gasse +Andrew Panfilov +Bart De Meyer +Benoît Lagae +Benoît Lagae +Benoît Lagae +Bruno Lowagie +Bruno Lowagie +Bruno Lowagie +Bryan +Dimitry Alexandrov +Dmitry Trusevich +Dmitry Trusevich +Dominik Helm +gothinkfree +Ilya Idamkin +iText Software +iText Software +iText Software +iText Software +iText Software +iText Software +iText Software +iText Software +Jeff Monson +Joris Schellekens +Kevin Day +Kevin Day +Kevin Willems +LaughingMan +Markus Wernig +Marvin Wichmann +Marvin Wichmann +Marvin Wichmann +Michaël Demey michael.demey <> +Michaël Demey +Michaël Demey +Michaël Demey +Michael Glazunoff +Michael Klink +Michael Klink +Nadia Ivaniukovich +Nadia Ivaniukovich +Nadja Sych +Natalia Zgirovskaya +Natalia Zgirovskaya +Olivier Blaise +Orabi Nakhla +Orabi Nakhla +Paulo Soares +Paulo Soares +Pavel Alay pavel.alay <> +Pavel Alay +Pavel Alay +Pavel Morozov +Pavel Morozov +Peter Goodman +Peter Goodman +Peter Goodman +Peter Kjuak +Richard Schwark +Roman Leonov +Roman Nadvodny +Sasha Kalykhan +Sasha Kalykhan +Semen Yakushev +Valera <7691262@mail.ru> <7691262@mail.ru> +Veronika Lisovskaya +Vit Nemecky +Yanina Cheremisina +Yulian Gaponenko +Yulian Gaponenko diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..d821d4d --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,298 @@ +#!/usr/bin/env groovy +@Library('pipeline-library')_ + +def schedule, sonarBranchName, sonarBranchTarget +switch (env.BRANCH_NAME) { + case ~/.*master.*/: + schedule = '@monthly' + sonarBranchName = '-Dsonar.branch.name=master' + sonarBranchTarget = '' + break + case ~/.*develop.*/: + schedule = '@midnight' + sonarBranchName = '-Dsonar.branch.name=develop' + sonarBranchTarget = '-Dsonar.branch.target=master' + break + default: + schedule = '' + sonarBranchName = '-Dsonar.branch.name=' + env.BRANCH_NAME + sonarBranchTarget = '-Dsonar.branch.target=develop' + break +} + +pipeline { + + agent { label '!master' } + + environment { + JDK_VERSION = 'jdk-8-oracle' + tesseractDir = tool name: 'Tesseract', type: 'com.cloudbees.jenkins.plugins.customtools.CustomTool' + } + + options { + ansiColor('xterm') + buildDiscarder logRotator(artifactNumToKeepStr: '1') + parallelsAlwaysFailFast() + skipStagesAfterUnstable() + timeout time: 1, unit: 'HOURS' + timestamps() + } + + triggers { + cron(schedule) + } + + tools { + maven 'M3' + jdk "${JDK_VERSION}" + } + + stages { + stage('Abort possible previous builds') { + steps { + script { + abortPreviousBuilds() + } + } + } + stage('Wait for blocking jobs') { + steps { + script { + properties[[ + $class : 'BuildBlockerProperty', + blockLevel : 'GLOBAL', + blockingJobs : "^iText_7_Java/itextcore/$env.JOB_BASE_NAME\$", + scanQueueFor : 'ALL', + useBuildBlocker: true + ]] + } + } + } + stage('Build') { + options { + retry(2) + } + stages { + stage('Clean workspace') { + options { + timeout time: 5, unit: 'MINUTES' + } + steps { + withMaven(jdk: "${JDK_VERSION}", maven: 'M3') { + sh 'mvn --threads 2C --no-transfer-progress clean dependency:purge-local-repository ' + + '-Dinclude=com.itextpdf -DresolutionFuzziness=groupId -DreResolve=false ' + + "-Dmaven.repo.local=${env.WORKSPACE.replace('\\','/')}/.repository" + } + script { + try {sh "rm -rf ${env.WORKSPACE.replace('\\','/')}/downloads"} catch (Exception ignored) {} + } + } + } + stage('Install branch dependencies') { + options { + timeout time: 5, unit: 'MINUTES' + } + when { + not { + anyOf { + branch "master" + branch "develop" + } + } + } + steps { + script { + getAndConfigureJFrogCLI() + sh "./jfrog rt dl branch-artifacts/${env.JOB_BASE_NAME}/**/java/ downloads/" + if (fileExists("downloads")) { + dir ("downloads") { + def mainPomFiles = findFiles glob: '**/main.pom' + mainPomFiles.each { pomFile -> + pomPath = pomFile.path.replace "\\", "/" + sh "mvn org.apache.maven.plugins:maven-install-plugin:3.0.0-M1:install-file --quiet " + + "-Dmaven.repo.local=${env.WORKSPACE.replace('\\','/')}/.repository " + + "-Dpackaging=pom -Dfile=${pomPath} -DpomFile=${pomPath}" + } + def pomFiles = findFiles glob: '**/*.pom' + pomFiles.each { pomFile -> + if (pomFile.name != "main.pom") { + pomPath = pomFile.path.replace "\\", "/" + sh "mvn org.apache.maven.plugins:maven-install-plugin:3.0.0-M1:install-file --quiet " + + "-Dmaven.repo.local=${env.WORKSPACE.replace('\\', '/')}/.repository " + + "-Dpackaging=pom -Dfile=${pomPath} -DpomFile=${pomPath}" + } + } + def jarFiles = findFiles glob: '**/*.jar' + jarFiles.each { jarFile -> + jarPath = jarFile.path.replace "\\", "/" + sh "mvn org.apache.maven.plugins:maven-install-plugin:3.0.0-M1:install-file --quiet " + + "-Dmaven.repo.local=${env.WORKSPACE.replace('\\', '/')}/.repository " + + "-Dfile=${jarPath}" + } + } + } + } + } + } + stage('Compile') { + options { + timeout time: 10, unit: 'MINUTES' + } + steps { + withMaven(jdk: "${JDK_VERSION}", maven: 'M3') { + sh 'mvn --threads 2C --no-transfer-progress package -Dmaven.test.skip=true ' + + "-Dmaven.repo.local=${env.WORKSPACE.replace('\\','/')}/.repository" + } + } + } + } + post { + failure { + sleep time: 2, unit: 'MINUTES' + } + success { + script { currentBuild.result = 'SUCCESS' } + } + } + } + stage('Static Code Analysis') { + options { + timeout time: 1, unit: 'HOURS' + } + steps { + withMaven(jdk: "${JDK_VERSION}", maven: 'M3', mavenLocalRepo: '.repository') { + sh 'mvn --no-transfer-progress verify --activate-profiles qa ' + + '-Dpmd.analysisCache=true ' + + "-Dmaven.repo.local=${env.WORKSPACE.replace('\\','/')}/.repository" + } + recordIssues(tools: [ + checkStyle(), + pmdParser(), + spotBugs(useRankAsPriority: true) + ]) + dependencyCheckPublisher pattern: 'target/dependency-check-report.xml' + } + } + stage('Run Tests') { + options { + timeout time: 30, unit: 'MINUTES' + } + steps { + withMaven(jdk: "${JDK_VERSION}", maven: 'M3') { + withSonarQubeEnv('Sonar') { + sh 'mvn --no-transfer-progress --activate-profiles test ' + + '-DgsExec="${gsExec}" -DcompareExec="${compareExec}" ' + + '-DtesseractDir="${tesseractDir}" ' + + '-Dmaven.main.skip=true -Dmaven.test.failure.ignore=false ' + + 'org.jacoco:jacoco-maven-plugin:prepare-agent verify org.jacoco:jacoco-maven-plugin:report ' + + '-Dsonar.java.spotbugs.reportPaths="target/spotbugs.xml" ' + + "-Dmaven.repo.local=${env.WORKSPACE.replace('\\','/')}/.repository " + + 'sonar:sonar ' + sonarBranchName + ' ' + sonarBranchTarget + } + } + } + } + stage("Quality Gate") { + options { + timeout time: 1, unit: 'HOURS' + } + steps { + waitForQualityGate abortPipeline: true + } + } + stage('Artifactory Deploy') { + options { + timeout time: 5, unit: 'MINUTES' + } + when { + anyOf { + branch "master" + branch "develop" + } + } + steps { + withMaven(jdk: "${JDK_VERSION}", maven: 'M3') { + script { + def server = Artifactory.server 'itext-artifactory' + def rtMaven = Artifactory.newMavenBuild() + rtMaven.deployer server: server, releaseRepo: 'releases', snapshotRepo: 'snapshot' + rtMaven.tool = 'M3' + def buildInfo = rtMaven.run pom: 'pom.xml', goals: '--threads 2C --no-transfer-progress install --activate-profiles artifactory ' + + "-Dmaven.repo.local=${env.WORKSPACE.replace('\\','/')}/.repository".toString() + server.publishBuildInfo buildInfo + } + } + } + } + stage('Branch Artifactory Deploy') { + options { + timeout time: 5, unit: 'MINUTES' + } + when { + not { + anyOf { + branch "master" + branch "develop" + } + } + } + steps { + script { + if (env.GIT_URL) { + repoName = ("${env.GIT_URL}" =~ /(.*\/)(.*)(\.git)/)[ 0 ][ 2 ] + findFiles(glob: '*/target/*.jar').each { item -> + if (!(item ==~ /.*\/[fs]b-contrib-.*?.jar/) && !(item ==~ /.*\/findsecbugs-plugin-.*?.jar/) && !(item ==~ /.*-sources.jar/) && !(item ==~ /.*-javadoc.jar/)) { + sh "./jfrog rt u \"${item.path}\" branch-artifacts/${env.BRANCH_NAME}/${repoName}/java/ --recursive=false --build-name ${env.BRANCH_NAME} --build-number ${env.BUILD_NUMBER} --props \"vcs.revision=${env.GIT_COMMIT};repo.name=${repoName}\"" + } + } + findFiles(glob: '**/pom.xml').each { item -> + def pomPath = item.path.replace('\\', '/') + if (!(pomPath ==~ /.*target.*/)) { + def resPomName = "main.pom" + def subDirMatcher = (pomPath =~ /^.*(?<=\/|^)(.*)\/pom\.xml/) + if (subDirMatcher.matches()) { + resPomName = "${subDirMatcher[0][1]}.pom" + } + sh "./jfrog rt u \"${item.path}\" branch-artifacts/${env.BRANCH_NAME}/${repoName}/java/${resPomName} --recursive=false --build-name ${env.BRANCH_NAME} --build-number ${env.BUILD_NUMBER} --props \"vcs.revision=${env.GIT_COMMIT};repo.name=${repoName}\"" + } + } + } + } + } + } + } + + post { + always { + echo 'One way or another, I have finished \uD83E\uDD16' + } + success { + echo 'I succeeeded! \u263A' + cleanWs deleteDirs: true + } + unstable { + echo 'I am unstable \uD83D\uDE2E' + } + failure { + echo 'I failed \uD83D\uDCA9' + } + changed { + echo 'Things were different before... \uD83E\uDD14' + } + fixed { + script { + if (env.BRANCH_NAME.contains('master') || env.BRANCH_NAME.contains('develop')) { + slackNotifier "#ci", currentBuild.currentResult, "${env.BRANCH_NAME} - Back to normal" + } + } + } + regression { + script { + if (env.BRANCH_NAME.contains('master') || env.BRANCH_NAME.contains('develop')) { + slackNotifier "#ci", currentBuild.currentResult, "${env.BRANCH_NAME} - First failure" + } + } + } + } + +} diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..39d74c2 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,16 @@ + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . diff --git a/gnu-agpl-v3.0.md b/gnu-agpl-v3.0.md new file mode 100644 index 0000000..4ef32f0 --- /dev/null +++ b/gnu-agpl-v3.0.md @@ -0,0 +1,651 @@ +GNU Affero General Public License +================================= + +_Version 3, 19 November 2007_ +_Copyright © 2007 Free Software Foundation, Inc. <>_ + +Everyone is permitted to copy and distribute verbatim copies +of this license document, but changing it is not allowed. + +## Preamble + +The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + +The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + +When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + +Developers that use our General Public Licenses protect your rights +with two steps: **(1)** assert copyright on the software, and **(2)** offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + +A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + +The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + +An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + +The precise terms and conditions for copying, distribution and +modification follow. + +## TERMS AND CONDITIONS + +### 0. Definitions + +“This License” refers to version 3 of the GNU Affero General Public License. + +“Copyright” also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + +“The Program” refers to any copyrightable work licensed under this +License. Each licensee is addressed as “you”. “Licensees” and +“recipients” may be individuals or organizations. + +To “modify” a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a “modified version” of the +earlier work or a work “based on” the earlier work. + +A “covered work” means either the unmodified Program or a work based +on the Program. + +To “propagate” a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + +To “convey” a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + +An interactive user interface displays “Appropriate Legal Notices” +to the extent that it includes a convenient and prominently visible +feature that **(1)** displays an appropriate copyright notice, and **(2)** +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + +### 1. Source Code + +The “source code” for a work means the preferred form of the work +for making modifications to it. “Object code” means any non-source +form of a work. + +A “Standard Interface” means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + +The “System Libraries” of an executable work include anything, other +than the work as a whole, that **(a)** is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and **(b)** serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +“Major Component”, in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + +The “Corresponding Source” for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + +The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + +The Corresponding Source for a work in source code form is that +same work. + +### 2. Basic Permissions + +All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + +You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + +Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + +### 3. Protecting Users' Legal Rights From Anti-Circumvention Law + +No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + +When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + +### 4. Conveying Verbatim Copies + +You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + +You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + +### 5. Conveying Modified Source Versions + +You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + +* **a)** The work must carry prominent notices stating that you modified +it, and giving a relevant date. +* **b)** The work must carry prominent notices stating that it is +released under this License and any conditions added under section 7. +This requirement modifies the requirement in section 4 to +“keep intact all notices”. +* **c)** You must license the entire work, as a whole, under this +License to anyone who comes into possession of a copy. This +License will therefore apply, along with any applicable section 7 +additional terms, to the whole of the work, and all its parts, +regardless of how they are packaged. This License gives no +permission to license the work in any other way, but it does not +invalidate such permission if you have separately received it. +* **d)** If the work has interactive user interfaces, each must display +Appropriate Legal Notices; however, if the Program has interactive +interfaces that do not display Appropriate Legal Notices, your +work need not make them do so. + +A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +“aggregate” if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + +### 6. Conveying Non-Source Forms + +You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + +* **a)** Convey the object code in, or embodied in, a physical product +(including a physical distribution medium), accompanied by the +Corresponding Source fixed on a durable physical medium +customarily used for software interchange. +* **b)** Convey the object code in, or embodied in, a physical product +(including a physical distribution medium), accompanied by a +written offer, valid for at least three years and valid for as +long as you offer spare parts or customer support for that product +model, to give anyone who possesses the object code either **(1)** a +copy of the Corresponding Source for all the software in the +product that is covered by this License, on a durable physical +medium customarily used for software interchange, for a price no +more than your reasonable cost of physically performing this +conveying of source, or **(2)** access to copy the +Corresponding Source from a network server at no charge. +* **c)** Convey individual copies of the object code with a copy of the +written offer to provide the Corresponding Source. This +alternative is allowed only occasionally and noncommercially, and +only if you received the object code with such an offer, in accord +with subsection 6b. +* **d)** Convey the object code by offering access from a designated +place (gratis or for a charge), and offer equivalent access to the +Corresponding Source in the same way through the same place at no +further charge. You need not require recipients to copy the +Corresponding Source along with the object code. If the place to +copy the object code is a network server, the Corresponding Source +may be on a different server (operated by you or a third party) +that supports equivalent copying facilities, provided you maintain +clear directions next to the object code saying where to find the +Corresponding Source. Regardless of what server hosts the +Corresponding Source, you remain obligated to ensure that it is +available for as long as needed to satisfy these requirements. +* **e)** Convey the object code using peer-to-peer transmission, provided +you inform other peers where the object code and Corresponding +Source of the work are being offered to the general public at no +charge under subsection 6d. + +A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + +A “User Product” is either **(1)** a “consumer product”, which means any +tangible personal property which is normally used for personal, family, +or household purposes, or **(2)** anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, “normally used” refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + +“Installation Information” for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + +If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + +The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + +Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + +### 7. Additional Terms + +“Additional permissions” are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + +When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + +Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + +* **a)** Disclaiming warranty or limiting liability differently from the +terms of sections 15 and 16 of this License; or +* **b)** Requiring preservation of specified reasonable legal notices or +author attributions in that material or in the Appropriate Legal +Notices displayed by works containing it; or +* **c)** Prohibiting misrepresentation of the origin of that material, or +requiring that modified versions of such material be marked in +reasonable ways as different from the original version; or +* **d)** Limiting the use for publicity purposes of names of licensors or +authors of the material; or +* **e)** Declining to grant rights under trademark law for use of some +trade names, trademarks, or service marks; or +* **f)** Requiring indemnification of licensors and authors of that +material by anyone who conveys the material (or modified versions of +it) with contractual assumptions of liability to the recipient, for +any liability that these contractual assumptions directly impose on +those licensors and authors. + +All other non-permissive additional terms are considered “further +restrictions” within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + +If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + +Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + +### 8. Termination + +You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + +However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated **(a)** +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and **(b)** permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + +Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + +Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + +### 9. Acceptance Not Required for Having Copies + +You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + +### 10. Automatic Licensing of Downstream Recipients + +Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + +An “entity transaction” is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + +You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + +### 11. Patents + +A “contributor” is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's “contributor version”. + +A contributor's “essential patent claims” are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, “control” includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + +Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + +In the following three paragraphs, a “patent license” is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To “grant” such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + +If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either **(1)** cause the Corresponding Source to be so +available, or **(2)** arrange to deprive yourself of the benefit of the +patent license for this particular work, or **(3)** arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. “Knowingly relying” means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + +If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + +A patent license is “discriminatory” if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license **(a)** in connection with copies of the covered work +conveyed by you (or copies made from those copies), or **(b)** primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + +Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + +### 12. No Surrender of Others' Freedom + +If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + +### 13. Remote Network Interaction; Use with the GNU General Public License + +Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + +Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + +### 14. Revised Versions of this License + +The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License “or any later version” applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + +If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + +Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + +### 15. Disclaimer of Warranty + +THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +### 16. Limitation of Liability + +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + +### 17. Interpretation of Sections 15 and 16 + +If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + +_END OF TERMS AND CONDITIONS_ + +## How to Apply These Terms to Your New Programs + +If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + +To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the “copyright” line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + +If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a “Source” link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + +You should also get your employer (if you work as a programmer) or school, +if any, to sign a “copyright disclaimer” for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +<>. diff --git a/pdfocr-api/pom.xml b/pdfocr-api/pom.xml new file mode 100644 index 0000000..87f6927 --- /dev/null +++ b/pdfocr-api/pom.xml @@ -0,0 +1,50 @@ + + + 4.0.0 + + + com.itextpdf + pdfocr-root + 1.0.0 + + + pdfocr-api + + pdfOCR API + pdfOCR is an iText 7 add-on for Java to recognize and extract text in scanned documents and images. It can also convert them into fully ISO-compliant PDF or PDF/A-3u files that are accessible, searchable, and suitable for archiving + + + + com.itextpdf + layout + ${itext.version} + + + com.itextpdf + pdfa + ${itext.version} + + + org.apache.commons + commons-imaging + 1.0-alpha1 + + + com.itextpdf + pdftest + ${itext.version} + test + + + + + + + src/main/resources + + **/*.ttf + + + + + diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/IMetaInfoWrapper.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/IMetaInfoWrapper.java new file mode 100644 index 0000000..e702cb9 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/IMetaInfoWrapper.java @@ -0,0 +1,37 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.kernel.counter.event.IMetaInfo; + +/** + * The meta info wrapper that holds some meta info + */ +public interface IMetaInfoWrapper { + + /** + * Gets the wrapped meta info + * @return the wrapped meta info + */ + public IMetaInfo getWrappedMetaInfo(); +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/IOcrEngine.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/IOcrEngine.java new file mode 100644 index 0000000..878fc57 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/IOcrEngine.java @@ -0,0 +1,61 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import java.io.File; +import java.util.List; +import java.util.Map; + +/** + * {@link IOcrEngine} interface is used for instantiating new OcrReader + * objects. + * {@link IOcrEngine} interface provides possibility to perform OCR, + * to read data from input files and to return the contained text in the + * required format. + */ +public interface IOcrEngine { + + /** + * Reads data from the provided input image file and returns retrieved data + * in the format described below. + * + * @param input input image {@link java.io.File} + * @return {@link java.util.Map} where key is {@link java.lang.Integer} + * representing the number of the page and value is + * {@link java.util.List} of {@link TextInfo} elements where each + * {@link TextInfo} element contains a word or a line and its 4 + * coordinates(bbox) + */ + Map> doImageOcr(File input); + + /** + * Performs OCR using provided {@link IOcrEngine} for the given list of + * input images and saves output to a text file using provided path. + * Note that a human reading order is not guaranteed + * due to possible specifics of input images (multi column layout, tables etc) + * + * @param inputImages {@link java.util.List} of images to be OCRed + * @param txtFile file to be created + */ + void createTxtFile(List inputImages, File txtFile); +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrEngineProperties.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrEngineProperties.java new file mode 100644 index 0000000..fd41dea --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrEngineProperties.java @@ -0,0 +1,76 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class OcrEngineProperties { + + /** + * List of languages required for ocr for provided images. + */ + private List languages = Collections.emptyList(); + + /** + * Creates a new {@link OcrEngineProperties} instance. + */ + public OcrEngineProperties() { + } + + /** + * Creates a new {@link OcrEngineProperties} instance + * based on another {@link OcrEngineProperties} instance (copy + * constructor). + * + * @param other the other {@link OcrEngineProperties} instance + */ + public OcrEngineProperties(OcrEngineProperties other) { + this.languages = other.languages; + } + + /** + * Gets list of languages required for provided images. + * + * @return {@link List} of languages + */ + public final List getLanguages() { + return new ArrayList(languages); + } + + /** + * Sets list of languages to be recognized in provided images. + * Consult with documentation of specific engine implementations + * to check on which format to give the language in. + * + * @param requiredLanguages {@link List} of languages in string + * format + * @return the {@link OcrEngineProperties} instance + */ + public final OcrEngineProperties setLanguages( + final List requiredLanguages) { + languages = Collections.unmodifiableList(requiredLanguages); + return this; + } +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrException.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrException.java new file mode 100644 index 0000000..be8486b --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrException.java @@ -0,0 +1,98 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.io.util.MessageFormatUtil; + +import java.util.Arrays; +import java.util.List; + +/** + * Exception class for custom exceptions. + */ +public class OcrException extends RuntimeException { + + public static final String CANNOT_READ_INPUT_IMAGE = + "Cannot read input image"; + public static final String CANNOT_RESOLVE_PROVIDED_FONTS = "Cannot resolve " + + "any of provided fonts. Please check provided FontProvider."; + public static final String CANNOT_CREATE_PDF_DOCUMENT = "Cannot create " + + "PDF document: {0}"; + private List messageParams; + + /** + * Creates a new OcrException. + * + * @param msg the detail message. + * @param e the cause + * (which is saved for later retrieval + * by {@link #getCause()} method). + */ + public OcrException(String msg, Throwable e) { + super(msg, e); + } + + /** + * Creates a new OcrException. + * + * @param msg the detail message. + */ + public OcrException(String msg) { + super(msg); + } + + /** + * {@inheritDoc} + */ + @Override + public String getMessage() { + return this.messageParams != null && this.messageParams.size() != 0 + ? MessageFormatUtil + .format(super.getMessage(), this.getMessageParams()) + : super.getMessage(); + } + + /** + * Gets additional params for Exception message. + */ + protected Object[] getMessageParams() { + Object[] parameters = new Object[this.messageParams.size()]; + + for(int i = 0; i < this.messageParams.size(); ++i) { + parameters[i] = this.messageParams.get(i); + } + + return parameters; + } + + /** + * Sets additional params for Exception message. + * + * @param messageParams additional params. + * @return object itself. + */ + public OcrException setMessageParams(String... messageParams) { + this.messageParams = Arrays.asList(messageParams); + return this; + } +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreator.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreator.java new file mode 100644 index 0000000..1196cec --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreator.java @@ -0,0 +1,621 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.io.font.otf.ActualTextIterator; +import com.itextpdf.io.font.otf.Glyph; +import com.itextpdf.io.font.otf.GlyphLine; +import com.itextpdf.io.image.ImageData; +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.counter.event.IMetaInfo; +import com.itextpdf.kernel.font.PdfFont; +import com.itextpdf.kernel.font.PdfTrueTypeFont; +import com.itextpdf.kernel.font.PdfType0Font; +import com.itextpdf.kernel.font.PdfType1Font; +import com.itextpdf.kernel.font.PdfType3Font; +import com.itextpdf.kernel.geom.PageSize; +import com.itextpdf.kernel.pdf.DocumentProperties; +import com.itextpdf.kernel.pdf.PdfAConformanceLevel; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfDocumentInfo; +import com.itextpdf.kernel.pdf.PdfOutputIntent; +import com.itextpdf.kernel.pdf.PdfPage; +import com.itextpdf.kernel.pdf.PdfString; +import com.itextpdf.kernel.pdf.PdfViewerPreferences; +import com.itextpdf.kernel.pdf.PdfWriter; +import com.itextpdf.kernel.pdf.canvas.PdfCanvas; +import com.itextpdf.kernel.pdf.canvas.PdfCanvasConstants.TextRenderingMode; +import com.itextpdf.kernel.pdf.layer.PdfLayer; +import com.itextpdf.layout.Canvas; +import com.itextpdf.layout.Document; +import com.itextpdf.layout.element.Paragraph; +import com.itextpdf.layout.element.Text; +import com.itextpdf.layout.font.FontProvider; +import com.itextpdf.layout.property.TextAlignment; +import com.itextpdf.pdfa.PdfADocument; +import com.itextpdf.pdfocr.OcrPdfCreatorMetaInfo.PdfDocumentType; +import com.itextpdf.pdfocr.events.IThreadLocalMetaInfoAware; + +import java.io.File; +import java.io.IOException; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link OcrPdfCreator} is the class that creates PDF documents containing input + * images and text that was recognized using provided {@link IOcrEngine}. + * + * {@link OcrPdfCreator} provides possibilities to set list of input images to + * be used for OCR, to set scaling mode for images, to set color of text in + * output PDF document, to set fixed size of the PDF document's page and to + * perform OCR using given images and to return + * {@link com.itextpdf.kernel.pdf.PdfDocument} as result. + * OCR is based on the provided {@link IOcrEngine} + * (e.g. tesseract reader). This parameter is obligatory and it should be + * provided in constructor + * or using setter. + */ +public class OcrPdfCreator { + + /** + * The logger. + */ + private static final Logger LOGGER = LoggerFactory + .getLogger(OcrPdfCreator.class); + + /** + * Selected {@link IOcrEngine}. + */ + private IOcrEngine ocrEngine; + + /** + * Set of properties. + */ + private OcrPdfCreatorProperties ocrPdfCreatorProperties; + + /** + * Creates a new {@link OcrPdfCreator} instance. + * + * @param ocrEngine {@link IOcrEngine} selected OCR Reader + */ + public OcrPdfCreator(final IOcrEngine ocrEngine) { + this(ocrEngine, new OcrPdfCreatorProperties()); + } + + /** + * Creates a new {@link OcrPdfCreator} instance. + * + * @param ocrEngine selected OCR Reader {@link IOcrEngine} + * @param ocrPdfCreatorProperties set of properties for {@link OcrPdfCreator} + */ + public OcrPdfCreator(final IOcrEngine ocrEngine, + final OcrPdfCreatorProperties ocrPdfCreatorProperties) { + setOcrEngine(ocrEngine); + setOcrPdfCreatorProperties(ocrPdfCreatorProperties); + } + + /** + * Gets properties for {@link OcrPdfCreator}. + * + * @return set properties {@link OcrPdfCreatorProperties} + */ + public final OcrPdfCreatorProperties getOcrPdfCreatorProperties() { + return ocrPdfCreatorProperties; + } + + /** + * Sets properties for {@link OcrPdfCreator}. + * + * @param ocrPdfCreatorProperties set of properties + * {@link OcrPdfCreatorProperties} for {@link OcrPdfCreator} + */ + public final void setOcrPdfCreatorProperties( + final OcrPdfCreatorProperties ocrPdfCreatorProperties) { + this.ocrPdfCreatorProperties = ocrPdfCreatorProperties; + } + + /** + * Performs OCR with set parameters using provided {@link IOcrEngine} and + * creates PDF using provided {@link com.itextpdf.kernel.pdf.PdfWriter} and + * {@link com.itextpdf.kernel.pdf.PdfOutputIntent}. + * PDF/A-3u document will be created if + * provided {@link com.itextpdf.kernel.pdf.PdfOutputIntent} is not null. + * + * @param inputImages {@link java.util.List} of images to be OCRed + * @param pdfWriter the {@link com.itextpdf.kernel.pdf.PdfWriter} object + * to write final PDF document to + * @param pdfOutputIntent {@link com.itextpdf.kernel.pdf.PdfOutputIntent} + * for PDF/A-3u document + * @return result PDF/A-3u {@link com.itextpdf.kernel.pdf.PdfDocument} + * object + * @throws OcrException if it was not possible to read provided or + * default font + */ + public final PdfDocument createPdfA(final List inputImages, + final PdfWriter pdfWriter, + final PdfOutputIntent pdfOutputIntent) + throws OcrException { + LOGGER.info(MessageFormatUtil.format( + PdfOcrLogMessageConstant.START_OCR_FOR_IMAGES, + inputImages.size())); + + IMetaInfo storedMetaInfo = null; + if (ocrEngine instanceof IThreadLocalMetaInfoAware) { + storedMetaInfo = ((IThreadLocalMetaInfoAware)ocrEngine).getThreadLocalMetaInfo(); + ((IThreadLocalMetaInfoAware)ocrEngine).setThreadLocalMetaInfo( + new OcrPdfCreatorMetaInfo(((IThreadLocalMetaInfoAware)ocrEngine).getThreadLocalMetaInfo(), + UUID.randomUUID(), + null != pdfOutputIntent ? PdfDocumentType.PDFA : PdfDocumentType.PDF)); + } + + // map contains: + // keys: image files + // values: + // map pageNumber -> retrieved text data(text and its coordinates) + Map>> imagesTextData = + new LinkedHashMap>>(); + try { + for (File inputImage : inputImages) { + imagesTextData.put(inputImage, + ocrEngine.doImageOcr(inputImage)); + } + } finally { + if (ocrEngine instanceof IThreadLocalMetaInfoAware) { + ((IThreadLocalMetaInfoAware)ocrEngine).setThreadLocalMetaInfo(storedMetaInfo); + } + } + + + // create PdfDocument + return createPdfDocument(pdfWriter, pdfOutputIntent, imagesTextData); + } + + /** + * Performs OCR with set parameters using provided {@link IOcrEngine} and + * creates PDF using provided {@link com.itextpdf.kernel.pdf.PdfWriter}. + * + * @param inputImages {@link java.util.List} of images to be OCRed + * @param pdfWriter the {@link com.itextpdf.kernel.pdf.PdfWriter} object + * to write final PDF document to + * @return result {@link com.itextpdf.kernel.pdf.PdfDocument} object + * @throws OcrException if provided font is incorrect + */ + public final PdfDocument createPdf(final List inputImages, + final PdfWriter pdfWriter) + throws OcrException { + return createPdfA(inputImages, pdfWriter, null); + } + + /** + * Gets used {@link IOcrEngine}. + * + * Returns {@link IOcrEngine} reader object to perform OCR. + * @return selected {@link IOcrEngine} instance + */ + public final IOcrEngine getOcrEngine() { + return ocrEngine; + } + + /** + * Sets {@link IOcrEngine} reader object to perform OCR. + * @param reader selected {@link IOcrEngine} instance + */ + public final void setOcrEngine(final IOcrEngine reader) { + ocrEngine = reader; + } + + /** + * Adds image (or its one page) and text that was found there to canvas. + * + * @param pdfDocument result {@link com.itextpdf.kernel.pdf.PdfDocument} + * @param imageSize size of the image according to the selected + * {@link ScaleMode} + * @param pageText text that was found on this image (or on this page) + * @param imageData input image if it is a single page or its one page if + * this is a multi-page image + * @param createPdfA3u true if PDF/A3u document is being created + * @throws OcrException if PDF/A3u document is being created and provided + * font contains notdef glyphs + */ + private void addToCanvas(final PdfDocument pdfDocument, + final com.itextpdf.kernel.geom.Rectangle imageSize, + final List pageText, final ImageData imageData, + final boolean createPdfA3u) throws OcrException { + com.itextpdf.kernel.geom.Rectangle rectangleSize = + ocrPdfCreatorProperties.getPageSize() == null + ? imageSize : ocrPdfCreatorProperties.getPageSize(); + PageSize size = new PageSize(rectangleSize); + PdfPage pdfPage = pdfDocument.addNewPage(size); + PdfCanvas canvas = new NotDefCheckingPdfCanvas(pdfPage, createPdfA3u); + + PdfLayer[] layers = createPdfLayers(ocrPdfCreatorProperties.getImageLayerName(), + ocrPdfCreatorProperties.getTextLayerName(), + pdfDocument); + + if (layers[0] != null) { + canvas.beginLayer(layers[0]); + } + addImageToCanvas(imageData, imageSize, canvas); + if (layers[0] != null && layers[0] != layers[1]) { + canvas.endLayer(); + } + + // how much the original image size changed + float multiplier = imageData == null + ? 1 : imageSize.getWidth() + / PdfCreatorUtil.getPoints(imageData.getWidth()); + if (layers[1] != null && layers[0] != layers[1]) { + canvas.beginLayer(layers[1]); + } + + try { + addTextToCanvas(imageSize, pageText, canvas, multiplier, + pdfPage.getMediaBox()); + } catch (OcrException e) { + LOGGER.error(MessageFormatUtil.format( + OcrException.CANNOT_CREATE_PDF_DOCUMENT, + e.getMessage())); + throw new OcrException(OcrException.CANNOT_CREATE_PDF_DOCUMENT) + .setMessageParams(e.getMessage()); + } + if (layers[1] != null) { + canvas.endLayer(); + } + } + + /** + * Creates a new PDF document using provided properties, adds images with + * recognized text. + * + * @param pdfWriter the {@link com.itextpdf.kernel.pdf.PdfWriter} object + * to write final PDF document to + * @param pdfOutputIntent {@link com.itextpdf.kernel.pdf.PdfOutputIntent} + * for PDF/A-3u document + * @param imagesTextData map that contains input image files as keys, + * and as value: map pageNumber -> text for the page + * @return result {@link com.itextpdf.kernel.pdf.PdfDocument} object + */ + private PdfDocument createPdfDocument(final PdfWriter pdfWriter, + final PdfOutputIntent pdfOutputIntent, + final Map>> imagesTextData) { + PdfDocument pdfDocument; + boolean createPdfA3u = pdfOutputIntent != null; + if (createPdfA3u) { + pdfDocument = new PdfADocument(pdfWriter, + PdfAConformanceLevel.PDF_A_3U, pdfOutputIntent, + new DocumentProperties().setEventCountingMetaInfo(new PdfOcrMetaInfo())); + } else { + pdfDocument = new PdfDocument(pdfWriter, + new DocumentProperties().setEventCountingMetaInfo(new PdfOcrMetaInfo())); + } + + // pdfLang should be set in PDF/A mode + boolean hasPdfLangProperty = ocrPdfCreatorProperties.getPdfLang() != null + && !ocrPdfCreatorProperties.getPdfLang().equals(""); + if (createPdfA3u && !hasPdfLangProperty) { + LOGGER.error(MessageFormatUtil.format( + OcrException.CANNOT_CREATE_PDF_DOCUMENT, + PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET)); + throw new OcrException(OcrException.CANNOT_CREATE_PDF_DOCUMENT) + .setMessageParams(PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET); + } + + // add metadata + if (hasPdfLangProperty) { + pdfDocument.getCatalog() + .setLang(new PdfString(ocrPdfCreatorProperties.getPdfLang())); + } + + // set title if it is not empty + if (ocrPdfCreatorProperties.getTitle() != null) { + pdfDocument.getCatalog().setViewerPreferences( + new PdfViewerPreferences().setDisplayDocTitle(true)); + PdfDocumentInfo info = pdfDocument.getDocumentInfo(); + info.setTitle(ocrPdfCreatorProperties.getTitle()); + } + + // reset passed font provider + ocrPdfCreatorProperties.getFontProvider().reset(); + + addDataToPdfDocument(imagesTextData, pdfDocument, createPdfA3u); + + return pdfDocument; + } + + /** + * Places provided images and recognized text to the result PDF document. + * + * @param imagesTextData map that contains input image + * files as keys, and as value: + * map pageNumber -> text for the page + * @param pdfDocument result {@link com.itextpdf.kernel.pdf.PdfDocument} + * @param createPdfA3u true if PDF/A3u document is being created + * @throws OcrException if input image cannot be read or provided font + * contains NOTDEF glyphs + */ + private void addDataToPdfDocument( + final Map>> imagesTextData, + final PdfDocument pdfDocument, + final boolean createPdfA3u) throws OcrException { + for (Map.Entry>> entry + : imagesTextData.entrySet()) { + try { + File inputImage = entry.getKey(); + List imageDataList = + PdfCreatorUtil.getImageData(inputImage); + LOGGER.info(MessageFormatUtil.format( + PdfOcrLogMessageConstant.NUMBER_OF_PAGES_IN_IMAGE, + inputImage.toString(), imageDataList.size())); + + Map> imageTextData = entry.getValue(); + if (imageTextData.keySet().size() > 0) { + for (int page = 0; page < imageDataList.size(); ++page) { + ImageData imageData = imageDataList.get(page); + com.itextpdf.kernel.geom.Rectangle imageSize = + PdfCreatorUtil.calculateImageSize( + imageData, + ocrPdfCreatorProperties.getScaleMode(), + ocrPdfCreatorProperties.getPageSize()); + + if (imageTextData.containsKey(page + 1)) { + addToCanvas(pdfDocument, imageSize, + imageTextData.get(page + 1), + imageData, createPdfA3u); + } + } + } + } catch (IOException e) { + LOGGER.error(MessageFormatUtil.format( + PdfOcrLogMessageConstant.CANNOT_ADD_DATA_TO_PDF_DOCUMENT, + e.getMessage())); + } + } + } + + /** + * Places given image to canvas to background to a separate layer. + * + * @param imageData input image as {@link java.io.File} + * @param imageSize size of the image according to the selected + * {@link ScaleMode} + * @param pdfCanvas canvas to place the image + */ + private void addImageToCanvas(final ImageData imageData, + final com.itextpdf.kernel.geom.Rectangle imageSize, + final PdfCanvas pdfCanvas) { + if (imageData != null) { + if (ocrPdfCreatorProperties.getPageSize() == null) { + pdfCanvas.addImage(imageData, imageSize, false); + } else { + com.itextpdf.kernel.geom.Point coordinates = + PdfCreatorUtil.calculateImageCoordinates( + ocrPdfCreatorProperties.getPageSize(), imageSize); + com.itextpdf.kernel.geom.Rectangle rect = + new com.itextpdf.kernel.geom.Rectangle( + (float)coordinates.x, (float)coordinates.y, + imageSize.getWidth(), imageSize.getHeight()); + pdfCanvas.addImage(imageData, rect, false); + } + } + } + + /** + * Places retrieved text to canvas to a separate layer. + * + * @param imageSize size of the image according to the selected + * {@link ScaleMode} + * @param pageText text that was found on this image (or on this page) + * @param pdfCanvas canvas to place the text + * @param multiplier coefficient to adjust text placing on canvas + * @param pageMediaBox page parameters + * @throws OcrException if PDF/A3u document is being created and provided + * font contains notdef glyphs + */ + private void addTextToCanvas( + final com.itextpdf.kernel.geom.Rectangle imageSize, + final List pageText, + final PdfCanvas pdfCanvas, + final float multiplier, + final com.itextpdf.kernel.geom.Rectangle pageMediaBox) + throws OcrException { + if (pageText != null && pageText.size() > 0) { + com.itextpdf.kernel.geom.Point imageCoordinates = + PdfCreatorUtil.calculateImageCoordinates( + ocrPdfCreatorProperties.getPageSize(), imageSize); + for (TextInfo item : pageText) { + String line = item.getText(); + List coordinates = item.getBbox(); + final float left = coordinates.get(0) * multiplier; + final float right = (coordinates.get(2) + 1) * multiplier - 1; + final float top = coordinates.get(1) * multiplier; + final float bottom = (coordinates.get(3) + 1) * multiplier - 1; + + float bboxWidthPt = PdfCreatorUtil + .getPoints(right - left); + float bboxHeightPt = PdfCreatorUtil + .getPoints(bottom - top); + FontProvider fontProvider = getOcrPdfCreatorProperties() + .getFontProvider(); + String fontFamily = getOcrPdfCreatorProperties() + .getDefaultFontFamily(); + if (!line.isEmpty() && bboxHeightPt > 0 && bboxWidthPt > 0) { + Document document = new Document(pdfCanvas.getDocument()); + document.setFontProvider(fontProvider); + + // Scale the text width to fit the OCR bbox + float fontSize = PdfCreatorUtil.calculateFontSize( + document, line, fontFamily, + bboxHeightPt, bboxWidthPt); + + float lineWidth = PdfCreatorUtil.getRealLineWidth(document, + line, fontFamily, fontSize); + + float deltaX = PdfCreatorUtil.getPoints(left); + float deltaY = imageSize.getHeight() + - PdfCreatorUtil.getPoints(bottom); + + Canvas canvas = new Canvas(pdfCanvas, pageMediaBox); + canvas.setFontProvider(fontProvider); + + Text text = new Text(line) + .setHorizontalScaling(bboxWidthPt / lineWidth); + + Paragraph paragraph = new Paragraph(text) + .setMargin(0) + .setMultipliedLeading(1.2f); + paragraph.setFontFamily(fontFamily) + .setFontSize(fontSize); + paragraph.setWidth(bboxWidthPt * 1.5f); + + if (ocrPdfCreatorProperties.getTextColor() != null) { + paragraph.setFontColor( + ocrPdfCreatorProperties.getTextColor()); + } else { + paragraph.setTextRenderingMode( + TextRenderingMode.INVISIBLE); + } + + canvas.showTextAligned(paragraph, + deltaX + (float)imageCoordinates.x, + deltaY + (float)imageCoordinates.y, + TextAlignment.LEFT); + canvas.close(); + } + } + } + } + + /** + * Creates layers for image and text according rules set in {@link OcrPdfCreatorProperties}. + * + * @param imageLayerName name of the image layer + * @param textLayerName name of the text layer + * @param pdfDocument document to add layers to + * + * @return array of two layers: first layer is for image, second layer is for text. + * Elements may be null meaning that layer creation is not requested + */ + private static PdfLayer[] createPdfLayers( + String imageLayerName, + String textLayerName, + PdfDocument pdfDocument) { + if (imageLayerName == null && textLayerName == null) { + return new PdfLayer[] {null, null}; + } else if (imageLayerName == null) { + return new PdfLayer[]{null, new PdfLayer(textLayerName, pdfDocument)}; + } else if (textLayerName == null) { + return new PdfLayer[]{new PdfLayer(imageLayerName, pdfDocument), null}; + } else if (imageLayerName.equals(textLayerName)) { + PdfLayer pdfLayer = new PdfLayer(imageLayerName, pdfDocument); + return new PdfLayer[] {pdfLayer, pdfLayer}; + } else { + return new PdfLayer[] {new PdfLayer(imageLayerName, pdfDocument), new PdfLayer(textLayerName, pdfDocument)}; + } + } + + /** + * A handler for PDF canvas that validates existing glyphs. + */ + private static class NotDefCheckingPdfCanvas extends PdfCanvas { + private static final long serialVersionUID = 708713860707664107L; + private final boolean createPdfA3u; + public NotDefCheckingPdfCanvas(PdfPage page, boolean createPdfA3u) { + super(page); + this.createPdfA3u = createPdfA3u; + } + + @Override + public PdfCanvas showText(GlyphLine text) { + ActualTextCheckingGlyphLine glyphLine = + new ActualTextCheckingGlyphLine(text); + PdfFont currentFont = getGraphicsState().getFont(); + boolean notDefGlyphsExists = false; + // default value for error message, it'll be updated with the + // unicode of the not found glyph + String message = PdfOcrLogMessageConstant + .COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER; + for (int i = glyphLine.start; i < glyphLine.end; i++) { + if (isNotDefGlyph(currentFont, glyphLine.get(i))) { + notDefGlyphsExists = true; + message = MessageFormatUtil.format(PdfOcrLogMessageConstant + .COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, + glyphLine.get(i).getUnicode()); + if (this.createPdfA3u) { + // exception is thrown only if PDF/A document is + // being created + throw new OcrException(message); + } + // setting actual text to NotDef glyph + glyphLine.setActualTextToGlyph(i, + glyphLine.toUnicodeString(i, i + 1)); + // setting a fake unicode deliberately to pass further + // checks for actual text necessity during iterating over + // glyphline chunks with ActualTextIterator + Glyph glyph = new Glyph(glyphLine.get(i)); + glyph.setUnicode(-1); + glyphLine.set(i, glyph); + } + } + // Warning is logged if not PDF/A document is being created + if (notDefGlyphsExists) { + LOGGER.warn(message); + } + return this.showText(glyphLine, new ActualTextIterator(glyphLine)); + } + + private static boolean isNotDefGlyph(PdfFont font, Glyph glyph) { + if (font instanceof PdfType0Font + || font instanceof PdfTrueTypeFont) { + return glyph.getCode() == 0; + } else if (font instanceof PdfType1Font + || font instanceof PdfType3Font) { + return glyph.getCode() == -1; + } + return false; + } + } + + /** + * A handler for GlyphLine that checks existing actual text not to + * overwrite it. + */ + private static class ActualTextCheckingGlyphLine extends GlyphLine { + private static final long serialVersionUID = -946356392098459518L; + + public ActualTextCheckingGlyphLine(GlyphLine other) { + super(other); + } + + public void setActualTextToGlyph(int i, String text) { + // set actual text if it doesn't exist for i-th glyph + if ((this.actualText == null || this.actualText.size() <= i + || this.actualText.get(i) == null)) { + super.setActualText(i, i + 1, text); + } + } + } +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorMetaInfo.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorMetaInfo.java new file mode 100644 index 0000000..47081ea --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorMetaInfo.java @@ -0,0 +1,83 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.kernel.counter.event.IMetaInfo; + +import java.util.UUID; + +/** + * The meta info that is used internally by pdfOcr to pass a wrapped custom meta data + */ +public class OcrPdfCreatorMetaInfo implements IMetaInfo, IMetaInfoWrapper { + private static final long serialVersionUID = 7047674343175216537L; + + private IMetaInfo wrappedMetaInfo; + private UUID uuid; + private PdfDocumentType pdfDocumentType; + + /** + * Creates an inner meta info wrapper + * + * @param wrappedMetaInfo the meta info to be wrapped + * @param uuid a unique String which corresponds to the ocr event for which this meta info is passed + * @param pdfDocumentType a type of the document which is created during the corresponding ocr event + */ + public OcrPdfCreatorMetaInfo(IMetaInfo wrappedMetaInfo, UUID uuid, PdfDocumentType pdfDocumentType) { + this.wrappedMetaInfo = wrappedMetaInfo; + this.uuid = uuid; + this.pdfDocumentType = pdfDocumentType; + } + + /** + * Gets the unique String which corresponds to the ocr event for which this meta info is passed + * @return the unique String which corresponds to the ocr event for which this meta info is passed + */ + public UUID getDocumentId() { + return uuid; + } + + /** + * Gets the type of the document which is created during the corresponding ocr event + * @return the type of the document which is created during the corresponding ocr event + */ + public PdfDocumentType getPdfDocumentType() { + return pdfDocumentType; + } + + @Override + /** + * Gets the wrapped meta info + * @return the wrapped meta info + */ + public IMetaInfo getWrappedMetaInfo() { + return wrappedMetaInfo; + } + + /** + * The enum which represents types of documents, for which pdfOcr sends different events + */ + public enum PdfDocumentType { + PDF, PDFA; + } +} \ No newline at end of file diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java new file mode 100644 index 0000000..187d768 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/OcrPdfCreatorProperties.java @@ -0,0 +1,340 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.layout.font.FontProvider; + +/** + * Properties that will be used by the {@link OcrPdfCreator}. + */ +public class OcrPdfCreatorProperties { + + /** + * Font provider. + * By default it is {@link PdfOcrFontProvider} object with default font + * family {@link PdfOcrFontProvider#getDefaultFontFamily()}. + */ + private FontProvider fontProvider = null; + + /** + * Default font family. + * {@link PdfOcrFontProvider#getDefaultFontFamily()} by default. + */ + private String defaultFontFamily = null; + + /** + * Color of the text in the output PDF document. + * Text will be transparent by default. + */ + private com.itextpdf.kernel.colors.Color textColor = null; + + /** + * Scale mode for input images. + * {@link ScaleMode#SCALE_TO_FIT} by default. But this value will be used + * only if {@link #pageSize} is not null; + */ + private ScaleMode scaleMode = ScaleMode.SCALE_TO_FIT; + + /** + * Size of the PDF document pages. + * null by default. + * If this parameter is null, size of the page will be equal to the + * input image size. If this parameter is not null, input image will be + * scaled according to the selected {@link ScaleMode}. + */ + private com.itextpdf.kernel.geom.Rectangle pageSize = null; + + /** + * Name of the image layer. + * null by default. + * If this parameter is null then image is placed directly in canvas instead of layer. + * If value of imageLayerName is equal to value of textLayerName then image and text placed in one layer. + */ + private String imageLayerName = null; + + /** + * Name of the text layer. + * null by default. + * If this parameter is null then text is placed directly in canvas instead of layer. + * If value of textLayerName is equal to value of imageLayerName then text and image placed in one layer. + */ + private String textLayerName = null; + + /** + * PDF Language. + */ + private String pdfLang = ""; + + /** + * Title of the created document. + * It is not set by default. + */ + private String title = null; + + /** + * Creates a new {@link OcrPdfCreatorProperties} instance. + */ + public OcrPdfCreatorProperties() { + } + + /** + * Creates a new {@link OcrPdfCreatorProperties} instance + * based on another {@link OcrPdfCreatorProperties} instance (copy + * constructor). + * + * @param other the other {@link OcrPdfCreatorProperties} instance + */ + public OcrPdfCreatorProperties(OcrPdfCreatorProperties other) { + this.scaleMode = other.scaleMode; + this.pageSize = other.pageSize; + this.imageLayerName = other.imageLayerName; + this.textLayerName = other.textLayerName; + this.textColor = other.textColor; + this.pdfLang = other.pdfLang; + this.title = other.title; + this.fontProvider = other.fontProvider; + this.defaultFontFamily = other.defaultFontFamily; + } + + /** + * Gets text color in output PDF document. + * + * @return set text {@link com.itextpdf.kernel.colors.Color} + */ + public final com.itextpdf.kernel.colors.Color getTextColor() { + return textColor; + } + + /** + * Sets text color in output PDF document. + * Text will be transparent by default. + * + * @param textColor required text {@link com.itextpdf.kernel.colors.Color} + * @return the {@link OcrPdfCreatorProperties} instance + */ + public final OcrPdfCreatorProperties setTextColor( + final com.itextpdf.kernel.colors.Color textColor) { + this.textColor = textColor; + return this; + } + + /** + * Gets scale mode for input images using available options from + * {@link ScaleMode} enumeration. + * + * @return selected {@link ScaleMode} + */ + public final ScaleMode getScaleMode() { + return scaleMode; + } + + /** + * Sets scale mode for input images using available options + * from {@link ScaleMode} enumeration. + * + * @param scaleMode selected {@link ScaleMode} + * @return the {@link OcrPdfCreatorProperties} instance + */ + public final OcrPdfCreatorProperties setScaleMode( + final ScaleMode scaleMode) { + this.scaleMode = scaleMode; + return this; + } + + /** + * Gets required size for output PDF document. Real size of the page will + * be calculated according to the selected {@link ScaleMode} + * + * @return required page size as {@link com.itextpdf.kernel.geom.Rectangle} + */ + public final com.itextpdf.kernel.geom.Rectangle getPageSize() { + return pageSize; + } + + /** + * Sets required size for output PDF document. + * + * @param pageSize requested page + * size as {@link com.itextpdf.kernel.geom.Rectangle} + * @return the {@link OcrPdfCreatorProperties} instance + */ + public final OcrPdfCreatorProperties setPageSize( + final com.itextpdf.kernel.geom.Rectangle pageSize) { + this.pageSize = pageSize; + return this; + } + + /** + * Gets name of image layer. + * + * @return image layer's name as {@link java.lang.String} if it was + * manually set, otherwise - null + */ + public final String getImageLayerName() { + return imageLayerName; + } + + /** + * Sets name for the image layer. + * null by default. + * If null then image is placed directly in canvas instead of layer. + * If image layer name is equal to text layer name then text and image placed in one layer. + * + * @param layerName name of the image layer + * as {@link java.lang.String} + * @return the {@link OcrPdfCreatorProperties} instance + */ + public final OcrPdfCreatorProperties setImageLayerName( + final String layerName) { + imageLayerName = layerName; + return this; + } + + /** + * Gets name of text layer. + * + * @return text layer's name as {@link java.lang.String} if it was + * manually set, otherwise - null + */ + public final String getTextLayerName() { + return textLayerName; + } + + /** + * Sets name for the text layer. + * null by default. + * If null then text is placed directly in canvas instead of layer. + * If text layer name is equal to image layer name then text and image placed in one layer. + * + * @param layerName of the text layer as {@link java.lang.String} + * @return the {@link OcrPdfCreatorProperties} instance + */ + public final OcrPdfCreatorProperties setTextLayerName( + final String layerName) { + textLayerName = layerName; + return this; + } + + /** + * Gets PDF language. + * + * @return PDF document language as {@link java.lang.String} + */ + public final String getPdfLang() { + return pdfLang; + } + + /** + * Specify PDF natural language, and optionally locale. + * Language identifier shall either be the empty text string, to indicate that the language is unknown, + * or a Language-Tag as defined in BCP 47 (2009), Tags for the Identification of Languages. + * + * @param language PDF document language as {@link java.lang.String}, + * e.g. "en-US", etc. + * @return the {@link OcrPdfCreatorProperties} instance + */ + public final OcrPdfCreatorProperties setPdfLang( + final String language) { + pdfLang = language; + return this; + } + + /** + * Gets PDF document title. + * + * @return PDF title as {@link java.lang.String} + */ + public final String getTitle() { + return title; + } + + /** + * Sets PDF document title. + * + * @param title PDF title as {@link java.lang.String} + * @return the {@link OcrPdfCreatorProperties} instance + */ + public final OcrPdfCreatorProperties setTitle( + final String title) { + this.title = title; + return this; + } + + /** + * Returns FontProvider that was set previously or if it is + * null a new instance of {@link PdfOcrFontProvider} is + * returned. + * @return {@link com.itextpdf.layout.font.FontProvider} object + */ + public FontProvider getFontProvider() { + if (fontProvider == null) { + fontProvider = new PdfOcrFontProvider(); + } + return fontProvider; + } + + /** + * Sets font provider. + * Please note that passed FontProvider is not to be used in multithreaded + * environments or for any parallel processing. + * There will be set the following default font family: + * {@link PdfOcrFontProvider#getDefaultFontFamily()} + * @param fontProvider selected + * {@link com.itextpdf.layout.font.FontProvider} instance + * @return the {@link OcrPdfCreatorProperties} instance + */ + public OcrPdfCreatorProperties setFontProvider(FontProvider fontProvider) { + this.fontProvider = fontProvider; + return this; + } + + /** + * Sets font provider and default font family. + * Please note that passed FontProvider is not to be used in multithreaded + * environments or for any parallel processing. + * @param fontProvider selected + * {@link com.itextpdf.layout.font.FontProvider} instance + * @param defaultFontFamily preferred font family to be used when selecting + * font from + * {@link com.itextpdf.layout.font.FontProvider}. + * @return the {@link OcrPdfCreatorProperties} instance + */ + public OcrPdfCreatorProperties setFontProvider(FontProvider fontProvider, + String defaultFontFamily) { + this.fontProvider = fontProvider; + this.defaultFontFamily = defaultFontFamily; + return this; + } + + /** + * Gets preferred font family to be used when selecting font from + * {@link com.itextpdf.layout.font.FontProvider}. + * + * @return if default font family is not set or it is null or empty + * {@link PdfOcrFontProvider#getDefaultFontFamily()} is returned + */ + public String getDefaultFontFamily() { + return defaultFontFamily == null || defaultFontFamily.length() == 0 + ? getFontProvider().getDefaultFontFamily() : defaultFontFamily; + } +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfCreatorUtil.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfCreatorUtil.java new file mode 100644 index 0000000..5398b2e --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfCreatorUtil.java @@ -0,0 +1,291 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.io.image.ImageData; +import com.itextpdf.io.image.ImageDataFactory; +import com.itextpdf.io.image.TiffImageData; +import com.itextpdf.io.source.RandomAccessFileOrArray; +import com.itextpdf.io.source.RandomAccessSourceFactory; +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.geom.Rectangle; +import com.itextpdf.layout.Document; +import com.itextpdf.layout.element.Paragraph; +import com.itextpdf.layout.layout.LayoutArea; +import com.itextpdf.layout.layout.LayoutContext; +import com.itextpdf.layout.layout.LayoutResult; +import com.itextpdf.layout.renderer.IRenderer; +import com.itextpdf.layout.renderer.ParagraphRenderer; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.List; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class PdfCreatorUtil { + + /** + * The Constant to convert pixels to points. + */ + static final float PX_TO_PT = 3f / 4f; + + /** + * The Constant for points per inch. + */ + private static final float POINTS_PER_INCH = 72.0f; + + /** + * The logger. + */ + private static final Logger LOGGER = LoggerFactory + .getLogger(PdfCreatorUtil.class); + + /** + * Calculates font size according to given bbox height, width and selected + * font. + * + * @param document PDF document as a {@link com.itextpdf.layout.Document} + * object + * @param line text line + * @param fontFamily default font family + * @param bboxHeightPt height of bbox calculated by OCR Reader + * @param bboxWidthPt width of bbox calculated by OCR Reader + * @return font size + * @throws OcrException if set font provider is invalid and/or fonts that + * it contains are invalid + */ + static float calculateFontSize(final Document document, final String line, + final String fontFamily, final float bboxHeightPt, + final float bboxWidthPt) throws OcrException { + Rectangle bbox = new Rectangle(bboxWidthPt * 1.5f, + bboxHeightPt * 1.5f); + // setting minimum and maximum (approx.) values for font size + float fontSize = 1; + float maxFontSize = bbox.getHeight(); + + try { + Paragraph paragraph = new Paragraph(line); + paragraph.setWidth(bbox.getWidth()); + paragraph.setFontFamily(fontFamily); + + while (Math.abs(fontSize - maxFontSize) > 1e-1) { + float curFontSize = (fontSize + maxFontSize) / 2; + paragraph.setFontSize(curFontSize); + ParagraphRenderer renderer = (ParagraphRenderer) paragraph.createRendererSubTree() + .setParent(document.getRenderer()); + LayoutContext context = new LayoutContext( + new LayoutArea(1, bbox)); + if (renderer.layout(context).getStatus() == LayoutResult.FULL && renderer.getLines().size() == 1) { + fontSize = curFontSize; + } else { + maxFontSize = curFontSize; + } + } + } catch (IllegalStateException e) { + LOGGER.error(PdfOcrLogMessageConstant + .PROVIDED_FONT_PROVIDER_IS_INVALID); + throw new OcrException( + OcrException.CANNOT_RESOLVE_PROVIDED_FONTS, e); + } + return fontSize; + } + + /** + * Calculated real width of a paragraph with given text line, font provider + * and font size. + * + * @param document PDF document as a {@link com.itextpdf.layout.Document} + * object + * @param line text line + * @param fontFamily default font family + * @param fontSize calculated font size + * @return real width of text line in paragraph + */ + static float getRealLineWidth(Document document, final String line, + final String fontFamily, float fontSize) { + Paragraph paragraph = new Paragraph(line); + paragraph.setFontFamily(fontFamily); + paragraph.setFontSize(fontSize); + IRenderer renderer = paragraph.createRendererSubTree() + .setParent(document.getRenderer()); + return ((ParagraphRenderer) renderer).getMinMaxWidth().getMaxWidth(); + } + + /** + * Calculates image coordinates on the page. + * + * @param size size of the page + * @param imageSize size of the image + * @return list of two elements (coordinates): first - x, second - y. + */ + static com.itextpdf.kernel.geom.Point calculateImageCoordinates( + final com.itextpdf.kernel.geom.Rectangle size, + final com.itextpdf.kernel.geom.Rectangle imageSize) { + float x = 0; + float y = 0; + if (size != null) { + if (imageSize.getHeight() < size.getHeight()) { + y = (size.getHeight() - imageSize.getHeight()) / 2; + } + if (imageSize.getWidth() < size.getWidth()) { + x = (size.getWidth() - imageSize.getWidth()) / 2; + } + } + return new com.itextpdf.kernel.geom.Point(x, y); + } + + /** + * Retrieves {@link com.itextpdf.io.image.ImageData} from the + * input {@link java.io.File}. + * + * @param inputImage input image as {@link java.io.File} + * @return list of {@link com.itextpdf.io.image.ImageData} objects + * (more than one element in the list if it is a multipage tiff) + * @throws OcrException if error occurred during reading a file + * @throws IOException if error occurred during reading a file + */ + static List getImageData(final File inputImage) + throws OcrException, IOException { + List images = new ArrayList(); + + String ext = ""; + int index = inputImage.getAbsolutePath().lastIndexOf('.'); + if (index > 0) { + ext = new String(inputImage.getAbsolutePath().toCharArray(), + index + 1, + inputImage.getAbsolutePath().length() - index - 1); + + if ("tiff".equals(ext.toLowerCase()) + || "tif".equals(ext.toLowerCase())) { + int tiffPages = getNumberOfPageTiff(inputImage); + + for (int page = 0; page < tiffPages; page++) { + byte[] bytes = Files.readAllBytes(inputImage.toPath()); + ImageData imageData = ImageDataFactory + .createTiff(bytes, true, + page + 1, true); + images.add(imageData); + } + } else { + try { + ImageData imageData = ImageDataFactory + .create(inputImage.getAbsolutePath()); + images.add(imageData); + } catch (com.itextpdf.io.IOException e) { + LOGGER.error(MessageFormatUtil.format( + PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, + e.getMessage())); + throw new OcrException( + OcrException.CANNOT_READ_INPUT_IMAGE, e); + } + } + } + return images; + } + + /** + * Calculates the size of the PDF document page according to the provided + * {@link ScaleMode}. + * + * @param imageData input image or its one page as + * {@link com.itextpdf.io.image.ImageData} + * @param scaleMode required {@link ScaleMode} that could be + * set using {@link OcrPdfCreatorProperties#setScaleMode} + * method + * @param requiredSize size of the page that could be using + * {@link OcrPdfCreatorProperties#setPageSize} method + * @return {@link com.itextpdf.kernel.geom.Rectangle} + */ + static com.itextpdf.kernel.geom.Rectangle calculateImageSize( + final ImageData imageData, + final ScaleMode scaleMode, + final com.itextpdf.kernel.geom.Rectangle requiredSize) { + if (imageData != null) { + float imgWidthPt = getPoints(imageData.getWidth()); + float imgHeightPt = getPoints(imageData.getHeight()); + // page size will be equal to the image size if page size or + // scale mode are not set + if (requiredSize == null || scaleMode == null) { + return new com.itextpdf.kernel.geom.Rectangle(imgWidthPt, + imgHeightPt); + } else { + com.itextpdf.kernel.geom.Rectangle size = + new com.itextpdf.kernel.geom.Rectangle( + requiredSize.getWidth(), + requiredSize.getHeight()); + // scale image according to the page size and scale mode + if (scaleMode == ScaleMode.SCALE_HEIGHT) { + float newHeight = imgHeightPt + * requiredSize.getWidth() / imgWidthPt; + size.setHeight(newHeight); + } else if (scaleMode == ScaleMode.SCALE_WIDTH) { + float newWidth = imgWidthPt + * requiredSize.getHeight() / imgHeightPt; + size.setWidth(newWidth); + } else if (scaleMode == ScaleMode.SCALE_TO_FIT) { + float ratio = Math.min( + requiredSize.getWidth() / imgWidthPt, + requiredSize.getHeight() / imgHeightPt); + size.setWidth(imgWidthPt * ratio); + size.setHeight(imgHeightPt * ratio); + } + return size; + } + } else { + return requiredSize; + } + } + + /** + * Converts value from pixels to points. + * + * @param pixels input value in pixels + * @return result value in points + */ + static float getPoints(final float pixels) { + return pixels * PX_TO_PT; + } + + /** + * Counts number of pages in the provided tiff image. + * + * @param inputImage input image {@link java.io.File} + * @return number of pages in the provided TIFF image + * @throws IOException if error occurred during creating a + * {@link com.itextpdf.io.source.IRandomAccessSource} based on a filename + * string + */ + private static int getNumberOfPageTiff(final File inputImage) + throws IOException { + RandomAccessFileOrArray raf = new RandomAccessFileOrArray( + new RandomAccessSourceFactory() + .createBestSource( + inputImage.getAbsolutePath())); + int numOfPages = TiffImageData.getNumberOfPages(raf); + raf.close(); + return numOfPages; + } +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfOcrFontProvider.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfOcrFontProvider.java new file mode 100644 index 0000000..9742120 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfOcrFontProvider.java @@ -0,0 +1,92 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.io.font.PdfEncodings; +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.io.util.ResourceUtil; +import com.itextpdf.io.util.StreamUtil; +import com.itextpdf.layout.font.FontProvider; +import com.itextpdf.layout.font.FontSet; + +import java.io.IOException; +import java.io.InputStream; +import org.slf4j.LoggerFactory; + +public class PdfOcrFontProvider extends FontProvider { + + /** + * Path to the default font. + */ + private static final String DEFAULT_FONT_PATH = "com/itextpdf/pdfocr/fonts/LiberationSans-Regular.ttf"; + + /** + * Default font family. + */ + private static final String DEFAULT_FONT_FAMILY = "LiberationSans"; + + /** + * Creates a new {@link PdfOcrFontProvider} instance with the default font + * and the default font family. + */ + public PdfOcrFontProvider() { + super(DEFAULT_FONT_FAMILY); + this.addFont(getDefaultFont(), PdfEncodings.IDENTITY_H); + } + + /** + * Creates a new {@link PdfOcrFontProvider} instance. + */ + public PdfOcrFontProvider(FontSet fontSet, + String defaultFontFamily) { + super(fontSet, defaultFontFamily); + } + + /** + * Gets default font family. + * + * @return default font family as a string + */ + @Override + public String getDefaultFontFamily() { + return DEFAULT_FONT_FAMILY; + } + + /** + * Gets default font as a byte array. + * + * @return default font as byte[] + */ + private byte[] getDefaultFont() { + try (InputStream stream = ResourceUtil + .getResourceStream(DEFAULT_FONT_PATH)) { + return StreamUtil.inputStreamToArray(stream); + } catch (IOException e) { + LoggerFactory.getLogger(getClass()) + .error(MessageFormatUtil.format( + PdfOcrLogMessageConstant.CANNOT_READ_DEFAULT_FONT, + e.getMessage())); + return new byte[0]; + } + } +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfOcrLogMessageConstant.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfOcrLogMessageConstant.java new file mode 100644 index 0000000..e324ae5 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfOcrLogMessageConstant.java @@ -0,0 +1,47 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +public class PdfOcrLogMessageConstant { + public static final String CANNOT_READ_INPUT_IMAGE = + "Cannot read input image {0}"; + public static final String PROVIDED_FONT_PROVIDER_IS_INVALID = + "Provided FontProvider is invalid. Please check that it contains " + + "valid fonts and default font family name."; + public static final String CANNOT_READ_DEFAULT_FONT = + "Cannot default read font: {0}"; + public static final String CANNOT_ADD_DATA_TO_PDF_DOCUMENT = + "Cannot add data to PDF document: {1}"; + public static final String START_OCR_FOR_IMAGES = + "Starting ocr for {0} image(s)"; + public static final String NUMBER_OF_PAGES_IN_IMAGE = + "Image {0} contains {1} page(s)"; + public static final String COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER = + "Could not find a glyph corresponding to Unicode character {0} " + + "in any of the fonts"; + public static final String PDF_LANGUAGE_PROPERTY_IS_NOT_SET = + "PDF language property is not set"; + + private PdfOcrLogMessageConstant() { + } +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfOcrMetaInfo.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfOcrMetaInfo.java new file mode 100644 index 0000000..e405906 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfOcrMetaInfo.java @@ -0,0 +1,29 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.kernel.counter.event.IMetaInfo; + +public class PdfOcrMetaInfo implements IMetaInfo { + private static final long serialVersionUID = 7047674343175216537L; +} \ No newline at end of file diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/ScaleMode.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/ScaleMode.java new file mode 100644 index 0000000..40d0c19 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/ScaleMode.java @@ -0,0 +1,56 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.kernel.geom.Rectangle; +import com.itextpdf.pdfocr.OcrPdfCreatorProperties; + +/** + * Enumeration of the possible scale modes for input images. + */ +public enum ScaleMode { + /** + * Only width of the image will be proportionally scaled to fit + * required size that is set using + * {@link OcrPdfCreatorProperties#setPageSize(Rectangle)} method. + * Height will be equal to the page height that was set using + * {@link OcrPdfCreatorProperties#setPageSize(Rectangle)} method and + * width will be proportionally scaled to keep the original aspect ratio. + */ + SCALE_WIDTH, + /** + * Only height of the image will be proportionally scaled to fit + * required size that is set using + * {@link OcrPdfCreatorProperties#setPageSize(Rectangle)} method. + * Width will be equal to the page width that was set using + * {@link OcrPdfCreatorProperties#setPageSize(Rectangle)} method and + * height will be proportionally scaled to keep the original aspect ratio. + */ + SCALE_HEIGHT, + /** + * The image will be scaled to fit within the page width and height dimensions that are set using + * {@link OcrPdfCreatorProperties#setPageSize(Rectangle)} method. + * Original aspect ratio of the image stays unchanged. + */ + SCALE_TO_FIT +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/TextInfo.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/TextInfo.java new file mode 100644 index 0000000..73ce258 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/TextInfo.java @@ -0,0 +1,99 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * This class describes how recognized text is positioned on the image + * providing bbox for each text item (could be a line or a word). + */ +public class TextInfo { + + /** + * Contains any text. + */ + private String text; + + /** + * Contains 4 float coordinates: bbox parameters. + */ + private List bbox; + + /** + * Creates a new {@link TextInfo} instance. + */ + public TextInfo() { + text = null; + bbox = Collections.emptyList(); + } + + /** + * Creates a new {@link TextInfo} instance. + * + * @param text any text + * @param bbox {@link java.util.List} of bbox parameters + */ + public TextInfo(final String text, final List bbox) { + this.text = text; + this.bbox = Collections.unmodifiableList(bbox); + } + + /** + * Gets text element. + * + * @return String + */ + public String getText() { + return text; + } + + /** + * Sets text element. + * + * @param newText retrieved text + */ + public void setText(final String newText) { + text = newText; + } + + /** + * Gets bbox coordinates. + * + * @return {@link java.util.List} of bbox parameters + */ + public List getBbox() { + return new ArrayList(bbox); + } + + /** + * Sets bbox coordinates. + * + * @param bbox {@link java.util.List} of bbox parameters + */ + public void setBbox(final List bbox) { + this.bbox = Collections.unmodifiableList(bbox); + } +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/events/IThreadLocalMetaInfoAware.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/events/IThreadLocalMetaInfoAware.java new file mode 100644 index 0000000..5542b25 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/events/IThreadLocalMetaInfoAware.java @@ -0,0 +1,45 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.events; + +import com.itextpdf.kernel.counter.event.IMetaInfo; + +/** + * The interface which holds a thread local meta info, + * meaning different threads operate with independent and different meta infos. + */ +public interface IThreadLocalMetaInfoAware { + + /** + * Gets the meta info which is held by the interface. + * @return the held thread local meta info + */ + IMetaInfo getThreadLocalMetaInfo(); + + /** + * Sets a thread local meta info. + * @param metaInfo a thread local meta info to be held + * @return this {@link IThreadLocalMetaInfoAware} + */ + IThreadLocalMetaInfoAware setThreadLocalMetaInfo(IMetaInfo metaInfo); +} diff --git a/pdfocr-api/src/main/java/com/itextpdf/pdfocr/package-info.java b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/package-info.java new file mode 100644 index 0000000..004df18 --- /dev/null +++ b/pdfocr-api/src/main/java/com/itextpdf/pdfocr/package-info.java @@ -0,0 +1 @@ +package com.itextpdf.pdfocr; diff --git a/pdfocr-api/src/main/resources/com/itextpdf/pdfocr/NOTICE.txt b/pdfocr-api/src/main/resources/com/itextpdf/pdfocr/NOTICE.txt new file mode 100644 index 0000000..2658931 --- /dev/null +++ b/pdfocr-api/src/main/resources/com/itextpdf/pdfocr/NOTICE.txt @@ -0,0 +1,103 @@ +This software uses the following font under the following license: +| Liberation Sans font | OFL-1.1 | + +------------------------------------------------------------------------------------------------------------------------ + +Liberation Sans font is used under the following license agreement: + +Digitized data copyright (c) 2010 Google Corporation + with Reserved Font Arimo, Tinos and Cousine. +Copyright (c) 2012 Red Hat, Inc. + with Reserved Font Name Liberation. + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +http://scripts.sil.org/OFL + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/pdfocr-api/src/main/resources/com/itextpdf/pdfocr/fonts/LiberationSans-Regular.ttf b/pdfocr-api/src/main/resources/com/itextpdf/pdfocr/fonts/LiberationSans-Regular.ttf new file mode 100644 index 0000000..626dd93 Binary files /dev/null and b/pdfocr-api/src/main/resources/com/itextpdf/pdfocr/fonts/LiberationSans-Regular.ttf differ diff --git a/pdfocr-api/src/test/java/com/itextpdf/metainfo/TestMetaInfo.java b/pdfocr-api/src/test/java/com/itextpdf/metainfo/TestMetaInfo.java new file mode 100644 index 0000000..1f10d9f --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/metainfo/TestMetaInfo.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.metainfo; + +import com.itextpdf.kernel.counter.event.IMetaInfo; + +/** + * This class is used for test purposes. + * Please be aware that it's put in the com.itextpdf.metainfo deliberately, + * so that it belongs neither to com.itextpdf.pdfocr nor com.itextpdf.pdfocr.tesseract4 packages + */ +public class TestMetaInfo implements IMetaInfo { + private static final long serialVersionUID = 5521060335175170386L; +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/ApiTest.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/ApiTest.java new file mode 100644 index 0000000..65d3ea8 --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/ApiTest.java @@ -0,0 +1,83 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.kernel.colors.DeviceRgb; +import com.itextpdf.kernel.font.PdfFont; +import com.itextpdf.pdfocr.helpers.CustomOcrEngine; +import com.itextpdf.pdfocr.helpers.ExtractionStrategy; +import com.itextpdf.pdfocr.helpers.PdfHelper; +import com.itextpdf.test.ExtendedITextTest; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class ApiTest extends ExtendedITextTest { + + @Test + public void testTextInfo() { + String path = PdfHelper.getDefaultImagePath(); + Map> result = new CustomOcrEngine().doImageOcr(new File(path)); + Assert.assertEquals(1, result.size()); + + TextInfo textInfo = new TextInfo(); + textInfo.setText("text"); + textInfo.setBbox(Arrays.asList(204.0f, 158.0f, 742.0f, 294.0f)); + int page = 2; + result.put(page, Collections.singletonList(textInfo)); + + Assert.assertEquals(2, result.size()); + Assert.assertEquals(textInfo.getText(), result.get(page).get(0).getText()); + Assert.assertEquals(textInfo.getBbox().size(), result.get(page).get(0).getBbox().size()); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, count = 7) + }) + @Test + public void testThaiImageWithNotDefGlyphs() throws IOException { + String testName = "testThaiImageWithNotdefGlyphs"; + String path = PdfHelper.getThaiImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + + PdfHelper.createPdf(pdfPath, new File(path), + new OcrPdfCreatorProperties().setTextColor(DeviceRgb.BLACK)); + + ExtractionStrategy strategy = PdfHelper.getExtractionStrategy(pdfPath); + + PdfFont font = strategy.getPdfFont(); + String fontName = font.getFontProgram().getFontNames().getFontName(); + Assert.assertTrue(fontName.contains("LiberationSans")); + } +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfA3uTest.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfA3uTest.java new file mode 100644 index 0000000..c52abdc --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfA3uTest.java @@ -0,0 +1,226 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.io.font.PdfEncodings; +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.colors.DeviceCmyk; +import com.itextpdf.kernel.colors.DeviceRgb; +import com.itextpdf.kernel.font.PdfFont; +import com.itextpdf.kernel.font.PdfFontFactory; +import com.itextpdf.kernel.pdf.PdfAConformanceLevel; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfReader; +import com.itextpdf.layout.font.FontProvider; +import com.itextpdf.layout.font.FontSelector; +import com.itextpdf.pdfa.PdfAConformanceException; +import com.itextpdf.pdfocr.helpers.ExtractionStrategy; +import com.itextpdf.pdfocr.helpers.PdfHelper; +import com.itextpdf.test.ExtendedITextTest; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import java.io.IOException; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExpectedException; +import org.verapdf.gf.model.factory.fonts.FontFactory; + +@Category(IntegrationTest.class) +public class PdfA3uTest extends ExtendedITextTest { + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + @Test + public void testPdfA3uWithNullIntent() throws IOException { + String testName = "testPdfA3uWithNullIntent"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setTextColor(DeviceCmyk.BLACK); + properties.setScaleMode(ScaleMode.SCALE_TO_FIT); + + PdfHelper.createPdfA(pdfPath, new File(path), properties, null); + String result = PdfHelper.getTextFromPdfLayer(pdfPath, null); + Assert.assertEquals(PdfHelper.DEFAULT_TEXT, result); + Assert.assertEquals(ScaleMode.SCALE_TO_FIT, properties.getScaleMode()); + } + + @Test + public void testIncompatibleOutputIntentAndFontColorSpaceException() + throws IOException { + junitExpectedException.expect(com.itextpdf.kernel.PdfException.class); + junitExpectedException.expectMessage(PdfAConformanceException.DEVICECMYK_MAY_BE_USED_ONLY_IF_THE_FILE_HAS_A_CMYK_PDFA_OUTPUT_INTENT_OR_DEFAULTCMYK_IN_USAGE_CONTEXT); + + String testName = "testIncompatibleOutputIntentAndFontColorSpaceException"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setTextColor(DeviceCmyk.BLACK); + + PdfHelper.createPdfA(pdfPath, new File(path), + ocrPdfCreatorProperties, + PdfHelper.getRGBPdfOutputIntent()); + } + + @Test + public void testPdfA3DefaultMetadata() throws IOException { + String testName = "testPdfDefaultMetadata"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setTextColor(DeviceRgb.BLACK); + + PdfHelper.createPdfA(pdfPath, file, + ocrPdfCreatorProperties, + PdfHelper.getRGBPdfOutputIntent()); + + PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); + + Assert.assertEquals("en-US", + pdfDocument.getCatalog().getLang().toString()); + Assert.assertEquals(null, + pdfDocument.getDocumentInfo().getTitle()); + Assert.assertEquals(PdfAConformanceLevel.PDF_A_3U, + pdfDocument.getReader().getPdfAConformanceLevel()); + + pdfDocument.close(); + } + + @Test + public void testPdfCustomMetadata() throws IOException { + String testName = "testPdfCustomMetadata"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + String locale = "nl-BE"; + properties.setPdfLang(locale); + String title = "Title"; + properties.setTitle(title); + + PdfHelper.createPdfA(pdfPath, file, + new OcrPdfCreatorProperties(properties), + PdfHelper.getCMYKPdfOutputIntent()); + + PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); + Assert.assertEquals(locale, + pdfDocument.getCatalog().getLang().toString()); + Assert.assertEquals(title, + pdfDocument.getDocumentInfo().getTitle()); + Assert.assertEquals(PdfAConformanceLevel.PDF_A_3U, + pdfDocument.getReader().getPdfAConformanceLevel()); + + pdfDocument.close(); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = OcrException.CANNOT_CREATE_PDF_DOCUMENT, count = 1) + }) + @Test + public void testNonCompliantThaiPdfA() throws IOException { + junitExpectedException.expect(OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil.format( + OcrException.CANNOT_CREATE_PDF_DOCUMENT, + MessageFormatUtil.format(PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, 3611))); + + String testName = "testNonCompliantThaiPdfA"; + String path = PdfHelper.getThaiImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setTextColor(DeviceRgb.BLACK); + + PdfHelper.createPdfA(pdfPath, new File(path), + ocrPdfCreatorProperties, + PdfHelper.getRGBPdfOutputIntent()); + } + + @Test + public void testCompliantThaiPdfA() throws IOException { + String testName = "testCompliantThaiPdfA"; + String path = PdfHelper.getThaiImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setTextColor(DeviceRgb.BLACK); + + FontProvider fontProvider = new FontProvider("Kanit"); + fontProvider.addFont(PdfHelper.getKanitFontPath()); + PdfOcrFontProvider pdfOcrFontProvider = new PdfOcrFontProvider( + fontProvider.getFontSet(), "Kanit"); + ocrPdfCreatorProperties.setFontProvider(pdfOcrFontProvider); + + PdfHelper.createPdfA(pdfPath, new File(path), ocrPdfCreatorProperties, + PdfHelper.getRGBPdfOutputIntent()); + + String resultWithActualText = PdfHelper + .getTextFromPdfLayerUseActualText(pdfPath, null); + Assert.assertEquals(PdfHelper.THAI_TEXT, resultWithActualText); + + String resultWithoutUseActualText = PdfHelper.getTextFromPdfLayer(pdfPath, + null); + Assert.assertEquals(PdfHelper.THAI_TEXT, resultWithoutUseActualText); + Assert.assertEquals(resultWithoutUseActualText, resultWithActualText); + + ExtractionStrategy strategy = PdfHelper.getExtractionStrategy(pdfPath); + PdfFont font = strategy.getPdfFont(); + String fontName = font.getFontProgram().getFontNames().getFontName(); + Assert.assertTrue(fontName.contains("Kanit")); + Assert.assertTrue(font.isEmbedded()); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = OcrException.CANNOT_CREATE_PDF_DOCUMENT, count = 1) + }) + @Test + public void testPdfACreateWithoutPdfLangProperty() + throws IOException { + junitExpectedException.expect(OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil.format( + OcrException.CANNOT_CREATE_PDF_DOCUMENT, + PdfOcrLogMessageConstant.PDF_LANGUAGE_PROPERTY_IS_NOT_SET)); + + String testName = "testPdfACreateWithoutPdfLangProperty"; + String path = PdfHelper.getThaiImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + + PdfHelper.createPdfA(pdfPath, new File(path), + new OcrPdfCreatorProperties(), + PdfHelper.getRGBPdfOutputIntent()); + } +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfFontTest.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfFontTest.java new file mode 100644 index 0000000..8b1abde --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfFontTest.java @@ -0,0 +1,230 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.io.font.PdfEncodings; +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.colors.DeviceCmyk; +import com.itextpdf.kernel.colors.DeviceRgb; +import com.itextpdf.kernel.font.PdfFont; +import com.itextpdf.layout.font.FontProvider; +import com.itextpdf.pdfocr.helpers.ExtractionStrategy; +import com.itextpdf.pdfocr.helpers.PdfHelper; +import com.itextpdf.test.ExtendedITextTest; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import java.io.IOException; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExpectedException; + +@Category(IntegrationTest.class) +public class PdfFontTest extends ExtendedITextTest { + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + @Test + public void testFontColor() throws IOException { + String testName = "testFontColor"; + String path = PdfHelper.getImagesTestDirectory() + "multipage.tiff"; + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setScaleMode(ScaleMode.SCALE_TO_FIT); + properties.setTextLayerName("Text1"); + com.itextpdf.kernel.colors.Color color = DeviceCmyk.CYAN; + properties.setTextColor(color); + + PdfHelper.createPdf(pdfPath, file, properties); + + ExtractionStrategy strategy = PdfHelper.getExtractionStrategy(pdfPath, "Text1"); + com.itextpdf.kernel.colors.Color fillColor = strategy.getFillColor(); + Assert.assertEquals(color, fillColor); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.PROVIDED_FONT_PROVIDER_IS_INVALID, count = 1), + @LogMessage(messageTemplate = OcrException.CANNOT_CREATE_PDF_DOCUMENT, count = 1) + }) + @Test + public void testInvalidFontWithInvalidDefaultFontFamily() + throws IOException { + junitExpectedException.expect(OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil.format( + OcrException.CANNOT_CREATE_PDF_DOCUMENT, + OcrException.CANNOT_RESOLVE_PROVIDED_FONTS)); + + String testName = "testInvalidFontWithInvalidDefaultFontFamily"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + FontProvider pdfOcrFontProvider = new FontProvider("Font"); + pdfOcrFontProvider.getFontSet().addFont("font.ttf", PdfEncodings.IDENTITY_H, "Font"); + + properties.setFontProvider(pdfOcrFontProvider, "Font"); + properties.setScaleMode(ScaleMode.SCALE_TO_FIT); + + PdfHelper.createPdf(pdfPath, file, properties); + String result = PdfHelper.getTextFromPdfLayer(pdfPath, null); + Assert.assertEquals(PdfHelper.DEFAULT_TEXT, result); + Assert.assertEquals(ScaleMode.SCALE_TO_FIT, properties.getScaleMode()); + } + + @Test + public void testDefaultFontInPdfARgb() throws IOException { + String testName = "testDefaultFontInPdf"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setTextColor(DeviceRgb.BLACK); + + PdfHelper.createPdfA(pdfPath, file, + ocrPdfCreatorProperties, + PdfHelper.getRGBPdfOutputIntent()); + ExtractionStrategy strategy = PdfHelper.getExtractionStrategy(pdfPath); + + PdfFont font = strategy.getPdfFont(); + String fontName = font.getFontProgram().getFontNames().getFontName(); + Assert.assertTrue(fontName.contains("LiberationSans")); + Assert.assertTrue(font.isEmbedded()); + } + + @Test + public void testInvalidCustomFontInPdfACMYK() throws IOException { + String testName = "testInvalidCustomFontInPdf"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setFontProvider(new PdfOcrFontProvider()); + + PdfHelper.createPdfA(pdfPath, file, + ocrPdfCreatorProperties, + PdfHelper.getCMYKPdfOutputIntent()); + + ExtractionStrategy strategy = PdfHelper.getExtractionStrategy(pdfPath); + PdfFont font = strategy.getPdfFont(); + String fontName = font.getFontProgram().getFontNames().getFontName(); + Assert.assertTrue(fontName.contains("LiberationSans")); + Assert.assertTrue(font.isEmbedded()); + } + + @Test + public void testCustomFontInPdf() throws IOException { + String testName = "testDefaultFontInPdf"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + FontProvider fontProvider = new FontProvider("FreeSans"); + fontProvider.getFontSet().addFont(PdfHelper.getFreeSansFontPath(), PdfEncodings.IDENTITY_H, "FreeSans"); + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setFontProvider(fontProvider, "FreeSans"); + + PdfHelper.createPdfA(pdfPath, file, + ocrPdfCreatorProperties, + PdfHelper.getCMYKPdfOutputIntent()); + + ExtractionStrategy strategy = PdfHelper.getExtractionStrategy(pdfPath); + PdfFont font = strategy.getPdfFont(); + String fontName = font.getFontProgram().getFontNames().getFontName(); + Assert.assertTrue(fontName.contains("FreeSans")); + Assert.assertTrue(font.isEmbedded()); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, count = 7) + }) + @Test + public void testThaiImageWithNotDefGlyphs() throws IOException { + String testName = "testThaiImageWithNotDefGlyphs"; + String path = PdfHelper.getThaiImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + + PdfHelper.createPdf(pdfPath, new File(path), + new OcrPdfCreatorProperties().setTextColor(DeviceRgb.BLACK)); + + String resultWithActualText = PdfHelper + .getTextFromPdfLayerUseActualText(pdfPath, null); + Assert.assertEquals(PdfHelper.THAI_TEXT.replace(" ", ""), + resultWithActualText.replace(" ", "")); + + String resultWithoutUseActualText = PdfHelper.getTextFromPdfLayer(pdfPath, + null); + Assert.assertNotEquals(PdfHelper.THAI_TEXT, resultWithoutUseActualText); + Assert.assertNotEquals(resultWithoutUseActualText, resultWithActualText); + } + + @Test + public void testReusingFontProvider() throws IOException { + String testName = "testReusingFontProvider"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPathA3u = PdfHelper.getTargetDirectory() + testName + "_a3u.pdf"; + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + FontProvider fontProvider = new FontProvider("FreeSans"); + fontProvider.addFont(PdfHelper.getFreeSansFontPath()); + PdfOcrFontProvider pdfOcrFontProvider = new PdfOcrFontProvider( + fontProvider.getFontSet(), "FreeSans"); + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setFontProvider(pdfOcrFontProvider); + + PdfHelper.createPdfA(pdfPathA3u, file, ocrPdfCreatorProperties, + PdfHelper.getCMYKPdfOutputIntent()); + + PdfHelper.createPdf(pdfPath, file, ocrPdfCreatorProperties); + + ExtractionStrategy strategy = PdfHelper.getExtractionStrategy(pdfPathA3u); + PdfFont font = strategy.getPdfFont(); + String fontName = font.getFontProgram().getFontNames().getFontName(); + Assert.assertTrue(fontName.contains("FreeSans")); + Assert.assertTrue(font.isEmbedded()); + Assert.assertEquals(PdfHelper.DEFAULT_TEXT, strategy.getResultantText()); + + strategy = PdfHelper.getExtractionStrategy(pdfPath); + font = strategy.getPdfFont(); + fontName = font.getFontProgram().getFontNames().getFontName(); + Assert.assertTrue(fontName.contains("FreeSans")); + Assert.assertTrue(font.isEmbedded()); + Assert.assertEquals(PdfHelper.DEFAULT_TEXT, strategy.getResultantText()); + } +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfInputImageTest.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfInputImageTest.java new file mode 100644 index 0000000..2bfb84a --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfInputImageTest.java @@ -0,0 +1,70 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.pdfocr.helpers.PdfHelper; +import com.itextpdf.test.ExtendedITextTest; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExpectedException; + +@Category(IntegrationTest.class) +public class PdfInputImageTest extends ExtendedITextTest { + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, + count = 1) + }) + @Test + public void testCorruptedImage() { + junitExpectedException.expect(OcrException.class); + File file = new File(PdfHelper.getImagesTestDirectory() + + "corrupted.jpg"); + String realOutput = PdfHelper.getTextFromPdf(file, "testCorruptedImage"); + Assert.assertNotNull(realOutput); + Assert.assertEquals("", realOutput); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, count = 1) + }) + @Test + public void testCorruptedImageWithoutExtension() { + junitExpectedException.expect(OcrException.class); + + File file = new File(PdfHelper.getImagesTestDirectory() + + "corrupted"); + String realOutput = PdfHelper.getTextFromPdf(file, "testCorruptedImageWithoutExtension"); + Assert.assertNotNull(realOutput); + Assert.assertEquals("", realOutput); + } +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfLayersTest.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfLayersTest.java new file mode 100644 index 0000000..7c20503 --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfLayersTest.java @@ -0,0 +1,211 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfName; +import com.itextpdf.kernel.pdf.layer.PdfLayer; +import com.itextpdf.pdfocr.helpers.CustomOcrEngine; +import com.itextpdf.pdfocr.helpers.PdfHelper; +import com.itextpdf.test.ExtendedITextTest; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class PdfLayersTest extends ExtendedITextTest { + + @Test + public void testPdfLayersWithDefaultNames() { + String path = PdfHelper.getDefaultImagePath(); + File file = new File(path); + + OcrEngineProperties ocrEngineProperties = new OcrEngineProperties(); + ocrEngineProperties.setLanguages( + Collections.singletonList("eng")); + CustomOcrEngine engine = new CustomOcrEngine(ocrEngineProperties); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(engine); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), + PdfHelper.getPdfWriter()); + + Assert.assertNotNull(doc); + List layers = doc.getCatalog() + .getOCProperties(true).getLayers(); + + Assert.assertEquals(0, layers.size()); + doc.close(); + + Assert.assertEquals(engine, ocrPdfCreator.getOcrEngine()); + Assert.assertEquals(1, engine.getOcrEngineProperties().getLanguages().size()); + Assert.assertEquals("eng", engine.getOcrEngineProperties().getLanguages().get(0)); + } + + @Test + public void testPdfLayersWithCustomNames() { + String path = PdfHelper.getDefaultImagePath(); + File file = new File(path); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setImageLayerName("name image 1"); + properties.setTextLayerName("name text 1"); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), + PdfHelper.getPdfWriter()); + + Assert.assertNotNull(doc); + List layers = doc.getCatalog() + .getOCProperties(true).getLayers(); + + Assert.assertEquals(2, layers.size()); + Assert.assertEquals("name image 1", + layers.get(0).getPdfObject().get(PdfName.Name).toString()); + Assert.assertTrue(layers.get(0).isOn()); + Assert.assertEquals("name text 1", + layers.get(1).getPdfObject().get(PdfName.Name).toString()); + Assert.assertTrue(layers.get(1).isOn()); + + doc.close(); + } + + @Test + public void testTextFromPdfLayers() throws IOException { + String testName = "testTextFromPdfLayers"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setImageLayerName("Image Layer"); + properties.setTextLayerName("Text Layer"); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), PdfHelper.getPdfWriter(pdfPath)); + + Assert.assertNotNull(doc); + List layers = doc.getCatalog() + .getOCProperties(true).getLayers(); + + Assert.assertEquals(2, layers.size()); + Assert.assertEquals("Image Layer", + layers.get(0).getPdfObject().get(PdfName.Name).toString()); + Assert.assertTrue(layers.get(0).isOn()); + Assert.assertEquals("Text Layer", + layers.get(1).getPdfObject().get(PdfName.Name).toString()); + Assert.assertTrue(layers.get(1).isOn()); + + doc.close(); + + Assert.assertEquals(PdfHelper.DEFAULT_TEXT, + PdfHelper.getTextFromPdfLayer(pdfPath, "Text Layer")); + Assert.assertEquals("", + PdfHelper.getTextFromPdfLayer(pdfPath, "Image Layer")); + } + + @Test + public void testPdfLayersWithImageLayerOnly() { + String path = PdfHelper.getDefaultImagePath(); + File file = new File(path); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setImageLayerName("Image Layer"); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), + PdfHelper.getPdfWriter()); + + Assert.assertNotNull(doc); + List layers = doc.getCatalog() + .getOCProperties(true).getLayers(); + + Assert.assertEquals(1, layers.size()); + Assert.assertEquals("Image Layer", + layers.get(0).getPdfObject().get(PdfName.Name).toString()); + Assert.assertTrue(layers.get(0).isOn()); + + doc.close(); + } + + @Test + public void testPdfLayersWithTextLayerOnly() { + String path = PdfHelper.getDefaultImagePath(); + File file = new File(path); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setTextLayerName("Text Layer"); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), + PdfHelper.getPdfWriter()); + + Assert.assertNotNull(doc); + List layers = doc.getCatalog() + .getOCProperties(true).getLayers(); + + Assert.assertEquals(1, layers.size()); + Assert.assertEquals("Text Layer", + layers.get(0).getPdfObject().get(PdfName.Name).toString()); + Assert.assertTrue(layers.get(0).isOn()); + + doc.close(); + } + + @Test + public void testPdfLayersWithTextAndImageLayerWithTheSameName() { + String path = PdfHelper.getDefaultImagePath(); + File file = new File(path); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setTextLayerName("Mixed Layer"); + properties.setImageLayerName("Mixed Layer"); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), properties); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), + PdfHelper.getPdfWriter()); + + Assert.assertNotNull(doc); + List layers = doc.getCatalog() + .getOCProperties(true).getLayers(); + + Assert.assertEquals(1, layers.size()); + Assert.assertEquals("Mixed Layer", + layers.get(0).getPdfObject().get(PdfName.Name).toString()); + Assert.assertTrue(layers.get(0).isOn()); + + doc.close(); + } + + +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/ScaleModeTest.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/ScaleModeTest.java new file mode 100644 index 0000000..d1d5396 --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/ScaleModeTest.java @@ -0,0 +1,150 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.io.image.ImageData; +import com.itextpdf.io.image.ImageDataFactory; +import com.itextpdf.kernel.geom.Rectangle; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.pdfocr.helpers.CustomOcrEngine; +import com.itextpdf.pdfocr.helpers.ExtractionStrategy; +import com.itextpdf.pdfocr.helpers.PdfHelper; +import com.itextpdf.test.ExtendedITextTest; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class ScaleModeTest extends ExtendedITextTest { + + private static final float DELTA = 1e-4f; + + @Test + public void testScaleWidthMode() throws IOException { + String testName = "testScaleWidthMode"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + float pageWidthPt = 400f; + float pageHeightPt = 400f; + + com.itextpdf.kernel.geom.Rectangle pageSize = + new com.itextpdf.kernel.geom.Rectangle(pageWidthPt, pageHeightPt); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setScaleMode(ScaleMode.SCALE_WIDTH); + properties.setPageSize(pageSize); + + PdfHelper.createPdf(pdfPath, file, properties); + + com.itextpdf.kernel.geom.Rectangle rect = getImageBBoxRectangleFromPdf(pdfPath); + ImageData originalImageData = ImageDataFactory.create(file.getAbsolutePath()); + + // page size should be equal to the result image size + // result image height should be equal to the value that + // was set as page height result image width should be scaled + // proportionally according to the provided image height + // and original image size + Assert.assertEquals(pageHeightPt, rect.getHeight(), DELTA); + Assert.assertEquals(originalImageData.getWidth() / originalImageData.getHeight(), + rect.getWidth() / rect.getHeight(), DELTA); + } + + @Test + public void testScaleHeightMode() throws IOException { + String testName = "testScaleHeightMode"; + String path = PdfHelper.getDefaultImagePath(); + String pdfPath = PdfHelper.getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + float pageWidthPt = 400f; + float pageHeightPt = 400f; + + com.itextpdf.kernel.geom.Rectangle pageSize = + new com.itextpdf.kernel.geom.Rectangle(pageWidthPt, pageHeightPt); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setScaleMode(ScaleMode.SCALE_HEIGHT); + properties.setPageSize(pageSize); + + PdfHelper.createPdf(pdfPath, file, properties); + + com.itextpdf.kernel.geom.Rectangle rect = getImageBBoxRectangleFromPdf(pdfPath); + ImageData originalImageData = ImageDataFactory.create(file.getAbsolutePath()); + + Assert.assertEquals(pageWidthPt, rect.getWidth(), DELTA); + Assert.assertEquals(originalImageData.getWidth() / originalImageData.getHeight(), + rect.getWidth() / rect.getHeight(), DELTA); + } + + @Test + public void testOriginalSizeScaleMode() throws IOException { + String path = PdfHelper.getDefaultImagePath(); + File file = new File(path); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine()); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), + PdfHelper.getPdfWriter()); + + Assert.assertNotNull(doc); + + ImageData imageData = ImageDataFactory.create(file.getAbsolutePath()); + + float imageWidth = getPoints(imageData.getWidth()); + float imageHeight = getPoints(imageData.getHeight()); + float realWidth = doc.getFirstPage().getPageSize().getWidth(); + float realHeight = doc.getFirstPage().getPageSize().getHeight(); + + Assert.assertEquals(imageWidth, realWidth, DELTA); + Assert.assertEquals(imageHeight, realHeight, DELTA); + + doc.close(); + } + + /** + * Converts value from pixels to points. + * + * @param pixels input value in pixels + * @return result value in points + */ + protected float getPoints(final float pixels) { + return pixels * 3f / 4f; + } + + /** + * Retrieve image BBox rectangle from the first page from given PDF document. + */ + public static Rectangle getImageBBoxRectangleFromPdf(String path) + throws IOException { + ExtractionStrategy extractionStrategy = + PdfHelper.getExtractionStrategy(path); + return extractionStrategy.getImageBBoxRectangle(); + } +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/events/EventCountingTest.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/events/EventCountingTest.java new file mode 100644 index 0000000..a1aa89a --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/events/EventCountingTest.java @@ -0,0 +1,117 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.events; + +import com.itextpdf.kernel.pdf.PdfOutputIntent; +import com.itextpdf.kernel.pdf.PdfWriter; +import com.itextpdf.metainfo.TestMetaInfo; +import com.itextpdf.pdfocr.IOcrEngine; +import com.itextpdf.pdfocr.OcrPdfCreator; +import com.itextpdf.pdfocr.OcrPdfCreatorProperties; +import com.itextpdf.pdfocr.helpers.CustomOcrEngine; +import com.itextpdf.test.ExtendedITextTest; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.InputStream; +import java.util.Arrays; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExpectedException; + +@Category(IntegrationTest.class) +public class EventCountingTest extends ExtendedITextTest { + + protected static final String PROFILE_FOLDER = "./src/test/resources/com/itextpdf/pdfocr/profiles/"; + protected static final String SOURCE_FOLDER = "./src/test/resources/com/itextpdf/pdfocr/events/"; + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + private IOcrEngine tesseractReader; + + public EventCountingTest() { + tesseractReader = new CustomOcrEngine(); + } + + @Test + public void testEventCountingPdfEvent() { + ((CustomOcrEngine) tesseractReader).setThreadLocalMetaInfo(new TestMetaInfo()); + + doImageToPdfOcr(tesseractReader, getTestImageFile()); + + Assert.assertTrue(((CustomOcrEngine) tesseractReader).getThreadLocalMetaInfo() instanceof TestMetaInfo); + } + + @Test + public void testEventCountingPdfAEvent() { + ((CustomOcrEngine) tesseractReader).setThreadLocalMetaInfo(new TestMetaInfo()); + + doImageToPdfAOcr(tesseractReader, getTestImageFile()); + + Assert.assertTrue(((CustomOcrEngine) tesseractReader).getThreadLocalMetaInfo() instanceof TestMetaInfo); + } + + @Test + public void testEventCountingImageEvent() { + ((CustomOcrEngine) tesseractReader).setThreadLocalMetaInfo(new TestMetaInfo()); + + doImageOcr(tesseractReader, getTestImageFile()); + + Assert.assertTrue(((CustomOcrEngine) tesseractReader).getThreadLocalMetaInfo() instanceof TestMetaInfo); + } + + private static void doImageOcr(IOcrEngine tesseractReader, File imageFile) { + tesseractReader.doImageOcr(imageFile); + } + + private static void doImageToPdfOcr(IOcrEngine tesseractReader, File imageFile) { + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + ocrPdfCreator.createPdf(Arrays.asList(imageFile), new PdfWriter(new ByteArrayOutputStream())); + } + + private static void doImageToPdfAOcr(IOcrEngine tesseractReader, File imageFile) { + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, + new OcrPdfCreatorProperties().setPdfLang("en-US")); + InputStream is = null; + try { + is = new FileInputStream(PROFILE_FOLDER + "sRGB_CS_profile.icm"); + } catch (FileNotFoundException e) { + // No expected + } + PdfOutputIntent outputIntent = new PdfOutputIntent("Custom", "", "http://www.color.org", "sRGB IEC61966-2.1", + is); + + ocrPdfCreator.createPdfA(Arrays.asList(imageFile), new PdfWriter(new ByteArrayOutputStream()), outputIntent); + } + + private static File getTestImageFile() { + String imgPath = SOURCE_FOLDER + "numbers_01.jpg"; + return new File(imgPath); + } +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/CustomOcrEngine.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/CustomOcrEngine.java new file mode 100644 index 0000000..c165a1a --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/CustomOcrEngine.java @@ -0,0 +1,82 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.helpers; + +import com.itextpdf.kernel.counter.event.IMetaInfo; +import com.itextpdf.pdfocr.IOcrEngine; +import com.itextpdf.pdfocr.OcrEngineProperties; +import com.itextpdf.pdfocr.TextInfo; +import com.itextpdf.pdfocr.events.IThreadLocalMetaInfoAware; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class CustomOcrEngine implements IOcrEngine, IThreadLocalMetaInfoAware { + + private OcrEngineProperties ocrEngineProperties; + private IMetaInfo threadLocalMetaInfo; + + public CustomOcrEngine() { + } + + public CustomOcrEngine(OcrEngineProperties ocrEngineProperties) { + this.ocrEngineProperties = new OcrEngineProperties(ocrEngineProperties); + } + + @Override + public Map> doImageOcr(File input) { + Map> result = + new HashMap>(); + String text = PdfHelper.DEFAULT_TEXT; + if (input.getAbsolutePath().contains(PdfHelper.THAI_IMAGE_NAME)) { + text = PdfHelper.THAI_TEXT; + } + TextInfo textInfo = new TextInfo(text, + Arrays.asList(204.0f, 158.0f, 742.0f, 294.0f)); + result.put(1, Collections.singletonList(textInfo)); + return result; + } + + @Override + public void createTxtFile(List inputImages, File txtFile) { + } + + @Override + public IMetaInfo getThreadLocalMetaInfo() { + return threadLocalMetaInfo; + } + + @Override + public IThreadLocalMetaInfoAware setThreadLocalMetaInfo(IMetaInfo metaInfo) { + this.threadLocalMetaInfo = metaInfo; + return this; + } + + public OcrEngineProperties getOcrEngineProperties() { + return ocrEngineProperties; + } +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/ExtractionStrategy.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/ExtractionStrategy.java new file mode 100644 index 0000000..0313dfd --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/ExtractionStrategy.java @@ -0,0 +1,127 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.helpers; + +import com.itextpdf.kernel.font.PdfFont; +import com.itextpdf.kernel.pdf.PdfDictionary; +import com.itextpdf.kernel.pdf.PdfName; +import com.itextpdf.kernel.pdf.canvas.CanvasTag; +import com.itextpdf.kernel.pdf.canvas.parser.EventType; +import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData; +import com.itextpdf.kernel.pdf.canvas.parser.data.ImageRenderInfo; +import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo; +import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextChunkLocation; +import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy; +import com.itextpdf.kernel.pdf.canvas.parser.listener.TextChunk; + +public class ExtractionStrategy extends LocationTextExtractionStrategy { + private com.itextpdf.kernel.geom.Rectangle imageBBoxRectangle; + private com.itextpdf.kernel.colors.Color fillColor; + private String layerName; + private PdfFont pdfFont; + + public ExtractionStrategy(String name) { + super(); + layerName = name; + } + + public com.itextpdf.kernel.colors.Color getFillColor() { + return fillColor; + } + + public void setFillColor(com.itextpdf.kernel.colors.Color color) { + fillColor = color; + } + + public PdfFont getPdfFont() { + return pdfFont; + } + + public void setPdfFont(PdfFont font) { + pdfFont = font; + } + + public com.itextpdf.kernel.geom.Rectangle getImageBBoxRectangle() { + return this.imageBBoxRectangle; + } + + public void setImageBBoxRectangle(com.itextpdf.kernel.geom.Rectangle imageBBoxRectangle) { + this.imageBBoxRectangle = imageBBoxRectangle; + } + + @Override + public void eventOccurred(IEventData data, EventType type) { + if (type.equals(EventType.RENDER_TEXT) || type.equals(EventType.RENDER_IMAGE)) { + String tagName = getTagName(data, type); + if ((tagName == null && layerName == null) || (layerName != null && layerName.equals(tagName))) { + if (type.equals(EventType.RENDER_TEXT)) { + TextRenderInfo renderInfo = (TextRenderInfo) data; + setFillColor(renderInfo.getGraphicsState() + .getFillColor()); + setPdfFont(renderInfo.getGraphicsState().getFont()); + super.eventOccurred(data, type); + } + else if (type.equals(EventType.RENDER_IMAGE)) { + ImageRenderInfo renderInfo = (ImageRenderInfo) data; + com.itextpdf.kernel.geom.Matrix ctm = renderInfo.getImageCtm(); + setImageBBoxRectangle(new com.itextpdf.kernel.geom.Rectangle(ctm.get(6), ctm.get(7), + ctm.get(0), ctm.get(4))); + } + } + } + } + + @Override + protected boolean isChunkAtWordBoundary(TextChunk chunk, + TextChunk previousChunk) { + ITextChunkLocation curLoc = chunk.getLocation(); + ITextChunkLocation prevLoc = previousChunk.getLocation(); + + if (curLoc.getStartLocation().equals(curLoc.getEndLocation()) || + prevLoc.getEndLocation() + .equals(prevLoc.getStartLocation())) { + return false; + } + + return curLoc.distParallelEnd() - prevLoc.distParallelStart() > + (curLoc.getCharSpaceWidth() + prevLoc.getCharSpaceWidth()) + / 2.0f; + } + + private String getTagName(IEventData data, EventType type) { + java.util.List tagHierarchy = null; + if (type.equals(EventType.RENDER_TEXT)) { + TextRenderInfo textRenderInfo = (TextRenderInfo) data; + tagHierarchy = textRenderInfo.getCanvasTagHierarchy(); + } + else if (type.equals(EventType.RENDER_IMAGE)) { + ImageRenderInfo imageRenderInfo = (ImageRenderInfo) data; + tagHierarchy = imageRenderInfo.getCanvasTagHierarchy(); + } + return (tagHierarchy == null || tagHierarchy.size() == 0 + || tagHierarchy.get(0).getProperties().get(PdfName.Name) == null) + ? null + : tagHierarchy.get(0).getProperties().get(PdfName.Name).toString(); + } + +} diff --git a/pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/PdfHelper.java b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/PdfHelper.java new file mode 100644 index 0000000..3658901 --- /dev/null +++ b/pdfocr-api/src/test/java/com/itextpdf/pdfocr/helpers/PdfHelper.java @@ -0,0 +1,252 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.helpers; + +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfOutputIntent; +import com.itextpdf.kernel.pdf.PdfReader; +import com.itextpdf.kernel.pdf.PdfWriter; +import com.itextpdf.kernel.pdf.WriterProperties; +import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor; +import com.itextpdf.pdfocr.OcrPdfCreator; +import com.itextpdf.pdfocr.OcrPdfCreatorProperties; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.util.Collections; + +import com.itextpdf.test.ExtendedITextTest; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class PdfHelper { + + public static final String DEFAULT_IMAGE_NAME = "numbers_01.jpg"; + public static final String DEFAULT_TEXT = "619121"; + public static final String THAI_IMAGE_NAME = "thai.PNG"; + public static final String THAI_TEXT = "ป ระ เท ศ ไ"; + // directory with test files + public static final String TEST_DIRECTORY = "./src/test/resources/com/itextpdf/pdfocr/"; + public static final String TARGET_DIRECTORY = "./target/test/resources/com/itextpdf/pdfocr/"; + + private static final Logger LOGGER = LoggerFactory + .getLogger(PdfHelper.class); + + /** + * Returns images test directory. + */ + public static String getImagesTestDirectory() { + return TEST_DIRECTORY + "images/"; + } + + /** + * Returns path to default test image. + */ + public static String getDefaultImagePath() { + return getImagesTestDirectory() + DEFAULT_IMAGE_NAME; + } + + /** + * Returns path to thai test image. + */ + public static String getThaiImagePath() { + return getImagesTestDirectory() + THAI_IMAGE_NAME; + } + + /** + * Returns path to test font. + */ + public static String getFreeSansFontPath() { + return TEST_DIRECTORY + "fonts/FreeSans.ttf"; + } + + /** + * Returns path to test font. + */ + public static String getKanitFontPath() { + return TEST_DIRECTORY + "fonts/Kanit-Regular.ttf"; + } + + /** + * Returns target directory (because target/test could not exist). + */ + public static String getTargetDirectory() { + if (!Files.exists(java.nio.file.Paths.get(TARGET_DIRECTORY))) { + ExtendedITextTest.createDestinationFolder(TARGET_DIRECTORY); + } + return TARGET_DIRECTORY; + } + + /** + * + * Create pdfWriter using provided path to destination file. + */ + public static PdfWriter getPdfWriter(String pdfPath) throws FileNotFoundException { + return new PdfWriter(pdfPath, + new WriterProperties().addUAXmpMetadata()); + } + + /** + * Create pdfWriter. + */ + public static PdfWriter getPdfWriter() { + return new PdfWriter(new ByteArrayOutputStream(), new WriterProperties().addUAXmpMetadata()); + } + + /** + * Creates PDF rgb output intent for tests. + */ + public static PdfOutputIntent getRGBPdfOutputIntent() throws FileNotFoundException { + String defaultRGBColorProfilePath = TEST_DIRECTORY + "profiles" + + "/sRGB_CS_profile.icm"; + InputStream is = new FileInputStream(defaultRGBColorProfilePath); + return new PdfOutputIntent("", "", + "", "sRGB IEC61966-2.1", is); + } + + /** + * Creates PDF cmyk output intent for tests. + */ + public static PdfOutputIntent getCMYKPdfOutputIntent() throws FileNotFoundException { + String defaultCMYKColorProfilePath = TEST_DIRECTORY + + "profiles/CoatedFOGRA27.icc"; + InputStream is = new FileInputStream(defaultCMYKColorProfilePath); + return new PdfOutputIntent("Custom", + "","http://www.color.org", + "Coated FOGRA27 (ISO 12647 - 2:2004)", is); + } + + /** + * Get text from layer specified by name from the first page. + */ + public static String getTextFromPdfLayer(String pdfPath, + String layerName) + throws IOException { + ExtractionStrategy textExtractionStrategy = getExtractionStrategy(pdfPath, layerName, false); + return textExtractionStrategy.getResultantText(); + } + + /** + * Get text from layer specified by name from the first page. + */ + public static String getTextFromPdfLayerUseActualText(String pdfPath, + String layerName) + throws IOException { + ExtractionStrategy textExtractionStrategy = getExtractionStrategy(pdfPath, layerName, true); + return textExtractionStrategy.getResultantText(); + } + + /** + * Perform OCR with custom ocr engine using provided input image and set + * of properties and save to the given path. + */ + public static void createPdf(String pdfPath, File inputFile, + OcrPdfCreatorProperties properties) { + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), + properties); + try (PdfWriter pdfWriter = getPdfWriter(pdfPath)) { + ocrPdfCreator.createPdf(Collections.singletonList(inputFile), + pdfWriter).close(); + } catch (IOException e) { + LOGGER.error(e.getMessage()); + } + } + + /** + * Perform OCR with custom ocr engine using provided input image and set + * of properties and save to the given path. + */ + public static void createPdfA(String pdfPath, File inputFile, + OcrPdfCreatorProperties properties, PdfOutputIntent outputIntent) { + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(new CustomOcrEngine(), + properties); + try (PdfWriter pdfWriter = getPdfWriter(pdfPath)) { + ocrPdfCreator.createPdfA(Collections.singletonList(inputFile), + pdfWriter, outputIntent).close(); + } catch (IOException e) { + LOGGER.error(e.getMessage()); + } + } + + /** + * Retrieve text from specified page from given PDF document. + */ + public static String getTextFromPdf(File file, String testName) { + String result = null; + String pdfPath = null; + try { + pdfPath = getTargetDirectory() + testName + ".pdf"; + createPdf(pdfPath, file, new OcrPdfCreatorProperties()); + result = getTextFromPdfLayer(pdfPath, "Text Layer"); + } catch (IOException e) { + LOGGER.error(e.getMessage()); + } + + return result; + } + + /** + * Get extraction strategy for given document. + */ + public static ExtractionStrategy getExtractionStrategy(String pdfPath) + throws IOException { + return getExtractionStrategy(pdfPath, null); + } + + /** + * Get extraction strategy for given document. + */ + public static ExtractionStrategy getExtractionStrategy(String pdfPath, + boolean useActualText) + throws IOException { + return getExtractionStrategy(pdfPath, "Text Layer", useActualText); + } + + /** + * Get extraction strategy for given document. + */ + public static ExtractionStrategy getExtractionStrategy(String pdfPath, + String layerName) throws IOException { + return getExtractionStrategy(pdfPath, layerName, false); + } + + /** + * Get extraction strategy for given document. + */ + public static ExtractionStrategy getExtractionStrategy(String pdfPath, + String layerName, boolean useActualText) + throws IOException { + PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); + ExtractionStrategy strategy = new ExtractionStrategy(layerName); + strategy.setUseActualText(useActualText); + PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy); + processor.processPageContent(pdfDocument.getFirstPage()); + pdfDocument.close(); + return strategy; + } +} diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/events/numbers_01.jpg b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/events/numbers_01.jpg new file mode 100644 index 0000000..f384caa Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/events/numbers_01.jpg differ diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/FreeSans.ttf b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/FreeSans.ttf new file mode 100644 index 0000000..2072cda Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/FreeSans.ttf differ diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/Kanit-Regular.ttf b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/Kanit-Regular.ttf new file mode 100644 index 0000000..8ca24fa Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/Kanit-Regular.ttf differ diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_GNU.txt b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_GNU.txt new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_GNU.txt @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_OFL.txt b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_OFL.txt new file mode 100644 index 0000000..77b1731 --- /dev/null +++ b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_OFL.txt @@ -0,0 +1,91 @@ +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +http://scripts.sil.org/OFL + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/NOTICE.txt b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/NOTICE.txt new file mode 100644 index 0000000..496bc47 --- /dev/null +++ b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/fonts/NOTICE.txt @@ -0,0 +1,4 @@ +Please notice that the following fonts are used with the mentioned below licenses. + +* FreeSans - GPL license you can find following the link: https://www.gnu.org/licenses +* Kanit-Regular - SIL Open Font License, Version 1.1 diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/corrupted.jpg b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/corrupted.jpg new file mode 100644 index 0000000..2c0d56a Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/corrupted.jpg differ diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/multipage.tiff b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/multipage.tiff new file mode 100644 index 0000000..e8cc630 Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/multipage.tiff differ diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.jpg b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.jpg new file mode 100644 index 0000000..f384caa Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.jpg differ diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/thai.PNG b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/thai.PNG new file mode 100644 index 0000000..7823203 Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/images/thai.PNG differ diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/profiles/CoatedFOGRA27.icc b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/profiles/CoatedFOGRA27.icc new file mode 100644 index 0000000..086ac9d Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/profiles/CoatedFOGRA27.icc differ diff --git a/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/profiles/sRGB_CS_profile.icm b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/profiles/sRGB_CS_profile.icm new file mode 100644 index 0000000..7f9d18d Binary files /dev/null and b/pdfocr-api/src/test/resources/com/itextpdf/pdfocr/profiles/sRGB_CS_profile.icm differ diff --git a/pdfocr-tesseract4/pom.xml b/pdfocr-tesseract4/pom.xml new file mode 100644 index 0000000..c09cdfe --- /dev/null +++ b/pdfocr-tesseract4/pom.xml @@ -0,0 +1,57 @@ + + + 4.0.0 + + + com.itextpdf + pdfocr-root + 1.0.0 + + + pdfocr-tesseract4 + + pdfOCR-Tesseract4 + pdfOCR-Tesseract4 is an iText 7 add-on for Java to recognize and extract text in scanned documents and images. It can also convert them into fully ISO-compliant PDF or PDF/A-3u files that are accessible, searchable, and suitable for archiving + + + + com.itextpdf + pdfocr-api + ${project.version} + + + com.itextpdf + styled-xml-parser + ${itext.version} + + + net.sourceforge.tess4j + tess4j + 4.5.1 + + + log4j + log4j + + + ghost4j + org.ghost4j + + + slf4j-api + org.slf4j + + + log4j-over-slf4j + org.slf4j + + + + + com.itextpdf + pdftest + ${itext.version} + test + + + diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/AbstractTesseract4OcrEngine.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/AbstractTesseract4OcrEngine.java new file mode 100644 index 0000000..feb1e9b --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/AbstractTesseract4OcrEngine.java @@ -0,0 +1,485 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.counter.EventCounterHandler; +import com.itextpdf.kernel.counter.event.IMetaInfo; +import com.itextpdf.pdfocr.IOcrEngine; +import com.itextpdf.pdfocr.OcrPdfCreatorMetaInfo; +import com.itextpdf.pdfocr.OcrPdfCreatorMetaInfo.PdfDocumentType; +import com.itextpdf.pdfocr.TextInfo; +import com.itextpdf.pdfocr.events.IThreadLocalMetaInfoAware; +import com.itextpdf.pdfocr.tesseract4.events.PdfOcrTesseract4Event; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import org.slf4j.LoggerFactory; + +/** + * The implementation of {@link IOcrEngine}. + * + * This class provides possibilities to perform OCR, to read data from input + * files and to return contained text in the required format. + * Also there are possibilities to use features of "tesseract" + * (optical character recognition engine for various operating systems). + */ +public abstract class AbstractTesseract4OcrEngine implements IOcrEngine, IThreadLocalMetaInfoAware { + + /** + * Supported image formats. + */ + private static final Set SUPPORTED_IMAGE_FORMATS = + Collections.unmodifiableSet(new HashSet<>( + Arrays.asList("bmp", "png", "tiff", "tif", "jpeg", + "jpg", "jpe", "jfif"))); + + Set processedUUID = new HashSet<>(); + + /** + * Set of properties. + */ + private Tesseract4OcrEngineProperties tesseract4OcrEngineProperties; + + private ThreadLocal threadLocalMetaInfo = new ThreadLocal<>(); + + public AbstractTesseract4OcrEngine( + Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) { + this.tesseract4OcrEngineProperties = tesseract4OcrEngineProperties; + } + + /** + * Performs tesseract OCR for the first (or for the only) image page. + * + * @param inputImage input image {@link java.io.File} + * @param outputFile output file for the result for the first page + * @param outputFormat selected {@link OutputFormat} for tesseract + */ + public void doTesseractOcr(File inputImage, File outputFile, + OutputFormat outputFormat) { + doTesseractOcr(inputImage, Collections.singletonList(outputFile), + outputFormat, 1); + } + + /** + * Performs OCR using provided {@link IOcrEngine} for the given list of + * input images and saves output to a text file using provided path. + * + * @param inputImages {@link java.util.List} of images to be OCRed + * @param txtFile file to be created + */ + public void createTxtFile(final List inputImages, final File txtFile) { + LoggerFactory.getLogger(getClass()) + .info(MessageFormatUtil.format( + Tesseract4LogMessageConstant.START_OCR_FOR_IMAGES, + inputImages.size())); + + StringBuilder content = new StringBuilder(); + for (File inputImage : inputImages) { + content.append(doImageOcr(inputImage, OutputFormat.TXT)); + } + + // write to file + TesseractHelper.writeToTextFile(txtFile.getAbsolutePath(), + content.toString()); + } + + /** + * Gets properties for {@link AbstractTesseract4OcrEngine}. + * + * @return set properties {@link Tesseract4OcrEngineProperties} + */ + public final Tesseract4OcrEngineProperties getTesseract4OcrEngineProperties() { + return tesseract4OcrEngineProperties; + } + + /** + * Sets properties for {@link AbstractTesseract4OcrEngine}. + * + * @param tesseract4OcrEngineProperties set of properties + * {@link Tesseract4OcrEngineProperties} for {@link AbstractTesseract4OcrEngine} + */ + public final void setTesseract4OcrEngineProperties( + final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) { + this.tesseract4OcrEngineProperties = tesseract4OcrEngineProperties; + } + + /** + * Gets list of languages concatenated with "+" symbol to a string + * in format required by tesseract. + * @return {@link java.lang.String} of concatenated languages + */ + public final String getLanguagesAsString() { + if (getTesseract4OcrEngineProperties().getLanguages().size() > 0) { + return String.join("+", + getTesseract4OcrEngineProperties().getLanguages()); + } else { + return getTesseract4OcrEngineProperties().getDefaultLanguage(); + } + } + + /** + * Reads data from the provided input image file and returns retrieved + * data in the format described below. + * + * @param input input image {@link java.io.File} + * @return {@link java.util.Map} where key is {@link java.lang.Integer} + * representing the number of the page and value is + * {@link java.util.List} of {@link TextInfo} elements where each + * {@link TextInfo} element contains a word or a line and its 4 + * coordinates(bbox) + */ + public final Map> doImageOcr( + final File input) { + verifyImageFormatValidity(input); + return ((TextInfoTesseractOcrResult)processInputFiles(input, OutputFormat.HOCR)).getTextInfos(); + } + + /** + * Reads data from the provided input image file and returns retrieved + * data as string. + * + * @param input input image {@link java.io.File} + * + * @param outputFormat return {@link OutputFormat} result + * @return OCR result as a {@link java.lang.String} that is + * returned after processing the given image + */ + public final String doImageOcr(final File input, + final OutputFormat outputFormat) { + String result = ""; + verifyImageFormatValidity(input); + ITesseractOcrResult processedData = processInputFiles(input, outputFormat); + if (processedData != null) { + if (outputFormat.equals(OutputFormat.TXT)) { + result = ((StringTesseractOcrResult)processedData).getData(); + } else { + StringBuilder outputText = new StringBuilder(); + Map> outputMap = + ((TextInfoTesseractOcrResult)processedData).getTextInfos(); + for (int page : outputMap.keySet()) { + StringBuilder pageText = new StringBuilder(); + for (TextInfo textInfo : outputMap.get(page)) { + pageText.append(textInfo.getText()); + pageText.append(System.lineSeparator()); + } + outputText.append(pageText); + outputText.append(System.lineSeparator()); + } + result = outputText.toString(); + } + } + return result; + } + + /** + * Checks current os type. + * + * @return boolean true is current os is windows, otherwise - false + */ + public boolean isWindows() { + return identifyOsType().toLowerCase().contains("win"); + } + + /** + * Identifies type of current OS and return it (win, linux). + * + * @return type of current os as {@link java.lang.String} + */ + public String identifyOsType() { + String os = System.getProperty("os.name") == null + ? System.getProperty("OS") : System.getProperty("os.name"); + return os.toLowerCase(); + } + + /** + * Validates list of provided languages and + * checks if they all exist in given tess data directory. + * + * @param languagesList {@link java.util.List} of provided languages + * @throws Tesseract4OcrException if tess data wasn't found for one of the + * languages from the provided list + */ + public void validateLanguages(final List languagesList) + throws Tesseract4OcrException { + String suffix = ".traineddata"; + if (languagesList.size() == 0) { + if (!new File(getTessData() + + java.io.File.separatorChar + + getTesseract4OcrEngineProperties().getDefaultLanguage() + + suffix) + .exists()) { + throw new Tesseract4OcrException( + Tesseract4OcrException.INCORRECT_LANGUAGE) + .setMessageParams( + getTesseract4OcrEngineProperties() + .getDefaultLanguage() + + suffix, + getTessData()); + } + } else { + for (String lang : languagesList) { + if (!new File(getTessData() + + java.io.File.separatorChar + lang + suffix) + .exists()) { + throw new Tesseract4OcrException( + Tesseract4OcrException.INCORRECT_LANGUAGE) + .setMessageParams(lang + suffix, getTessData()); + } + } + } + } + + /** + * {@inheritDoc} + */ + @Override + public IMetaInfo getThreadLocalMetaInfo() { + return threadLocalMetaInfo.get(); + } + + /** + * {@inheritDoc} + */ + @Override + public IThreadLocalMetaInfoAware setThreadLocalMetaInfo(IMetaInfo metaInfo) { + this.threadLocalMetaInfo.set(metaInfo); + return this; + } + + /** + * Performs tesseract OCR using command line tool + * or a wrapper for Tesseract OCR API. + * + * Please note that list of output files is accepted instead of a single file because + * page number parameter is not respected in case of TIFF images not requiring preprocessing. + * In other words, if the passed image is the TIFF image and according to the {@link Tesseract4OcrEngineProperties} + * no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list + * is expected to be same as number of pages in the image, otherwise, only one file is expected + * + * @param inputImage input image {@link java.io.File} + * @param outputFiles {@link java.util.List} of output files + * (one per each page) + * @param outputFormat selected {@link OutputFormat} for tesseract + * @param pageNumber number of page to be processed + */ + abstract void doTesseractOcr(File inputImage, + List outputFiles, OutputFormat outputFormat, + int pageNumber); + + /** + * Gets path to provided tess data directory. + * + * @return path to provided tess data directory as + * {@link java.lang.String} + */ + String getTessData() { + if (getTesseract4OcrEngineProperties().getPathToTessData() == null) { + throw new Tesseract4OcrException(Tesseract4OcrException + .PATH_TO_TESS_DATA_IS_NOT_SET); + } else { + return getTesseract4OcrEngineProperties().getPathToTessData() + .getAbsolutePath(); + } + } + + void scheduledCheck() { + ReflectionUtils.scheduledCheck(); + } + + void onEvent() { + IMetaInfo metaInfo = this.getThreadLocalMetaInfo(); + if (!(metaInfo instanceof OcrPdfCreatorMetaInfo)) { + EventCounterHandler.getInstance() + .onEvent(PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR, this.getThreadLocalMetaInfo(), getClass()); + } else { + UUID uuid = ((OcrPdfCreatorMetaInfo) metaInfo).getDocumentId(); + if (!processedUUID.contains(uuid)) { + processedUUID.add(uuid); + EventCounterHandler.getInstance() + .onEvent(PdfDocumentType.PDFA.equals(((OcrPdfCreatorMetaInfo) metaInfo).getPdfDocumentType()) + ? PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDFA + : PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, + ((OcrPdfCreatorMetaInfo) metaInfo).getWrappedMetaInfo(), getClass()); + + } + } + } + + /** + * Reads data from the provided input image file. + * + * @param input input image {@link java.io.File} + * @param outputFormat {@link OutputFormat} for the result returned + * by {@link IOcrEngine} + * @return {@link ITesseractOcrResult} instance, either {@link StringTesseractOcrResult} + * if output format is TXT, or {@link TextInfoTesseractOcrResult} if the output format is HOCR + */ + private ITesseractOcrResult processInputFiles( + final File input, final OutputFormat outputFormat) { + Map> imageData = + new LinkedHashMap>(); + StringBuilder data = new StringBuilder(); + List tempFiles = new ArrayList(); + ITesseractOcrResult result = null; + try { + // image needs to be paginated only if it's tiff + // or preprocessing isn't required + int realNumOfPages = !ImagePreprocessingUtil.isTiffImage(input) + ? 1 : ImagePreprocessingUtil.getNumberOfPageTiff(input); + int numOfPages = + getTesseract4OcrEngineProperties().isPreprocessingImages() + ? realNumOfPages : 1; + int numOfFiles = + getTesseract4OcrEngineProperties().isPreprocessingImages() + ? 1 : realNumOfPages; + + for (int page = 1; page <= numOfPages; page++) { + String extension = outputFormat.equals(OutputFormat.HOCR) + ? ".hocr" : ".txt"; + for (int i = 0; i < numOfFiles; i++) { + tempFiles.add(createTempFile(extension)); + } + + doTesseractOcr(input, tempFiles, outputFormat, page); + if (outputFormat.equals(OutputFormat.HOCR)) { + Map> pageData = TesseractHelper + .parseHocrFile(tempFiles, + getTesseract4OcrEngineProperties() + .getTextPositioning()); + + if (getTesseract4OcrEngineProperties() + .isPreprocessingImages()) { + imageData.put(page, pageData.get(1)); + } else { + imageData = pageData; + } + result = new TextInfoTesseractOcrResult(imageData); + } else { + for (File tmpFile : tempFiles) { + if (Files.exists( + java.nio.file.Paths + .get(tmpFile.getAbsolutePath()))) { + data.append(TesseractHelper.readTxtFile(tmpFile)); + } + } + result = new StringTesseractOcrResult(data.toString()); + } + } + } catch (IOException e) { + LoggerFactory.getLogger(getClass()) + .error(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_OCR_INPUT_FILE, + e.getMessage())); + } finally { + for (File file : tempFiles) { + TesseractHelper.deleteFile(file.getAbsolutePath()); + } + } + return result; + } + + /** + * Creates a temporary file with given extension. + * + * @param extension file extension for a new file {@link java.lang.String} + * @return a new created {@link java.io.File} instance + */ + private File createTempFile(final String extension) { + String tmpFileName = TesseractOcrUtil.getTempFilePath( + UUID.randomUUID().toString(), extension); + return new File(tmpFileName); + } + + /** + * Validates input image format. + * Allowed image formats are listed + * in {@link AbstractTesseract4OcrEngine#SUPPORTED_IMAGE_FORMATS} + * + * @param image input image {@link java.io.File} + * @throws Tesseract4OcrException if image format is invalid + */ + private void verifyImageFormatValidity(final File image) + throws Tesseract4OcrException { + boolean isValid = false; + String extension = "incorrect extension"; + int index = image.getAbsolutePath().lastIndexOf('.'); + if (index > 0) { + extension = new String(image.getAbsolutePath().toCharArray(), + index + 1, + image.getAbsolutePath().length() - index - 1); + for (String format : SUPPORTED_IMAGE_FORMATS) { + if (format.equals(extension.toLowerCase())) { + isValid = true; + break; + } + } + } + if (!isValid) { + LoggerFactory.getLogger(getClass()).error(MessageFormatUtil + .format(Tesseract4LogMessageConstant + .CANNOT_READ_INPUT_IMAGE, + image.getAbsolutePath())); + throw new Tesseract4OcrException( + Tesseract4OcrException.INCORRECT_INPUT_IMAGE_FORMAT) + .setMessageParams(extension); + } + } + + interface ITesseractOcrResult { + } + + static class StringTesseractOcrResult implements ITesseractOcrResult { + private String data; + + StringTesseractOcrResult(String data) { + this.data = data; + } + + String getData() { + return data; + } + } + + static class TextInfoTesseractOcrResult implements ITesseractOcrResult { + private Map> textInfos; + + TextInfoTesseractOcrResult(Map> textInfos) { + this.textInfos = textInfos; + } + + Map> getTextInfos() { + return this.textInfos; + } + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ImagePreprocessingUtil.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ImagePreprocessingUtil.java new file mode 100644 index 0000000..9abc13d --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ImagePreprocessingUtil.java @@ -0,0 +1,226 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.image.TiffImageData; +import com.itextpdf.io.source.RandomAccessFileOrArray; +import com.itextpdf.io.source.RandomAccessSourceFactory; +import com.itextpdf.io.util.MessageFormatUtil; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import javax.imageio.ImageIO; +import net.sourceforge.lept4j.Leptonica; +import net.sourceforge.lept4j.Pix; +import org.slf4j.LoggerFactory; + +/** + * Utilities class to work with images. + * Class provides tools for basic image preprocessing. + */ +class ImagePreprocessingUtil { + + /** + * Creates a new {@link ImagePreprocessingUtil} instance. + */ + private ImagePreprocessingUtil() { + } + + /** + * Counts number of pages in the provided tiff image. + * + * @param inputImage input image {@link java.io.File} + * @return number of pages in the provided TIFF image + * @throws IOException if error occurred during creating a + * {@link com.itextpdf.io.source.IRandomAccessSource} based on a filename + * string + */ + static int getNumberOfPageTiff(final File inputImage) + throws IOException { + RandomAccessFileOrArray raf = new RandomAccessFileOrArray( + new RandomAccessSourceFactory() + .createBestSource( + inputImage.getAbsolutePath())); + int numOfPages = TiffImageData.getNumberOfPages(raf); + raf.close(); + return numOfPages; + } + + /** + * Checks whether image format is TIFF. + * + * @param inputImage input image {@link java.io.File} + * @return true if provided image has 'tiff' or 'tif' extension + */ + static boolean isTiffImage(final File inputImage) { + int index = inputImage.getAbsolutePath().lastIndexOf('.'); + if (index > 0) { + String extension = new String( + inputImage.getAbsolutePath().toCharArray(), index + 1, + inputImage.getAbsolutePath().length() - index - 1); + return extension.toLowerCase().contains("tif"); + } + return false; + } + + /** + * Reads provided image file using stream. + * + * @param inputFile input image {@link java.io.File} + * @return returns a {@link java.awt.image.BufferedImage} as the result + * @throws IllegalArgumentException if error occurred during reading a file + * @throws IOException if error occurred during reading a file + */ + static BufferedImage readImageFromFile(final File inputFile) + throws IllegalArgumentException, IOException { + FileInputStream is = new FileInputStream(inputFile.getAbsolutePath()); + BufferedImage bi = ImageIO.read(is); + is.close(); + return bi; + } + + /** + * Reads input file as Leptonica {@link net.sourceforge.lept4j.Pix} and + * converts it to {@link java.awt.image.BufferedImage}. + * + * @param inputImage input image {@link java.io.File} + * @return returns a {@link java.awt.image.BufferedImage} as the result + * @throws IOException is error occurred during conversion from + * {@link net.sourceforge.lept4j.Pix} to + * {@link java.awt.image.BufferedImage} + */ + static BufferedImage readAsPixAndConvertToBufferedImage( + final File inputImage) + throws IOException { + Pix pix = Leptonica.INSTANCE + .pixRead(inputImage.getAbsolutePath()); + return TesseractOcrUtil.convertPixToImage(pix); + } + + /** + * Performs basic image preprocessing using buffered image (if provided). + * Preprocessed image will be saved in temporary directory. + * + * @param inputFile input image {@link java.io.File} + * @param pageNumber number of page to be preprocessed + * @return created preprocessed image as {@link net.sourceforge.lept4j.Pix} + * @throws Tesseract4OcrException if it was not possible to read or convert + * input file + */ + static Pix preprocessImage(final File inputFile, + final int pageNumber) throws Tesseract4OcrException { + Pix pix = null; + // read image + if (isTiffImage(inputFile)) { + pix = TesseractOcrUtil.readPixPageFromTiff(inputFile, + pageNumber - 1); + } else { + pix = readPix(inputFile); + } + if (pix == null) { + throw new Tesseract4OcrException( + Tesseract4OcrException.CANNOT_READ_PROVIDED_IMAGE) + .setMessageParams(inputFile.getAbsolutePath()); + } + return TesseractOcrUtil.preprocessPix(pix); + } + + /** + * Reads {@link net.sourceforge.lept4j.Pix} from input file or, if + * this is not possible, reads input file as + * {@link java.awt.image.BufferedImage} and then converts to + * {@link net.sourceforge.lept4j.Pix}. + * + * @param inputFile input image {@link java.io.File} + * @return Pix result {@link net.sourceforge.lept4j.Pix} object from + * input file + */ + static Pix readPix(final File inputFile) { + Pix pix = null; + try { + BufferedImage bufferedImage = ImagePreprocessingUtil + .readImageFromFile(inputFile); + if (bufferedImage != null) { + pix = TesseractOcrUtil.convertImageToPix(bufferedImage); + } + } catch (Exception e) { // NOSONAR + LoggerFactory.getLogger(ImagePreprocessingUtil.class) + .info(MessageFormatUtil.format( + Tesseract4LogMessageConstant + .CANNOT_CONVERT_IMAGE_TO_PIX, + inputFile.getAbsolutePath(), + e.getMessage())); + } + if (pix == null) { + try { + pix = Leptonica.INSTANCE.pixRead(inputFile.getAbsolutePath()); + } catch (Exception e) { // NOSONAR + LoggerFactory.getLogger(ImagePreprocessingUtil.class) + .info(MessageFormatUtil.format( + Tesseract4LogMessageConstant + .CANNOT_CONVERT_IMAGE_TO_PIX, + inputFile.getAbsolutePath(), + e.getMessage())); + } + } + return pix; + } + + /** + * Reads input image as a {@link java.awt.image.BufferedImage}. + * If it is not possible to read {@link java.awt.image.BufferedImage} from + * input file, image will be read as a {@link net.sourceforge.lept4j.Pix} + * and then converted to {@link java.awt.image.BufferedImage}. + * @param inputImage original input image + * @return input image as a {@link java.awt.image.BufferedImage} + */ + static BufferedImage readImage(File inputImage) { + BufferedImage bufferedImage = null; + try { + bufferedImage = ImagePreprocessingUtil + .readImageFromFile(inputImage); + } catch (IllegalArgumentException | IOException ex) { + LoggerFactory.getLogger(ImagePreprocessingUtil.class).info( + MessageFormatUtil.format( + Tesseract4LogMessageConstant + .CANNOT_CREATE_BUFFERED_IMAGE, + ex.getMessage())); + } + if (bufferedImage == null) { + try { + bufferedImage = ImagePreprocessingUtil + .readAsPixAndConvertToBufferedImage( + inputImage); + } catch (IOException ex) { + LoggerFactory.getLogger(ImagePreprocessingUtil.class) + .info(MessageFormatUtil.format( + Tesseract4LogMessageConstant + .CANNOT_READ_INPUT_IMAGE, + ex.getMessage())); + } + } + return bufferedImage; + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/OutputFormat.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/OutputFormat.java new file mode 100644 index 0000000..30f47db --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/OutputFormat.java @@ -0,0 +1,44 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.pdfocr.TextInfo; + +/** + * Enumeration of the available output formats. + * It is used when there is possibility in selected Reader to process input + * file and to return result in the required output format. + */ +public enum OutputFormat { + /** + * Reader will produce XHTML output compliant + * with the hOCR specification. + * Output will be parsed and represented as {@link java.util.List} of + * {@link TextInfo} objects + */ + HOCR, + /** + * Reader will produce plain txt file. + */ + TXT +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/PdfOcrTesseract4ProductInfo.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/PdfOcrTesseract4ProductInfo.java new file mode 100644 index 0000000..7ae48d2 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/PdfOcrTesseract4ProductInfo.java @@ -0,0 +1,38 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +/** + * Product info about this iText add-on. + */ +public class PdfOcrTesseract4ProductInfo { + + /** The product name. */ + public static final String PRODUCT_NAME = "pdfOcr-Tesseract4"; + + /** The major version number. */ + public static final int MAJOR_VERSION = 1; + + /** The minor version number. */ + public static final int MINOR_VERSION = 0; +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java new file mode 100644 index 0000000..07c1c53 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ReflectionUtils.java @@ -0,0 +1,209 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.Version; +import com.itextpdf.kernel.counter.ContextManager; + +import java.lang.reflect.AccessibleObject; +import java.lang.reflect.Array; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +final class ReflectionUtils { + + private static final Logger logger = LoggerFactory.getLogger(ReflectionUtils.class); + + private static final String KERNEL_PACKAGE = "com.itextpdf.kernel."; + private static final String LICENSEKEY_PACKAGE = "com.itextpdf.licensekey."; + + private static final String CONTEXT_MANAGER = "counter.ContextManager"; + private static final String LICENSEKEY = "LicenseKey"; + private static final String LICENSEKEY_PRODUCT = "LicenseKeyProduct"; + private static final String LICENSEKEY_FEATURE = "LicenseKeyProductFeature"; + + private static final String REGISTER_GENERIC_CONTEXT = "registerGenericContext"; + private static final String SCHEDULED_CHECK = "scheduledCheck"; + + private static final String NO_PDFOCR_TESSERACT4 = "No license loaded for product pdfOcr-Tesseract4. Please use LicenseKey.loadLicense(...) to load one."; + + private static Map> cachedClasses = new HashMap<>(); + private static Map cachedMethods = new HashMap<>(); + + static { + try { + ContextManager contextManager = ContextManager.getInstance(); + callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager, + new Class[] {Collection.class, Collection.class}, + Collections.singletonList("com.itextpdf.pdfocr"), + Collections.singletonList("com.itextpdf.pdfocr.tesseract4")); + callMethod(KERNEL_PACKAGE + CONTEXT_MANAGER, REGISTER_GENERIC_CONTEXT, contextManager, + new Class[] {Collection.class, Collection.class}, + Collections.singletonList("com.itextpdf.pdfocr.tesseract4"), + Collections.singletonList("com.itextpdf.pdfocr.tesseract4")); + } catch (Exception e) { + logger.error(e.getMessage()); + } + } + + private ReflectionUtils() { + } + + public static void scheduledCheck() { + try { + Class licenseKeyClass = getClass(LICENSEKEY_PACKAGE + LICENSEKEY); + Class licenseKeyProductClass = getClass(LICENSEKEY_PACKAGE + LICENSEKEY_PRODUCT); + Class licenseKeyProductFeatureClass = getClass(LICENSEKEY_PACKAGE + LICENSEKEY_FEATURE); + + Object licenseKeyProductFeatureArray = Array.newInstance(licenseKeyProductFeatureClass, 0); + + Class[] params = new Class[] { + String.class, + Integer.TYPE, + Integer.TYPE, + licenseKeyProductFeatureArray.getClass() + }; + + Constructor licenseKeyProductConstructor = licenseKeyProductClass.getConstructor(params); + + Object licenseKeyProductObject = licenseKeyProductConstructor.newInstance( + PdfOcrTesseract4ProductInfo.PRODUCT_NAME, + PdfOcrTesseract4ProductInfo.MAJOR_VERSION, + PdfOcrTesseract4ProductInfo.MINOR_VERSION, + licenseKeyProductFeatureArray + ); + + Method method = licenseKeyClass.getMethod(SCHEDULED_CHECK, licenseKeyProductClass); + method.invoke(null, licenseKeyProductObject); + } catch (Exception e) { + if (null != e && null != e.getCause()) { + String message = e.getCause().getMessage(); + if (NO_PDFOCR_TESSERACT4.equals(message)) { + throw new RuntimeException(message, e.getCause()); + } + } + if (!Version.isAGPLVersion()) { + throw new RuntimeException(e.getCause()); + } + } + } + + private static Object callMethod(String className, String methodName, Object target, Class[] parameterTypes, + Object... args) { + try { + Method method = findMethod(className, methodName, parameterTypes); + return method.invoke(target, args); + } catch (NoSuchMethodException e) { + logger.warn(MessageFormatUtil.format("Cannot find method {0} for class {1}", methodName, className)); + } catch (ClassNotFoundException e) { + logger.warn(MessageFormatUtil.format("Cannot find class {0}", className)); + } catch (IllegalArgumentException e) { + logger.warn(MessageFormatUtil + .format("Illegal arguments passed to {0}#{1} method call: {2}", className, methodName, + e.getMessage())); + } catch (Exception e) { + // Converting checked exceptions to unchecked RuntimeException (java-specific comment). + // + // If kernel utils throws an exception at this point, we consider it as unrecoverable situation for + // its callers (pdfOcr methods). + // It's might be more suitable to wrap checked exceptions at a bit higher level, but we do it here for + // the sake of convenience. + throw new RuntimeException(e.toString(), e); + } + return null; + } + + private static Method findMethod(String className, String methodName, Class[] parameterTypes) + throws NoSuchMethodException, ClassNotFoundException { + MethodSignature tm = new MethodSignature(className, parameterTypes, methodName); + Method m = (Method) cachedMethods.get(tm); + if (m == null) { + m = findClass(className).getDeclaredMethod(methodName, parameterTypes); + m.setAccessible(true); + cachedMethods.put(tm, m); + } + return m; + } + + private static Class findClass(String className) throws ClassNotFoundException { + Class c = cachedClasses.get(className); + if (c == null) { + c = getClass(className); + cachedClasses.put(className, c); + } + return c; + } + + private static Class getClass(String className) throws ClassNotFoundException { + return Class.forName(className); + } + + private static class MethodSignature { + protected final String className; + private final String methodName; + protected Class[] parameterTypes; + + MethodSignature(String className, Class[] parameterTypes, String methodName) { + this.methodName = methodName; + this.className = className; + this.parameterTypes = parameterTypes; + } + + @Override + public int hashCode() { + int result = className.hashCode(); + result = 31 * result + Arrays.hashCode(parameterTypes); + result = 31 * result + (methodName != null ? methodName.hashCode() : 0); + return result; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + MethodSignature that = (MethodSignature) o; + + if (!className.equals(that.className)) { + return false; + } + if (!Arrays.equals(parameterTypes, that.parameterTypes)) { + return false; + } + return methodName != null ? methodName.equals(that.methodName) : that.methodName == null; + + } + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.java new file mode 100644 index 0000000..705d6a6 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.java @@ -0,0 +1,494 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.util.MessageFormatUtil; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.UUID; +import net.sourceforge.lept4j.Pix; +import org.slf4j.LoggerFactory; + +/** + * The implementation of {@link AbstractTesseract4OcrEngine} for tesseract OCR. + * + * This class provides possibilities to use features of "tesseract" CL tool + * (optical character recognition engine for various operating systems). + * Please note that it's assumed that "tesseract" has already been + * installed locally. + */ +public class Tesseract4ExecutableOcrEngine extends AbstractTesseract4OcrEngine { + + /** + * Path to the tesseract executable. + * By default it's assumed that "tesseract" already exists in the "PATH". + */ + private String pathToExecutable; + + /** + * Creates a new {@link Tesseract4ExecutableOcrEngine} instance. + * + * @param tesseract4OcrEngineProperties set of properties + */ + public Tesseract4ExecutableOcrEngine( + final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) { + super(tesseract4OcrEngineProperties); + setPathToExecutable("tesseract"); + } + + /** + * Creates a new {@link Tesseract4ExecutableOcrEngine} instance. + * + * @param executablePath path to tesseract executable + * @param tesseract4OcrEngineProperties set of properties + */ + public Tesseract4ExecutableOcrEngine(final String executablePath, + final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) { + super(tesseract4OcrEngineProperties); + setPathToExecutable(executablePath); + } + + /** + * Gets path to tesseract executable. + * + * @return path to tesseract executable + */ + public final String getPathToExecutable() { + return pathToExecutable; + } + + /** + * Sets path to tesseract executable. + * By default it's assumed that "tesseract" already exists in the "PATH". + * + * @param path path to tesseract executable + */ + public final void setPathToExecutable(final String path) { + pathToExecutable = path; + } + + /** + * Performs tesseract OCR using command line tool for the selected page + * of input image (by default 1st). + * + * Please note that list of output files is accepted instead of a single file because + * page number parameter is not respected in case of TIFF images not requiring preprocessing. + * In other words, if the passed image is the TIFF image and according to the {@link Tesseract4OcrEngineProperties} + * no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list + * is expected to be same as number of pages in the image, otherwise, only one file is expected + * + * @param inputImage input image {@link java.io.File} + * @param outputFiles {@link java.util.List} of output files + * (one per each page) + * @param outputFormat selected {@link OutputFormat} for tesseract + * @param pageNumber number of page to be processed + */ + void doTesseractOcr(final File inputImage, + final List outputFiles, final OutputFormat outputFormat, + final int pageNumber) { + scheduledCheck(); + List params = new ArrayList(); + String execPath = null; + String imagePath = null; + try { + imagePath = inputImage.getAbsolutePath(); + // path to tesseract executable + if (getPathToExecutable() == null + || getPathToExecutable().isEmpty()) { + throw new Tesseract4OcrException( + Tesseract4OcrException + .CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE); + } else { + if (isWindows()) { + execPath = addQuotes(getPathToExecutable()); + } else { + execPath = getPathToExecutable(); + } + params.add(execPath); + } + checkTesseractInstalled(execPath); + // path to tess data + addTessData(params); + + // validate languages before preprocessing started + validateLanguages(getTesseract4OcrEngineProperties() + .getLanguages()); + + // preprocess input file if needed and add it + imagePath = preprocessImage(inputImage, pageNumber); + addInputFile(params, imagePath); + // move to image directory as tesseract cannot parse non ascii + // characters in input path + List moveToDirectoryParams = moveToImageDirectory( + imagePath); + // output file + addOutputFile(params, outputFiles.get(0), outputFormat, + imagePath); + // page segmentation mode + addPageSegMode(params); + // add user words if needed + addUserWords(params, imagePath); + // required languages + addLanguages(params); + if (outputFormat.equals(OutputFormat.HOCR)) { + // path to hocr script + setHocrOutput(params); + } + // set default user defined dpi + addDefaultDpi(params); + onEvent(); + TesseractHelper.runCommand(isWindows() ? "cmd" : "bash", + createCommandList(moveToDirectoryParams, params)); + } catch (Tesseract4OcrException e) { + LoggerFactory.getLogger(getClass()) + .error(e.getMessage()); + throw new Tesseract4OcrException(e.getMessage(), e); + } finally { + try { + if (imagePath != null + && !inputImage.getAbsolutePath().equals(imagePath)) { + TesseractHelper.deleteFile(imagePath); + } + } catch (SecurityException e) { + LoggerFactory.getLogger(getClass()) + .error(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, + imagePath, e.getMessage())); + } + try { + if (getTesseract4OcrEngineProperties() + .getPathToUserWordsFile() != null + && getTesseract4OcrEngineProperties().isUserWordsFileTemporary()) { + TesseractHelper.deleteFile( + getTesseract4OcrEngineProperties() + .getPathToUserWordsFile()); + } + } catch (SecurityException e) { + LoggerFactory.getLogger(getClass()) + .error(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, + getTesseract4OcrEngineProperties() + .getPathToUserWordsFile(), + e.getMessage())); + } + } + } + + /** + * Creates joint command list of two commands passed as parameters. + * @param moveToDirectoryParams first command is responsible for moving + * to the directory + * @param tesseractParams second command is responsible for tesseract + * parameters + * @return joint command list + */ + private List createCommandList( + final List moveToDirectoryParams, + final List tesseractParams) { + // create list of several lists with commands + List params = new ArrayList(); + params.add(isWindows() ? "/c": "-c"); + params.add(isWindows() ? "\"" : "'"); + for (String p : moveToDirectoryParams) { + params.add(p); + } + params.add("&&"); + for (String p : tesseractParams) { + params.add(p); + } + params.add(isWindows() ? "\"" : "'"); + return params; + } + + /** + * Create list of parameters for command moving to the image parent + * directory. + * @param imagePath path to input image + * @return command list + */ + private List moveToImageDirectory(final String imagePath) { + // go the image parent directory + List params = new ArrayList(); + String parent = TesseractOcrUtil.getParentDirectory(imagePath); + String replacement = isWindows() ? "" : "/"; + parent = parent.replace("file:///", replacement) + .replace("file:/", replacement); + + // Use "/d" parameter to handle cases when the current directory on Windows + // is located on a different drive compared to the directory we move to + if (isWindows()) { + params.add("cd /d"); + } else { + params.add("cd"); + } + params.add(addQuotes(parent)); + return params; + } + + /** + * Sets hocr output format. + * + * @param command result command as list of strings + */ + private void setHocrOutput(final List command) { + command.add("-c"); + command.add("tessedit_create_hocr=1"); + } + + /** + * Add path to user-words file for tesseract executable. + * + * @param command result command as list of strings + */ + private void addUserWords(final List command, + final String imgPath) { + if (getTesseract4OcrEngineProperties().getPathToUserWordsFile() != null + && !getTesseract4OcrEngineProperties() + .getPathToUserWordsFile().isEmpty()) { + File userWordsFile = new File(getTesseract4OcrEngineProperties() + .getPathToUserWordsFile()); + // Workaround for a non-ASCII characters in path + // Currently works only if the user words (or output files) reside in the same directory as the input image + // Leaves only a filename in this case, otherwise - absolute path to output file + String filePath = areEqualParentDirectories(imgPath, + userWordsFile.getAbsolutePath()) + ? userWordsFile.getName() + : userWordsFile.getAbsolutePath(); + + command.add("--user-words"); + command.add(addQuotes(filePath)); + command.add("--oem"); + command.add("0"); + } + } + + /** + * Set default DPI for image. + * + * @param command result command as list of strings + */ + private void addDefaultDpi(final List command) { + command.add("-c"); + command.add("user_defined_dpi=300"); + } + + /** + * Adds path to tess data to the command list. + * + * @param command result command as list of strings + */ + private void addTessData(final List command) { + command.add("--tessdata-dir"); + command.add(addQuotes(getTessData())); + } + + /** + * Adds selected Page Segmentation Mode as parameter. + * + * @param command result command as list of strings + */ + private void addPageSegMode(final List command) { + if (getTesseract4OcrEngineProperties().getPageSegMode() != null) { + command.add("-c"); + command.add("tessedit_pageseg_mode=" + getTesseract4OcrEngineProperties().getPageSegMode()); + } + } + + /** + * Add list of selected languages concatenated to a string as parameter. + * + * @param command result command as list of strings + */ + private void addLanguages(final List command) { + if (getTesseract4OcrEngineProperties().getLanguages().size() > 0) { + command.add("-l"); + command.add(getLanguagesAsString()); + } + } + + /** + * Adds path to the input image file. + * + * @param command result command as list of strings + * @param imagePath path to the input image file as string + */ + private void addInputFile(final List command, + final String imagePath) { + command.add(addQuotes(new File(imagePath).getName())); + } + + /** + * Adds path to temporary output file with result. + * + * @param command result command as list of strings + * @param outputFile output file with result + * @param outputFormat selected {@link OutputFormat} for tesseract + */ + private void addOutputFile(final List command, + final File outputFile, final OutputFormat outputFormat, + final String inputImagePath) { + String extension = outputFormat.equals(OutputFormat.HOCR) + ? ".hocr" : ".txt"; + try { + // Workaround for a non-ASCII characters in path + // Currently works only if the user words (or output files) reside in the same directory as the input image + // Leaves only a filename in this case, otherwise - absolute path to output file + String filePath = areEqualParentDirectories(inputImagePath, + outputFile.getAbsolutePath()) + ? outputFile.getName() + : outputFile.getAbsolutePath(); + String fileName = new String( + filePath.toCharArray(), 0, + filePath.indexOf(extension)); + LoggerFactory.getLogger(getClass()).info( + MessageFormatUtil.format( + Tesseract4LogMessageConstant.CREATED_TEMPORARY_FILE, + outputFile.getAbsolutePath())); + command.add(addQuotes(fileName)); + } catch (Exception e) { // NOSONAR + throw new Tesseract4OcrException(Tesseract4OcrException + .TESSERACT_FAILED); + } + } + + /** + * Surrounds given string with quotes. + * + * @param value string to be wrapped into quotes + * @return wrapped string + */ + private String addQuotes(final String value) { + // choosing correct quotes for system + if (isWindows()) { + return "\"" + value + "\""; + } else { + return "'" + value + "'"; + } + } + + /** + * Preprocess given image if it is needed. + * + * @param inputImage original input image {@link java.io.File} + * @param pageNumber number of page to be OCRed + * @return path to output image as {@link java.lang.String} + * @throws Tesseract4OcrException if preprocessing cannot be done or file + * is invalid + */ + private String preprocessImage(final File inputImage, + final int pageNumber) throws Tesseract4OcrException { + String tmpFileName = TesseractOcrUtil + .getTempFilePath(UUID.randomUUID().toString(), + getExtension(inputImage)); + String path = inputImage.getAbsolutePath(); + try { + if (getTesseract4OcrEngineProperties().isPreprocessingImages()) { + Pix pix = ImagePreprocessingUtil + .preprocessImage(inputImage, pageNumber); + TesseractOcrUtil.savePixToTempPngFile(tmpFileName, pix); + if (!Files.exists(Paths.get(tmpFileName))) { + BufferedImage img = TesseractOcrUtil.convertPixToImage(pix); + if (img != null) { + TesseractOcrUtil.saveImageToTempPngFile(tmpFileName, + img); + } + } + } + if (!getTesseract4OcrEngineProperties().isPreprocessingImages() + || !Files.exists(Paths.get(tmpFileName))) { + TesseractOcrUtil.createTempFileCopy(path, tmpFileName); + } + if (Files.exists(Paths.get(tmpFileName))) { + path = tmpFileName; + } + } catch (IOException e) { + LoggerFactory.getLogger(getClass()) + .error(MessageFormatUtil.format( + Tesseract4LogMessageConstant + .CANNOT_READ_INPUT_IMAGE, + e.getMessage())); + } + return path; + } + + /** + * Check whether tesseract executable is installed on the machine and + * provided path to tesseract executable is correct. + * @param execPath path to tesseract executable + * @throws Tesseract4OcrException if tesseract is not installed or + * provided path to tesseract executable is incorrect, + * i.e. running "{@link #getPathToExecutable()} --version" command failed. + */ + private void checkTesseractInstalled(String execPath) + throws Tesseract4OcrException { + try { + TesseractHelper.runCommand(execPath, + Collections.singletonList("--version")); + } catch (Tesseract4OcrException e) { + throw new Tesseract4OcrException( + Tesseract4OcrException.TESSERACT_NOT_FOUND, e); + } + } + + /** + * Gets input image file extension. + * + * @param inputImage input file + * @return file extension as a {@link java.lang.String} + */ + private String getExtension(File inputImage) { + if (inputImage != null) { + int index = inputImage.getAbsolutePath().lastIndexOf('.'); + if (index > 0) { + String extension = new String( + inputImage.getAbsolutePath().toCharArray(), index, + inputImage.getAbsolutePath().length() - index); + return extension.toLowerCase(); + } + } + return ".png"; + } + + /** + * Checks whether parent directories are equal for the passed file paths. + * + * @param firstPath path to the first file + * @param secondPath path to the second file + * @return true if parent directories are equal, otherwise - false + */ + private boolean areEqualParentDirectories(final String firstPath, + final String secondPath) { + String firstParentDir = TesseractOcrUtil.getParentDirectory(firstPath); + String secondParentDir = TesseractOcrUtil + .getParentDirectory(secondPath); + return firstParentDir != null + && firstParentDir.equals(secondParentDir); + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LibOcrEngine.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LibOcrEngine.java new file mode 100644 index 0000000..1c90c9c --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LibOcrEngine.java @@ -0,0 +1,299 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.util.MessageFormatUtil; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import net.sourceforge.tess4j.ITesseract; +import net.sourceforge.tess4j.TesseractException; +import org.slf4j.LoggerFactory; + +/** + * The implementation of {@link AbstractTesseract4OcrEngine} for tesseract OCR. + * + * This class provides possibilities to use features of "tesseract" + * using tess4j. + * + * Please note that this class is not thread-safe, in other words this Tesseract engine cannot + * be used for multithreaded processing. You should create one instance per thread + */ +public class Tesseract4LibOcrEngine extends AbstractTesseract4OcrEngine { + + /** + * {@link net.sourceforge.tess4j.ITesseract} Instance. + * (depends on OS type) + */ + private ITesseract tesseractInstance = null; + + /** + * Creates a new {@link Tesseract4LibOcrEngine} instance. + * + * @param tesseract4OcrEngineProperties set of properteis + */ + public Tesseract4LibOcrEngine( + final Tesseract4OcrEngineProperties tesseract4OcrEngineProperties) { + super(tesseract4OcrEngineProperties); + tesseractInstance = TesseractOcrUtil + .initializeTesseractInstance(isWindows(), null, + null, null); + } + + /** + * Gets tesseract instance. + * + * @return initialized {@link net.sourceforge.tess4j.ITesseract} instance + */ + public ITesseract getTesseractInstance() { + return tesseractInstance; + } + + /** + * Initializes instance of tesseract if it haven't been already + * initialized or it have been disposed and sets all the required + * properties. + * + * @param outputFormat selected {@link OutputFormat} for tesseract + */ + public void initializeTesseract(final OutputFormat outputFormat) { + if (getTesseractInstance() == null + || TesseractOcrUtil + .isTesseractInstanceDisposed(getTesseractInstance())) { + tesseractInstance = TesseractOcrUtil + .initializeTesseractInstance(isWindows(), getTessData(), + getLanguagesAsString(), + getTesseract4OcrEngineProperties() + .getPathToUserWordsFile()); + } + getTesseractInstance() + .setTessVariable("tessedit_create_hocr", + outputFormat.equals(OutputFormat.HOCR) ? "1" : "0"); + getTesseractInstance().setTessVariable("user_defined_dpi", "300"); + if (getTesseract4OcrEngineProperties() + .getPathToUserWordsFile() != null) { + getTesseractInstance() + .setTessVariable("load_system_dawg", "0"); + getTesseractInstance() + .setTessVariable("load_freq_dawg", "0"); + getTesseractInstance() + .setTessVariable("user_words_suffix", + getTesseract4OcrEngineProperties() + .getDefaultUserWordsSuffix()); + getTesseractInstance() + .setTessVariable("user_words_file", + getTesseract4OcrEngineProperties() + .getPathToUserWordsFile()); + } + + TesseractOcrUtil.setTesseractProperties(getTesseractInstance(), + getTessData(), getLanguagesAsString(), + getTesseract4OcrEngineProperties().getPageSegMode(), + getTesseract4OcrEngineProperties().getPathToUserWordsFile()); + } + + /** + * Performs tesseract OCR using wrapper for Tesseract OCR API for the selected page + * of input image (by default 1st). + * + * Please note that list of output files is accepted instead of a single file because + * page number parameter is not respected in case of TIFF images not requiring preprocessing. + * In other words, if the passed image is the TIFF image and according to the {@link Tesseract4OcrEngineProperties} + * no preprocessing is needed, each page of the TIFF image is OCRed and the number of output files in the list + * is expected to be same as number of pages in the image, otherwise, only one file is expected + * + * @param inputImage input image {@link java.io.File} + * @param outputFiles {@link java.util.List} of output files + * (one per each page) + * @param outputFormat selected {@link OutputFormat} for tesseract + * @param pageNumber number of page to be processed + */ + void doTesseractOcr(final File inputImage, + final List outputFiles, final OutputFormat outputFormat, + final int pageNumber) { + scheduledCheck(); + try { + validateLanguages(getTesseract4OcrEngineProperties() + .getLanguages()); + initializeTesseract(outputFormat); + onEvent(); + // if preprocessing is not needed and provided image is tiff, + // the image will be paginated and separate pages will be OCRed + List resultList = new ArrayList(); + if (!getTesseract4OcrEngineProperties().isPreprocessingImages() + && ImagePreprocessingUtil.isTiffImage(inputImage)) { + resultList = getOcrResultForMultiPage(inputImage, + outputFormat); + } else { + resultList.add(getOcrResultForSinglePage(inputImage, + outputFormat, pageNumber)); + } + + // list of result strings is written to separate files + // (one for each page) + for (int i = 0; i < resultList.size(); i++) { + String result = resultList.get(i); + File outputFile = i >= outputFiles.size() + ? null : outputFiles.get(i); + if (result != null && outputFile != null) { + try (Writer writer = new OutputStreamWriter( + new FileOutputStream(outputFile.getAbsolutePath()), + StandardCharsets.UTF_8)) { + writer.write(result); + } catch (IOException e) { + LoggerFactory.getLogger(getClass()).error( + MessageFormatUtil.format( + Tesseract4LogMessageConstant + .CANNOT_WRITE_TO_FILE, + e.getMessage())); + throw new Tesseract4OcrException( + Tesseract4OcrException.TESSERACT_FAILED); + } + } + } + } catch (Tesseract4OcrException e) { + LoggerFactory.getLogger(getClass()) + .error(e.getMessage()); + throw new Tesseract4OcrException(e.getMessage(), e); + } finally { + if (tesseractInstance != null) { + TesseractOcrUtil.disposeTesseractInstance(tesseractInstance); + } + if (getTesseract4OcrEngineProperties().getPathToUserWordsFile() + != null + && getTesseract4OcrEngineProperties().isUserWordsFileTemporary()) { + TesseractHelper.deleteFile( + getTesseract4OcrEngineProperties() + .getPathToUserWordsFile()); + } + } + } + + /** + * Gets OCR result from provided multi-page image and returns result as + * list of strings for each page. This method is used for tiff images + * when preprocessing is not needed. + * + * @param inputImage input image {@link java.io.File} + * @param outputFormat selected {@link OutputFormat} for tesseract + * @return list of result string that will be written to a temporary files + * later + */ + private List getOcrResultForMultiPage(final File inputImage, + final OutputFormat outputFormat) { + List resultList = new ArrayList(); + try { + initializeTesseract(outputFormat); + TesseractOcrUtil util = new TesseractOcrUtil(); + util.initializeImagesListFromTiff(inputImage); + int numOfPages = util.getListOfPages().size(); + for (int i = 0; i < numOfPages; i++) { + String result = util.getOcrResultAsString( + getTesseractInstance(), + util.getListOfPages().get(i), + outputFormat); + resultList.add(result); + } + } catch (TesseractException e) { + String msg = MessageFormatUtil + .format(Tesseract4LogMessageConstant.TESSERACT_FAILED, + e.getMessage()); + LoggerFactory.getLogger(getClass()) + .error(msg); + throw new Tesseract4OcrException( + Tesseract4OcrException + .TESSERACT_FAILED); + } finally { + TesseractOcrUtil + .disposeTesseractInstance(getTesseractInstance()); + } + return resultList; + } + + /** + * Gets OCR result from provided single page image and preprocesses it if + * it is needed. + * + * @param inputImage input image {@link java.io.File} + * @param outputFormat selected {@link OutputFormat} for tesseract + * @param pageNumber number of page to be OCRed + * @return result as string that will be written to a temporary file later + */ + private String getOcrResultForSinglePage(final File inputImage, + final OutputFormat outputFormat, + final int pageNumber) { + String result = null; + try { + // preprocess if required + if (getTesseract4OcrEngineProperties().isPreprocessingImages()) { + // preprocess and try to ocr + result = new TesseractOcrUtil().getOcrResultAsString( + getTesseractInstance(), + ImagePreprocessingUtil + .preprocessImage(inputImage, pageNumber), + outputFormat); + } + if (result == null) { + BufferedImage bufferedImage = ImagePreprocessingUtil + .readImage(inputImage); + if (bufferedImage != null) { + try { + result = new TesseractOcrUtil() + .getOcrResultAsString(getTesseractInstance(), + bufferedImage, outputFormat); + } catch (Exception e) { // NOSONAR + LoggerFactory.getLogger(getClass()) + .info(MessageFormatUtil.format( + Tesseract4LogMessageConstant + .CANNOT_PROCESS_IMAGE, + e.getMessage())); + } + } + if (result == null) { + // perform ocr using original input image + result = new TesseractOcrUtil() + .getOcrResultAsString(getTesseractInstance(), + inputImage, outputFormat); + } + } + } catch (Exception e) { // NOSONAR + LoggerFactory.getLogger(getClass()) + .error(MessageFormatUtil + .format(Tesseract4LogMessageConstant + .TESSERACT_FAILED, + e.getMessage())); + throw new Tesseract4OcrException( + Tesseract4OcrException + .TESSERACT_FAILED); + } + + return result; + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java new file mode 100644 index 0000000..90bf76b --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LogMessageConstant.java @@ -0,0 +1,65 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +public class Tesseract4LogMessageConstant { + public static final String TESSERACT_FAILED = + "Tesseract failed: {0}"; + public static final String COMMAND_FAILED = + "Command failed: {0}"; + public static final String CANNOT_READ_FILE = + "Cannot read file {0}: {1}"; + public static final String CANNOT_OCR_INPUT_FILE = + "Cannot ocr input file: {1}"; + public static final String CANNOT_USE_USER_WORDS = + "Cannot use custom user words: {0}"; + public static final String CANNOT_RETRIEVE_PAGES_FROM_IMAGE = + "Cannot get pages from image {0}: {1}"; + public static final String PAGE_NUMBER_IS_INCORRECT = + "Provided number of page ({0}) is incorrect for {1}"; + public static final String CANNOT_DELETE_FILE = + "File {0} cannot be deleted: {1}"; + public static final String CANNOT_PROCESS_IMAGE = "Cannot process " + + "image: {0}"; + public static final String CANNOT_WRITE_TO_FILE = + "Cannot write to file {0}: {1}"; + public static final String CREATED_TEMPORARY_FILE = + "Created temp file {0}"; + public static final String CANNOT_CONVERT_IMAGE_TO_GRAYSCALE = + "Cannot convert to gray image with depth {0}"; + public static final String CANNOT_BINARIZE_IMAGE = + "Cannot binarize image with depth {0}"; + public static final String CANNOT_CREATE_BUFFERED_IMAGE = + "Cannot create a buffered image from the input image: {0}"; + public static final String START_OCR_FOR_IMAGES = + "Starting ocr for {0} image(s)"; + public static final String CANNOT_READ_INPUT_IMAGE = + "Cannot read input image {0}"; + public static final String CANNOT_GET_TEMPORARY_DIRECTORY = "Cannot get " + + "temporary directory: {0}"; + public static final String CANNOT_CONVERT_IMAGE_TO_PIX = + "Cannot convert image to pix: {0}"; + + private Tesseract4LogMessageConstant() { + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrEngineProperties.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrEngineProperties.java new file mode 100644 index 0000000..de24193 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrEngineProperties.java @@ -0,0 +1,390 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.util.FileUtil; +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.pdfocr.IOcrEngine; +import com.itextpdf.pdfocr.OcrEngineProperties; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.util.List; +import org.slf4j.LoggerFactory; + +/** + * Properties that will be used by the {@link IOcrEngine}. + */ +public class Tesseract4OcrEngineProperties extends OcrEngineProperties { + + /** + * Default suffix for user-word file. + * (e.g. name: 'eng.user-words') + */ + static final String DEFAULT_USER_WORDS_SUFFIX = "user-words"; + + /** + * Default language for OCR. + */ + private static final String DEFAULT_LANGUAGE = "eng"; + + /** + * Path to directory with tess data. + */ + private File tessDataDir; + + /** + * Page Segmentation Mode. + */ + private Integer pageSegMode = 3; + + /** + * "True" - if images need to be preprocessed, otherwise - false. + * True by default. + */ + private boolean preprocessingImages = true; + + /** + * Defines the way text is retrieved from tesseract output. + * Default text positioning is by lines. + */ + private TextPositioning textPositioning = TextPositioning.BY_LINES; + + /** + * Path to the file containing user words. + * Each word should be on a new line, + * file should end with a newline character. + */ + private String pathToUserWordsFile = null; + + /** + * Indicates if user words file is temporary and has to be removed. + */ + private boolean isUserWordsFileTemporary = false; + + /** + * Creates a new {@link Tesseract4OcrEngineProperties} instance. + */ + public Tesseract4OcrEngineProperties() { + } + + /** + * Creates a new {@link Tesseract4OcrEngineProperties} instance + * based on another {@link Tesseract4OcrEngineProperties} instance (copy + * constructor). + * + * @param other the other {@link Tesseract4OcrEngineProperties} instance + */ + public Tesseract4OcrEngineProperties(Tesseract4OcrEngineProperties other) { + super(other); + this.tessDataDir = other.tessDataDir; + this.pageSegMode = other.pageSegMode; + this.preprocessingImages = other.preprocessingImages; + this.textPositioning = other.textPositioning; + this.pathToUserWordsFile = other.pathToUserWordsFile; + } + + /** + * Gets default language for ocr. + * + * @return default language - "eng" + */ + public final String getDefaultLanguage() { + return DEFAULT_LANGUAGE; + } + + /** + * Gets default user words suffix. + * + * @return default suffix for user words files + */ + public final String getDefaultUserWordsSuffix() { + return DEFAULT_USER_WORDS_SUFFIX; + } + + /** + * Gets path to directory with tess data. + * + * @return path to directory with tess data + */ + public final File getPathToTessData() { + return tessDataDir; + } + + /** + * Sets path to directory with tess data. + * + * @param tessData path to train directory as {@link java.io.File} + * @return the {@link Tesseract4OcrEngineProperties} instance + * @throws Tesseract4OcrException if path to tess data directory is + * null or empty, or provided directory does not exist? or it is not + * a directory + */ + public final Tesseract4OcrEngineProperties setPathToTessData( + final File tessData) { + if (tessData == null + || !FileUtil.directoryExists(tessData.getAbsolutePath())) { + throw new Tesseract4OcrException( + Tesseract4OcrException + .PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID); + } + this.tessDataDir = tessData; + + return this; + } + + /** + * Gets Page Segmentation Mode. + * + * @return psm mode as {@link java.lang.Integer} + */ + public final Integer getPageSegMode() { + return pageSegMode; + } + + /** + * Sets Page Segmentation Mode. + * More detailed explanation about psm modes could be found + * here https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#options + * Note that in documentation it is stated that default value of PSM is 3. + * This is true for tesseract executable, + * but for tesseract lib it is -1 which has negative impact on some documents. + * That's why in the code we set it explicitly to 3. + * + * @param mode psm mode as {@link java.lang.Integer} + * @return the {@link Tesseract4OcrEngineProperties} instance + */ + public final Tesseract4OcrEngineProperties setPageSegMode( + final Integer mode) { + pageSegMode = mode; + return this; + } + + /** + * Checks whether image preprocessing is needed. + * + * @return true if images need to be preprocessed, otherwise - false + */ + public final boolean isPreprocessingImages() { + return preprocessingImages; + } + + /** + * Sets true if image preprocessing is needed. + * + * @param preprocess true if images need to be preprocessed, + * otherwise - false + * @return the {@link Tesseract4OcrEngineProperties} instance + */ + public final Tesseract4OcrEngineProperties setPreprocessingImages( + final boolean preprocess) { + preprocessingImages = preprocess; + return this; + } + + /** + * Defines the way text is retrieved from tesseract output using + * {@link TextPositioning}. + * + * @return the way text is retrieved + */ + public final TextPositioning getTextPositioning() { + return textPositioning; + } + + /** + * Defines the way text is retrieved from tesseract output + * using {@link TextPositioning}. + * + * @param positioning the way text is retrieved + * @return the {@link Tesseract4OcrEngineProperties} instance + */ + public final Tesseract4OcrEngineProperties setTextPositioning( + final TextPositioning positioning) { + textPositioning = positioning; + return this; + } + + /** + * Using provided list of words there will be created + * temporary file containing words (one per line) which + * ends with a new line character. Train data for provided language + * should exist in specified tess data directory. + * + * NOTE: + * User words dictionary doesn't work properly in tesseract4 + * and hidden for public usage until fix is available + * + * @param language language as {@link java.lang.String}, tessdata for + * this languages has to exist in tess data directory + * @param userWords {@link java.util.List} of custom words + * @return the {@link Tesseract4OcrEngineProperties} instance + * @throws Tesseract4OcrException if one of given languages wasn't specified in the + * list of required languages for OCR using + */ + Tesseract4OcrEngineProperties setUserWords(final String language, + final List userWords) + throws Tesseract4OcrException { + setPathToUserWordsFile(null); + if (userWords != null && userWords.size() > 0) { + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + for (String word : userWords) { + byte[] bytesWord = word.getBytes(); + baos.write(bytesWord, 0, bytesWord.length); + byte[] bytesSeparator = System.lineSeparator() + .getBytes(); + baos.write(bytesSeparator, 0, bytesSeparator.length); + } + InputStream inputStream = new ByteArrayInputStream( + baos.toByteArray()); + baos.close(); + setUserWords(language, inputStream); + } catch (IOException e) { + LoggerFactory.getLogger(getClass()) + .warn(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_USE_USER_WORDS, + e.getMessage())); + } + } + return this; + } + + /** + * Using provided input stream there will be created + * temporary file (with name 'language.user-words') + * containing words (one per line) which ends with + * a new line character. Train data for provided language + * should exist in specified tess data directory. + * + * NOTE: + * User words dictionary doesn't work properly in tesseract4 + * and hidden for public usage until fix is available + * + * @param language language as {@link java.lang.String}, tessdata for + * this languages has to exist in tess data directory + * @param inputStream custom user words as {@link java.io.InputStream} + * @throws Tesseract4OcrException if one of given languages wasn't specified + * in the list of required languages for OCR using + * {@link Tesseract4OcrEngineProperties#setLanguages(List)} method + * @return the {@link Tesseract4OcrEngineProperties} instance + */ + Tesseract4OcrEngineProperties setUserWords(final String language, + final InputStream inputStream) throws Tesseract4OcrException { + setPathToUserWordsFile(null); + if (!getLanguages().contains(language)) { + if (DEFAULT_LANGUAGE.equals(language.toLowerCase())) { + List languagesList = getLanguages(); + languagesList.add(language); + setLanguages(languagesList); + } else { + throw new Tesseract4OcrException( + Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST) + .setMessageParams(language); + } + } + String userWordsFileName = TesseractOcrUtil.getTempFilePath(language, + "." + DEFAULT_USER_WORDS_SUFFIX); + try (OutputStreamWriter writer = + new FileWriter(userWordsFileName)) { + Reader reader = new InputStreamReader(inputStream, + StandardCharsets.UTF_8); + int data; + while ((data = reader.read()) != -1) { + writer.write(data); + } + writer.write(System.lineSeparator()); + setPathToUserWordsFile(userWordsFileName, true); + } catch (IOException e) { + setPathToUserWordsFile(null); + LoggerFactory.getLogger(getClass()) + .warn(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_USE_USER_WORDS, + e.getMessage())); + } + return this; + } + + /** + * Returns path to the user words file. + * + * NOTE: + * User words dictionary doesn't work properly in tesseract4 + * and hidden for public usage until fix is available + * + * @return path to user words file as {@link java.lang.String} if it + * exists, otherwise - null + */ + final String getPathToUserWordsFile() { + return pathToUserWordsFile; + } + + /** + * Sets path to the user words file. + * + * NOTE: + * User words dictionary doesn't work properly in tesseract4 + * and hidden for public usage until fix is available + * + * @param pathToUserWordsFile path to user words file + * as {@link java.lang.String} + * @return the {@link Tesseract4OcrEngineProperties} instance + */ + final Tesseract4OcrEngineProperties setPathToUserWordsFile( + String pathToUserWordsFile) { + return setPathToUserWordsFile(pathToUserWordsFile, false); + } + + /** + * Sets path to the user words file. + * + * @param pathToUserWordsFile path to user words file + * as {@link java.lang.String} + * @param isTempFile indicates if user words file is temporary and has to be removed + * @return the {@link Tesseract4OcrEngineProperties} instance + */ + final Tesseract4OcrEngineProperties setPathToUserWordsFile( + String pathToUserWordsFile, boolean isTempFile) { + this.pathToUserWordsFile = pathToUserWordsFile; + this.isUserWordsFileTemporary = isTempFile; + return this; + } + + /** + * Indicates if user words file is temporary and has to be removed. + * + * @return true if the file is temporary, otherwise false. + */ + final boolean isUserWordsFileTemporary() { + return isUserWordsFileTemporary; + } + +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrException.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrException.java new file mode 100644 index 0000000..21b7845 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4OcrException.java @@ -0,0 +1,74 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.pdfocr.OcrException; + +public class Tesseract4OcrException extends OcrException { + public static final String INCORRECT_INPUT_IMAGE_FORMAT = + "{0} format is not supported."; + public static final String INCORRECT_LANGUAGE = + "{0} does not exist in {1}"; + public static final String LANGUAGE_IS_NOT_IN_THE_LIST = + "Provided list of languages doesn't contain {0} language"; + public static final String CANNOT_READ_PROVIDED_IMAGE = + "Cannot read input image {0}"; + public static final String TESSERACT_FAILED = "Tesseract failed. " + + "Please check provided parameters"; + public static final String TESSERACT_LIB_NOT_INSTALLED = "Tesseract failed. " + + "Please ensure you have tesseract library installed"; + public static final String TESSERACT_LIB_NOT_INSTALLED_WIN = "Tesseract failed. " + + "Please ensure you have latest Visual C++ Redistributable installed"; + public static final String TESSERACT_NOT_FOUND = "Tesseract failed. " + + "Please check that tesseract is installed and provided path to " + + "tesseract executable directory is correct"; + public static final String CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE = + "Cannot find path to tesseract executable."; + public static final String PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID = + "Provided path to tess data directory does not exist or it is " + + "an invalid directory"; + public static final String PATH_TO_TESS_DATA_IS_NOT_SET = + "Path to tess data directory cannot be null and must be set " + + "to a valid directory"; + + /** + * Creates a new TesseractException. + * + * @param msg the detail message. + * @param e the cause + * (which is saved for later retrieval + * by {@link #getCause()} method). + */ + public Tesseract4OcrException(String msg, Throwable e) { + super(msg, e); + } + + /** + * Creates a new TesseractException. + * + * @param msg the detail message. + */ + public Tesseract4OcrException(String msg) { + super(msg); + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java new file mode 100644 index 0000000..c700f25 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractHelper.java @@ -0,0 +1,254 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.io.util.SystemUtil; +import com.itextpdf.pdfocr.TextInfo; +import com.itextpdf.styledxmlparser.jsoup.Jsoup; +import com.itextpdf.styledxmlparser.jsoup.nodes.Document; +import com.itextpdf.styledxmlparser.jsoup.nodes.Element; +import com.itextpdf.styledxmlparser.jsoup.select.Elements; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Helper class. + */ +public class TesseractHelper { + + /** + * The logger. + */ + private static final Logger LOGGER = LoggerFactory + .getLogger(TesseractHelper.class); + + /** + * Creates a new {@link TesseractHelper} instance. + */ + private TesseractHelper() { + } + + /** + * Parses each hocr file from the provided list, retrieves text, and + * returns data in the format described below. + * + * @param inputFiles list of input files + * @param textPositioning {@link TextPositioning} + * @return {@link java.util.Map} where key is {@link java.lang.Integer} + * representing the number of the page and value is + * {@link java.util.List} of {@link TextInfo} elements where each + * {@link TextInfo} element contains a word or a line and its 4 + * coordinates(bbox) + * @throws IOException if error occurred during reading one the provided + * files + */ + public static Map> parseHocrFile( + final List inputFiles, + final TextPositioning textPositioning) + throws IOException { + Map> imageData = + new LinkedHashMap>(); + + for (File inputFile : inputFiles) { + if (inputFile != null + && Files.exists( + java.nio.file.Paths + .get(inputFile.getAbsolutePath()))) { + FileInputStream fileInputStream = + new FileInputStream(inputFile.getAbsolutePath()); + Document doc = Jsoup.parse(fileInputStream, + java.nio.charset.StandardCharsets.UTF_8.name(), + inputFile.getAbsolutePath()); + Elements pages = doc.getElementsByClass("ocr_page"); + + Pattern bboxPattern = Pattern.compile(".*bbox(\\s+\\d+){4}.*"); + Pattern bboxCoordinatePattern = Pattern + .compile( + ".*\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+).*"); + List searchedClasses = TextPositioning.BY_LINES + .equals(textPositioning) + ? Arrays.asList("ocr_line", "ocr_caption") + : Collections.singletonList("ocrx_word"); + for (Element page : pages) { + String[] pageNum = page.id().split("page_"); + int pageNumber = Integer + .parseInt(pageNum[pageNum.length - 1]); + List textData = new ArrayList(); + if (searchedClasses.size() > 0) { + Elements objects = page + .getElementsByClass(searchedClasses.get(0)); + for (int i = 1; i < searchedClasses.size(); i++) { + Elements foundElements = page + .getElementsByClass( + searchedClasses.get(i)); + for (int j = 0; j < foundElements.size(); j++) { + objects.add(foundElements.get(j)); + } + } + for (Element obj : objects) { + String value = obj.attr("title"); + Matcher bboxMatcher = bboxPattern.matcher(value); + if (bboxMatcher.matches()) { + Matcher bboxCoordinateMatcher = + bboxCoordinatePattern + .matcher(bboxMatcher.group()); + if (bboxCoordinateMatcher.matches()) { + List coordinates = + new ArrayList(); + for (int i = 0; i < 4; i++) { + String coord = bboxCoordinateMatcher + .group(i + 1); + coordinates + .add(Float.parseFloat(coord)); + } + + textData.add(new TextInfo(obj.text(), + coordinates)); + } + } + } + } + if (textData.size() > 0) { + if (imageData.containsKey(pageNumber)) { + pageNumber = Collections.max(imageData.keySet()) + + 1; + } + imageData.put(pageNumber, textData); + } + } + fileInputStream.close(); + } + } + return imageData; + } + + /** + * Deletes file using provided path. + * + * @param pathToFile path to the file to be deleted + */ + static void deleteFile(final String pathToFile) { + try { + if (pathToFile != null && !pathToFile.isEmpty() + && Files.exists(java.nio.file.Paths.get(pathToFile))) { + Files.delete(java.nio.file.Paths.get(pathToFile)); + } + } catch (IOException | SecurityException e) { + LOGGER.info(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_DELETE_FILE, + pathToFile, + e.getMessage())); + } + } + + /** + * Reads from text file to string. + * + * @param txtFile input {@link java.io.File} to be read + * @return result {@link java.lang.String} from provided text file + */ + static String readTxtFile(final File txtFile) { + String content = null; + try { + content = new String( + Files.readAllBytes(txtFile.toPath()), + StandardCharsets.UTF_8); + } catch (IOException e) { + LOGGER.error(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_READ_FILE, + txtFile.getAbsolutePath(), + e.getMessage())); + } + return content; + } + + /** + * Writes provided {@link java.lang.String} to text file using + * provided path. + * + * @param path path as {@link java.lang.String} to file to be created + * @param data text data in required format as {@link java.lang.String} + */ + static void writeToTextFile(final String path, + final String data) { + try (Writer writer = new OutputStreamWriter(new FileOutputStream(path), + StandardCharsets.UTF_8)) { + writer.write(data); + } catch (IOException e) { + LOGGER.error(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_WRITE_TO_FILE, + path, + e.getMessage())); + } + } + + /** + * Runs given command. + * + * @param execPath path to the executable + * @param paramsList {@link java.util.List} of command line arguments + * @throws Tesseract4OcrException if provided command failed + */ + static void runCommand(final String execPath, + final List paramsList) throws Tesseract4OcrException { + try { + String params = String.join(" ", paramsList); + boolean cmdSucceeded = SystemUtil + .runProcessAndWait(execPath, params); + + if (!cmdSucceeded) { + LOGGER.error(MessageFormatUtil + .format(Tesseract4LogMessageConstant.COMMAND_FAILED, + execPath + " " + params)); + throw new Tesseract4OcrException( + Tesseract4OcrException + .TESSERACT_FAILED); + } + } catch (IOException | InterruptedException e) { // NOSONAR + LOGGER.error(MessageFormatUtil + .format(Tesseract4LogMessageConstant.COMMAND_FAILED, + e.getMessage())); + throw new Tesseract4OcrException( + Tesseract4OcrException + .TESSERACT_FAILED); + } + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractOcrUtil.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractOcrUtil.java new file mode 100644 index 0000000..ffca409 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractOcrUtil.java @@ -0,0 +1,574 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.util.MessageFormatUtil; + +import com.ochafik.lang.jnaerator.runtime.NativeSize; +import com.ochafik.lang.jnaerator.runtime.NativeSizeByReference; +import com.sun.jna.ptr.PointerByReference; +import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import javax.imageio.ImageIO; +import net.sourceforge.lept4j.ILeptonica; +import net.sourceforge.lept4j.Leptonica; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.tess4j.ITesseract; +import net.sourceforge.tess4j.Tesseract; +import net.sourceforge.tess4j.Tesseract1; +import net.sourceforge.tess4j.TesseractException; +import org.apache.commons.imaging.ImageReadException; +import org.apache.commons.imaging.Imaging; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utilities class to work with tesseract command line tool and image + * preprocessing using {@link net.sourceforge.lept4j.ILeptonica}. + * These all methods have to be ported to .Net manually. + */ +class TesseractOcrUtil { + + /** + * The logger. + */ + private static final Logger LOGGER = LoggerFactory + .getLogger(TesseractOcrUtil.class); + + /** + * List of pages of the image that is being processed. + */ + private List imagePages = + Collections.emptyList(); + + /** + * Creates a new {@link TesseractOcrUtil} instance. + */ + TesseractOcrUtil() { + } + + /** + * Reads required page from provided tiff image. + * + * @param inputFile input image as {@link java.io.File} + * @param pageNumber number of page + * @return result {@link net.sourceforge.lept4j.Pix} object created from + * given image + */ + static Pix readPixPageFromTiff(final File inputFile, + final int pageNumber) { + Pix pix = null; + try { + BufferedImage img = TesseractOcrUtil + .getImagePage(inputFile, pageNumber); + pix = convertImageToPix(img); + } catch (IOException e) { + LOGGER.error(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, + e.getMessage())); + } + // return required page to be preprocessed + return pix; + } + + /** + * Performs default image preprocessing. + * It includes the following actions: + * converting to grayscale, + * thresholding. + * + * @param pix {@link net.sourceforge.lept4j.Pix} object to be processed + * @return preprocessed {@link net.sourceforge.lept4j.Pix} object + */ + static Pix preprocessPix(Pix pix) { + pix = convertToGrayscale(pix); + pix = otsuImageThresholding(pix); + return pix; + } + + /** + * Converts Leptonica {@link net.sourceforge.lept4j.Pix} to grayscale. + * In .Net image is converted only if this is 32bpp image. In java image is + * converted anyway using different Leptonica methods depending on + * image depth. + * + * @param pix {@link net.sourceforge.lept4j.Pix} object to be processed + * @return preprocessed {@link net.sourceforge.lept4j.Pix} object + */ + static Pix convertToGrayscale(final Pix pix) { + Leptonica instance = Leptonica.INSTANCE; + if (pix != null) { + int depth = instance.pixGetDepth(pix); + + if (depth == 32) { + return instance.pixConvertRGBToLuminance(pix); + } else { + return instance.pixRemoveColormap(pix, + instance.REMOVE_CMAP_TO_GRAYSCALE); + } + } else { + return pix; + } + } + + /** + * Performs Leptonica Otsu adaptive image thresholding using + * {@link net.sourceforge.lept4j.Leptonica#pixOtsuAdaptiveThreshold} + * method. + * + * @param pix {@link net.sourceforge.lept4j.Pix} object to be processed + * @return {@link net.sourceforge.lept4j.Pix} object after thresholding + */ + static Pix otsuImageThresholding(final Pix pix) { + if (pix != null) { + Pix thresholdPix = null; + if (pix.d == 8) { + PointerByReference pointer = new PointerByReference(); + Leptonica.INSTANCE + .pixOtsuAdaptiveThreshold(pix, pix.w, pix.h, + 0, 0, 0, + null, pointer); + thresholdPix = new Pix(pointer.getValue()); + if (thresholdPix.w > 0 && thresholdPix.h > 0) { + // destroying original pix + destroyPix(pix); + return thresholdPix; + } else { + LOGGER.info(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_BINARIZE_IMAGE, + pix.d)); + // destroying created PointerByReference object + destroyPix(thresholdPix); + return pix; + } + } else { + LOGGER.info(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_BINARIZE_IMAGE, + pix.d)); + return pix; + } + } else { + return pix; + } + } + + /** + * Destroys {@link net.sourceforge.lept4j.Pix} object. + * + * @param pix {@link net.sourceforge.lept4j.Pix} object to be destroyed + */ + static void destroyPix(Pix pix) { + if (pix != null) { + Leptonica.INSTANCE.lept_free(pix.getPointer()); + } + } + + /** + * Sets tesseract properties. + * The following properties are set in this method: + * In java: path to tess data, languages, psm + * In .Net: psm + * This means that other properties have been set during the + * initialization of tesseract instance previously or tesseract library + * doesn't provide such possibilities in api for .Net or java. + * + * @param tesseractInstance {@link net.sourceforge.tess4j.ITesseract} object + * @param tessData path to tess data directory + * @param languages list of languages in required format + * as {@link java.lang.String} + * @param pageSegMode page segmentation mode {@link java.lang.Integer} + * @param userWordsFilePath path to a temporary file with user words + */ + static void setTesseractProperties( + final ITesseract tesseractInstance, + final String tessData, final String languages, + final Integer pageSegMode, final String userWordsFilePath) { + tesseractInstance.setDatapath(tessData); + tesseractInstance.setLanguage(languages); + if (pageSegMode != null) { + tesseractInstance.setPageSegMode(pageSegMode); + } + tesseractInstance.setOcrEngineMode(userWordsFilePath != null ? 0 : 3); + } + + /** + * Creates tesseract instance with parameters. + * Method is used to initialize tesseract instance with parameters if it + * haven't been initialized yet. + * In this method in java 'tessData', 'languages' and 'userWordsFilePath' + * properties are unused as they will be set using setters in + * {@link #setTesseractProperties} method. In .Net all these properties + * are needed to be provided in tesseract constructor in order to + * initialize tesseract instance. Thus, tesseract initialization takes + * place in {@link Tesseract4LibOcrEngine#Tesseract4LibOcrEngine} constructor in + * java, but in .Net it happens only after all properties are validated, + * i.e. just before OCR process. + * + * @param isWindows true is current os is windows + * @param tessData path to tess data directory + * @param languages list of languages in required format as + * {@link java.lang.String} + * @param userWordsFilePath path to a temporary file with user words + * @return initialized {@link net.sourceforge.tess4j.ITesseract} object + */ + static ITesseract initializeTesseractInstance(final boolean isWindows, + final String tessData, final String languages, + final String userWordsFilePath) { + try { + if (isWindows) { + return new Tesseract1(); + } else { + return new Tesseract(); + } + } catch (LinkageError e) { + throw new Tesseract4OcrException(isWindows ? + Tesseract4OcrException.TESSERACT_LIB_NOT_INSTALLED_WIN : + Tesseract4OcrException.TESSERACT_LIB_NOT_INSTALLED, e); + } + } + + /** + * Returns true if tesseract instance has been already disposed. + * (used in .net version) + * @param tesseractInstance {@link net.sourceforge.tess4j.ITesseract} + * object to check + * @return true if tesseract instance is disposed. + */ + static boolean isTesseractInstanceDisposed( + final ITesseract tesseractInstance) { + return false; + } + + /** + * Disposes {@link net.sourceforge.tess4j.ITesseract} instance. + * (used in .net version) + * @param tesseractInstance {@link net.sourceforge.tess4j.ITesseract} + * object to dispose + */ + static void disposeTesseractInstance( + final ITesseract tesseractInstance) { + } + + /** + * Converts {@link java.awt.image.BufferedImage} to + * {@link net.sourceforge.lept4j.Pix}. + * + * @param bufferedImage input image as {@link java.awt.image.BufferedImage} + * @return Pix result converted {@link net.sourceforge.lept4j.Pix} object + * @throws IOException if it's not possible to convert + */ + static Pix convertImageToPix( + final BufferedImage bufferedImage) + throws IOException { + Pix pix = null; + if (bufferedImage != null) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ImageIO.write(bufferedImage, "png", baos); + + ByteBuffer byteBuffer = ByteBuffer.wrap(baos.toByteArray()); + NativeSize nativeSize = new NativeSize(baos.toByteArray().length); + pix = Leptonica.INSTANCE.pixReadMem(byteBuffer, nativeSize); + } + + return pix; + } + + /** + * Converts Leptonica {@link net.sourceforge.lept4j.Pix} + * to {@link java.awt.image.BufferedImage} with + * {@link net.sourceforge.lept4j.ILeptonica#IFF_PNG} image format. + * + * @param pix input {@link net.sourceforge.lept4j.Pix} object + * @return result {@link java.awt.image.BufferedImage} object + * @throws IOException if it is not possible to convert + */ + static BufferedImage convertPixToImage(final Pix pix) + throws IOException { + if (pix != null) { + Leptonica instance = Leptonica.INSTANCE; + BufferedImage bi = null; + PointerByReference pdata = new PointerByReference(); + try { + NativeSizeByReference psize = new NativeSizeByReference(); + instance.pixWriteMem(pdata, psize, pix, ILeptonica.IFF_PNG); + byte[] b = pdata.getValue().getByteArray(0, + psize.getValue().intValue()); + try (InputStream in = new ByteArrayInputStream(b)) { + bi = ImageIO.read(in); + } + } finally { + instance.lept_free(pdata.getValue()); + } + return bi; + } else { + return null; + } + } + + /** + * Gets path to temp file in current system temporary directory. + * + * @return path to temp file in the system temporary directory + */ + static String getTempFilePath(String name, String suffix) { + String tmpFileName = name + suffix; + try { + Path tempPath = Files.createTempFile(name, suffix); + tmpFileName = tempPath.toString(); + } catch (IOException | IllegalArgumentException e) { + LOGGER.info(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_GET_TEMPORARY_DIRECTORY, + e.getMessage())); + } + return tmpFileName; + } + + /** + * Gets requested image page from the provided image. + * + * @param inputFile input image + * @param page requested image page + * @return requested image page as a {@link java.awt.image.BufferedImage} + */ + static BufferedImage getImagePage(File inputFile, int page) + { + BufferedImage img = null; + try (InputStream is = + new FileInputStream(inputFile.getAbsolutePath())) { + List pages = Imaging.getAllBufferedImages(is, + inputFile.getAbsolutePath()); + if (page >= pages.size()) { + LOGGER.warn(MessageFormatUtil.format( + Tesseract4LogMessageConstant.PAGE_NUMBER_IS_INCORRECT, + page, + inputFile.getAbsolutePath())); + return null; + } + img = pages.get(page); + } catch (ImageReadException | IOException e) { + LOGGER.error(MessageFormatUtil.format( + Tesseract4LogMessageConstant + .CANNOT_RETRIEVE_PAGES_FROM_IMAGE, + inputFile.getAbsolutePath(), + e.getMessage())); + } + return img; + } + + /** + * Saves passed {@link java.awt.image.BufferedImage} to given path + * + * @param tmpFileName provided file path to save the + * {@link java.awt.image.BufferedImage} + * @param image provided {@link java.awt.image.BufferedImage} to be saved + */ + static void saveImageToTempPngFile(final String tmpFileName, + final BufferedImage image) { + if (image != null) { + try { + ImageIO.write(image, "png", new File(tmpFileName)); + } catch (Exception e) { // NOSONAR + LOGGER.error(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_PROCESS_IMAGE, + e.getMessage())); + } + } + } + + /** + * Saves passed {@link net.sourceforge.lept4j.Pix} to given path + * + * @param tmpFileName provided file path to save the + * {@link net.sourceforge.lept4j.Pix} + * @param pix provided {@link net.sourceforge.lept4j.Pix} to be saved + */ + static void savePixToTempPngFile(final String tmpFileName, + final Pix pix) { + if (pix != null) { + try { + Leptonica.INSTANCE.pixWritePng(tmpFileName, pix, + ILeptonica.IFF_PNG); + } catch (Exception e) { // NOSONAR + LOGGER.info(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_PROCESS_IMAGE, + e.getMessage())); + } + } + } + + /** + * Create temporary copy of input file to avoid issue with tesseract and + * different encodings in the path. + * + * @param src path to the source image + * @param dst destination path + */ + static void createTempFileCopy(final String src, final String dst) + throws IOException { + Files.copy(Paths.get(src), Paths.get(dst), + StandardCopyOption.REPLACE_EXISTING); + } + + /** + * Returns parent directory for the passed path. + * + * @param path path to file + * @return parent directory where the file is located + */ + static String getParentDirectory(final String path) { + return new File(path).getParent(); + } + + /** + * Retrieves list of pages from provided image as list of + * {@link java.awt.image.BufferedImage}, one per page and updates + * this list for the image using {@link #setListOfPages} method. + * + * @param inputFile input image {@link java.io.File} + */ + void initializeImagesListFromTiff( + final File inputFile) { + try (InputStream is = + new FileInputStream(inputFile.getAbsolutePath())) { + setListOfPages(Imaging + .getAllBufferedImages(is, + inputFile.getAbsolutePath())); + } catch (Exception e) { // NOSONAR + LOGGER.error(MessageFormatUtil.format( + Tesseract4LogMessageConstant + .CANNOT_RETRIEVE_PAGES_FROM_IMAGE, + inputFile.getAbsolutePath(), + e.getMessage())); + } + } + + /** + * Gets list of page of processing image as list of + * {@link java.awt.image.BufferedImage}, one per page. + * + * @return result {@link java.util.List} of pages + */ + List getListOfPages() { + return new ArrayList(imagePages); + } + + /** + * Sets list of page of processing image as list of + * {@link java.awt.image.BufferedImage}, one per page. + * + * @param listOfPages list of {@link java.awt.image.BufferedImage} for + * each page. + */ + void setListOfPages(final List listOfPages) { + imagePages = Collections.unmodifiableList(listOfPages); + } + + /** + * Performs ocr for the provided image + * and returns result as string in required format. + * ({@link OutputFormat} is used in .Net version, + * in java output format should already be set) + * @param tesseractInstance {@link net.sourceforge.tess4j.ITesseract} + * object to perform OCR + * @param image input {@link java.awt.image.BufferedImage} to be processed + * @param outputFormat selected {@link OutputFormat} for tesseract + * @return result as {@link java.lang.String} in required format + * @throws TesseractException if tesseract recognition failed + */ + String getOcrResultAsString( + final ITesseract tesseractInstance, + final BufferedImage image, final OutputFormat outputFormat) + throws TesseractException { + if (image != null) { + return tesseractInstance.doOCR(image); + } else { + return null; + } + } + + /** + * Performs ocr for the provided image + * and returns result as string in required format. + * ({@link OutputFormat} is used in .Net version, in java output format + * should already be set) + * + * @param tesseractInstance {@link net.sourceforge.tess4j.ITesseract} + * object to perform OCR + * @param image input image as {@link java.io.File} to be + * processed + * @param outputFormat selected {@link OutputFormat} for tesseract + * @return result as {@link java.lang.String} in required format + * @throws TesseractException if tesseract recognition failed + */ + String getOcrResultAsString( + final ITesseract tesseractInstance, + final File image, final OutputFormat outputFormat) + throws TesseractException { + if (image != null) { + return tesseractInstance.doOCR(image); + } else { + return null; + } + } + + /** + * Performs ocr for the provided image + * and returns result as string in required format. + * ({@link OutputFormat} is used in .Net version, in java output format + * should already be set) + * + * @param tesseractInstance {@link net.sourceforge.tess4j.ITesseract} + * object to perform OCR + * @param pix input image as {@link net.sourceforge.lept4j.Pix} to be + * processed + * @param outputFormat selected {@link OutputFormat} for tesseract + * @return result as {@link java.lang.String} in required format + * @throws TesseractException if tesseract recognition failed + * @throws IOException if it is not possible to convert input image + */ + String getOcrResultAsString( + final ITesseract tesseractInstance, + final Pix pix, final OutputFormat outputFormat) + throws TesseractException, IOException { + if (pix != null) { + BufferedImage bufferedImage = convertPixToImage(pix); + return getOcrResultAsString(tesseractInstance, + bufferedImage, outputFormat); + } else { + return null; + } + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java new file mode 100644 index 0000000..c8edb07 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TextPositioning.java @@ -0,0 +1,43 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +/** + * Enumeration of the possible types of text positioning. + * It is used when there is possibility in selected Reader to process + * the text by lines or by words and to return coordinates for the + * selected type of item. + * For tesseract this value makes sense only if selected + * {@link OutputFormat} is {@link OutputFormat#HOCR}. + */ +public enum TextPositioning { + /** + * Text will be located by lines retrieved from hocr file. + * (default value) + */ + BY_LINES, + /** + * Text will be located by words retrieved from hocr file. + */ + BY_WORDS +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/events/PdfOcrTesseract4Event.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/events/PdfOcrTesseract4Event.java new file mode 100644 index 0000000..40dcf67 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/events/PdfOcrTesseract4Event.java @@ -0,0 +1,61 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4.events; + +import com.itextpdf.kernel.counter.event.IGenericEvent; + +/** + * Class for ocr events + */ +public class PdfOcrTesseract4Event implements IGenericEvent { + + public static final PdfOcrTesseract4Event TESSERACT4_IMAGE_OCR = new PdfOcrTesseract4Event("tesseract4-image-ocr"); + public static final PdfOcrTesseract4Event TESSERACT4_IMAGE_TO_PDF = new PdfOcrTesseract4Event("tesseract4-image-to-pdf"); + public static final PdfOcrTesseract4Event TESSERACT4_IMAGE_TO_PDFA = new PdfOcrTesseract4Event("tesseract4-image-to-pdfa"); + + private static final String PDF_OCR_TESSERACT4_ORIGIN_ID = "com.itextpdf.pdfocr.tesseract4"; + + private final String subtype; + + private PdfOcrTesseract4Event(String subtype) { + this.subtype = subtype; + } + + @Override + /** + * Gets the type of the event + * @return the event type + */ + public String getEventType() { + return "pdfOcr-" + subtype; + } + + @Override + /** + * Gets the origin id of the event + * @return the origin id + */ + public String getOriginId() { + return PDF_OCR_TESSERACT4_ORIGIN_ID; + } +} diff --git a/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/package-info.java b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/package-info.java new file mode 100644 index 0000000..bfc5482 --- /dev/null +++ b/pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/package-info.java @@ -0,0 +1 @@ +package com.itextpdf.pdfocr.tesseract4; diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/metainfo/TestMetaInfo.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/metainfo/TestMetaInfo.java new file mode 100644 index 0000000..1f10d9f --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/metainfo/TestMetaInfo.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.metainfo; + +import com.itextpdf.kernel.counter.event.IMetaInfo; + +/** + * This class is used for test purposes. + * Please be aware that it's put in the com.itextpdf.metainfo deliberately, + * so that it belongs neither to com.itextpdf.pdfocr nor com.itextpdf.pdfocr.tesseract4 packages + */ +public class TestMetaInfo implements IMetaInfo { + private static final long serialVersionUID = 5521060335175170386L; +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java new file mode 100644 index 0000000..394e3d6 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/IntegrationTestHelper.java @@ -0,0 +1,525 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.io.font.PdfEncodings; +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.font.PdfFont; +import com.itextpdf.kernel.pdf.DocumentProperties; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfName; +import com.itextpdf.kernel.pdf.PdfReader; +import com.itextpdf.kernel.pdf.PdfWriter; +import com.itextpdf.kernel.pdf.WriterProperties; +import com.itextpdf.kernel.pdf.canvas.CanvasTag; +import com.itextpdf.kernel.pdf.canvas.parser.EventType; +import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor; +import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData; +import com.itextpdf.kernel.pdf.canvas.parser.data.ImageRenderInfo; +import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo; +import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextChunkLocation; +import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy; +import com.itextpdf.kernel.pdf.canvas.parser.listener.TextChunk; +import com.itextpdf.layout.font.FontProvider; +import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine; +import com.itextpdf.pdfocr.tesseract4.Tesseract4ExecutableOcrEngine; +import com.itextpdf.pdfocr.tesseract4.Tesseract4LibOcrEngine; +import com.itextpdf.pdfocr.tesseract4.Tesseract4LogMessageConstant; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; +import com.itextpdf.test.ExtendedITextTest; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.junit.Assert; +import org.junit.experimental.categories.Category; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Category(IntegrationTest.class) +public class IntegrationTestHelper extends ExtendedITextTest { + + private static final Logger LOGGER = LoggerFactory + .getLogger(IntegrationTestHelper.class); + + // directory with test files + public static final String TEST_DIRECTORY = "./src/test/resources/com/itextpdf/pdfocr/"; + private static final String TARGET_FOLDER = "./target/test/resources/com/itextpdf/pdfocr/"; + + // directory with trained data for tests + protected static final String LANG_TESS_DATA_DIRECTORY = TEST_DIRECTORY + "tessdata"; + // directory with trained data for tests + protected static final String SCRIPT_TESS_DATA_DIRECTORY = TEST_DIRECTORY + "tessdata" + File.separator + "script"; + // directory with test image files + protected static final String TEST_IMAGES_DIRECTORY = TEST_DIRECTORY + "images" + File.separator; + // directory with fonts + protected static final String TEST_FONTS_DIRECTORY = TEST_DIRECTORY + "fonts" + File.separator; + // directory with fonts + protected static final String TEST_DOCUMENTS_DIRECTORY = TEST_DIRECTORY + "documents" + File.separator; + + // path to font for hindi + protected static final String NOTO_SANS_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSans-Regular.ttf"; + // path to font for japanese + protected static final String KOSUGI_FONT_PATH = TEST_FONTS_DIRECTORY + "Kosugi-Regular.ttf"; + // path to font for chinese + protected static final String NOTO_SANS_SC_FONT_PATH = TEST_FONTS_DIRECTORY + "NotoSansSC-Regular.otf"; + // path to font for arabic + protected static final String CAIRO_FONT_PATH = TEST_FONTS_DIRECTORY + "Cairo-Regular.ttf"; + // path to font for georgian + protected static final String FREE_SANS_FONT_PATH = TEST_FONTS_DIRECTORY + "FreeSans.ttf"; + + protected static final Map FONT_PATH_TO_FONT_NAME_MAP; + + static { + Map fontPathToNameMap = new HashMap<>(); + fontPathToNameMap.put(NOTO_SANS_FONT_PATH, "NotoSans"); + fontPathToNameMap.put(KOSUGI_FONT_PATH, "Kosugi"); + fontPathToNameMap.put(NOTO_SANS_SC_FONT_PATH, "NotoSansSC"); + fontPathToNameMap.put(CAIRO_FONT_PATH, "Cairo"); + fontPathToNameMap.put(FREE_SANS_FONT_PATH, "FreeSans"); + FONT_PATH_TO_FONT_NAME_MAP = Collections.unmodifiableMap(fontPathToNameMap); + } + + public enum ReaderType { + LIB, + EXECUTABLE + } + + private static Tesseract4LibOcrEngine tesseractLibReader = null; + private static Tesseract4ExecutableOcrEngine tesseractExecutableReader = null; + + public IntegrationTestHelper() { + Tesseract4OcrEngineProperties ocrEngineProperties = + new Tesseract4OcrEngineProperties(); + ocrEngineProperties.setPathToTessData(getTessDataDirectory()); + tesseractLibReader = new Tesseract4LibOcrEngine(ocrEngineProperties); + tesseractExecutableReader = new Tesseract4ExecutableOcrEngine( + getTesseractDirectory(), ocrEngineProperties); + } + + protected static AbstractTesseract4OcrEngine getTesseractReader(ReaderType type) { + if (type.equals(ReaderType.LIB)) { + return tesseractLibReader; + } else { + return tesseractExecutableReader; + } + } + + protected static Tesseract4LibOcrEngine getTesseract4LibOcrEngine() { + return tesseractLibReader; + } + + protected static String getTesseractDirectory() { + String tesseractDir = System.getProperty("tesseractDir"); + String os = System.getProperty("os.name") == null + ? System.getProperty("OS") : System.getProperty("os.name"); + return os.toLowerCase().contains("win") && tesseractDir != null + && !tesseractDir.isEmpty() + ? tesseractDir + "\\tesseract.exe" : "tesseract"; + } + + /** + * Returns target directory (because target/test could not exist). + */ + public static String getTargetDirectory() { + if (!Files.exists(java.nio.file.Paths.get(TARGET_FOLDER))) { + createDestinationFolder(TARGET_FOLDER); + } + return TARGET_FOLDER; + } + + protected static File getTessDataDirectory() { + return new File(LANG_TESS_DATA_DIRECTORY); + } + + /** + * Retrieve text from specified page from given PDF document. + */ + protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, + File file, int page, List languages, List fonts) { + String result = null; + String pdfPath = null; + try { + pdfPath = getTargetDirectory() + getImageName(file.getAbsolutePath(), languages) + ".pdf"; + doOcrAndSavePdfToPath(tesseractReader, file.getAbsolutePath(), + pdfPath, languages, fonts); + result = getTextFromPdfLayer(pdfPath, null, page); + } catch (IOException e) { + LOGGER.error(e.getMessage()); + } + + return result; + } + + /** + * Retrieve text from specified page from given PDF document. + */ + protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, + File file, int page, List languages, String fontPath) { + return getTextFromPdf(tesseractReader, file, page, languages, + Collections.singletonList(fontPath)); + } + + /** + * Retrieve text from the first page of given PDF document setting font. + */ + protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file, + List languages, String fontPath) { + return getTextFromPdf(tesseractReader, file, 1, languages, fontPath); + } + + /** + * Retrieve text from the first page of given PDF document. + */ + protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file, + List languages) { + return getTextFromPdf(tesseractReader, file, 1, languages, + new ArrayList()); + } + + /** + * Retrieve text from the required page of given PDF document. + */ + protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file, int page, + List languages) { + return getTextFromPdf(tesseractReader, file, page, languages, new ArrayList()); + } + + /** + * Retrieve text from the first page of given PDF document. + */ + protected String getTextFromPdf(AbstractTesseract4OcrEngine tesseractReader, File file) { + return getTextFromPdf(tesseractReader, file, 1, null, new ArrayList()); + } + + /** + * Get text from layer specified by name from page. + */ + protected String getTextFromPdfLayer(String pdfPath, String layerName, + int page, boolean useActualText) throws IOException { + PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath), + new DocumentProperties().setEventCountingMetaInfo(new PdfOcrMetaInfo())); + + ExtractionStrategy textExtractionStrategy = new ExtractionStrategy( + layerName); + textExtractionStrategy.setUseActualText(useActualText); + PdfCanvasProcessor processor = new PdfCanvasProcessor( + textExtractionStrategy); + processor.processPageContent(pdfDocument.getPage(page)); + + pdfDocument.close(); + return textExtractionStrategy.getResultantText(); + } + + /** + * Get text from layer specified by name from page. + */ + protected String getTextFromPdfLayer(String pdfPath, String layerName, + int page) throws IOException { + return getTextFromPdfLayer(pdfPath, layerName, page, false); + } + + /** + * Get text from layer specified by name from page + * removing unnecessary space that were added after each glyph in + * {@link LocationTextExtractionStrategy#getResultantText()}. + */ + protected String getTextFromPdfLayerUsingActualText(String pdfPath, + String layerName, int page) throws IOException { + return getTextFromPdfLayer(pdfPath, layerName, page, true) + .replace(" ", ""); + } + + /** + * Perform OCR using provided path to image (imgPath), + * save to file and get text from file. + */ + protected String getRecognizedTextFromTextFile( + AbstractTesseract4OcrEngine tesseractReader, String input, + List languages) { + String result = null; + String txtPath = null; + try { + txtPath = getTargetDirectory() + + getImageName(input, languages) + ".txt"; + doOcrAndSaveToTextFile(tesseractReader, input, txtPath, languages); + result = getTextFromTextFile(new File(txtPath)); + } catch (Exception e) { + LOGGER.error(e.getMessage()); + } + + return result; + } + + /** + * Perform OCR using provided path to image (imgPath), + * save to file and get text from file. + */ + protected String getRecognizedTextFromTextFile( + AbstractTesseract4OcrEngine tesseractReader, String input) { + return getRecognizedTextFromTextFile(tesseractReader, input, null); + } + + /** + * Perform OCR using provided path to image (imgPath) + * and save result to text file. + */ + protected void doOcrAndSaveToTextFile( + AbstractTesseract4OcrEngine tesseractReader, String imgPath, + String txtPath, List languages) { + if (languages != null) { + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setLanguages(languages); + tesseractReader.setTesseract4OcrEngineProperties(properties); + } + + tesseractReader.createTxtFile(Collections.singletonList(new File(imgPath)), + new File(txtPath)); + + if (languages != null) { + Assert.assertEquals(languages.size(), + tesseractReader.getTesseract4OcrEngineProperties().getLanguages().size()); + } + } + + /** + * Perform OCR using provided path to image (imgPath) + * and save result PDF document to "pdfPath". + * (Method is used for compare tool) + */ + protected void doOcrAndSavePdfToPath( + AbstractTesseract4OcrEngine tesseractReader, String imgPath, + String pdfPath, List languages, + List fonts, com.itextpdf.kernel.colors.Color color) { + if (languages != null) { + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setLanguages(languages); + tesseractReader.setTesseract4OcrEngineProperties(properties); + } + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setPdfLang("en-US"); + properties.setTitle(""); + + if (fonts != null && fonts.size() > 0) { + FontProvider fontProvider = new FontProvider(); + for (String fontPath : fonts) { + String name = FONT_PATH_TO_FONT_NAME_MAP.get(fontPath); + fontProvider.getFontSet().addFont(fontPath, PdfEncodings.IDENTITY_H, name); + } + properties.setFontProvider(fontProvider); + } + if (color != null) { + properties.setTextColor(color); + } + if (languages != null) { + Assert.assertEquals(languages.size(), + tesseractReader.getTesseract4OcrEngineProperties().getLanguages().size()); + } + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties); + try (PdfWriter pdfWriter = getPdfWriter(pdfPath)) { + PdfDocument doc = ocrPdfCreator.createPdf( + Collections.singletonList(new File(imgPath)), + pdfWriter); + + Assert.assertNotNull(doc); + doc.close(); + } catch (IOException e) { + LOGGER.error(e.getMessage()); + } + } + + /** + * Perform OCR using provided path to image (imgPath) + * and save result PDF document to "pdfPath". + */ + protected void doOcrAndSavePdfToPath( + AbstractTesseract4OcrEngine tesseractReader, String imgPath, + String pdfPath, List languages, + com.itextpdf.kernel.colors.Color color) { + doOcrAndSavePdfToPath(tesseractReader, imgPath, pdfPath, + languages, null, color); + } + + /** + * Perform OCR using provided path to image (imgPath) + * and save result PDF document to "pdfPath". + * (Text will be invisible) + */ + protected void doOcrAndSavePdfToPath(AbstractTesseract4OcrEngine tesseractReader, String imgPath, + String pdfPath, List languages, List fonts) { + doOcrAndSavePdfToPath(tesseractReader, imgPath, pdfPath, + languages, fonts, null); + } + + /** + * Perform OCR using provided path to image (imgPath) + * and save result PDF document to "pdfPath". + * (Method is used for compare tool) + */ + protected void doOcrAndSavePdfToPath( + AbstractTesseract4OcrEngine tesseractReader, String imgPath, + String pdfPath) { + doOcrAndSavePdfToPath(tesseractReader, imgPath, pdfPath, null, + null, null); + } + + /** + * Retrieve text from given txt file. + */ + protected String getTextFromTextFile(File file) { + String content = null; + try { + content = new String( + Files.readAllBytes(file.toPath()), + StandardCharsets.UTF_8); + } catch (IOException e) { + LOGGER.error(MessageFormatUtil.format( + Tesseract4LogMessageConstant.CANNOT_READ_FILE, + file.getAbsolutePath(), + e.getMessage())); + } + return content; + } + + /** + * Create pdfWriter using provided path to destination file. + */ + protected PdfWriter getPdfWriter(String pdfPath) throws FileNotFoundException { + return new PdfWriter(pdfPath, + new WriterProperties().addUAXmpMetadata()); + } + + /** + * Gets image name from path. + */ + protected String getImageName(String path, List languages) { + String lang = (languages != null && languages.size() > 0) ? + "_" + String.join("", languages) : ""; + String img = path + .substring(path.lastIndexOf(java.io.File.separator)) + .substring(1) + .replace(".", "_"); + return img + lang; + } + + public static class ExtractionStrategy extends LocationTextExtractionStrategy { + private com.itextpdf.kernel.geom.Rectangle imageBBoxRectangle; + private com.itextpdf.kernel.colors.Color fillColor; + private String layerName; + private PdfFont pdfFont; + + public ExtractionStrategy(String name) { + super(); + layerName = name; + } + + public void setFillColor(com.itextpdf.kernel.colors.Color color) { + fillColor = color; + } + + public com.itextpdf.kernel.colors.Color getFillColor() { + return fillColor; + } + + public void setPdfFont(PdfFont font) { + pdfFont = font; + } + + public PdfFont getPdfFont() { + return pdfFont; + } + + public com.itextpdf.kernel.geom.Rectangle getImageBBoxRectangle() { return this.imageBBoxRectangle; } + + public void setImageBBoxRectangle(com.itextpdf.kernel.geom.Rectangle imageBBoxRectangle) { + this.imageBBoxRectangle = imageBBoxRectangle; + } + + @Override + protected boolean isChunkAtWordBoundary(TextChunk chunk, + TextChunk previousChunk) { + ITextChunkLocation curLoc = chunk.getLocation(); + ITextChunkLocation prevLoc = previousChunk.getLocation(); + + if (curLoc.getStartLocation().equals(curLoc.getEndLocation()) || + prevLoc.getEndLocation() + .equals(prevLoc.getStartLocation())) { + return false; + } + + return curLoc.distParallelEnd() - prevLoc.distParallelStart() > + (curLoc.getCharSpaceWidth() + prevLoc.getCharSpaceWidth()) + / 2.0f; + } + + @Override + public void eventOccurred(IEventData data, EventType type) { + if (type.equals(EventType.RENDER_TEXT) || type.equals(EventType.RENDER_IMAGE)) { + String tagName = getTagName(data, type); + if ((tagName == null && layerName == null) || (layerName != null && layerName.equals(tagName))) { + if (type.equals(EventType.RENDER_TEXT)) { + TextRenderInfo renderInfo = (TextRenderInfo) data; + setFillColor(renderInfo.getGraphicsState() + .getFillColor()); + setPdfFont(renderInfo.getGraphicsState().getFont()); + super.eventOccurred(data, type); + } + else if (type.equals(EventType.RENDER_IMAGE)) { + ImageRenderInfo renderInfo = (ImageRenderInfo) data; + com.itextpdf.kernel.geom.Matrix ctm = renderInfo.getImageCtm(); + setImageBBoxRectangle(new com.itextpdf.kernel.geom.Rectangle(ctm.get(6), ctm.get(7), + ctm.get(0), ctm.get(4))); + } + } + } + } + + private String getTagName(IEventData data, EventType type) { + java.util.List tagHierarchy = null; + if (type.equals(EventType.RENDER_TEXT)) { + TextRenderInfo textRenderInfo = (TextRenderInfo) data; + tagHierarchy = textRenderInfo.getCanvasTagHierarchy(); + } + else if (type.equals(EventType.RENDER_IMAGE)) { + ImageRenderInfo imageRenderInfo = (ImageRenderInfo) data; + tagHierarchy = imageRenderInfo.getCanvasTagHierarchy(); + } + return (tagHierarchy == null || tagHierarchy.size() == 0 + || tagHierarchy.get(0).getProperties().get(PdfName.Name) == null) + ? null + : tagHierarchy.get(0).getProperties().get(PdfName.Name).toString(); + } + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/TesseractExecutableIntegrationTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/TesseractExecutableIntegrationTest.java new file mode 100644 index 0000000..16b1d56 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/TesseractExecutableIntegrationTest.java @@ -0,0 +1,86 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr; + +import com.itextpdf.pdfocr.tesseract4.Tesseract4ExecutableOcrEngine; +import com.itextpdf.pdfocr.tesseract4.Tesseract4LogMessageConstant; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrException; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExpectedException; + +@Category(IntegrationTest.class) +public class TesseractExecutableIntegrationTest extends IntegrationTestHelper { + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + @LogMessages(messages = { + @LogMessage(messageTemplate = + Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE, count = 1) + }) + @Test + public void testNullPathToTesseractExecutable() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE); + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + Tesseract4ExecutableOcrEngine tesseractExecutableReader = + new Tesseract4ExecutableOcrEngine( + new Tesseract4OcrEngineProperties()); + tesseractExecutableReader.setPathToExecutable(null); + getTextFromPdf(tesseractExecutableReader, file); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = + Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE, count = 1) + }) + @Test + public void testEmptyPathToTesseractExecutable() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(Tesseract4OcrException.CANNOT_FIND_PATH_TO_TESSERACT_EXECUTABLE); + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + getTextFromPdf(new Tesseract4ExecutableOcrEngine("", new Tesseract4OcrEngineProperties()), file); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = + Tesseract4LogMessageConstant.COMMAND_FAILED, count = 1), + @LogMessage(messageTemplate = + Tesseract4OcrException.TESSERACT_NOT_FOUND, count = 1) + }) + @Test + public void testIncorrectPathToTesseractExecutable() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(Tesseract4OcrException.TESSERACT_NOT_FOUND); + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + getTextFromPdf(new Tesseract4ExecutableOcrEngine("path\\to\\executable\\", new Tesseract4OcrEngineProperties()), file); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/EventCountingExecutableTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/EventCountingExecutableTest.java new file mode 100644 index 0000000..d651e9c --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/EventCountingExecutableTest.java @@ -0,0 +1,53 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.events; + +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.pdfocr.tesseract4.Tesseract4LogMessageConstant; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrException; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class EventCountingExecutableTest extends EventCountingTest { + public EventCountingExecutableTest() { + super(ReaderType.EXECUTABLE); + } + + @Test + @LogMessages(messages = {@LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE)}) + public void testEventCountingCustomMetaInfoError() { + String imgPath = new File(TEST_IMAGES_DIRECTORY + "numbers_101.jpg").getAbsolutePath(); + + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException + .expectMessage(MessageFormatUtil.format(Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, imgPath)); + + super.testEventCountingCustomMetaInfoError(); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/EventCountingLibTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/EventCountingLibTest.java new file mode 100644 index 0000000..2fce3ba --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/EventCountingLibTest.java @@ -0,0 +1,55 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.events; + +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.pdfocr.tesseract4.Tesseract4LogMessageConstant; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrException; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class EventCountingLibTest extends EventCountingTest { + public EventCountingLibTest() { + super(ReaderType.LIB); + } + + @Test + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.TESSERACT_FAILED), + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED) + }) + public void testEventCountingCustomMetaInfoError() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_101.jpg"; + + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException + .expectMessage(MessageFormatUtil.format(Tesseract4OcrException.TESSERACT_FAILED, imgPath)); + + super.testEventCountingCustomMetaInfoError(); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/EventCountingTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/EventCountingTest.java new file mode 100644 index 0000000..1540b42 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/EventCountingTest.java @@ -0,0 +1,290 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.events; + +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.counter.EventCounter; +import com.itextpdf.kernel.counter.EventCounterHandler; +import com.itextpdf.kernel.counter.IEventCounterFactory; +import com.itextpdf.kernel.counter.SimpleEventCounterFactory; +import com.itextpdf.kernel.counter.event.IEvent; +import com.itextpdf.kernel.counter.event.IMetaInfo; +import com.itextpdf.kernel.pdf.PdfOutputIntent; +import com.itextpdf.kernel.pdf.PdfWriter; +import com.itextpdf.metainfo.TestMetaInfo; +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.pdfocr.OcrPdfCreator; +import com.itextpdf.pdfocr.OcrPdfCreatorProperties; +import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine; +import com.itextpdf.pdfocr.tesseract4.Tesseract4LogMessageConstant; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrException; +import com.itextpdf.pdfocr.tesseract4.events.PdfOcrTesseract4Event; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public abstract class EventCountingTest extends IntegrationTestHelper { + + protected static final String PROFILE_FOLDER = "./src/test/resources/com/itextpdf/pdfocr/events/"; + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + AbstractTesseract4OcrEngine tesseractReader; + String testFileTypeName; + private boolean isExecutableReaderType; + + public EventCountingTest(ReaderType type) { + isExecutableReaderType = type.equals(ReaderType.EXECUTABLE); + if (isExecutableReaderType) { + testFileTypeName = "executable"; + } else { + testFileTypeName = "lib"; + } + tesseractReader = getTesseractReader(type); + } + + @Before + public void initTesseractProperties() { + Tesseract4OcrEngineProperties ocrEngineProperties = + new Tesseract4OcrEngineProperties(); + ocrEngineProperties.setPathToTessData(getTessDataDirectory()); + tesseractReader.setTesseract4OcrEngineProperties(ocrEngineProperties); + } + + @Test + public void testEventCountingPdfEvent() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File file = new File(imgPath); + + TestEventCounter eventCounter = new TestEventCounter(); + IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); + EventCounterHandler.getInstance().register(factory); + try { + doImageToPdfOcr(tesseractReader, Arrays.asList(file)); + + Assert.assertEquals(1, eventCounter.getEvents().size()); + Assert.assertSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, eventCounter.getEvents().get(0)); + Assert.assertNull(eventCounter.getMetaInfos().get(0)); + } finally { + EventCounterHandler.getInstance().unregister(factory); + } + } + + @Test + public void testEventCountingSeveralImagesOneImageToPdfEvent() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File file = new File(imgPath); + + TestEventCounter eventCounter = new TestEventCounter(); + IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); + EventCounterHandler.getInstance().register(factory); + try { + doImageToPdfOcr(tesseractReader, Arrays.asList(file, file)); + + Assert.assertEquals(1, eventCounter.getEvents().size()); + Assert.assertSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, eventCounter.getEvents().get(0)); + Assert.assertNull(eventCounter.getMetaInfos().get(0)); + } finally { + EventCounterHandler.getInstance().unregister(factory); + } + } + + @Test + public void testEventCountingPdfAEvent() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File file = new File(imgPath); + + TestEventCounter eventCounter = new TestEventCounter(); + IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); + EventCounterHandler.getInstance().register(factory); + try { + doImageToPdfAOcr(tesseractReader, Arrays.asList(file)); + + Assert.assertEquals(1, eventCounter.getEvents().size()); + Assert.assertSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDFA, eventCounter.getEvents().get(0)); + Assert.assertNull(eventCounter.getMetaInfos().get(0)); + } finally { + EventCounterHandler.getInstance().unregister(factory); + } + } + + @Test + public void testEventCountingTwoPdfEvents() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File file = new File(imgPath); + + TestEventCounter eventCounter = new TestEventCounter(); + IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); + EventCounterHandler.getInstance().register(factory); + try { + doImageToPdfOcr(tesseractReader, Arrays.asList(file)); + doImageToPdfOcr(tesseractReader, Arrays.asList(file)); + + Assert.assertEquals(2, eventCounter.getEvents().size()); + for (int i = 0; i < eventCounter.getEvents().size(); i++) { + Assert.assertSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, eventCounter.getEvents().get(i)); + Assert.assertNull(eventCounter.getMetaInfos().get(i)); + } + } finally { + EventCounterHandler.getInstance().unregister(factory); + } + } + + @Test + public void testEventCountingImageEvent() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File file = new File(imgPath); + + TestEventCounter eventCounter = new TestEventCounter(); + IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); + EventCounterHandler.getInstance().register(factory); + try { + doImageOcr(tesseractReader, file); + + Assert.assertEquals(1, eventCounter.getEvents().size()); + Assert.assertSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR, eventCounter.getEvents().get(0)); + Assert.assertNull(eventCounter.getMetaInfos().get(0)); + } finally { + EventCounterHandler.getInstance().unregister(factory); + } + } + + @Test + public void testEventCountingImageEventCustomMetaInfo() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File file = new File(imgPath); + + TestEventCounter eventCounter = new TestEventCounter(); + IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); + EventCounterHandler.getInstance().register(factory); + try { + tesseractReader.setThreadLocalMetaInfo(new TestMetaInfo()); + doImageOcr(tesseractReader, file); + + Assert.assertEquals(1, eventCounter.getEvents().size()); + Assert.assertSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR, eventCounter.getEvents().get(0)); + Assert.assertTrue(eventCounter.getMetaInfos().get(0) instanceof TestMetaInfo); + } finally { + EventCounterHandler.getInstance().unregister(factory); + tesseractReader.setThreadLocalMetaInfo(null); + } + } + + @Test + public void testEventCountingPdfEventCustomMetaInfo() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File file = new File(imgPath); + + TestEventCounter eventCounter = new TestEventCounter(); + IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); + EventCounterHandler.getInstance().register(factory); + try { + tesseractReader.setThreadLocalMetaInfo(new TestMetaInfo()); + doImageToPdfOcr(tesseractReader, Arrays.asList(file)); + + Assert.assertEquals(1, eventCounter.getEvents().size()); + Assert.assertSame(PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, eventCounter.getEvents().get(0)); + Assert.assertTrue(eventCounter.getMetaInfos().get(0) instanceof TestMetaInfo); + } finally { + EventCounterHandler.getInstance().unregister(factory); + tesseractReader.setThreadLocalMetaInfo(null); + } + } + + public void testEventCountingCustomMetaInfoError() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_101.jpg"; + File file = new File(imgPath); + + TestEventCounter eventCounter = new TestEventCounter(); + IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); + EventCounterHandler.getInstance().register(factory); + + IMetaInfo metaInfo = new TestMetaInfo(); + try { + tesseractReader.setThreadLocalMetaInfo(metaInfo); + doImageToPdfOcr(tesseractReader, Arrays.asList(file)); + } finally { + Assert.assertEquals(metaInfo, tesseractReader.getThreadLocalMetaInfo()); + EventCounterHandler.getInstance().unregister(factory); + tesseractReader.setThreadLocalMetaInfo(null); + } + } + + private static void doImageOcr(AbstractTesseract4OcrEngine tesseractReader, File imageFile) { + tesseractReader.doImageOcr(imageFile); + } + + private static void doImageToPdfOcr(AbstractTesseract4OcrEngine tesseractReader, List imageFiles) { + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + ocrPdfCreator.createPdf(imageFiles, new PdfWriter(new ByteArrayOutputStream())); + } + + private static void doImageToPdfAOcr(AbstractTesseract4OcrEngine tesseractReader, List imageFiles) { + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, + new OcrPdfCreatorProperties().setPdfLang("en-US")); + InputStream is = null; + try { + is = new FileInputStream(PROFILE_FOLDER + "sRGB_CS_profile.icm"); + } catch (FileNotFoundException e) { + // No expected + } + PdfOutputIntent outputIntent = new PdfOutputIntent("Custom", "", "http://www.color.org", "sRGB IEC61966-2.1", + is); + + ocrPdfCreator.createPdfA(imageFiles, new PdfWriter(new ByteArrayOutputStream()), outputIntent); + } + + private static class TestEventCounter extends EventCounter { + private List events = new ArrayList<>(); + private List metaInfos = new ArrayList<>(); + + public List getEvents() { + return events; + } + + public List getMetaInfos() { + return metaInfos; + } + + @Override + protected void onEvent(IEvent event, IMetaInfo metaInfo) { + this.events.add(event); + this.metaInfos.add(metaInfo); + } + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/PdfOcrTesseract4EventTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/PdfOcrTesseract4EventTest.java new file mode 100644 index 0000000..1da58aa --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/PdfOcrTesseract4EventTest.java @@ -0,0 +1,59 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.events; + +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.pdfocr.tesseract4.events.PdfOcrTesseract4Event; +import com.itextpdf.test.annotations.type.UnitTest; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category(UnitTest.class) +public class PdfOcrTesseract4EventTest extends IntegrationTestHelper { + + private static final String PDF_OCR_TESSERACT4_ORIGIN_ID = "com.itextpdf.pdfocr.tesseract4"; + + @Test + public void testEventTypes() { + String[] expectedTypes = {"pdfOcr-tesseract4-image-ocr", "pdfOcr-tesseract4-image-to-pdf", "pdfOcr-tesseract4-image-to-pdfa"}; + PdfOcrTesseract4Event[] testedEvents = {PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR, + PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDFA}; + + for (int i = 0; i < testedEvents.length; i++) { + Assert.assertEquals(expectedTypes[i], testedEvents[i].getEventType()); + } + } + + @Test + public void testOriginId() { + String expected = PDF_OCR_TESSERACT4_ORIGIN_ID; + PdfOcrTesseract4Event[] testedEvents = {PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, + PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF, PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDFA}; + + for (PdfOcrTesseract4Event event : testedEvents) { + Assert.assertEquals(expected, event.getOriginId()); + } + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/DoImageOcrRunnable.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/DoImageOcrRunnable.java new file mode 100644 index 0000000..ea66eaa --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/DoImageOcrRunnable.java @@ -0,0 +1,63 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.events.multithreading; + +import com.itextpdf.kernel.counter.event.IMetaInfo; +import com.itextpdf.kernel.pdf.PdfWriter; +import com.itextpdf.pdfocr.OcrPdfCreator; +import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine; +import com.itextpdf.pdfocr.tesseract4.OutputFormat; + +import java.io.File; +import java.util.Arrays; + +public class DoImageOcrRunnable implements Runnable { + private AbstractTesseract4OcrEngine tesseractReader; + private File imgFile; + private File outputFile; + private boolean createPdf; + private IMetaInfo metaInfo; + + DoImageOcrRunnable(AbstractTesseract4OcrEngine tesseractReader, IMetaInfo metaInfo, File imgFile, File outputFile, boolean createPdf) { + this.tesseractReader = tesseractReader; + this.metaInfo = metaInfo; + this.imgFile = imgFile; + this.outputFile = outputFile; + this.createPdf = createPdf; + } + + public void run() { + try { + tesseractReader.setThreadLocalMetaInfo(metaInfo); + if (createPdf) { + new OcrPdfCreator(tesseractReader).createPdf(Arrays.asList(imgFile), new PdfWriter(outputFile)); + } else { + tesseractReader.doTesseractOcr(imgFile, outputFile, OutputFormat.TXT); + } + // for test purposes + System.out.println(imgFile.getName()); + } catch (Exception e) { + throw new RuntimeException(e.getMessage()); + } + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingExecutableTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingExecutableTest.java new file mode 100644 index 0000000..2321017 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingExecutableTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.events.multithreading; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class MultiThreadingExecutableTest extends MultiThreadingTest { + public MultiThreadingExecutableTest() { + super(ReaderType.EXECUTABLE); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java new file mode 100644 index 0000000..3a36dee --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/events/multithreading/MultiThreadingTest.java @@ -0,0 +1,148 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.events.multithreading; + +import com.itextpdf.kernel.counter.EventCounter; +import com.itextpdf.kernel.counter.EventCounterHandler; +import com.itextpdf.kernel.counter.IEventCounterFactory; +import com.itextpdf.kernel.counter.SimpleEventCounterFactory; +import com.itextpdf.kernel.counter.event.IEvent; +import com.itextpdf.kernel.counter.event.IMetaInfo; +import com.itextpdf.metainfo.TestMetaInfo; +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; +import com.itextpdf.pdfocr.tesseract4.events.PdfOcrTesseract4Event; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExpectedException; + +@Category(IntegrationTest.class) +public abstract class MultiThreadingTest extends IntegrationTestHelper { + protected static final String destinationFolder = "./target/test/com/itextpdf/pdfocr/events/multithreading/"; + protected static final String sourceFolder = "./src/test/resources/com/itextpdf/pdfocr/events/multithreading/"; + + AbstractTesseract4OcrEngine tesseractReader; + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + public MultiThreadingTest(ReaderType type) { + tesseractReader = getTesseractReader(type); + } + + @BeforeClass + public static void beforeClass() { + createDestinationFolder(destinationFolder); + } + + @Before + public void initTesseractProperties() { + Tesseract4OcrEngineProperties ocrEngineProperties = + new Tesseract4OcrEngineProperties(); + ocrEngineProperties.setPathToTessData(new File(sourceFolder + "../../tessdata/")); + tesseractReader.setTesseract4OcrEngineProperties(ocrEngineProperties); + } + + @Test + public void testEventCountingPdfEvent() throws InterruptedException { + TestEventCounter eventCounter = new TestEventCounter(); + IEventCounterFactory factory = new SimpleEventCounterFactory(eventCounter); + EventCounterHandler.getInstance().register(factory); + try { + int n = 16; + IMetaInfo metainfo = new TestMetaInfo(); + Thread[] threads = new Thread[n]; + for (int i = 0; i < n; i++) { + // We do not use Runnable as the variable's type because of porting issues + DoImageOcrRunnable runnable = new DoImageOcrRunnable( + tesseractReader, + metainfo, + new File(sourceFolder + "numbers_01.jpg"), + new File(destinationFolder + "ocr-result-" + (i + 1) + ".txt"), + 0 == i % 2); + threads[i] = getThread(runnable); + } + for (int i = 0; i < n; i++) { + threads[i].start(); + + // The test will pass in sequential mode, i.e. if the following line is uncommented + //threads[i].join(); + } + for (int i = 0; i < n; i++) { + threads[i].join(); + } + + Assert.assertEquals(n, eventCounter.getEvents().size()); + int expectedPdfEvents = n / 2; + int expectedImageEvents = n - expectedPdfEvents; + int foundPdfEvents = 0; + int foundImageEvents = 0; + for (int i = 0; i < n; i++) { + if (PdfOcrTesseract4Event.TESSERACT4_IMAGE_TO_PDF == eventCounter.getEvents().get(i)) { + foundPdfEvents++; + } else if (PdfOcrTesseract4Event.TESSERACT4_IMAGE_OCR == eventCounter.getEvents().get(i)) { + foundImageEvents++; + } + Assert.assertEquals(metainfo, eventCounter.getMetaInfos().get(i)); + } + Assert.assertEquals(expectedImageEvents, foundImageEvents); + Assert.assertEquals(expectedPdfEvents, foundPdfEvents); + } finally { + EventCounterHandler.getInstance().unregister(factory); + } + } + + private static Thread getThread(DoImageOcrRunnable runnable) { + return new Thread(runnable); + } + + public static class TestEventCounter extends EventCounter { + private List events = new ArrayList<>(); + private List metaInfos = new ArrayList<>(); + + public List getEvents() { + return events; + } + + public List getMetaInfos() { + return metaInfos; + } + + @Override + protected void onEvent(IEvent event, IMetaInfo metaInfo) { + this.events.add(event); + this.metaInfos.add(metaInfo); + } + } + +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/general/BasicTesseractIntegrationExecutableTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/general/BasicTesseractIntegrationExecutableTest.java new file mode 100644 index 0000000..a6cb20f --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/general/BasicTesseractIntegrationExecutableTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.general; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class BasicTesseractIntegrationExecutableTest extends BasicTesseractIntegrationTest { + public BasicTesseractIntegrationExecutableTest() { + super(ReaderType.EXECUTABLE); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/general/BasicTesseractIntegrationLibTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/general/BasicTesseractIntegrationLibTest.java new file mode 100644 index 0000000..4d2b0f4 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/general/BasicTesseractIntegrationLibTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.general; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class BasicTesseractIntegrationLibTest extends BasicTesseractIntegrationTest { + public BasicTesseractIntegrationLibTest() { + super(ReaderType.LIB); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/general/BasicTesseractIntegrationTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/general/BasicTesseractIntegrationTest.java new file mode 100644 index 0000000..0bf1cf2 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/general/BasicTesseractIntegrationTest.java @@ -0,0 +1,465 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.general; + +import com.itextpdf.io.source.ByteArrayOutputStream; +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.colors.DeviceCmyk; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfReader; +import com.itextpdf.kernel.pdf.PdfWriter; +import com.itextpdf.kernel.pdf.WriterProperties; +import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor; +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.pdfocr.IOcrEngine; +import com.itextpdf.pdfocr.OcrPdfCreator; +import com.itextpdf.pdfocr.OcrPdfCreatorProperties; +import com.itextpdf.pdfocr.TextInfo; +import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine; +import com.itextpdf.pdfocr.tesseract4.OutputFormat; +import com.itextpdf.pdfocr.tesseract4.Tesseract4LogMessageConstant; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrException; +import com.itextpdf.pdfocr.tesseract4.TesseractHelper; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public abstract class BasicTesseractIntegrationTest extends IntegrationTestHelper { + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + AbstractTesseract4OcrEngine tesseractReader; + + public BasicTesseractIntegrationTest(ReaderType type) { + tesseractReader = getTesseractReader(type); + } + + @Before + public void initTesseractProperties() { + Tesseract4OcrEngineProperties ocrEngineProperties = + new Tesseract4OcrEngineProperties(); + ocrEngineProperties.setPathToTessData(getTessDataDirectory()); + tesseractReader.setTesseract4OcrEngineProperties(ocrEngineProperties); + } + + @Test + public void testFontColorInMultiPagePdf() throws IOException { + String testName = "testFontColorInMultiPagePdf"; + String path = TEST_IMAGES_DIRECTORY + "multîpage.tiff"; + String pdfPath = getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(false)); + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setTextLayerName("Text1"); + com.itextpdf.kernel.colors.Color color = DeviceCmyk.MAGENTA; + ocrPdfCreatorProperties.setTextColor(color); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, + ocrPdfCreatorProperties); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), getPdfWriter(pdfPath)); + + Assert.assertNotNull(doc); + doc.close(); + + PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); + + ExtractionStrategy strategy = new ExtractionStrategy("Text1"); + PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy); + + processor.processPageContent(pdfDocument.getPage(1)); + + com.itextpdf.kernel.colors.Color fillColor = strategy.getFillColor(); + Assert.assertEquals(fillColor, color); + + pdfDocument.close(); + } + + @Test + public void testNoisyImage() { + String path = TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png"; + String expectedOutput1 = "Noisyimage to test Tesseract OCR"; + String expectedOutput2 = "Noisy image to test Tesseract OCR"; + + String realOutputHocr = getTextUsingTesseractFromImage(tesseractReader, + new File(path)); + Assert.assertTrue(realOutputHocr.equals(expectedOutput1) || + realOutputHocr.equals(expectedOutput2)); + } + + @Test + public void testPantoneImage() { + String filePath = TEST_IMAGES_DIRECTORY + "pantone_blue.jpg"; + String expected = ""; + + String realOutputHocr = getTextUsingTesseractFromImage(tesseractReader, + new File(filePath)); + Assert.assertEquals(expected, realOutputHocr); + } + + @Test + public void testDifferentTextStyles() { + String path = TEST_IMAGES_DIRECTORY + "example_04.png"; + String expectedOutput = "How about a bigger font?"; + + testImageOcrText(tesseractReader, path, expectedOutput); + } + + @Test + public void testImageWithoutText() throws IOException { + String testName = "testImageWithoutText"; + String filePath = TEST_IMAGES_DIRECTORY + "pantone_blue.jpg"; + String pdfPath = getTargetDirectory() + testName + ".pdf"; + File file = new File(filePath); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + + ocrPdfCreator.createPdf(Collections.singletonList(file), + new PdfWriter(pdfPath)).close(); + + PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); + + ExtractionStrategy strategy = new ExtractionStrategy("Text Layer"); + PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy); + + processor.processPageContent(pdfDocument.getFirstPage()); + pdfDocument.close(); + + Assert.assertEquals("", strategy.getResultantText()); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, count = 1) + }) + @Test + public void testInputInvalidImage() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil + .format(Tesseract4OcrException.INCORRECT_INPUT_IMAGE_FORMAT, + "txt")); + + File file1 = new File(TEST_IMAGES_DIRECTORY + "example.txt"); + File file2 = new File(TEST_IMAGES_DIRECTORY + + "example_05_corrupted.bmp"); + File file3 = new File(TEST_IMAGES_DIRECTORY + + "numbers_02.jpg"); + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPathToTessData(getTessDataDirectory())); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + + ocrPdfCreator.createPdf(Arrays.asList(file3, file1, file2, file3), getPdfWriter()); + } + + @Test + public void testNonAsciiImagePath() { + String path = TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png"; + String expectedOutput1 = "Noisyimage to test Tesseract OCR"; + String expectedOutput2 = "Noisy image to test Tesseract OCR"; + + String realOutputHocr = getTextUsingTesseractFromImage(tesseractReader, + new File(path)); + Assert.assertTrue(realOutputHocr.equals(expectedOutput1) || + realOutputHocr.equals(expectedOutput2)); + } + + @Test + public void testNonAsciiImageName() { + String path = TEST_IMAGES_DIRECTORY + "nümbérs.jpg"; + String expectedOutput = "619121"; + + String realOutputHocr = getTextUsingTesseractFromImage(tesseractReader, + new File(path)); + Assert.assertTrue(realOutputHocr.equals(expectedOutput)); + } + + @Test + public void testNullPathToTessData() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(Tesseract4OcrException.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID); + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPathToTessData(null)); + getTextFromPdf(tesseractReader, file, Collections.singletonList("eng")); + } + + @Test + public void testPathToTessDataWithoutData() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(Tesseract4OcrException.PATH_TO_TESS_DATA_DIRECTORY_IS_INVALID); + + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPathToTessData(new File("test/"))); + getTextFromPdf(tesseractReader, file, Collections.singletonList("eng")); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4OcrException.INCORRECT_LANGUAGE) + }) + @Test + public void testEmptyPathToTessData() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil + .format(Tesseract4OcrException.INCORRECT_LANGUAGE, + "eng.traineddata", + new File(".").getAbsolutePath())); + + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + Tesseract4OcrEngineProperties properties = tesseractReader.getTesseract4OcrEngineProperties() + .setPathToTessData(new File(".")); + tesseractReader.setTesseract4OcrEngineProperties(properties); + getTextFromPdf(tesseractReader, file); + + Assert.assertEquals(new File("").getAbsolutePath(), + tesseractReader.getTesseract4OcrEngineProperties() + .getPathToTessData().getAbsolutePath()); + } + + @Test + public void testTxtStringOutput() { + File file = new File(TEST_IMAGES_DIRECTORY + "multîpage.tiff"); + List expectedOutput = Arrays.asList( + "Multipage\nTIFF\nExample\nPage 1", + "Multipage\nTIFF\nExample\nPage 2", + "Multipage\nTIFF\nExample\nPage 4", + "Multipage\nTIFF\nExample\nPage 5", + "Multipage\nTIFF\nExample\nPage 6", + "Multipage\nTIFF\nExample\nPage /", + "Multipage\nTIFF\nExample\nPage 8", + "Multipage\nTIFF\nExample\nPage 9" + ); + + String result = tesseractReader.doImageOcr(file, OutputFormat.TXT); + for (String line : expectedOutput) { + Assert.assertTrue(result.replaceAll("\r", "").contains(line)); + } + } + + @Test + public void testHocrStringOutput() { + File file = new File(TEST_IMAGES_DIRECTORY + "multîpage.tiff"); + List expectedOutput = Arrays.asList( + "Multipage\nTIFF\nExample\nPage 1", + "Multipage\nTIFF\nExample\nPage 2", + "Multipage\nTIFF\nExample\nPage 4", + "Multipage\nTIFF\nExample\nPage 5", + "Multipage\nTIFF\nExample\nPage 6", + "Multipage\nTIFF\nExample\nPage /", + "Multipage\nTIFF\nExample\nPage 8", + "Multipage\nTIFF\nExample\nPage 9" + ); + + String result = tesseractReader.doImageOcr(file, OutputFormat.HOCR); + for (String line : expectedOutput) { + Assert.assertTrue(result.replaceAll("\r", "").contains(line)); + } + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4OcrException.INCORRECT_LANGUAGE, + count = 1) + }) + @Test + public void testIncorrectLanguage() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil + .format(Tesseract4OcrException.INCORRECT_LANGUAGE, + "spa_new.traineddata", + new File(LANG_TESS_DATA_DIRECTORY).getAbsolutePath())); + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + getTextFromPdf(tesseractReader, file, Collections.singletonList("spa_new")); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4OcrException.INCORRECT_LANGUAGE, + count = 1) + }) + @Test + public void testListOfLanguagesWithOneIncorrectLanguage() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil + .format(Tesseract4OcrException.INCORRECT_LANGUAGE, + "spa_new.traineddata", + new File(LANG_TESS_DATA_DIRECTORY).getAbsolutePath())); + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + getTextFromPdf(tesseractReader, file, Arrays.asList("spa", "spa_new", "spa_old")); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4OcrException.INCORRECT_LANGUAGE, + count = 1) + }) + @Test + public void testIncorrectScriptsName() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil + .format(Tesseract4OcrException.INCORRECT_LANGUAGE, + "English.traineddata", + new File(SCRIPT_TESS_DATA_DIRECTORY).getAbsolutePath())); + + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPathToTessData(new File(SCRIPT_TESS_DATA_DIRECTORY))); + getTextFromPdf(tesseractReader, file, Collections.singletonList("English")); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4OcrException.INCORRECT_LANGUAGE, count + = 1) + }) + @Test + public void testListOfScriptsWithOneIncorrect() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil + .format(Tesseract4OcrException.INCORRECT_LANGUAGE, + "English.traineddata", + new File(SCRIPT_TESS_DATA_DIRECTORY).getAbsolutePath())); + + File file = new File(TEST_IMAGES_DIRECTORY + "spanish_01.jpg"); + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPathToTessData(new File(SCRIPT_TESS_DATA_DIRECTORY))); + getTextFromPdf(tesseractReader, file, + Arrays.asList("Georgian", "Japanese", "English")); + } + + @Test + public void testTesseract4OcrForOnePageWithHocrFormat() + throws IOException { + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + String expected = "619121"; + File imgFile = new File(path); + File outputFile = new File(getTargetDirectory() + + "testTesseract4OcrForOnePage.hocr"); + + tesseractReader.doTesseractOcr(imgFile, outputFile, OutputFormat.HOCR); + Map> pageData = TesseractHelper + .parseHocrFile(Collections.singletonList(outputFile), + tesseractReader + .getTesseract4OcrEngineProperties() + .getTextPositioning() + ); + + String result = getTextFromPage(pageData.get(1)); + Assert.assertEquals(expected, result.trim()); + } + + @Test + public void testTesseract4OcrForOnePageWithTxtFormat() { + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + String expected = "619121"; + File imgFile = new File(path); + File outputFile = new File(getTargetDirectory() + + "testTesseract4OcrForOnePage.txt"); + + tesseractReader.doTesseractOcr(imgFile, outputFile, OutputFormat.TXT); + + String result = getTextFromTextFile(outputFile); + Assert.assertTrue(result.contains(expected)); + } + + @Test + public void testSimpleTextOutput() { + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + String expectedOutput = "619121"; + + Assert.assertTrue( + getRecognizedTextFromTextFile(tesseractReader, imgPath) + .contains(expectedOutput)); + } + + /** + * Parse text from image and compare with expected. + */ + private void testImageOcrText(AbstractTesseract4OcrEngine tesseractReader, String path, + String expectedOutput) { + File ex1 = new File(path); + + String realOutputHocr = getTextUsingTesseractFromImage(tesseractReader, + ex1); + Assert.assertTrue(realOutputHocr.contains(expectedOutput)); + } + + /** + * Parse text from given image using tesseract. + */ + private String getTextUsingTesseractFromImage(IOcrEngine tesseractReader, + File file) { + int page = 1; + Map> data = tesseractReader.doImageOcr(file); + List pageText = data.get(page); + + if (pageText == null || pageText.size() == 0) { + pageText = new ArrayList(); + TextInfo textInfo = new TextInfo(); + textInfo.setBbox(Arrays.asList(0f, 0f, 0f, 0f)); + textInfo.setText(""); + pageText.add(textInfo); + } + + return getTextFromPage(pageText); + } + + /** + * Concatenates provided text items to one string. + */ + private String getTextFromPage(List pageText) { + Assert.assertEquals(4, + pageText.get(0).getBbox().size()); + + StringBuilder stringBuilder = new StringBuilder(); + for (TextInfo text : pageText) { + stringBuilder.append(text.getText()); + stringBuilder.append(" "); + } + return stringBuilder.toString().trim(); + } + + /** + * Create pdfWriter. + */ + private PdfWriter getPdfWriter() { + return new PdfWriter(new ByteArrayOutputStream(), new WriterProperties().addUAXmpMetadata()); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/imageformats/ImageFormatIntegrationExecutableTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/imageformats/ImageFormatIntegrationExecutableTest.java new file mode 100644 index 0000000..7eeaebb --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/imageformats/ImageFormatIntegrationExecutableTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.imageformats; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class ImageFormatIntegrationExecutableTest extends ImageFormatIntegrationTest { + public ImageFormatIntegrationExecutableTest() { + super(ReaderType.EXECUTABLE); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/imageformats/ImageFormatIntegrationLibTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/imageformats/ImageFormatIntegrationLibTest.java new file mode 100644 index 0000000..15c21cc --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/imageformats/ImageFormatIntegrationLibTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.imageformats; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class ImageFormatIntegrationLibTest extends ImageFormatIntegrationTest { + public ImageFormatIntegrationLibTest() { + super(ReaderType.LIB); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/imageformats/ImageFormatIntegrationTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/imageformats/ImageFormatIntegrationTest.java new file mode 100644 index 0000000..3550bac --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/imageformats/ImageFormatIntegrationTest.java @@ -0,0 +1,318 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.imageformats; + +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.kernel.colors.DeviceCmyk; +import com.itextpdf.kernel.utils.CompareTool; +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine; +import com.itextpdf.pdfocr.tesseract4.Tesseract4LogMessageConstant; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrException; +import com.itextpdf.pdfocr.tesseract4.TextPositioning; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public abstract class ImageFormatIntegrationTest extends IntegrationTestHelper { + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + AbstractTesseract4OcrEngine tesseractReader; + String testType; + + public ImageFormatIntegrationTest(ReaderType type) { + tesseractReader = getTesseractReader(type); + this.testType = type.toString().toLowerCase(); + } + + @Before + public void initTesseractProperties() { + Tesseract4OcrEngineProperties ocrEngineProperties = + new Tesseract4OcrEngineProperties(); + ocrEngineProperties.setPathToTessData(getTessDataDirectory()); + tesseractReader.setTesseract4OcrEngineProperties(ocrEngineProperties); + } + + @Test + public void compareBmp() throws IOException, InterruptedException { + String testName = "compareBmp"; + String fileName = "example_01"; + String path = TEST_IMAGES_DIRECTORY + fileName + ".BMP"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + fileName + "_" + testType + ".pdf"; + String resultPdfPath = getTargetDirectory() + fileName + "_" + testName + "_" + testType + ".pdf"; + + doOcrAndSavePdfToPath(tesseractReader, path, resultPdfPath, + Collections.singletonList("eng"), DeviceCmyk.MAGENTA); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, + expectedPdfPath, getTargetDirectory(), "diff_")); + } + + @Test + public void testBMPText() { + String path = TEST_IMAGES_DIRECTORY + "example_01.BMP"; + String expectedOutput = "This is a test message for OCR Scanner Test"; + + String realOutputHocr = getTextFromPdf(tesseractReader, new File(path), + Collections.singletonList("eng")); + realOutputHocr = realOutputHocr.replaceAll("[\n]", " "); + realOutputHocr = realOutputHocr.replaceAll("[‘]", ""); + Assert.assertTrue(realOutputHocr.contains((expectedOutput))); + } + + @Test + public void compareBmp02() throws IOException, InterruptedException { + String testName = "compareBmp02"; + String fileName = "englishText"; + String path = TEST_IMAGES_DIRECTORY + fileName + ".bmp"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + fileName + "_" + testType + ".pdf"; + String resultPdfPath = getTargetDirectory() + fileName + "_" + testName + "_" + testType + ".pdf"; + + doOcrAndSavePdfToPath(tesseractReader, path, resultPdfPath, + Collections.singletonList("eng"), DeviceCmyk.MAGENTA); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, + expectedPdfPath, getTargetDirectory(), "diff_")); + } + + @Test + public void testBMPText02() { + String path = TEST_IMAGES_DIRECTORY + "englishText.bmp"; + String expectedOutput = "This is a test message for OCR Scanner Test BMPTest"; + + String realOutputHocr = getTextFromPdf(tesseractReader, new File(path), + Collections.singletonList("eng")); + realOutputHocr = realOutputHocr.replaceAll("[\n]", " "); + Assert.assertTrue(realOutputHocr.contains((expectedOutput))); + } + + @Test + public void compareJFIF() throws IOException, InterruptedException { + String testName = "compareJFIF"; + String filename = "example_02"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + ".pdf"; + String resultPdfPath = getTargetDirectory() + filename + "_" + testName + ".pdf"; + + doOcrAndSavePdfToPath(tesseractReader, + TEST_IMAGES_DIRECTORY + filename + ".JFIF", + resultPdfPath, null, DeviceCmyk.MAGENTA); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, + expectedPdfPath, getTargetDirectory(), "diff_")); + } + + @Test + public void compareJpg() throws IOException, InterruptedException { + String testName = "compareJpg"; + String fileName = "numbers_02"; + String path = TEST_IMAGES_DIRECTORY + fileName + ".jpg"; + String pdfName = fileName + "_" + testName + "_" + testType + ".pdf"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + pdfName; + String resultPdfPath = getTargetDirectory() + pdfName; + + doOcrAndSavePdfToPath(tesseractReader, path, resultPdfPath, + null, DeviceCmyk.BLACK); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, + expectedPdfPath, getTargetDirectory(), "diff_")); + } + + @Test + public void testTextFromJPG() { + String path = TEST_IMAGES_DIRECTORY + "numbers_02.jpg"; + String expectedOutput = "0123456789"; + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader + .getTesseract4OcrEngineProperties() + .setPreprocessingImages(false)); + String realOutputHocr = getTextFromPdf(tesseractReader, new File(path)); + Assert.assertTrue(realOutputHocr.contains(expectedOutput)); + } + + @Test + public void compareJpe() throws IOException, InterruptedException { + String testName = "compareJpe"; + String fileName = "numbers_01"; + String path = TEST_IMAGES_DIRECTORY + fileName + ".jpe"; + String pdfName = fileName + "_" + testName + "_" + testType + ".pdf"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + pdfName; + String resultPdfPath = getTargetDirectory() + pdfName; + + doOcrAndSavePdfToPath(tesseractReader, path, resultPdfPath, + null, DeviceCmyk.BLACK); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, + expectedPdfPath, getTargetDirectory(), "diff_")); + } + + @Test + public void testTextFromJPE() { + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpe"; + String expectedOutput = "619121"; + + String realOutputHocr = getTextFromPdf(tesseractReader, new File(path)); + Assert.assertTrue(realOutputHocr.contains(expectedOutput)); + } + + @Test + public void compareTif() throws IOException, InterruptedException { + String testName = "compareTif"; + String fileName = "numbers_01"; + String path = TEST_IMAGES_DIRECTORY + fileName + ".tif"; + String pdfName = fileName + "_" + testName + "_" + testType + ".pdf"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + pdfName; + String resultPdfPath = getTargetDirectory() + pdfName; + + doOcrAndSavePdfToPath(tesseractReader, path, resultPdfPath, + null, DeviceCmyk.BLACK); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, + expectedPdfPath, getTargetDirectory(), "diff_")); + } + + @Test + public void testTextFromTIF() { + String path = TEST_IMAGES_DIRECTORY + "numbers_01.tif"; + String expectedOutput = "619121"; + + String realOutputHocr = getTextFromPdf(tesseractReader, new File(path)); + Assert.assertTrue(realOutputHocr.contains(expectedOutput)); + } + + @Test + public void testBigTiffWithoutPreprocessing() { + String path = TEST_IMAGES_DIRECTORY + "example_03_10MB.tiff"; + String expectedOutput = "Image File Format"; + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(false) + .setPageSegMode(null)); + String realOutputHocr = getTextFromPdf(tesseractReader, new File(path), + Collections.singletonList("eng")); + Assert.assertTrue(realOutputHocr.contains(expectedOutput)); + } + + @Test + public void compareMultipagesTIFFWithPreprocessing() throws IOException, InterruptedException { + String testName = "compareMultipagesTIFFWithPreprocessing"; + String fileName = "multipage"; + String path = TEST_IMAGES_DIRECTORY + fileName + ".tiff"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + fileName + "_" + testType + ".pdf"; + String resultPdfPath = getTargetDirectory() + fileName + "_" + testName + "_" + testType + ".pdf"; + + doOcrAndSavePdfToPath(tesseractReader, path, resultPdfPath, + Collections.singletonList("eng"), DeviceCmyk.BLACK); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, + expectedPdfPath, getTargetDirectory(), "diff_")); + } + + @Test + public void testInputMultipagesTIFFWithPreprocessing() { + String path = TEST_IMAGES_DIRECTORY + "multîpage.tiff"; + String expectedOutput = "Multipage\nTIFF\nExample\nPage 5"; + + File file = new File(path); + + String realOutputHocr = getTextFromPdf(tesseractReader, file, 5, + Collections.singletonList("eng")); + Assert.assertNotNull(realOutputHocr); + Assert.assertEquals(expectedOutput, realOutputHocr); + } + + @Test + public void testInputMultipagesTIFFWithoutPreprocessing() { + String path = TEST_IMAGES_DIRECTORY + "multîpage.tiff"; + String expectedOutput = "Multipage\nTIFF\nExample\nPage 3"; + + File file = new File(path); + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(false)); + String realOutputHocr = getTextFromPdf(tesseractReader, file, 3, + Collections.singletonList("eng")); + Assert.assertNotNull(realOutputHocr); + Assert.assertEquals(expectedOutput, realOutputHocr); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE, count = 1) + }) + @Test + public void testInputWrongFormat() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil + .format(Tesseract4OcrException.INCORRECT_INPUT_IMAGE_FORMAT, + "txt")); + File file = new File(TEST_IMAGES_DIRECTORY + "example.txt"); + getTextFromPdf(tesseractReader, file); + } + + @Test + public void testJpgWithoutPreprocessing() { + String path = TEST_IMAGES_DIRECTORY + "nümbérs.jpg"; + String expectedOutput = "619121"; + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(false)); + String realOutputHocr = getTextFromPdf(tesseractReader, new File(path), + Collections.singletonList("eng")); + Assert.assertTrue(realOutputHocr.contains(expectedOutput)); + } + + @Test + public void compareNumbersJPG() throws IOException, InterruptedException { + String testName = "compareNumbersJPG"; + String filename = "nümbérs"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + "numbers_01.pdf"; + String resultPdfPath = getTargetDirectory() + "numbers_01_" + testName + ".pdf"; + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setTextPositioning(TextPositioning.BY_WORDS)); + doOcrAndSavePdfToPath(tesseractReader, + TEST_IMAGES_DIRECTORY + filename + ".jpg", + resultPdfPath); + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setTextPositioning(TextPositioning.BY_LINES)); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, + expectedPdfPath, getTargetDirectory(), "diff_")); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdfa3u/PdfA3UIntegrationExecutableTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdfa3u/PdfA3UIntegrationExecutableTest.java new file mode 100644 index 0000000..6b1fe6b --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdfa3u/PdfA3UIntegrationExecutableTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.pdfa3u; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class PdfA3UIntegrationExecutableTest extends PdfA3UIntegrationTest { + public PdfA3UIntegrationExecutableTest() { + super(ReaderType.EXECUTABLE); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdfa3u/PdfA3UIntegrationLibTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdfa3u/PdfA3UIntegrationLibTest.java new file mode 100644 index 0000000..5e7cff5 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdfa3u/PdfA3UIntegrationLibTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.pdfa3u; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class PdfA3UIntegrationLibTest extends PdfA3UIntegrationTest { + public PdfA3UIntegrationLibTest() { + super(ReaderType.LIB); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdfa3u/PdfA3UIntegrationTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdfa3u/PdfA3UIntegrationTest.java new file mode 100644 index 0000000..8868609 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdfa3u/PdfA3UIntegrationTest.java @@ -0,0 +1,158 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.pdfa3u; + +import com.itextpdf.kernel.colors.DeviceRgb; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfOutputIntent; +import com.itextpdf.kernel.utils.CompareTool; +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.pdfocr.OcrPdfCreator; +import com.itextpdf.pdfocr.OcrPdfCreatorProperties; +import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; +import com.itextpdf.pdfocr.tesseract4.TextPositioning; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public abstract class PdfA3UIntegrationTest extends IntegrationTestHelper { + + // path to default cmyk color profile + private static final String DEFAULT_CMYK_COLOR_PROFILE_PATH = TEST_DIRECTORY + "profiles/CoatedFOGRA27.icc"; + // path to default rgb color profile + private static final String DEFAULT_RGB_COLOR_PROFILE_PATH = TEST_DIRECTORY + "profiles/sRGB_CS_profile.icm"; + + AbstractTesseract4OcrEngine tesseractReader; + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + public PdfA3UIntegrationTest(ReaderType type) { + tesseractReader = getTesseractReader(type); + } + + @Test + public void comparePdfA3uCMYKColorSpaceJPG() throws IOException, + InterruptedException { + String testName = "comparePdfA3uCMYKColorSpaceJPG"; + String filename = "numbers_01"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_a3u.pdf"; + String resultPdfPath = getTargetDirectory() + filename + "_" + testName + "_a3u.pdf"; + + try { + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setTitle(""); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, + ocrPdfCreatorProperties); + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setTextPositioning(TextPositioning.BY_WORDS)); + Assert.assertEquals(tesseractReader, ocrPdfCreator.getOcrEngine()); + ocrPdfCreator.setOcrEngine(tesseractReader); + PdfDocument doc = + ocrPdfCreator.createPdfA( + Collections.singletonList( + new File(TEST_IMAGES_DIRECTORY + + filename + ".jpg")), + getPdfWriter(resultPdfPath), + getCMYKPdfOutputIntent()); + Assert.assertNotNull(doc); + doc.close(); + + Assert.assertNull(new CompareTool() + .compareByContent(resultPdfPath, expectedPdfPath, + getTargetDirectory(), "diff_")); + } finally { + Assert.assertEquals(TextPositioning.BY_WORDS, + tesseractReader.getTesseract4OcrEngineProperties().getTextPositioning()); + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setTextPositioning(TextPositioning.BY_LINES)); + } + } + + @Test + public void comparePdfA3uRGBSpanishJPG() + throws IOException, InterruptedException { + String testName = "comparePdfA3uRGBSpanishJPG"; + String filename = "spanish_01"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_a3u.pdf"; + String resultPdfPath = getTargetDirectory() + filename + "_" + testName + "_a3u.pdf"; + + Tesseract4OcrEngineProperties properties = + new Tesseract4OcrEngineProperties(tesseractReader.getTesseract4OcrEngineProperties()); + properties.setPathToTessData(getTessDataDirectory()); + properties.setLanguages(Collections.singletonList("spa")); + tesseractReader.setTesseract4OcrEngineProperties(properties); + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setPdfLang("en-US"); + ocrPdfCreatorProperties.setTitle(""); + ocrPdfCreatorProperties.setTextColor(DeviceRgb.BLACK); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, + ocrPdfCreatorProperties); + + PdfDocument doc = ocrPdfCreator.createPdfA( + Collections.singletonList( + new File(TEST_IMAGES_DIRECTORY + filename + + ".jpg")), getPdfWriter(resultPdfPath), + getRGBPdfOutputIntent()); + Assert.assertNotNull(doc); + doc.close(); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, + expectedPdfPath, getTargetDirectory(), "diff_")); + } + + /** + * Creates PDF cmyk output intent for tests. + */ + protected PdfOutputIntent getCMYKPdfOutputIntent() throws FileNotFoundException { + InputStream is = new FileInputStream(DEFAULT_CMYK_COLOR_PROFILE_PATH); + return new PdfOutputIntent("Custom", + "","http://www.color.org", + "Coated FOGRA27 (ISO 12647 - 2:2004)", is); + } + + /** + * Creates PDF rgb output intent for tests. + */ + protected PdfOutputIntent getRGBPdfOutputIntent() throws FileNotFoundException { + InputStream is = new FileInputStream(DEFAULT_RGB_COLOR_PROFILE_PATH); + return new PdfOutputIntent("", "", + "", "sRGB IEC61966-2.1", is); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdflayers/PdfLayersIntegrationExecutableTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdflayers/PdfLayersIntegrationExecutableTest.java new file mode 100644 index 0000000..f4b8057 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdflayers/PdfLayersIntegrationExecutableTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.pdflayers; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class PdfLayersIntegrationExecutableTest extends PdfLayersIntegrationTest { + public PdfLayersIntegrationExecutableTest() { + super(ReaderType.EXECUTABLE); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdflayers/PdfLayersIntegrationLibTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdflayers/PdfLayersIntegrationLibTest.java new file mode 100644 index 0000000..e725807 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdflayers/PdfLayersIntegrationLibTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.pdflayers; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class PdfLayersIntegrationLibTest extends PdfLayersIntegrationTest { + public PdfLayersIntegrationLibTest() { + super(ReaderType.LIB); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdflayers/PdfLayersIntegrationTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdflayers/PdfLayersIntegrationTest.java new file mode 100644 index 0000000..59270e3 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/pdflayers/PdfLayersIntegrationTest.java @@ -0,0 +1,174 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.pdflayers; + +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfName; +import com.itextpdf.kernel.pdf.layer.PdfLayer; +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.pdfocr.OcrPdfCreator; +import com.itextpdf.pdfocr.OcrPdfCreatorProperties; +import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.junit.Assert; +import org.junit.Test; + +public abstract class PdfLayersIntegrationTest extends IntegrationTestHelper { + + AbstractTesseract4OcrEngine tesseractReader; + + public PdfLayersIntegrationTest(ReaderType type) { + tesseractReader = getTesseractReader(type); + } + + @Test + public void testTextFromPdfLayersFromMultiPageTiff() throws IOException { + String testName = "testTextFromPdfLayersFromMultiPageTiff"; + boolean preprocess = + tesseractReader.getTesseract4OcrEngineProperties().isPreprocessingImages(); + String path = TEST_IMAGES_DIRECTORY + "multîpage.tiff"; + String pdfPath = getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(false)); + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setTextLayerName("Text Layer"); + properties.setImageLayerName("Image Layer"); + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), getPdfWriter(pdfPath)); + + Assert.assertNotNull(doc); + int numOfPages = doc.getNumberOfPages(); + List layers = doc.getCatalog() + .getOCProperties(true).getLayers(); + + Assert.assertEquals(numOfPages * 2, layers.size()); + Assert.assertEquals("Image Layer", + layers.get(2).getPdfObject().get(PdfName.Name).toString()); + Assert.assertEquals("Text Layer", + layers.get(3).getPdfObject().get(PdfName.Name).toString()); + + doc.close(); + + // Text layer should contain all text + // Image layer shouldn't contain any text + String expectedOutput = "Multipage\nTIFF\nExample\nPage 5"; + Assert.assertEquals(expectedOutput, + getTextFromPdfLayer(pdfPath, "Text Layer", 5)); + Assert.assertEquals("", + getTextFromPdfLayer(pdfPath, + "Image Layer", 5)); + Assert.assertFalse(tesseractReader.getTesseract4OcrEngineProperties().isPreprocessingImages()); + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(preprocess)); + } + + @Test + public void testTextFromMultiPageTiff() throws IOException { + String testName = "testTextFromMultiPageTiff"; + boolean preprocess = + tesseractReader.getTesseract4OcrEngineProperties().isPreprocessingImages(); + String path = TEST_IMAGES_DIRECTORY + "multîpage.tiff"; + String pdfPath = getTargetDirectory() + testName + ".pdf"; + File file = new File(path); + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(false)); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader); + PdfDocument doc = + ocrPdfCreator.createPdf(Collections.singletonList(file), getPdfWriter(pdfPath)); + + Assert.assertNotNull(doc); + int numOfPages = doc.getNumberOfPages(); + List layers = doc.getCatalog() + .getOCProperties(true).getLayers(); + + Assert.assertEquals(0, layers.size()); + + + doc.close(); + + // Text layer should contain all text + // Image layer shouldn't contain any text + String expectedOutput = "Multipage\nTIFF\nExample\nPage 5"; + Assert.assertEquals(expectedOutput, + getTextFromPdfLayer(pdfPath, null, 5)); + Assert.assertFalse(tesseractReader.getTesseract4OcrEngineProperties().isPreprocessingImages()); + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(preprocess)); + } + + @Test + public void testTextFromPdfLayersFromMultiPagePdf() throws IOException { + String testName = "testTextFromPdfLayersFromMultiPagePdf"; + String pdfPath = getTargetDirectory() + testName + ".pdf"; + + List files = Arrays.asList( + new File(TEST_IMAGES_DIRECTORY + "german_01.jpg"), + new File(TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png"), + new File(TEST_IMAGES_DIRECTORY + "nümbérs.jpg"), + new File(TEST_IMAGES_DIRECTORY + "example_04.png") + ); + + OcrPdfCreatorProperties properties = new OcrPdfCreatorProperties(); + properties.setImageLayerName("image"); + properties.setTextLayerName("text"); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, properties); + PdfDocument doc = ocrPdfCreator.createPdf(files, getPdfWriter(pdfPath)); + + Assert.assertNotNull(doc); + int numOfPages = doc.getNumberOfPages(); + Assert.assertEquals(numOfPages, files.size()); + List layers = doc.getCatalog() + .getOCProperties(true).getLayers(); + + Assert.assertEquals(numOfPages * 2, layers.size()); + Assert.assertEquals("image", + layers.get(2).getPdfObject().get(PdfName.Name).toString()); + Assert.assertEquals("text", + layers.get(3).getPdfObject().get(PdfName.Name).toString()); + + doc.close(); + + // Text layer should contain all text + // Image layer shouldn't contain any text + String expectedOutput = "619121"; + Assert.assertEquals(expectedOutput, + getTextFromPdfLayer(pdfPath, "text", 3)); + Assert.assertEquals("", + getTextFromPdfLayer(pdfPath, "image", 3)); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationExecutableTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationExecutableTest.java new file mode 100644 index 0000000..c67fc54 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationExecutableTest.java @@ -0,0 +1,34 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tessdata; + +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class TessDataIntegrationExecutableTest extends TessDataIntegrationTest { + public TessDataIntegrationExecutableTest() { + super(ReaderType.EXECUTABLE); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java new file mode 100644 index 0000000..689a058 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationLibTest.java @@ -0,0 +1,112 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tessdata; + +import com.itextpdf.pdfocr.TextInfo; +import com.itextpdf.pdfocr.tesseract4.OutputFormat; +import com.itextpdf.pdfocr.tesseract4.TesseractHelper; +import com.itextpdf.pdfocr.tesseract4.TextPositioning; +import com.itextpdf.test.annotations.type.IntegrationTest; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.io.File; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +@Category(IntegrationTest.class) +public class TessDataIntegrationLibTest extends TessDataIntegrationTest { + public TessDataIntegrationLibTest() { + super(ReaderType.LIB); + } + + @Test(timeout = 50000) + public void textOutputFromHalftoneFile() { + String imgPath = TEST_IMAGES_DIRECTORY + "halftone.jpg"; + String expected01 = "Silliness Enablers"; + String expected02 = "You dream it, we enable it"; + String expected03 = "QUANTITY"; + + String result = getRecognizedTextFromTextFile(tesseractReader, imgPath, + Collections.singletonList("eng")); + + // correct result for a halftone input image + Assert.assertTrue(result.contains(expected01)); + Assert.assertTrue(result.contains(expected02)); + Assert.assertTrue(result.contains(expected03)); + } + + @Test(timeout = 50000) + public void hocrOutputFromHalftoneFile() throws java.io.IOException { + String path = TEST_IMAGES_DIRECTORY + "halftone.jpg"; + String expected01 = "Silliness"; + String expected02 = "Enablers"; + String expected03 = "You"; + String expected04 = "Middle"; + String expected05 = "André"; + String expected06 = "QUANTITY"; + String expected07 = "DESCRIPTION"; + String expected08 = "Silliness Enablers"; + String expected09 = "QUANTITY DESCRIPTION UNIT PRICE TOTAL"; + + File imgFile = new File(path); + File outputFile = new File(getTargetDirectory() + + "hocrOutputFromHalftoneFile.hocr"); + + tesseractReader.doTesseractOcr(imgFile, outputFile, OutputFormat.HOCR); + Map> pageData = TesseractHelper + .parseHocrFile(Collections.singletonList(outputFile), + TextPositioning.BY_WORDS + ); + Assert.assertTrue(findTextInPageData(pageData, 1, expected01)); + Assert.assertTrue(findTextInPageData(pageData, 1, expected02)); + Assert.assertTrue(findTextInPageData(pageData, 1, expected03)); + Assert.assertTrue(findTextInPageData(pageData, 1, expected04)); + Assert.assertTrue(findTextInPageData(pageData, 1, expected05)); + Assert.assertTrue(findTextInPageData(pageData, 1, expected06)); + Assert.assertTrue(findTextInPageData(pageData, 1, expected07)); + + pageData = TesseractHelper + .parseHocrFile(Collections.singletonList(outputFile), + TextPositioning.BY_LINES + ); + Assert.assertTrue(findTextInPageData(pageData, 1, expected08)); + Assert.assertTrue(findTextInPageData(pageData, 1, expected09)); + } + + /** + * Searches for certain text in page data. + */ + private boolean findTextInPageData(Map> pageData, int page, String textToSearchFor) { + for (TextInfo textInfo : pageData.get(page)) { + if (textToSearchFor.equals(textInfo.getText())) { + return true; + } + } + return false; + } + +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationTest.java new file mode 100644 index 0000000..7ff1b14 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tessdata/TessDataIntegrationTest.java @@ -0,0 +1,673 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tessdata; + +import com.itextpdf.kernel.colors.DeviceCmyk; +import com.itextpdf.kernel.pdf.PdfWriter; +import com.itextpdf.kernel.utils.CompareTool; +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.pdfocr.OcrPdfCreator; +import com.itextpdf.pdfocr.OcrPdfCreatorProperties; +import com.itextpdf.pdfocr.PdfOcrLogMessageConstant; +import com.itextpdf.pdfocr.tesseract4.AbstractTesseract4OcrEngine; +import com.itextpdf.pdfocr.tesseract4.Tesseract4OcrEngineProperties; +import com.itextpdf.pdfocr.tesseract4.TextPositioning; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class TessDataIntegrationTest extends IntegrationTestHelper { + + private static final Logger LOGGER = LoggerFactory + .getLogger(TessDataIntegrationTest.class); + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + AbstractTesseract4OcrEngine tesseractReader; + String testFileTypeName; + private boolean isExecutableReaderType; + + public TessDataIntegrationTest(ReaderType type) { + isExecutableReaderType = type.equals(ReaderType.EXECUTABLE); + if (isExecutableReaderType) { + testFileTypeName = "executable"; + } else { + testFileTypeName = "lib"; + } + tesseractReader = getTesseractReader(type); + } + + @Before + public void initTesseractProperties() { + Tesseract4OcrEngineProperties ocrEngineProperties = + new Tesseract4OcrEngineProperties(); + ocrEngineProperties.setPathToTessData(getTessDataDirectory()); + tesseractReader.setTesseract4OcrEngineProperties(ocrEngineProperties); + } + + @Test + public void textGreekText() { + String imgPath = TEST_IMAGES_DIRECTORY + "greek_01.jpg"; + File file = new File(imgPath); + String expected = "ΟΜΟΛΟΓΙΑ"; + + if (isExecutableReaderType) { + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(false)); + } + String real = getTextFromPdf(tesseractReader, file, + Arrays.asList("ell"), NOTO_SANS_FONT_PATH); + // correct result with specified greek language + Assert.assertTrue(real.contains(expected)); + } + + @Test + public void textJapaneseText() { + String imgPath = TEST_IMAGES_DIRECTORY + "japanese_01.png"; + File file = new File(imgPath); + String expected = "日 本 語\n文法"; + + // correct result with specified japanese language + Assert.assertEquals(expected, getTextFromPdf(tesseractReader, file, + Arrays.asList("jpn"), KOSUGI_FONT_PATH)); + } + + @Test + public void testFrench() { + String imgPath = TEST_IMAGES_DIRECTORY + "french_01.png"; + File file = new File(imgPath); + String expectedFr = "RESTEZ\nCALME\nPARLEZ EN\nFRANÇAIS"; + + // correct result with specified spanish language + Assert.assertTrue(getTextFromPdf(tesseractReader, file, + Collections.singletonList("fra")).endsWith(expectedFr)); + + // incorrect result when languages are not specified + // or languages were specified in the wrong order + Assert.assertFalse(getTextFromPdf(tesseractReader,file, + Collections.singletonList("eng")).endsWith(expectedFr)); + Assert.assertNotEquals(expectedFr, + getTextFromPdf(tesseractReader,file, Collections.singletonList("spa"))); + Assert.assertNotEquals(expectedFr, + getTextFromPdf(tesseractReader,file, new ArrayList())); + } + + @Test + public void testSpanishPNG() throws IOException { + String testName = "compareSpanishPNG"; + String filename = "scanned_spa_01"; + String expectedText1 = "¿Y SI ENSAYARA COMO ACTUAR?"; + String expectedText2 = "¿Y SI ENSAYARA ACTUAR?"; + String resultPdfPath = getTargetDirectory() + filename + "_" + testName + + "_" + testFileTypeName + ".pdf"; + + List languages = Arrays.asList("spa", "spa_old"); + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + if (isExecutableReaderType) { + properties.setPreprocessingImages(false); + } + + // locate text by words + properties.setTextPositioning(TextPositioning.BY_WORDS); + properties.setLanguages(languages); + tesseractReader.setTesseract4OcrEngineProperties(properties); + + OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); + ocrPdfCreatorProperties.setTextColor(DeviceCmyk.BLACK); + + OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties); + try (PdfWriter pdfWriter = getPdfWriter(resultPdfPath)) { + ocrPdfCreator.createPdf(Collections.singletonList( + new File(TEST_IMAGES_DIRECTORY + filename + ".png")), + pdfWriter) + .close(); + } + + try { + String result = getTextFromPdfLayer(resultPdfPath, null, 1) + .replace("\n", " "); + Assert.assertTrue(result.contains(expectedText1) + || result.contains(expectedText2)); + } finally { + Assert.assertEquals(TextPositioning.BY_WORDS, + tesseractReader.getTesseract4OcrEngineProperties().getTextPositioning()); + } + } + + @Test + public void textGreekOutputFromTxtFile() { + String imgPath = TEST_IMAGES_DIRECTORY + "greek_01.jpg"; + String expected = "ΟΜΟΛΟΓΙΑ"; + + if (isExecutableReaderType) { + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPreprocessingImages(false)); + } + String result = getRecognizedTextFromTextFile(tesseractReader, imgPath, + Collections.singletonList("ell")); + // correct result with specified greek language + Assert.assertTrue(result.contains(expected)); + } + + @Test + public void textJapaneseOutputFromTxtFile() { + String imgPath = TEST_IMAGES_DIRECTORY + "japanese_01.png"; + String expected = "日本語文法"; + + String result = getRecognizedTextFromTextFile(tesseractReader, imgPath, + Collections.singletonList("jpn")); + + result = result.replaceAll("[\f\n]", ""); + // correct result with specified japanese language + Assert.assertTrue(result.contains(expected)); + } + + @Test + public void testFrenchOutputFromTxtFile() { + String imgPath = TEST_IMAGES_DIRECTORY + "french_01.png"; + String expectedFr = "RESTEZ\nCALME\nPARLEZ EN\nFRANÇAIS"; + + String result = getRecognizedTextFromTextFile(tesseractReader, imgPath, + Collections.singletonList("fra")); + result = result.replaceAll("(?:\\n\\f)+", "").trim(); + result = result.replaceAll("\\n\\n", "\n").trim(); + // correct result with specified spanish language + Assert.assertTrue(result.endsWith(expectedFr)); + + // incorrect result when languages are not specified + // or languages were specified in the wrong order + Assert.assertFalse( + getRecognizedTextFromTextFile(tesseractReader, imgPath, + Collections.singletonList("eng")).endsWith(expectedFr)); + Assert.assertNotEquals(expectedFr, + getRecognizedTextFromTextFile(tesseractReader, imgPath, + Collections.singletonList("spa"))); + Assert.assertNotEquals(expectedFr, + getRecognizedTextFromTextFile(tesseractReader, imgPath, + new ArrayList())); + } + + @Test + public void testArabicOutputFromTxtFile() { + String imgPath = TEST_IMAGES_DIRECTORY + "arabic_02.png"; + // First sentence + String expected = "اللغة العربية"; + + String result = getRecognizedTextFromTextFile(tesseractReader, imgPath, + Collections.singletonList("ara")); + // correct result with specified arabic language + Assert.assertTrue(result.startsWith(expected)); + + // incorrect result when languages are not specified + // or languages were specified in the wrong order + + String engResult = getRecognizedTextFromTextFile(tesseractReader, imgPath, + Collections.singletonList("eng")); + Assert.assertFalse(engResult.startsWith(expected)); + String spaResult = getRecognizedTextFromTextFile(tesseractReader, imgPath, + Collections.singletonList("spa")); + Assert.assertFalse(spaResult.startsWith(expected)); + String langNotSpecifiedResult = getRecognizedTextFromTextFile(tesseractReader, imgPath, + new ArrayList()); + Assert.assertFalse(langNotSpecifiedResult.startsWith(expected)); + } + + @Test + public void testGermanAndCompareTxtFiles() { + String imgPath = TEST_IMAGES_DIRECTORY + "german_01.jpg"; + String expectedTxt = TEST_DOCUMENTS_DIRECTORY + "german_01" + testFileTypeName + ".txt"; + + boolean result = doOcrAndCompareTxtFiles(tesseractReader, imgPath, expectedTxt, + Collections.singletonList("deu")); + Assert.assertTrue(result); + } + + @Test + public void testMultipageTiffAndCompareTxtFiles() { + String imgPath = TEST_IMAGES_DIRECTORY + "multîpage.tiff"; + String expectedTxt = TEST_DOCUMENTS_DIRECTORY + "multipage_" + testFileTypeName + ".txt"; + + boolean result = doOcrAndCompareTxtFiles(tesseractReader, imgPath, expectedTxt, + Collections.singletonList("eng")); + Assert.assertTrue(result); + } + + @Test + public void testGermanWithTessData() { + String imgPath = TEST_IMAGES_DIRECTORY + "german_01.jpg"; + File file = new File(imgPath); + String expectedGerman = "Das Geheimnis\ndes Könnens\nliegt im Wollen."; + + String res = getTextFromPdf(tesseractReader, file, Collections.singletonList("deu")); + // correct result with specified spanish language + Assert.assertEquals(expectedGerman, res); + + // incorrect result when languages are not specified + // or languages were specified in the wrong order + Assert.assertNotEquals(expectedGerman, + getTextFromPdf(tesseractReader, file, Collections.singletonList("eng"))); + Assert.assertNotEquals(expectedGerman, + getTextFromPdf(tesseractReader, file, Collections.singletonList("fra"))); + Assert.assertNotEquals(expectedGerman, + getTextFromPdf(tesseractReader, file, new ArrayList())); + } + + @Test + public void testArabicTextWithEng() { + String imgPath = TEST_IMAGES_DIRECTORY + "arabic_01.jpg"; + File file = new File(imgPath); + String expected = "الحية. والضحك؛ والحب\nlive, laugh, love"; + + String result = getTextFromPdf(tesseractReader, file, + Arrays.asList("ara", "eng"), CAIRO_FONT_PATH); + // correct result with specified arabic+english languages + Assert.assertEquals(expected, result.replaceAll("[?]", "")); + + // incorrect result when languages are not specified + // or languages were specified in the wrong order + Assert.assertNotEquals(expected, getTextFromPdf(tesseractReader, file, + Collections.singletonList("eng"), CAIRO_FONT_PATH)); + Assert.assertNotEquals(expected, + getTextFromPdf(tesseractReader, file, new ArrayList(), + CAIRO_FONT_PATH)); + } + + @Test + public void testArabicText() { + String imgPath = TEST_IMAGES_DIRECTORY + "arabic_02.png"; + File file = new File(imgPath); + // First sentence + String expected = "اللغة العربية"; + + // correct result with specified arabic language + Assert.assertEquals(expected, getTextFromPdf(tesseractReader, file, + Collections.singletonList("ara"), CAIRO_FONT_PATH)); + + // incorrect result when languages are not specified + // or languages were specified in the wrong order + Assert.assertNotEquals(expected, getTextFromPdf(tesseractReader, file, + Collections.singletonList("eng"), CAIRO_FONT_PATH)); + Assert.assertNotEquals(expected, getTextFromPdf(tesseractReader, file, + Collections.singletonList("spa"), CAIRO_FONT_PATH)); + Assert.assertNotEquals(expected, getTextFromPdf(tesseractReader, file, + new ArrayList(), CAIRO_FONT_PATH)); + } + + @Test + public void compareMultiLangImage() throws InterruptedException, java.io.IOException { + String testName = "compareMultiLangImage"; + String filename = "multilang"; + String expectedPdfPath = TEST_DOCUMENTS_DIRECTORY + filename + "_" + testFileTypeName + ".pdf"; + String resultPdfPath = getTargetDirectory() + filename + "_" + testName + "_" + testFileTypeName + ".pdf"; + + try { + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setTextPositioning(TextPositioning.BY_WORDS); + properties.setPathToTessData(getTessDataDirectory()); + properties.setPageSegMode(3); + tesseractReader.setTesseract4OcrEngineProperties(properties); + doOcrAndSavePdfToPath(tesseractReader, + TEST_IMAGES_DIRECTORY + filename + ".jpg", resultPdfPath, + Arrays.asList("eng", "deu", "spa"), DeviceCmyk.BLACK); + + Assert.assertNull(new CompareTool().compareByContent(resultPdfPath, expectedPdfPath, + TEST_DOCUMENTS_DIRECTORY, "diff_")); + } finally { + Assert.assertEquals(TextPositioning.BY_WORDS, + tesseractReader.getTesseract4OcrEngineProperties().getTextPositioning()); + Assert.assertEquals(3, tesseractReader + .getTesseract4OcrEngineProperties().getPageSegMode().intValue()); + } + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, count = 12) + }) + @Test + public void testHindiTextWithUrdu() throws IOException { + String testName = "testHindiTextWithUrdu"; + String imgPath = TEST_IMAGES_DIRECTORY + "hindi_01.jpg"; + File file = new File(imgPath); + String pdfPath = getTargetDirectory() + testName + ".pdf"; + + String expectedHindi = "हिन्दुस्तानी"; + String expectedUrdu = "وتالی"; + + doOcrAndSavePdfToPath(tesseractReader, file.getAbsolutePath(), + pdfPath, Arrays.asList("hin", "urd"), + Collections.singletonList(CAIRO_FONT_PATH)); + + String resultWithoutActualText = getTextFromPdfLayer(pdfPath, null, 1); + // because of provided font only urdu will be displayed correctly + Assert.assertTrue(resultWithoutActualText.contains(expectedUrdu)); + Assert.assertFalse(resultWithoutActualText.contains(expectedHindi)); + + String resultWithActualText = getTextFromPdfLayerUsingActualText(pdfPath, null, 1); + // actual text should contain all text + Assert.assertTrue(resultWithActualText.contains(expectedUrdu)); + Assert.assertTrue(resultWithActualText.contains(expectedHindi)); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER) + }, ignore = true) + @Test + public void testHindiTextWithUrduActualTextWithIncorrectFont() throws IOException { + String testName = "testHindiTextWithUrduActualTextWithIncorrectFont"; + String imgPath = TEST_IMAGES_DIRECTORY + "hindi_01.jpg"; + File file = new File(imgPath); + String pdfPath = getTargetDirectory() + testName + ".pdf"; + + String expectedHindi = "हिन्दुस्तानी"; + String expectedUrdu = "وتالی"; + + doOcrAndSavePdfToPath(tesseractReader, file.getAbsolutePath(), + pdfPath, Arrays.asList("hin", "urd"), null, null); + + String resultWithoutActualText = getTextFromPdfLayer(pdfPath, null, 1); + // because of provided font only urdu will be displayed correctly + Assert.assertFalse(resultWithoutActualText.contains(expectedUrdu)); + Assert.assertFalse(resultWithoutActualText.contains(expectedHindi)); + + String resultWithActualText = getTextFromPdfLayerUsingActualText(pdfPath, null, 1); + // actual text should contain all text + Assert.assertTrue(resultWithActualText.contains(expectedUrdu)); + Assert.assertTrue(resultWithActualText.contains(expectedHindi)); + } + + @Test + public void testHindiTextWithEng() { + String imgPath = TEST_IMAGES_DIRECTORY + "hindi_02.jpg"; + File file = new File(imgPath); + + String expected = "मानक हनिदी\nHindi"; + + // correct result with specified arabic+english languages + Assert.assertEquals(expected, getTextFromPdf(tesseractReader, file, + Arrays.asList("hin", "eng"), NOTO_SANS_FONT_PATH)); + + // incorrect result without specified english language + Assert.assertNotEquals(expected, getTextFromPdf(tesseractReader, file, + Collections.singletonList("hin"), NOTO_SANS_FONT_PATH)); + + // incorrect result when languages are not specified + // or languages were specified in the wrong order + Assert.assertNotEquals(expected, getTextFromPdf(tesseractReader, file, + Collections.singletonList("eng"), NOTO_SANS_FONT_PATH)); + Assert.assertNotEquals(expected, getTextFromPdf(tesseractReader, file)); + Assert.assertNotEquals(expected, getTextFromPdf(tesseractReader, file, + new ArrayList(), NOTO_SANS_FONT_PATH)); + } + + @Test + public void testGeorgianText() throws IOException { + String imgPath = TEST_IMAGES_DIRECTORY + "georgian_01.jpg"; + File file = new File(imgPath); + // First sentence + String expected = "ღმერთი"; + + String result = getTextFromPdf(tesseractReader, file, + Collections.singletonList("kat"), FREE_SANS_FONT_PATH); + // correct result with specified georgian+eng language + Assert.assertEquals(expected, result); + result = getTextFromPdf(tesseractReader, file, + Arrays.asList("kat", "kat_old"), FREE_SANS_FONT_PATH); + Assert.assertEquals(expected, result); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, count = 6) + }) + @Test + public void testGeorgianActualTextWithDefaultFont() throws IOException { + String testName = "testGeorgianActualTextWithDefaultFont"; + String pdfPath = getTargetDirectory() + testName + ".pdf"; + String imgPath = TEST_IMAGES_DIRECTORY + "georgian_01.jpg"; + File file = new File(imgPath); + // First sentence + String expected = "ღმერთი"; + + doOcrAndSavePdfToPath(tesseractReader, file.getAbsolutePath(), + pdfPath, Collections.singletonList("kat"), null, null); + + String resultWithoutActualText = getTextFromPdfLayer(pdfPath, null, 1); + Assert.assertNotEquals(expected, resultWithoutActualText); + + String resultWithActualText = getTextFromPdfLayerUsingActualText(pdfPath, null, 1); + Assert.assertEquals(expected, resultWithActualText); + } + + @Test + public void testBengali() { + String imgPath = TEST_IMAGES_DIRECTORY + "bengali_01.jpeg"; + File file = new File(imgPath); + String expected = "ইংরজে\nশখো"; + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setTextPositioning(TextPositioning.BY_WORDS)); + // correct result with specified spanish language + String result = getTextFromPdf(tesseractReader, file, 1, + Collections.singletonList("ben"), + Arrays.asList(FREE_SANS_FONT_PATH, KOSUGI_FONT_PATH)); + Assert.assertEquals(expected, result); + + Assert.assertEquals(expected, getTextFromPdf(tesseractReader, file, + Collections.singletonList("ben"), FREE_SANS_FONT_PATH)); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, count = 8) + }) + @Test + public void testBengaliActualTextWithDefaultFont() throws IOException { + String testName = "testBengaliActualTextWithDefaultFont"; + String pdfPath = getTargetDirectory() + testName + ".pdf"; + String imgPath = TEST_IMAGES_DIRECTORY + "bengali_01.jpeg"; + File file = new File(imgPath); + String expected = "ইংরজে\nশখো"; + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setTextPositioning(TextPositioning.BY_WORDS)); + + doOcrAndSavePdfToPath(tesseractReader, file.getAbsolutePath(), + pdfPath, Collections.singletonList("ben"), null, null); + + String resultWithoutActualText = getTextFromPdfLayer(pdfPath, null, 1); + Assert.assertNotEquals(expected, resultWithoutActualText); + + String resultWithActualText = getTextFromPdfLayerUsingActualText(pdfPath, null, 1); + Assert.assertEquals(expected, resultWithActualText); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = PdfOcrLogMessageConstant.COULD_NOT_FIND_CORRESPONDING_GLYPH_TO_UNICODE_CHARACTER, count = 6) + }) + @Test + public void testChinese() { + String imgPath = TEST_IMAGES_DIRECTORY + "chinese_01.jpg"; + File file = new File(imgPath); + String expected = "你 好\nni hao"; + + // correct result with specified spanish language + Assert.assertEquals(expected, getTextFromPdf(tesseractReader, file, + Arrays.asList("chi_sim", "chi_tra"), + NOTO_SANS_SC_FONT_PATH)); + Assert.assertEquals(expected, getTextFromPdf(tesseractReader, file, + Collections.singletonList("chi_sim"), + NOTO_SANS_SC_FONT_PATH)); + Assert.assertEquals(expected, getTextFromPdf(tesseractReader, file, + Collections.singletonList("chi_tra"), + NOTO_SANS_SC_FONT_PATH)); + + // incorrect result when languages are not specified + // or languages were specified in the wrong order + Assert.assertNotEquals(expected, + getTextFromPdf(tesseractReader, file, + Collections.singletonList("chi_sim"))); + Assert.assertNotEquals(expected, + getTextFromPdf(tesseractReader, file, Collections.singletonList("chi_tra"))); + Assert.assertNotEquals(expected, + getTextFromPdf(tesseractReader, file, Arrays.asList("chi_sim", "chi_tra"))); + Assert.assertFalse(getTextFromPdf(tesseractReader, file, new ArrayList()) + .contains(expected)); + } + + @Test + public void testSpanishWithTessData() { + String imgPath = TEST_IMAGES_DIRECTORY + "spanish_01.jpg"; + File file = new File(imgPath); + String expectedSpanish = "Aquí\nhablamos\nespañol"; + + // correct result with specified spanish language + Assert.assertEquals(expectedSpanish, + getTextFromPdf(tesseractReader, file, Collections.singletonList("spa"))); + Assert.assertEquals(expectedSpanish, + getTextFromPdf(tesseractReader, file, Arrays.asList("spa", "eng"))); + Assert.assertEquals(expectedSpanish, + getTextFromPdf(tesseractReader, file, Arrays.asList("eng", "spa"))); + + // incorrect result when languages are not specified + // or languages were specified in the wrong order + Assert.assertNotEquals(expectedSpanish, + getTextFromPdf(tesseractReader, file, Collections.singletonList("eng"))); + Assert.assertNotEquals(expectedSpanish, + getTextFromPdf(tesseractReader, file, new ArrayList())); + } + + @Test + public void testBengaliScript() { + String imgPath = TEST_IMAGES_DIRECTORY + "bengali_01.jpeg"; + File file = new File(imgPath); + String expected = "ইংরজে"; + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPathToTessData(new File(SCRIPT_TESS_DATA_DIRECTORY))); + // correct result with specified spanish language + Assert.assertTrue(getTextFromPdf(tesseractReader, file, 1, + Collections.singletonList("Bengali"), + Arrays.asList(FREE_SANS_FONT_PATH, KOSUGI_FONT_PATH)) + .startsWith(expected)); + } + + @Test + public void testGeorgianTextWithScript() { + String imgPath = TEST_IMAGES_DIRECTORY + "georgian_01.jpg"; + File file = new File(imgPath); + // First sentence + String expected = "ღმერთი"; + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPathToTessData(new File(SCRIPT_TESS_DATA_DIRECTORY))); + // correct result with specified georgian+eng language + Assert.assertTrue(getTextFromPdf(tesseractReader, file, + Collections.singletonList("Georgian"), + FREE_SANS_FONT_PATH) + .startsWith(expected)); + } + + @Test + public void testJapaneseScript() { + String imgPath = TEST_IMAGES_DIRECTORY + "japanese_01.png"; + File file = new File(imgPath); + String expected = "日 本 語\n文法"; + + tesseractReader.setTesseract4OcrEngineProperties( + tesseractReader.getTesseract4OcrEngineProperties() + .setPathToTessData(new File(SCRIPT_TESS_DATA_DIRECTORY))); + // correct result with specified japanese language + String result = getTextFromPdf(tesseractReader, file, + Arrays.asList("Japanese"), KOSUGI_FONT_PATH); + Assert.assertEquals(expected, result); + } + + /** + * Do OCR for given image and compare result text file with expected one. + */ + private boolean doOcrAndCompareTxtFiles(AbstractTesseract4OcrEngine tesseractReader, + String imgPath, String expectedPath, List languages) { + String resultTxtFile = getTargetDirectory() + getImageName(imgPath, languages) + ".txt"; + doOcrAndSaveToTextFile(tesseractReader, imgPath, resultTxtFile, languages); + return compareTxtFiles(expectedPath, resultTxtFile); + } + + /** + * Compare two text files using provided paths. + */ + private boolean compareTxtFiles(String expectedFilePath, String resultFilePath) { + boolean areEqual = true; + try { + List expected = Files.readAllLines(java.nio.file.Paths.get(expectedFilePath)); + List result = Files.readAllLines(java.nio.file.Paths.get(resultFilePath)); + + if (expected.size() != result.size()) { + return false; + } + + for (int i = 0; i < expected.size(); i++) { + String exp = expected.get(i) + .replace("\n", "") + .replace("\f", ""); + exp = exp.replaceAll("[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", ""); + String res = result.get(i) + .replace("\n", "") + .replace("\f", ""); + res = res.replaceAll("[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", ""); + if (expected.get(i) == null || result.get(i) == null) { + areEqual = false; + break; + } else if (!exp.equals(res)) { + areEqual = false; + break; + } + } + } catch (IOException e) { + areEqual = false; + LOGGER.error(e.getMessage()); + } + + return areEqual; + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java new file mode 100644 index 0000000..1f43e3e --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ApiTest.java @@ -0,0 +1,178 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.tess4j.TesseractException; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public class ApiTest extends IntegrationTestHelper { + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET) + }) + @Test + public void testDefaultTessDataPathValidationForLib() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET); + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File imgFile = new File(path); + + Tesseract4LibOcrEngine engine = + new Tesseract4LibOcrEngine(new Tesseract4OcrEngineProperties()); + engine.doImageOcr(imgFile); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET) + }) + @Test + public void testDefaultTessDataPathValidationForExecutable() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(Tesseract4OcrException.PATH_TO_TESS_DATA_IS_NOT_SET); + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File imgFile = new File(path); + + Tesseract4ExecutableOcrEngine engine = + new Tesseract4ExecutableOcrEngine(getTesseractDirectory(), + new Tesseract4OcrEngineProperties()); + engine.doImageOcr(imgFile); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE) + }) + @Test + public void testDoTesseractOcrForIncorrectImageForExecutable() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil.format( + Tesseract4OcrException.CANNOT_READ_PROVIDED_IMAGE, + new File(TEST_IMAGES_DIRECTORY + "numbers_01") + .getAbsolutePath())); + String path = TEST_IMAGES_DIRECTORY + "numbers_01"; + File imgFile = new File(path); + + Tesseract4ExecutableOcrEngine engine = + new Tesseract4ExecutableOcrEngine(getTesseractDirectory(), + new Tesseract4OcrEngineProperties() + .setPathToTessData(getTessDataDirectory())); + engine.doTesseractOcr(imgFile, null, OutputFormat.HOCR); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED), + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.TESSERACT_FAILED) + }) + @Test + public void testOcrResultForSinglePageForNullImage() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(Tesseract4OcrException.TESSERACT_FAILED); + Tesseract4LibOcrEngine tesseract4LibOcrEngine = getTesseract4LibOcrEngine(); + tesseract4LibOcrEngine.setTesseract4OcrEngineProperties( + new Tesseract4OcrEngineProperties() + .setPathToTessData(getTessDataDirectory())); + tesseract4LibOcrEngine.initializeTesseract(OutputFormat.TXT); + tesseract4LibOcrEngine.doTesseractOcr(null, null, OutputFormat.HOCR); + } + + @Test + public void testDoTesseractOcrForNonAsciiPathForExecutable() { + String path = TEST_IMAGES_DIRECTORY + "tèst/noisy_01.png"; + File imgFile = new File(path); + File outputFile = new File(TesseractOcrUtil.getTempFilePath("test", + ".hocr")); + + Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties(); + properties.setPathToTessData(getTessDataDirectory()); + properties.setPreprocessingImages(false); + Tesseract4ExecutableOcrEngine engine = + new Tesseract4ExecutableOcrEngine(getTesseractDirectory(), + properties); + engine.doTesseractOcr(imgFile, outputFile, OutputFormat.HOCR); + Assert.assertTrue(Files.exists(Paths.get(outputFile.getAbsolutePath()))); + TesseractHelper.deleteFile(outputFile.getAbsolutePath()); + Assert.assertFalse(Files.exists(Paths.get(outputFile.getAbsolutePath()))); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE), + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED), + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND), + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED) + }, ignore = true) + @Test + public void testDoTesseractOcrForExecutableForWin() { + junitExpectedException.expect(Tesseract4OcrException.class); + testSettingOsName("win"); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_READ_INPUT_IMAGE), + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_FAILED), + @LogMessage(messageTemplate = Tesseract4OcrException.TESSERACT_NOT_FOUND), + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.COMMAND_FAILED) + }, ignore = true) + @Test + public void testDoTesseractOcrForExecutableForLinux() { + junitExpectedException.expect(Tesseract4OcrException.class); + testSettingOsName("linux"); + } + + private void testSettingOsName(String osName) { + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + File imgFile = new File(path); + + String tesseractDirectory = getTesseractDirectory(); + String osPropertyName = System.getProperty("os.name") == null ? "OS" : "os.name"; + String os = System.getProperty(osPropertyName); + System.setProperty(osPropertyName, osName); + + try { + Tesseract4OcrEngineProperties properties = new Tesseract4OcrEngineProperties(); + properties.setPathToTessData(getTessDataDirectory()); + Tesseract4ExecutableOcrEngine engine = + new Tesseract4ExecutableOcrEngine(tesseractDirectory, + properties); + + engine.doTesseractOcr(imgFile, null, OutputFormat.HOCR); + } finally { + System.setProperty(osPropertyName, os); + } + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ImagePreprocessingUtilTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ImagePreprocessingUtilTest.java new file mode 100644 index 0000000..5309494 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ImagePreprocessingUtilTest.java @@ -0,0 +1,52 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.pdfocr.IntegrationTestHelper; + +import java.io.File; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +public class ImagePreprocessingUtilTest extends IntegrationTestHelper{ + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + @Test + public void testCheckForInvalidTiff() { + String path = TEST_IMAGES_DIRECTORY + "example_03_10MB"; + File imgFile = new File(path); + Assert.assertFalse(ImagePreprocessingUtil.isTiffImage(imgFile)); + } + + @Test + public void testReadingInvalidImagePath() { + junitExpectedException.expect(Tesseract4OcrException.class); + String path = TEST_IMAGES_DIRECTORY + "numbers_02"; + File imgFile = new File(path); + ImagePreprocessingUtil.preprocessImage(imgFile, 1); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/TesseractOcrUtilTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/TesseractOcrUtilTest.java new file mode 100644 index 0000000..1287719 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/TesseractOcrUtilTest.java @@ -0,0 +1,195 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.pdfocr.IntegrationTestHelper; +import com.itextpdf.test.annotations.LogMessage; +import com.itextpdf.test.annotations.LogMessages; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import javax.imageio.ImageIO; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.tess4j.TesseractException; +import org.junit.Assert; +import org.junit.Test; + +public class TesseractOcrUtilTest extends IntegrationTestHelper { + + @Test + public void testTesseract4OcrForPix() + throws TesseractException, IOException { + String path = TEST_IMAGES_DIRECTORY + "numbers_02.jpg"; + String expected = "0123456789"; + File imgFile = new File(path); + + Pix pix = ImagePreprocessingUtil.readPix(imgFile); + Tesseract4LibOcrEngine tesseract4LibOcrEngine = getTesseract4LibOcrEngine(); + tesseract4LibOcrEngine.setTesseract4OcrEngineProperties( + new Tesseract4OcrEngineProperties() + .setPathToTessData(getTessDataDirectory())); + tesseract4LibOcrEngine.initializeTesseract(OutputFormat.TXT); + + String result = new TesseractOcrUtil().getOcrResultAsString( + tesseract4LibOcrEngine.getTesseractInstance(), + pix, OutputFormat.TXT); + Assert.assertTrue(result.contains(expected)); + } + + @Test + public void testGetOcrResultAsStringForFile() + throws TesseractException { + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + String expected = "619121"; + File imgFile = new File(path); + + Tesseract4LibOcrEngine tesseract4LibOcrEngine = getTesseract4LibOcrEngine(); + tesseract4LibOcrEngine.setTesseract4OcrEngineProperties( + new Tesseract4OcrEngineProperties() + .setPathToTessData(getTessDataDirectory())); + tesseract4LibOcrEngine.initializeTesseract(OutputFormat.TXT); + + String result = new TesseractOcrUtil().getOcrResultAsString( + tesseract4LibOcrEngine.getTesseractInstance(), + imgFile, OutputFormat.TXT); + Assert.assertTrue(result.contains(expected)); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.PAGE_NUMBER_IS_INCORRECT) + }) + @Test + public void testReadingSecondPageFromOnePageTiff() { + String path = TEST_IMAGES_DIRECTORY + "example_03_10MB.tiff"; + File imgFile = new File(path); + Pix page = TesseractOcrUtil.readPixPageFromTiff(imgFile, 2); + Assert.assertNull(page); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_RETRIEVE_PAGES_FROM_IMAGE) + }) + @Test + public void testReadingPageFromInvalidTiff() { + String path = TEST_IMAGES_DIRECTORY + "example_03.tiff"; + File imgFile = new File(path); + Pix page = TesseractOcrUtil.readPixPageFromTiff(imgFile, 0); + Assert.assertNull(page); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_RETRIEVE_PAGES_FROM_IMAGE) + }) + @Test + public void testInitializeImagesListFromInvalidTiff() { + String path = TEST_IMAGES_DIRECTORY + "example_03.tiff"; + File imgFile = new File(path); + TesseractOcrUtil tesseractOcrUtil = new TesseractOcrUtil(); + tesseractOcrUtil.initializeImagesListFromTiff(imgFile); + Assert.assertEquals(0, tesseractOcrUtil.getListOfPages().size()); + } + + @Test + public void testPreprocessingConditions() throws IOException { + Pix pix = null; + Assert.assertNull(TesseractOcrUtil.convertToGrayscale(pix)); + Assert.assertNull(TesseractOcrUtil.otsuImageThresholding(pix)); + Assert.assertNull(TesseractOcrUtil.convertPixToImage(pix)); + TesseractOcrUtil.destroyPix(pix); + } + + @Test + public void testOcrResultConditions() throws IOException, + TesseractException { + Tesseract4LibOcrEngine tesseract4LibOcrEngine = getTesseract4LibOcrEngine(); + tesseract4LibOcrEngine.setTesseract4OcrEngineProperties( + new Tesseract4OcrEngineProperties() + .setPathToTessData(getTessDataDirectory())); + tesseract4LibOcrEngine.initializeTesseract(OutputFormat.HOCR); + + Pix pix = null; + Assert.assertNull(new TesseractOcrUtil() + .getOcrResultAsString( + tesseract4LibOcrEngine.getTesseractInstance(), + pix, OutputFormat.HOCR)); + File file = null; + Assert.assertNull(new TesseractOcrUtil() + .getOcrResultAsString( + tesseract4LibOcrEngine.getTesseractInstance(), + file, OutputFormat.HOCR)); + BufferedImage bi = null; + Assert.assertNull(new TesseractOcrUtil() + .getOcrResultAsString( + tesseract4LibOcrEngine.getTesseractInstance(), + bi, OutputFormat.HOCR)); + } + + @Test + public void testImageSavingAsPng() throws IOException { + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + String tmpFileName = getTargetDirectory() + "testImageSavingAsPng.png"; + Assert.assertFalse(Files.exists(Paths.get(tmpFileName))); + BufferedImage bi = ImageIO.read(new FileInputStream(path)); + TesseractOcrUtil.saveImageToTempPngFile(tmpFileName, bi); + Assert.assertTrue(Files.exists(Paths.get(tmpFileName))); + TesseractHelper.deleteFile(tmpFileName); + Assert.assertFalse(Files.exists(Paths.get(tmpFileName))); + } + + @Test + public void testNullSavingAsPng() { + String tmpFileName = TesseractOcrUtil.getTempFilePath( + getTargetDirectory() + "/testNullSavingAsPng", ".png"); + TesseractOcrUtil.saveImageToTempPngFile(tmpFileName, null); + Assert.assertFalse(Files.exists(Paths.get(tmpFileName))); + + TesseractOcrUtil.savePixToTempPngFile(tmpFileName, null); + Assert.assertFalse(Files.exists(Paths.get(tmpFileName))); + } + + @Test + public void testPixSavingAsPng() { + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + String tmpFileName = getTargetDirectory() + "testPixSavingAsPng.png"; + Assert.assertFalse(Files.exists(Paths.get(tmpFileName))); + Pix pix = ImagePreprocessingUtil.readPix(new File(path)); + TesseractOcrUtil.savePixToTempPngFile(tmpFileName, pix); + Assert.assertTrue(Files.exists(Paths.get(tmpFileName))); + TesseractHelper.deleteFile(tmpFileName); + Assert.assertFalse(Files.exists(Paths.get(tmpFileName))); + } + + @LogMessages(messages = { + @LogMessage(messageTemplate = Tesseract4LogMessageConstant.CANNOT_PROCESS_IMAGE) + }) + @Test + public void testImageSavingAsPngWithError() throws IOException { + String path = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + BufferedImage bi = ImageIO.read(new FileInputStream(path)); + TesseractOcrUtil.saveImageToTempPngFile(null, bi); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/UserWordsExecutableTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/UserWordsExecutableTest.java new file mode 100644 index 0000000..03b4daf --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/UserWordsExecutableTest.java @@ -0,0 +1,33 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.test.annotations.type.IntegrationTest; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class UserWordsExecutableTest extends UserWordsTest { + public UserWordsExecutableTest() { + super(ReaderType.EXECUTABLE); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/UserWordsLibTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/UserWordsLibTest.java new file mode 100644 index 0000000..c0b4541 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/UserWordsLibTest.java @@ -0,0 +1,33 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.test.annotations.type.IntegrationTest; +import org.junit.experimental.categories.Category; + +@Category(IntegrationTest.class) +public class UserWordsLibTest extends UserWordsTest { + public UserWordsLibTest() { + super(ReaderType.LIB); + } +} diff --git a/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/UserWordsTest.java b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/UserWordsTest.java new file mode 100644 index 0000000..8f222e6 --- /dev/null +++ b/pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/UserWordsTest.java @@ -0,0 +1,157 @@ +/* + This file is part of the iText (R) project. + Copyright (c) 1998-2020 iText Group NV + Authors: iText Software. + + This program is offered under a commercial and under the AGPL license. + For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + + AGPL licensing: + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + */ +package com.itextpdf.pdfocr.tesseract4; + +import com.itextpdf.io.util.MessageFormatUtil; +import com.itextpdf.pdfocr.IntegrationTestHelper; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public abstract class UserWordsTest extends IntegrationTestHelper { + + @Rule + public ExpectedException junitExpectedException = ExpectedException.none(); + + AbstractTesseract4OcrEngine tesseractReader; + String testFileTypeName; + private boolean isExecutableReaderType; + + public UserWordsTest(ReaderType type) { + isExecutableReaderType = type.equals(ReaderType.EXECUTABLE); + if (isExecutableReaderType) { + testFileTypeName = "executable"; + } else { + testFileTypeName = "lib"; + } + tesseractReader = getTesseractReader(type); + } + + @Before + public void initTesseractProperties() { + Tesseract4OcrEngineProperties ocrEngineProperties = + new Tesseract4OcrEngineProperties(); + ocrEngineProperties.setPathToTessData(getTessDataDirectory()); + tesseractReader.setTesseract4OcrEngineProperties(ocrEngineProperties); + } + + @Test + public void testCustomUserWords() { + String imgPath = TEST_IMAGES_DIRECTORY + "wierdwords.png"; + List userWords = Arrays.asList("he23llo", "qwetyrtyqpwe-rty"); + + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setLanguages(Arrays.asList("fra")); + properties.setUserWords("fra", userWords); + tesseractReader.setTesseract4OcrEngineProperties(properties); + String result = getRecognizedTextFromTextFile(tesseractReader, imgPath); + Assert.assertTrue(result.contains(userWords.get(0)) + || result.contains(userWords.get(1))); + + Assert.assertTrue(tesseractReader.getTesseract4OcrEngineProperties() + .getPathToUserWordsFile().endsWith(".user-words")); + } + + @Test + public void testCustomUserWordsWithListOfLanguages() { + String imgPath = TEST_IMAGES_DIRECTORY + "bogusText.jpg"; + String expectedOutput = "B1adeb1ab1a"; + + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setLanguages(Arrays.asList("fra", "eng")); + properties.setUserWords("eng", Arrays.asList("b1adeb1ab1a")); + tesseractReader.setTesseract4OcrEngineProperties(properties); + + String result = getRecognizedTextFromTextFile(tesseractReader, imgPath); + result = result.replace("\n", "").replace("\f", ""); + result = result.replaceAll("[^\\u0009\\u000A\\u000D\\u0020-\\u007E]", ""); + Assert.assertTrue(result.startsWith(expectedOutput)); + + Assert.assertTrue(tesseractReader.getTesseract4OcrEngineProperties() + .getPathToUserWordsFile().endsWith(".user-words")); + } + + @Test + public void testUserWordsWithLanguageNotInList() throws FileNotFoundException { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil + .format(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST, + "spa")); + String userWords = TEST_DOCUMENTS_DIRECTORY + "userwords.txt"; + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setUserWords("spa", new FileInputStream(userWords)); + properties.setLanguages(new ArrayList()); + } + + @Test + public void testIncorrectLanguageForUserWordsAsList() { + junitExpectedException.expect(Tesseract4OcrException.class); + junitExpectedException.expectMessage(MessageFormatUtil + .format(Tesseract4OcrException.LANGUAGE_IS_NOT_IN_THE_LIST, + "eng1")); + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setUserWords("eng1", Arrays.asList("word1", "word2")); + properties.setLanguages(new ArrayList()); + } + + @Test + public void testUserWordsWithDefaultLanguageNotInList() + throws FileNotFoundException { + String userWords = TEST_DOCUMENTS_DIRECTORY + "userwords.txt"; + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setUserWords("eng", new FileInputStream(userWords)); + properties.setLanguages(new ArrayList()); + tesseractReader.setTesseract4OcrEngineProperties(properties); + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + String expectedOutput = "619121"; + String result = getRecognizedTextFromTextFile(tesseractReader, imgPath); + Assert.assertTrue(result.startsWith(expectedOutput)); + } + + @Test + public void testUserWordsFileNotDeleted() { + String userWords = TEST_DOCUMENTS_DIRECTORY + "userwords.txt"; + Tesseract4OcrEngineProperties properties = + tesseractReader.getTesseract4OcrEngineProperties(); + properties.setPathToUserWordsFile(userWords); + properties.setLanguages(Arrays.asList("eng")); + tesseractReader.setTesseract4OcrEngineProperties(properties); + String imgPath = TEST_IMAGES_DIRECTORY + "numbers_01.jpg"; + tesseractReader.doImageOcr(new File(imgPath)); + Assert.assertTrue(new File(userWords).exists()); + } +} diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/englishText_executable.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/englishText_executable.pdf new file mode 100644 index 0000000..5d706d0 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/englishText_executable.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/englishText_lib.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/englishText_lib.pdf new file mode 100644 index 0000000..004e5ef Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/englishText_lib.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/example_01_executable.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/example_01_executable.pdf new file mode 100644 index 0000000..d08a201 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/example_01_executable.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/example_01_lib.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/example_01_lib.pdf new file mode 100644 index 0000000..657e872 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/example_01_lib.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/example_02.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/example_02.pdf new file mode 100644 index 0000000..1a567f2 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/example_02.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/german_01executable.txt b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/german_01executable.txt new file mode 100644 index 0000000..c15d2a9 --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/german_01executable.txt @@ -0,0 +1,4 @@ +Das Geheimnis +des Könnens +liegt im Wollen. + diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/german_01lib.txt b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/german_01lib.txt new file mode 100644 index 0000000..72180eb --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/german_01lib.txt @@ -0,0 +1,3 @@ +Das Geheimnis +des Könnens +liegt im Wollen. diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multilang_executable.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multilang_executable.pdf new file mode 100644 index 0000000..4441b9e Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multilang_executable.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multilang_lib.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multilang_lib.pdf new file mode 100644 index 0000000..7fa0333 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multilang_lib.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_executable.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_executable.pdf new file mode 100644 index 0000000..35b6538 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_executable.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_executable.txt b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_executable.txt new file mode 100644 index 0000000..b73a392 --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_executable.txt @@ -0,0 +1,37 @@ +Multipage +TIFF +Example +Page 1 + Multipage +TIFF +Example +Page 2 + Multipage +TIFF +Example +Page 3 + Multipage +TIFF +Example +Page 4 + Multipage +TIFF +Example +Page 5 + Multipage +TIFF +Example +Page 6 + Multipage +TIFF +Example +Page / + Multipage +TIFF +Example +Page 8 + Multipage +TIFF +Example +Page 9 + \ No newline at end of file diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_lib.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_lib.pdf new file mode 100644 index 0000000..1a726fe Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_lib.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_lib.txt b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_lib.txt new file mode 100644 index 0000000..61b7bbd --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/multipage_lib.txt @@ -0,0 +1,36 @@ +Multipage +TIFF +Example +Page 1 +Multipage +TIFF +Example +Page 2 +Multipage +TIFF +Example +Page 3 +Multipage +TIFF +Example +Page 4 +Multipage +TIFF +Example +Page 5 +Multipage +TIFF +Example +Page 6 +Multipage +TIFF +Example +Page / +Multipage +TIFF +Example +Page 8 +Multipage +TIFF +Example +Page 9 diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01.pdf new file mode 100644 index 0000000..882a45d Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_a3u.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_a3u.pdf new file mode 100644 index 0000000..7657a62 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_a3u.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareJpe_executable.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareJpe_executable.pdf new file mode 100644 index 0000000..0bbd75b Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareJpe_executable.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareJpe_lib.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareJpe_lib.pdf new file mode 100644 index 0000000..cafdbf5 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareJpe_lib.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareTif_executable.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareTif_executable.pdf new file mode 100644 index 0000000..f74a7df Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareTif_executable.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareTif_lib.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareTif_lib.pdf new file mode 100644 index 0000000..ea13a39 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_01_compareTif_lib.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_02_compareJpg_executable.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_02_compareJpg_executable.pdf new file mode 100644 index 0000000..c86db65 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_02_compareJpg_executable.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_02_compareJpg_lib.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_02_compareJpg_lib.pdf new file mode 100644 index 0000000..867a2f5 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/numbers_02_compareJpg_lib.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/spanish_01_a3u.pdf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/spanish_01_a3u.pdf new file mode 100644 index 0000000..68c24cf Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/spanish_01_a3u.pdf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/userwords.txt b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/userwords.txt new file mode 100644 index 0000000..c91b117 --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/documents/userwords.txt @@ -0,0 +1,7 @@ +Items +hello +included +vat +cash +change +word diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/events/multithreading/numbers_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/events/multithreading/numbers_01.jpg new file mode 100644 index 0000000..f384caa Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/events/multithreading/numbers_01.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/Cairo-Regular.ttf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/Cairo-Regular.ttf new file mode 100644 index 0000000..f0920bd Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/Cairo-Regular.ttf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/FreeSans.ttf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/FreeSans.ttf new file mode 100644 index 0000000..2072cda Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/FreeSans.ttf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/Kosugi-Regular.ttf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/Kosugi-Regular.ttf new file mode 100644 index 0000000..67ee170 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/Kosugi-Regular.ttf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_APACHE.txt b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_APACHE.txt new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_APACHE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_GNU.txt b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_GNU.txt new file mode 100644 index 0000000..f288702 --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_GNU.txt @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_OFL.txt b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_OFL.txt new file mode 100644 index 0000000..77b1731 --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/LICENSE_OFL.txt @@ -0,0 +1,91 @@ +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +http://scripts.sil.org/OFL + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NOTICE.txt b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NOTICE.txt new file mode 100644 index 0000000..d340343 --- /dev/null +++ b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NOTICE.txt @@ -0,0 +1,7 @@ +Please notice that the following fonts are used with the mentioned below licenses. + +* Cairo-Regular - SIL Open Font License, Version 1.1 +* FreeSans - GPL license you can find following the link: https://www.gnu.org/licenses +* Kosugi-Regular - Apache License, Version 2.0. +* NotoSans-Regular - SIL Open Font License, Version 1.1 +* NotoSansSC-Regular - SIL Open Font License, Version 1.1 diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSans-Regular.ttf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSans-Regular.ttf new file mode 100644 index 0000000..10589e2 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSans-Regular.ttf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSansSC-Regular.otf b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSansSC-Regular.otf new file mode 100644 index 0000000..ee1e304 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/fonts/NotoSansSC-Regular.otf differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/arabic_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/arabic_01.jpg new file mode 100644 index 0000000..c41788f Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/arabic_01.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/arabic_02.png b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/arabic_02.png new file mode 100644 index 0000000..7b35925 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/arabic_02.png differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/bengali_01.jpeg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/bengali_01.jpeg new file mode 100644 index 0000000..d769282 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/bengali_01.jpeg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/bogusText.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/bogusText.jpg new file mode 100644 index 0000000..7ee7f8a Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/bogusText.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/chinese_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/chinese_01.jpg new file mode 100644 index 0000000..ea1cb26 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/chinese_01.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/corrupted.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/corrupted.jpg new file mode 100644 index 0000000..2c0d56a Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/corrupted.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/englishText.bmp b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/englishText.bmp new file mode 100644 index 0000000..9e35ff4 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/englishText.bmp differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_01.BMP b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_01.BMP new file mode 100644 index 0000000..439962e Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_01.BMP differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_02.JFIF b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_02.JFIF new file mode 100644 index 0000000..b598806 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_02.JFIF differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_03_10MB.tiff b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_03_10MB.tiff new file mode 100644 index 0000000..c9d37ff Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_03_10MB.tiff differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_04.png b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_04.png new file mode 100644 index 0000000..d0243e0 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_04.png differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_05_corrupted.bmp b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_05_corrupted.bmp new file mode 100644 index 0000000..6540ad7 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/example_05_corrupted.bmp differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/french_01.png b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/french_01.png new file mode 100644 index 0000000..2d35846 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/french_01.png differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/georgian_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/georgian_01.jpg new file mode 100644 index 0000000..7e42f2d Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/georgian_01.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/german_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/german_01.jpg new file mode 100644 index 0000000..6f63dc2 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/german_01.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/greek_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/greek_01.jpg new file mode 100644 index 0000000..5e48756 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/greek_01.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/halftone.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/halftone.jpg new file mode 100644 index 0000000..a255024 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/halftone.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/hindi_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/hindi_01.jpg new file mode 100644 index 0000000..8c1477a Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/hindi_01.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/hindi_02.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/hindi_02.jpg new file mode 100644 index 0000000..1e2b01f Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/hindi_02.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/japanese_01.png b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/japanese_01.png new file mode 100644 index 0000000..9fa9fae Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/japanese_01.png differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/multilang.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/multilang.jpg new file mode 100644 index 0000000..9dc0004 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/multilang.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/multipage.tiff b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/multipage.tiff new file mode 100644 index 0000000..e8cc630 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/multipage.tiff differ diff --git "a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/mult\303\256page.tiff" "b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/mult\303\256page.tiff" new file mode 100644 index 0000000..e8cc630 Binary files /dev/null and "b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/mult\303\256page.tiff" differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/noisy_01.png b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/noisy_01.png new file mode 100644 index 0000000..2a91a3b Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/noisy_01.png differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.jpe b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.jpe new file mode 100644 index 0000000..e633b25 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.jpe differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.jpg new file mode 100644 index 0000000..f384caa Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.tif b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.tif new file mode 100644 index 0000000..60e70e9 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_01.tif differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_02.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_02.jpg new file mode 100644 index 0000000..5da603a Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/numbers_02.jpg differ diff --git "a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/n\303\274mb\303\251rs.jpg" "b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/n\303\274mb\303\251rs.jpg" new file mode 100644 index 0000000..f384caa Binary files /dev/null and "b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/n\303\274mb\303\251rs.jpg" differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/pantone_blue.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/pantone_blue.jpg new file mode 100644 index 0000000..431ca52 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/pantone_blue.jpg differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/scanned_spa_01.png b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/scanned_spa_01.png new file mode 100644 index 0000000..e0b46d5 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/scanned_spa_01.png differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/spanish_01.jpg b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/spanish_01.jpg new file mode 100644 index 0000000..ffe5bf9 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/spanish_01.jpg differ diff --git "a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/t\303\250st/noisy_01.png" "b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/t\303\250st/noisy_01.png" new file mode 100644 index 0000000..2a91a3b Binary files /dev/null and "b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/t\303\250st/noisy_01.png" differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/wierdwords.png b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/wierdwords.png new file mode 100644 index 0000000..40cc9d6 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/images/wierdwords.png differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/profiles/CoatedFOGRA27.icc b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/profiles/CoatedFOGRA27.icc new file mode 100644 index 0000000..086ac9d Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/profiles/CoatedFOGRA27.icc differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/profiles/sRGB_CS_profile.icm b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/profiles/sRGB_CS_profile.icm new file mode 100644 index 0000000..7f9d18d Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/profiles/sRGB_CS_profile.icm differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/ara.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/ara.traineddata new file mode 100644 index 0000000..4b687c7 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/ara.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/ben.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/ben.traineddata new file mode 100644 index 0000000..7e9054d Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/ben.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/chi_sim.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/chi_sim.traineddata new file mode 100644 index 0000000..388bac2 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/chi_sim.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/chi_tra.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/chi_tra.traineddata new file mode 100644 index 0000000..1955cd8 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/chi_tra.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/deu.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/deu.traineddata new file mode 100644 index 0000000..97ed7b2 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/deu.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/ell.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/ell.traineddata new file mode 100644 index 0000000..ed98ae1 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/ell.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/eng.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/eng.traineddata new file mode 100644 index 0000000..f4744c2 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/eng.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/fra.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/fra.traineddata new file mode 100644 index 0000000..250c774 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/fra.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/grc.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/grc.traineddata new file mode 100644 index 0000000..a306f3e Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/grc.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/hin.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/hin.traineddata new file mode 100644 index 0000000..a8f0aae Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/hin.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/jpn.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/jpn.traineddata new file mode 100644 index 0000000..c4178f8 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/jpn.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/jpn_vert.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/jpn_vert.traineddata new file mode 100644 index 0000000..43f38de Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/jpn_vert.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/kat.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/kat.traineddata new file mode 100644 index 0000000..1a3ae11 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/kat.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/kat_old.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/kat_old.traineddata new file mode 100644 index 0000000..f4ae5ab Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/kat_old.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/osd.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/osd.traineddata new file mode 100644 index 0000000..527457c Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/osd.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Bengali.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Bengali.traineddata new file mode 100644 index 0000000..a1b888e Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Bengali.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Georgian.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Georgian.traineddata new file mode 100644 index 0000000..7751150 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Georgian.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Japanese.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Japanese.traineddata new file mode 100644 index 0000000..89481f2 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/script/Japanese.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/spa.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/spa.traineddata new file mode 100644 index 0000000..72e901f Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/spa.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/spa_old.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/spa_old.traineddata new file mode 100644 index 0000000..42b281f Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/spa_old.traineddata differ diff --git a/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/urd.traineddata b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/urd.traineddata new file mode 100644 index 0000000..715a159 Binary files /dev/null and b/pdfocr-tesseract4/src/test/resources/com/itextpdf/pdfocr/tessdata/urd.traineddata differ diff --git a/pom.xml b/pom.xml index f8907e6..e8ab2c5 100644 --- a/pom.xml +++ b/pom.xml @@ -5,21 +5,29 @@ com.itextpdf root - 7.1.0 - + 7.1.11 + - ocr - 1.0.0-SNAPSHOT + pdfocr-root + 1.0.0 + pom - OCR - OCR is an iText 7 add-on that lets you to parse text from provided images and adds it to PDF. + pdfOCR + pdfOCR is an iText 7 add-on for Java to recognize and extract text in scanned documents and images. It can also convert them into fully ISO-compliant PDF or PDF/A-3u files that are accessible, searchable, and suitable for archiving + + + pdfocr-api + pdfocr-tesseract4 + - ${project.parent.version} + 7.1.11 1.8 - 1.8 - 1.8 + ${java.version} + ${java.version} + ${java.version} + false @@ -41,86 +49,54 @@ - - - com.itextpdf - forms - ${itext.version} - - - com.itextpdf - io - ${itext.version} - - - com.itextpdf - kernel - ${itext.version} - - - com.itextpdf - layout - ${itext.version} - - - commons-io - commons-io - 2.6 - - - net.htmlparser.jericho - jericho-html - 3.3 - - - com.itextpdf - pdfa - ${itext.version} - test - - - com.itextpdf - pdftest - ${itext.version} - test - - - - + + + src/main/resources + + **/*.ttf + + + - external.atlassian.jgitflow - jgitflow-maven-plugin - 1.0-m5.1 + maven-compiler-plugin + + ${java.version} + ${java.version} + - org.apache.maven.plugins maven-failsafe-plugin - 2.19.1 + ${failsafe.version} - - **/*Test.java - - ${integrationtests} + ${skipTests} + 1 + false + false - org.apache.maven.plugins - maven-source-plugin - 3.0.0 + maven-javadoc-plugin - - ** - + ${java.version} + none + + + Ocr + com.itextpdf.ocr* + + - org.apache.maven.plugins maven-surefire-plugin - 2.19.1 + ${surefire.version} - ${unittests} + ${skipTests} + 1 + false + false @@ -131,15 +107,42 @@ true - - org.revapi - revapi-maven-plugin - 0.8.2 - - true - - - + + + + qa + + + + com.github.siom79.japicmp + japicmp-maven-plugin + 0.14.3 + + + none + + + + + org.owasp + dependency-check-core + 5.3.0 + + false + + + + org.revapi + revapi-maven-plugin + 0.11.1 + + ${project.version} + + + + + +