diff --git a/Dockerfile b/Dockerfile index 3618565d..f58112b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,11 @@ -FROM ocrd/core:latest -ENV VERSION="Mi 9. Okt 13:26:16 CEST 2019" +FROM ocrd/core:latest AS base +ENV VERSION="Di 12. Mai 13:26:35 CEST 2020" ENV GITURL="https://github.com/cisocrgroup" ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf" -ENV DATA="/apps/ocrd-cis-post-correction" # deps -COPY data/docker/deps.txt ${DATA}/deps.txt RUN apt-get update \ - && apt-get -y install --no-install-recommends $(cat ${DATA}/deps.txt) + && apt-get -y install --no-install-recommends locales # locales RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ @@ -15,40 +13,46 @@ RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ && update-locale LANG=en_US.UTF-8 # install the profiler -RUN git clone ${GITURL}/Profiler --branch devel --single-branch /tmp/profiler \ - && cd /tmp/profiler \ - && mkdir build \ - && cd build \ - && cmake -DCMAKE_BUILD_TYPE=release .. \ - && make compileFBDic trainFrequencyList profiler \ - && cp bin/compileFBDic bin/trainFrequencyList bin/profiler /apps/ \ +FROM base AS profiler +RUN apt-get update \ + && apt-get -y install --no-install-recommends cmake g++ libcppunit-dev libxerces-c-dev \ + && git clone ${GITURL}/Profiler --branch devel --single-branch /build \ + && cd /build \ + && cmake -DCMAKE_BUILD_TYPE=release . \ + && make compileFBDic trainFrequencyList runDictSearch profiler \ + && mkdir /apps \ + && cp bin/compileFBDic bin/trainFrequencyList bin/profiler bin/runDictSearch /apps/ \ && cd / \ - && rm -rf /tmp/profiler + && rm -rf /build +FROM profiler AS languagemodel # install the profiler's language backend -RUN git clone ${GITURL}/Resources --branch master --single-branch /tmp/resources \ - && cd /tmp/resources/lexica \ - && make FBDIC=/apps/compileFBDic TRAIN=/apps/trainFrequencyList \ - && mkdir -p /${DATA}/languages \ - && cp -r german latin greek german.ini latin.ini greek.ini /${DATA}/languages \ +COPY --from=profiler /apps/compileFBDic /apps/ +COPY --from=profiler /apps/trainFrequencyList /apps/ +COPY --from=profiler /apps/runDictSearch /apps/ +RUN apt-get update \ + && apt-get -y install --no-install-recommends icu-devtools \ + && git clone ${GITURL}/Resources --branch master --single-branch /build \ + && cd /build/lexica \ + && PATH=$PATH:/apps make \ + && PATH=$PATH:/apps make test \ + && PATH=$PATH:/apps make install \ && cd / \ - && rm -rf /tmp/resources + && rm -rf /build +FROM base AS postcorrection # install ocrd_cis (python) -COPY Manifest.in Makefile setup.py ocrd-tool.json /tmp/build/ -COPY ocrd_cis/ /tmp/build/ocrd_cis/ -COPY bashlib/ /tmp/build/bashlib/ -# COPY . /tmp/ocrd_cis -RUN cd /tmp/build \ +VOLUME ["/data"] +COPY --from=languagemodel /etc/profiler/languages /etc/profiler/languages +COPY --from=profiler /apps/profiler /apps/ +COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linux-gnu/ +COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/ +COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/ +COPY . /build +RUN apt-get update \ + && apt-get -y install --no-install-recommends gcc wget default-jre-headless \ + && cd /build \ && make install \ + && make test \ && cd / \ - && rm -rf /tmp/build - -# download ocr models and pre-trainded post-correction model -RUN mkdir /apps/models \ - && cd /apps/models \ - && wget ${DOWNLOAD_URL}/model.zip >/dev/null 2>&1 \ - && wget ${DOWNLOAD_URL}/fraktur1-00085000.pyrnn.gz >/dev/null 2>&1 \ - && wget ${DOWNLOAD_URL}/fraktur2-00062000.pyrnn.gz >/dev/null 2>&1 - -VOLUME ["/data"] + && rm -rf /build diff --git a/Makefile b/Makefile index 33de67f2..730ba3f4 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,24 @@ PY ?= python3 PIP ?= pip3 +V ?= > /dev/null 2>&1 +PKG = ocrd_cis install: ${PIP} install --upgrade pip . install-devel: ${PIP} install --upgrade pip -e . +uninstall: + ${PIP} uninstall ${PKG} docker-build: Dockerfile docker build -t flobar/ocrd_cis:latest . docker-push: docker-build docker push flobar/ocrd_cis:latest -TEST_SCRIPTS=$(wildcard tests/run_*.sh) +TEST_SCRIPTS=$(sort $(wildcard tests/run_*.bash)) .PHONY: $(TEST_SCRIPTS) $(TEST_SCRIPTS): - bash $@ -# run test scripts + bash $@ $V test: $(TEST_SCRIPTS) - + echo $^ .PHONY: install test diff --git a/README.md b/README.md index 0568ff07..a2d7fb32 100644 --- a/README.md +++ b/README.md @@ -32,23 +32,39 @@ It is possible to install `ocrd_cis` in a custom directory using deactivate ``` +## Profiler +The post correction is dependent on the language +[profiler](https://github.com/cisocrgroup/Profiler) and its laguage +configurations to generate corrections for suspicious words. In order +to use the post correction a profiler with according language +configruations have to be present on the system. You can refer to our +[manuals](https://github.com/cisocrgroup/Resources/tree/master/manuals) +and our [lexical +resources](https://github.com/cisocrgroup/Resources/tree/master/lexica) +for more information. + +If you use docker you can use the preinstalled profiler from within +the docker-container. The profiler is installed to `/apps/profiler` +and the language configurations lie in `/etc/profiler/languages` in +the container image. + ## Usage Most tools follow the [OCR-D cli conventions](https://ocr-d.github.io/cli). They accept the `--input-file-grp`, `--output-file-grp`, `--parameter`, `--mets`, -`--log-level` command line arguments (short and long). For some tools -(most notably the alignment tool) expect a comma seperated list of -multiple input file groups. +`--log-level` command line arguments (short and long). Some of the +tools (most notably the alignment tool) expect a comma seperated list +of multiple input file groups. The [ocrd-tool.json](ocrd_cis/ocrd-tool.json) contains a schema description of the parameter config file for the different tools that accept the `--parameter` argument. -### ocrd-cis-post-correct.sh -This bash script runs the post correction using a pre-trained -[model](http://cis.lmu.de/~finkf/model.zip). If additional support -OCRs should be used, models for these OCR steps are required and must -be configured in an according configuration file (see ocrd-tool.json). +### ocrd-cis-postcorrect +This command runs the post correction using a pre-trained model. If +additional support OCRs should be used, models for these OCR steps are +required and must be executed and aligned beforehand (see [the test +script](tests/run_postcorrection_test.bash) for an example). Arguments: * `--parameter` path to configuration file @@ -57,6 +73,20 @@ Arguments: * `--log-level` set log level * `--mets` path to METS file in workspace +As mentioned above in order to use the postcorrection with input from +multiple OCR's, some preprocessing steps are needed: firstly the +additional OCR recognition has to be done and secondly the multiple +OCR's have to be aligned (you can also take a look to the function +`ocrd_cis_align` in the [tests](tests/test_lib.bash)). Assuming an +original recognition as file group `OCR1` on the segmented document of +file group `SEG`, the folloing commands can be used: + +```sh +ocrd-ocropus-recognize -I SEG -O OCR2 ... # additional OCR +ocrd-cis-align -I OCR1,OCR2 -O ALGN ... # align OCR1 and OCR2 +ocrd-cis-postcorrect -I ALGN -O PC ... # post correction +``` + ### ocrd-cis-align Aligns tokens of multiple input file groups to one output file group. This tool is used to align the master OCR with any additional support @@ -66,41 +96,26 @@ it aligns in order. Arguments: * `--parameter` path to configuration file * `--input-file-grp` comma seperated list of the input file groups; - first input file group is the master OCR + first input file group is the master OCR; if there is a ground + truth (for evaluation) it must be the last file group in the list * `--output-file-grp` name of the file group for the aligned result * `--log-level` set log level * `--mets` path to METS file in workspace -### ocrd-cis-train.sh -Script to train a model from a list of ground-truth archives (see -ocrd-tool.json) for the post correction. The tool somewhat mimics the -behaviour of other ocrd tools: - * `--mets` for the workspace - * `--log-level` is passed to other tools - * `--parameter` is used as configuration - * `--output-file-grp` defines the output file group for the model - ### ocrd-cis-data Helper tool to get the path of the installed data files. Usage: -`ocrd-cis-data [-jar|-3gs]` to get the path of the jar library or the -path to th default 3-grams language model file. - -### ocrd-cis-wer -Helper tool to calculate the word error rate aligned ocr files. It -writes a simple JSON-formated stats file to the given output file group. - -Arguments: - * `--input-file-grp` input file group of aligned ocr results with - their respective ground truth. - * `--output-file-grp` name of the file group for the stats file - * `--log-level` set log level - * `--mets` path to METS file in workspace - -### ocrd-cis-profile -Run the profiler over the given files of the according the given input -file grp and adds a gzipped JSON-formatted profile to the output file -group of the workspace. This tools requires an installed [language -profiler](https://github.com/cisocrgroup/Profiler). +`ocrd-cis-data [-h|-jar|-3gs|-model|-config]` to get the path of the +jar library, the pre-trained post correction model, the path to the +default 3-grams language model file or the default training +configuration file. This tool does not follow the OCR-D conventions. + +### Trainining +There is no dedicated training script provided. Models are trained +using the java implementation directly (check out the [training test +script](tests/run_training_test.bash) for an example). Training a +model requires a workspace containing one or more file groups +consisting of aligned OCR and ground-truth documents (the last file +group has to be the ground truth). Arguments: * `--parameter` path to configuration file @@ -114,11 +129,14 @@ Arguments: The `ocropy-train` tool can be used to train LSTM models. It takes ground truth from the workspace and saves (image+text) snippets from the corresponding pages. Then a model is trained on all snippets for 1 million (or the given number of) randomized iterations from the parameter file. + ```sh -ocrd-cis-ocropy-train \ - --input-file-grp OCR-D-GT-SEG-LINE \ - --mets mets.xml - --parameter file:///path/to/config.json +java -jar $(ocrd-cis-data -jar) \ + -c train \ + --input-file-grp OCR1,OCR2,GT \ + --log-level DEBUG \ + -m mets.xml \ + --parameter $(ocrd-cis-data -config) ``` ### ocrd-cis-ocropy-clip @@ -228,9 +246,8 @@ pip install . ``` Download and move tesseract models from: -https://github.com/tesseract-ocr/tesseract/wiki/Data-Files -or use your own models and -place them into: /usr/share/tesseract-ocr/4.00/tessdata +https://github.com/tesseract-ocr/tesseract/wiki/Data-Files or use your +own models and place them into: /usr/share/tesseract-ocr/4.00/tessdata ## Workflow configuration @@ -256,6 +273,7 @@ If GT is used, steps 1, 5 and 8 can be omitted. Else if a segmentation is used i To run a few basic tests type `make test` (`ocrd_cis` has to be installed in order to run any tests). +# Miscellaneous ## OCR-D workspace * Create a new (empty) workspace: `ocrd workspace init workspace-dir` diff --git a/bashlib/ocrd-cis-eval-all.sh b/bashlib/ocrd-cis-eval-all.sh deleted file mode 100755 index 3f0709da..00000000 --- a/bashlib/ocrd-cis-eval-all.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -set -e - -bdir=$(dirname "$0") -source "$bdir/ocrd-cis-lib.sh" - -get_eval_dir() { - case $1 in - bodenstein) echo "eval/1557-Bodenstein-WieSichMeniglich";; - grenzboten) echo "eval/1841-DieGrenzboten";; - *) exit 1;; - esac -} - -odir=$(date +%Y%m%d_%H_%M) -odir="results/$odir" - -for how in shuffle ocrd; do - for corpus in grenzboten bodenstein; do - dir="eval-$corpus-$how" - cmd="$bdir/ocrd-cis-eval-$how.sh" - # rm -rf "$dir" - ocrd-cis-log ./"$cmd" -P "config/config-$how-$corpus.json" $(get_eval_dir $corpus) "$dir" - ./"$cmd" -P "config/config-$how-$corpus.json" $(get_eval_dir $corpus) "$dir" - mkdir -p "$odir" - cp -r "$dir" "$odir" - done -done diff --git a/bashlib/ocrd-cis-eval-ocrd-self.sh b/bashlib/ocrd-cis-eval-ocrd-self.sh deleted file mode 100755 index 0067d0bc..00000000 --- a/bashlib/ocrd-cis-eval-ocrd-self.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash -set -e -bdir=$(dirname "$0") -source "$bdir/ocrd-cis-lib.sh" - -config=$(ocrd-cis-getopt -P --parameter $*) -odir=eval-ocrd-self -url=$(cat "$config" | jq --raw-output .gtlink) - -ocrd-cis-download-and-extract-ground-truth "$url" downloads - -############################################# -# shuffle files into eval and train folders # -############################################# -odirtrain="$odir/train-corpus" -odireval="$odir/eval-corpus" -mkdir -p "$odirtrain" "$odireval" -for dir in downloads/*; do - if [[ ! -d "$dir" ]]; then continue; fi - i=1 - for xml in $(find "$dir" -type f -name '*.xml' | grep -v 'alto' | shuf); do - echo "xml: $xml" - img=$(ocrd-cis-find-image-for-xml "$dir" "$xml") - echo "xml: $xml img: $img" - x=$((i%2)) - name=$(basename "$dir") - if [[ $x == 0 ]]; then - mkdir -p "$odireval/$name" - cp "$xml" "$img" "$odireval/$name" - else - mkdir -p "$odirtrain/$name" - cp "$xml" "$img" "$odirtrain/$name" - fi - i=$((i+1)) - done -done - -######### -# train # -######### -mkdir -p "$odir/trainws" -pushd "$odir/trainws" -ocrd workspace init . -popd -for dir in "$odirtrain/"*; do - name=$(basename "$dir") - ocrd-cis-run-ocr-and-align "$config" "$odir/trainws/mets.xml" "$dir" "train-ocrd-self-$name" GT -done -ocrd-cis-run-training "$config" "$odir/trainws/mets.xml" - -######### -# eval # -######### -mkdir -p "$odir/evalws" -pushd "$odir/evalws" -ocrd workspace init . -popd -for dir in "$odireval/"*; do - name=$(basename "$dir") - ocrd-cis-run-ocr-and-align "$config" "$odir/evalws/mets.xml" "$dir" "eval-ocrd-self-$name" GT -done -ocrd-cis-run-evaluation "$config" "$odir/evalws/mets.xml" diff --git a/bashlib/ocrd-cis-eval-ocrd.sh b/bashlib/ocrd-cis-eval-ocrd.sh deleted file mode 100755 index 943cebf5..00000000 --- a/bashlib/ocrd-cis-eval-ocrd.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -set -e -bdir=$(dirname "$0") -source "$bdir/ocrd-cis-lib.sh" - -if [[ $# != 4 ]]; then - echo "usage: $0 -P|--parameter config input-dir output-dir-basename" - exit 2 -fi -config=$(ocrd-cis-getopt -P --parameter $*) -idir=$3 -odir=$4 - -########################################## -# train post correction from ocrd corpus # -########################################## -if [[ ! -d "$odir/trainws" ]]; then - mkdir -p "$odir/trainws" - pushd "$odir/trainws" - ocrd workspace init . - popd -fi - -gtlink=$(cat "$config" | jq --raw-output '.gtlink') -ocrd-cis-download-and-extract-ground-truth "$gtlink" downloads -for dir in downloads/*; do - if [[ ! -d "$dir" ]]; then continue; fi - name=$(basename "$dir") - ocrd-cis-run-ocr-and-align "$config" "$odir/trainws/mets.xml" "$dir" "train-ocrd-corpus-$name" GT -done -ocrd-cis-run-training "$config" "$odir/trainws/mets.xml" - -############# -# evaluate # -############# -mkdir -p "$odir/evalws" -pushd "$odir/evalws" -ocrd workspace init . -popd -ocrd-cis-run-ocr-and-align "$config" "$odir/evalws/mets.xml" "$idir" eval-ocrd-corpus GT -ocrd-cis-run-evaluation "$config" "$odir/evalws/mets.xml" diff --git a/bashlib/ocrd-cis-eval-shuffle.sh b/bashlib/ocrd-cis-eval-shuffle.sh deleted file mode 100755 index f55b9821..00000000 --- a/bashlib/ocrd-cis-eval-shuffle.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/bash -set -e -bdir=$(dirname "$0") -source "$bdir/ocrd-cis-lib.sh" - -if [[ $# != 4 ]]; then - echo "usage: $0 -P|--parameter config input-dir output-dir-basename" - exit 2 -fi -config=$(ocrd-cis-getopt -P --parameter $*) -idir=$3 -odir=$4 - -############################################# -# shuffle files into eval and train folders # -############################################# -odirtrain="$odir/train-corpus" -odireval="$odir/eval-corpus" -if [[ ! -d "$odirtrain" ]]; then - mkdir -p "$odirtrain" "$odireval" - i=1 - for xml in $(find "$idir" -type f -name '*.xml' | sort); do - img=$(ocrd-cis-find-image-for-xml "$(dirname $xml)" "$xml") - x=$((i%2)) - if [[ $x == 0 ]]; then - cp "$xml" "$img" "$odirtrain" - else - cp "$xml" "$img" "$odireval" - fi - i=$((i+1)) - done -fi - -######### -# train # -######### -if [[ ! -d "$odir/trainws" ]]; then - mkdir -p "$odir/trainws" - pushd "$odir/trainws" - ocrd workspace init . - popd -fi -ocrd-cis-run-ocr-and-align "$config" "$odir/trainws/mets.xml" "$odirtrain" train-shuffle GT -ocrd-cis-run-training "$config" "$odir/trainws/mets.xml" - -######### -# eval # -######### -if [[ ! -d "$odir/evalws" ]]; then - mkdir -p "$odir/evalws" - pushd "$odir/evalws" - ocrd workspace init . - popd -fi -ocrd-cis-run-ocr-and-align "$config" "$odir/evalws/mets.xml" "$odireval" eval-shuffle GT -ocrd-cis-run-evaluation "$config" "$odir/evalws/mets.xml" diff --git a/bashlib/ocrd-cis-evaluate.sh b/bashlib/ocrd-cis-evaluate.sh deleted file mode 100755 index dc09772a..00000000 --- a/bashlib/ocrd-cis-evaluate.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash - -set -e - -source "$(dirname $0)/ocrd-cis-lib.sh" - -config=$(ocrd-cis-getopt -P --parameter $*) -ifg=$(ocrd-cis-getopt -I --input-file-grp $*) -mets=$(ocrd-cis-getopt -M --mets $*) -workspace=$(/usr/bin/dirname "$mets") -jar=$(cat "$config" | jq --raw-output '.jar') -LOG_LEVEL=DEBUG - -echo ifg $ifg -echo mets $mets -echo workspace $workspace -echo jar $jar -echo config $config - -######################## -# download newest jar # -######################## -ocrd-cis-download-jar "$jar" - -############################## -# create workspace (really?) # -############################## -if [[ ! -d "$workspace" ]]; then - mkdir -p "$workspace" - pushd "$workspace" - ocrd workspace init . - popd -fi - -########################################### -# search for intput file group directory # -########################################### -inputdir=$(find . -type d -name "$ifg") -if [[ -z $inputdir ]]; then - echo "cannot find input directory for $ifg" - exit 1 -fi - -####################################### -# add gt and image files to worksapce # -####################################### -max=-1 # set to -1 for all -for pxml in $(find "$inputdir" -type f -name '*.xml'); do - if [[ max -eq 0 ]]; then - break; - fi - max=$((max-1)) - img=$(ocrd-cis-find-image-for-xml "$inputdir" $(basename "$pxml")) - if [[ -z $img ]]; then - echo cannot find image file for $img - exit 1 - fi - if [[ -f $img ]]; then - ocrd-cis-add-pagexml-and-image-to-workspace \ - "$workspace" "OCR-D-GT-EVAL-$ifg" "$pxml" "OCR-D-IMG-EVAL-$ifg" "$img" - fi -done - -echo $OCRFILEGRPS -ocrd-cis-run-ocr "$config" "$mets" "OCR-D-GT-EVAL-$ifg" "OCR-D-OCR-EVAL-XXX-$ifg" -echo $OCRFILEGRPS -OCRFILEGRPS="$OCRFILEGRPS OCR-D-GT-EVAL-$ifg" -echo $OCRFILEGRPS -OCRFILEGRPS=$(ocrd-cis-join-by , $OCRFILEGRPS) -echo $OCRFILEGRPS -ocrd-cis-log ocrd-cis-align \ - --input-file-grp "$OCRFILEGRPS" \ - --output-file-grp "OCR-D-ALIGN-EVAL-$ifg" \ - --mets "$mets"\ - --parameter $(cat "$config" | jq --raw-output ".alignparampath") \ - --log-level $LOG_LEVEL - -ocrd-cis-align \ - --input-file-grp "$OCRFILEGRPS" \ - --output-file-grp "OCR-D-ALIGN-EVAL-$ifg" \ - --mets "$mets" \ - --parameter $(cat "$config" | jq --raw-output ".alignparampath") \ - --log-level $LOG_LEVEL - -main="de.lmu.cis.ocrd.cli.Main" -param=$(cat "$config" | jq --raw-output '.trainparampath') -for cmd in evaluate-dle evaluate-rrdm; do - java -Dfile.encoding=UTF-8 -Xmx3g -cp "$jar" "$main" -c $cmd \ - --mets "$mets" \ - --parameter "$param" \ - --input-file-grp "OCR-D-ALIGN-EVAL-$ifg" \ - --log-level $LOG_LEVEL -done diff --git a/bashlib/ocrd-cis-lib.sh b/bashlib/ocrd-cis-lib.sh deleted file mode 100644 index c2bff00c..00000000 --- a/bashlib/ocrd-cis-lib.sh +++ /dev/null @@ -1,469 +0,0 @@ -#!/bin/bash - -set -e -# global default log-level -LOG_LEVEL=DEBUG - -ocrd-cis-log() { - echo $(date +%R:%S.%N | sed -e 's/.*\([0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9][0-9]\).*/\1/') $LOG_LEVEL - $* >&2 -} - -# Write a OCR-D debug log message to stderr. -ocrd-cis-debug() { - case $LOG_LEVEL in - DEBUG) echo $(date +%R:%S.%N | sed -e 's/.*\([0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9][0-9]\).*/\1/') DEBUG ocrd.cis.bashlib - $* >&2;; - esac -} - -# Write a OCR-D info log message to stderr. -ocrd-cis-info() { - case $LOG_LEVEL in - DEBUG|INFO) echo $(date +%R:%S.%N | sed -e 's/.*\([0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9][0-9]\).*/\1/') INFO ocrd.cis.bashlib - $* >&2;; - esac -} - -# Print error message to stderr and exit. -# Usage `ocrd-cis-fail "error message" [EXIT]` -function ocrd-cis-fail { - printf '%s\n' "$1" >&2 - exit "${2-1}" -} - -# utility function to join strings with a given string -function ocrd-cis-join-by { local IFS="$1"; shift; echo "$*"; } - -# Parse command line arguments for a given argument and -# SETS_CIS_OPTARG to the additional provided value. Usage: -# `ocrd-cis-getopt -P --parameter $*`. -ocrd-cis-getopt() { - OCRD_CIS_OPTARG="" - local short=$1 - shift - local long=$1 - shift - while [[ $# -gt 0 ]]; do - case $1 in - $short|$long) OCRD_CIS_OPTARG=$2; return 0;; - *) shift;; - esac - done - return 1; -} - -# Download the ocrd.jar if it does not exist. -ocrd-cis-download-jar() { - if [[ -f "$1" ]]; then - return 0 - fi - local jar=http://www.cis.lmu.de/~finkf/ocrd.jar - local dir=$(/usr/bin/dirname $1) - pushd $dir - wget -N $jar || true - popd -} - -# Get the file for a file path. Sets OCRD_CIS_FILE_ID to -# the file id. Usage: ocrd-cis-file-id path/to/file.xml -OCRD_CIS_FILE_ID="" -ocrd-cis-get-file-id() { - local path=$1 - local filename=$(basename "$path") - local ext="${filename##*.}" - local fileid="${filename%.*}" - OCRD_CIS_FILE_ID="${fileid}_${ext}" - echo $path $OCRD_CIS_FILE_ID -} - -# Add a zipped OCR-D ground truth zip to the workspace. The current -# directory must be a valid workspace with an according mets file. -# Sets OCR_D_CIS_GT_FILEGRP and OCR_D_CIS_IMG_FILEGRP to the according -# filegroups. Exits if the image file for a page-XML file in the zip -# archive cannot be found. -# Usage: `ocrd-cis-add-gt-zip ZIP TMP_DIR -# * ZIP: path to the gt-zip file -# * TMP_DIR: existing temporary directory for extracted files -ocrd-cis-add-gt-zip() { - local zip=$1 - local tmp=$2 - ocrd-cis-log ocrd-cis-add-gt-zip $zip $tmp - unzip -d "$tmp" "$zip" - - local base=$(echo $(basename $zip) | tr '_ \t' '-') - base=${base/.zip/} - local gtfg="OCR-D-GT-$base" - local imgfg="OCR-D-IMG-$base" - for xml in $(find "$tmp" -type f -name '*.xml' | grep -i 'page'); do - local imgname=$(sed -ne 's/.*imageFilename="\([^"]*\)".*/\1/p' "$xml") - local img=$(find "$tmp" -type f -name "$imgname") - if [[ ! -f "$img" ]]; then - echo "cannot find image: $imgname" - exit 1 - fi - # add image to workspace - local imgmimetype=$(ocrd-cis-get-mimetype-by-extension "$img") - ocrd-cis-get-file-id "$img" - ocrd workspace add \ - --file-grp "$imgfg" \ - --mimetype "$imgmimetype" \ - --file-id "$OCRD_CIS_FILE_ID" \ - "$img" - # get img path in workspace and set imageFilename in page xml accordingly. - img=$(ocrd workspace find -i "$OCRD_CIS_FILE_ID") - sed -i -e "s#imageFilename=\"[^\"]*\"#imageFilename=\"$img\"#" "$xml" - # add page xml file to workspace - ocrd-cis-get-file-id "$xml" - ocrd workspace add \ - --file-grp "$gtfg" \ - --mimetype "application/vnd.prima.page+xml" \ - --file-id "$OCRD_CIS_FILE_ID" \ - "$xml" - done - # set global filegroup variables - OCR_D_CIS_GT_FILEGRP=$gtfg - OCR_D_CIS_IMG_FILEGRP=$imgfg -} - -# Add a zipped OCR-D ground truth zip to the workspace. The current -# directory must be a valid workspace with an according mets file. -# Usage: `ocrd-cis-add-gt-zip URL TMP_DIR -# * URL: URL to the gt-zip file -# * TMP_DIR: existing temporary directory for downloaded (and -# extracted) files -ocrd-cis-download-and-add-gt-zip() { - local url=$1 - local tmp=$2 - ocrd-cis-log ocrd-cis-download-and-add-gt-zip $url $tmp - wget -P "$tmp" $url - local zip=$(find $tmp -type f -name '*.zip') - echo $zip - ocrd-cis-add-gt-zip "$zip" "$tmp" -} - -# Get the mimetype of a given path. The mimetype is determined using -# the file's extension. -ocrd-cis-get-mimetype-by-extension() { - case $(echo $1 | tr '[:upper:]' '[:lower:]') in - *.tif | *.tiff) echo "image/tif";; - *.jpg | *.jpeg) echo "image/jpeg";; - *.png) echo "image/png";; - *.xml) echo "application/vnd.prima.page+xml";; - *) echo "UNKWNON" - esac -} - -# Check if given file-id exists in the given mets file. Usage: -# `ocrd-cis-file-id-exists mets fileid`. -# * mets: path to the mets file -# * fileid: the file id -ocrd-cis-file-id-exists() { - local workspace=$(dirname "$1") - local fileid=$2 - pushd "$workspace" - local check=$(ocrd workspace find --file-id "$fileid") - popd - if [[ -z "$check" ]]; then return 1; fi - return 0 -} - -# Check if given file-grp exists in the given mets file. Usage: -# `ocrd-cis-file-grp-exists mets filegrp`. -# * mets: path to the mets file -# * filegrp: the file grp -ocrd-cis-file-grp-exists() { - local workspace=$(dirname "$1") - local filegrp=$2 - pushd "$workspace" - local check=$(ocrd workspace find --file-grp "$filegrp") - popd - if [[ -z "$check" ]]; then return 1; fi - return 0 -} - -# Run multiple OCRs over a file group. Usage: `ocrd-cis-run-ocr -# configfile mets ifg ofg`. A XXX in the ofg is replaced with the -# ocr-type and number. This function sets the global variable -# $OCRFILEGRPS to a space-separated list of the ocr output file -# groups. -ocrd-cis-run-ocr() { - local config=$1 - local mets=$2 - local ifg=$3 - local ofg=$4 - OCRFILEGRPS="" - - for i in $(seq 0 $(cat "$config" | jq ".ocr | length-1")); do - local type=$(cat "$config" | jq --raw-output ".ocr[$i].type") - local path=$(cat "$config" | jq --raw-output ".ocr[$i].path") - local utype=$(echo $type | tr '[:lower:]' '[:upper:]') - local xofg=${ofg/XXX/$utype-$((i+1))} - OCRFILEGRPS="$OCRFILEGRPS $xofg" - if ocrd-cis-file-grp-exists "$mets" "$xofg"; then - ocrd-cis-log skipping ocr for $xofg - continue - # else - # ocrd-cis-log $xofg does not exist. - # exit 1 - fi - case $utype in - "OCROPY") - ocrd-cis-log ocrd-cis-ocropy-recognize \ - --input-file-grp $ifg \ - --output-file-grp $xofg \ - --mets "$mets" \ - --parameter $path \ - --log-level $LOG_LEVEL - ocrd-cis-ocropy-recognize \ - --input-file-grp $ifg \ - --output-file-grp $xofg \ - --mets "$mets" \ - --parameter $path \ - --log-level $LOG_LEVEL - ;; - "TESSERACT") - ocrd-cis-log ocrd-tesserocr-recognize \ - --input-file-grp $ifg \ - --output-file-grp $xofg \ - --mets "$mets" \ - --parameter $path \ - --log-level $LOG_LEVEL - ocrd-tesserocr-recognize \ - --input-file-grp $ifg \ - --output-file-grp $xofg \ - --mets "$mets" \ - --parameter $path \ - --log-level $LOG_LEVEL - ;; - *) - echo "invalid ocr type: $utype" - exit 1 - ;; - esac - done -} - -# Search for the associated image file for the given xml file in the -# given directory. The given xml file must end with .xml. Usage: -# `ocrd-cis-find-image-for-xml dir xy.xml` -ocrd-cis-find-image-for-xml() { - local dir=$1 - local xml=$2 - - for pre in .bin .dew ""; do # prefer binary before descewed before normal images - for ext in .jpg .jpeg .JPG .JPEG .png .tiff .tif; do - # strict search based on the xml file's name - # try also using the xml file's number, e.g xyz_123.xml -> 123.bin.png - local name=$(basename "$xml") - local name=${name/.xml/$pre$ext} - local numname=$(echo $name | sed -e 's/.*[-_]\([0-9]*\.\)/\1/') - local file=$(find "$dir" -type f -name "$name" -o -type f -name "$numname") - # echo find "$dir" -type f -name "$name" -o -type f -name "$numname" - # echo file $file - if [[ ! -z "$file" ]]; then - ocrd-cis-log found image: $file for xml: $xml - echo $file - return 0 - fi - done - done - return 1 -} - -# Add a pagexml and image file pair to a workspace. The according -# imageFilename attribute of the page xml file is set accordingly. -# The basename of the given files are used as file ids. Usage: -# `ocrd-cis-add-xml-image-pair mets xml xmlfg img imgfg`. -# * mets: path to the workspace's mets file -# * xml: path to the page xml file -# * xmlfg: file group of the xml file -# * img: path to the imaage file -# * imgfg: file group of the image file -ocrd-cis-add-xml-image-pair() { - local mets=$1 - local xml=$2 - local xmlfg=$3 - local img=$4 - local imgfg=$5 - - local imgmt=$(ocrd-cis-get-mimetype-by-extension "$img") - local xmlmt=$(ocrd-cis-get-mimetype-by-extension "$xml") - local workspace=$(dirname "$mets") - local absxml=$(realpath "$xml") - local absimg=$(realpath "$img") - - if ocrd-cis-file-id-exists "$mets" "$(basename "$img")"; then - ocrd-cis-log skipping add to workspace for $img and $xml - return - fi - - pushd $workspace - # add image file - ocrd workspace add \ - --file-grp "$imgfg" \ - --mimetype "$imgmt" \ - --file-id "$(basename "$img")" \ - --force "$absimg" - # add xml file - ocrd workspace add \ - --file-grp "$xmlfg" \ - --mimetype "$xmlmt" \ - --file-id "$(basename "$xml")" \ - --force "$absxml" - # fix filepath - local relxml="$xmlfg/$(basename $xml)" - local relimg="$imgfg/$(basename $img)" - echo sed -i "s#imageFilename=\"\([^\"]*\)\"#imageFilename=\"$relimg\"#" "$relxml" - sed -i "s#imageFilename=\"\([^\"]*\)\"#imageFilename=\"$relimg\"#" "$relxml" - popd -} - -# Given a directory add image and base xml files, run additional ocrs -# and align them. Sets ALGINFILEGRP to the alignment file group. -# Usage: `ocrd-cis-run-ocr-and-align config mets dir fg gt`. -# * config : path to the main config file -# * mets : path to the mets file -# * dir : path to the directory -# * fg : base name of filegroups -# * gt : gt=GT if xml files are ground truth; anything else if not -ocrd-cis-run-ocr-and-align() { - local config=$1 - local mets=$2 - local dir=$3 - local fg=$4 - local gt=$5 - local workspace=$(dirname "$mets") - - for xml in $(find "$dir" -type f -name '*.xml'); do - if [[ "$xml" == *"alto"* ]]; then # skip alto xml files in gt archives - continue - fi - local img=$(ocrd-cis-find-image-for-xml "$dir" "$xml") - ocrd-cis-add-xml-image-pair "$mets" "$xml" "OCR-D-$gt-$fg" "$img" "OCR-D-IMG-$fg" - done - OCRFILEGRPS="" - ocrd-cis-run-ocr "$config" "$mets" "OCR-D-$gt-$fg" "OCR-D-XXX-$fg" - if [[ $(echo "$gt" | tr '[[:upper:]]' '[[:lower:]]') == "gt" ]]; then - OCRFILEGRPS="$OCRFILEGRPS OCR-D-$gt-$fg" - else - OCRFILEGRPS="OCR-D-$gt-$fg $OCRFILEGRPS" - fi - OCRFILEGRPS=$(ocrd-cis-join-by , $OCRFILEGRPS) - ALIGNFILEGRP="OCR-D-ALIGN-$fg" - if ocrd-cis-file-grp-exists "$mets" "$ALIGNFILEGRP"; then - ocrd-cis-log skipping aligning of $ALIGNFILEGRP - return - fi - ocrd-cis-log ocrd-cis-align \ - --input-file-grp "$OCRFILEGRPS" \ - --output-file-grp "$ALIGNFILEGRP" \ - --mets "$mets" \ - --parameter $(cat "$config" | jq --raw-output ".alignparampath") \ - --log-level $LOG_LEVEL - ocrd-cis-align \ - --input-file-grp "$OCRFILEGRPS" \ - --output-file-grp "$ALIGNFILEGRP" \ - --mets "$mets" \ - --parameter $(cat "$config" | jq --raw-output ".alignparampath") \ - --log-level $LOG_LEVEL - # (Cannot use non unicode chars if installing this) - # Change long s (\u017f) to normal s if the ground truth - # does not contain long s. - # fixlongs=$(cat "$config" | jq --raw-output '.fixLongS') - # if [[ "$fixlongs" == "true" ]]; then - # pushd "$workspace" - # ocrd-cis-log "fixing long s in file" - # for fg in $(ocrd workspace list-group | grep 'ALIGN'); do - # ocrd-cis-log "fixing long s in filegroup $fg" - # for xml in "$fg"/*; do - # ocrd-cis-log "fixing long s in file $xml" - # sed -i -e 's/\u017f/s/g' "$xml" - # done - # done - # popd - # fi -} - -# Run the training over the `-ALIGN-` filegroups in the workspace -# directory of the given mets.xml file. Usage: `ocrd-cis-run-training -# config mets`. -# * config: path to the configuration file -# * mets: path to the mets file -ocrd-cis-run-training() { - local config=$1 - local mets=$2 - local workspace=$(dirname "$mets") - local main="de.lmu.cis.ocrd.cli.Main" - local jar=$(cat "$config" | jq --raw-output '.jar') - local trainconfig=$(cat "$config" | jq --raw-output '.trainparampath') - - # get -ALIGN- filegroups - pushd "$workspace" - local trainfilegrps="" - for fg in $(ocrd workspace list-group); do - if [[ $fg == *"-ALIGN-"* ]]; then - trainfilegrps="$trainfilegrps -I $(basename $fg)" - fi - done - popd - # run training - ocrd-cis-log java -Dfile.encoding=UTF-8 -Xmx3g -cp $jar $main --log-level $LOG_LEVEL \ - -c train --mets "$mets" --parameter $trainconfig $trainfilegrps - java -Dfile.encoding=UTF-8 -Xmx3g -cp "$jar" "$main" --log-level "$LOG_LEVEL" \ - -c train --mets "$mets" --parameter "$trainconfig" $trainfilegrps -} - -# Run the evaluation over the `-ALIGN-` filegroups in the workspace -# directory of the given mets.xml file. Usage: -# `ocrd-cis-run-evaluation config mets`. -# * config: path to the configuration file -# * mets: path to the mets file -ocrd-cis-run-evaluation() { - local config=$1 - local mets=$2 - local workspace=$(dirname "$mets") - local main="de.lmu.cis.ocrd.cli.Main" - local jar=$(cat "$config" | jq --raw-output '.jar') - local evalconfig=$(cat "$config" | jq --raw-output '.evalparampath') - - # get -ALIGN- filegroups - pushd "$workspace" - local trainfilegrps="" - for fg in $(ocrd workspace list-group); do - if [[ $fg == *"-ALIGN-"* ]]; then - trainfilegrps="$trainfilegrps -I $(basename $fg)" - fi - done - popd - # run evaluation - for cmd in evaluate-dle evaluate-rrdm; do - ocrd-cis-log java -Dfile.encoding=UTF-8 -Xmx3g -cp "$jar" "$main" -c "$cmd" \ - --mets "$mets" \ - --parameter "$param" \ - $trainfilegrps \ - --log-level $LOG_LEVEL - java -Dfile.encoding=UTF-8 -Xmx3g -cp "$jar" "$main" -c "$cmd" \ - --mets "$mets" \ - --parameter "$evalconfig" \ - $trainfilegrps \ - --log-level $LOG_LEVEL - done -} - -# Download the ground truth archives and unzip them into a dedicated -# directory. Usage: `ocrd-cis-download-and-extract-ground-truth url -# dir`. -# * url: URL of the archives -# * dir: output directory for the extracted archives -ocrd-cis-download-and-extract-ground-truth() { - local url=$1 - local dir=$2 - mkdir -p "$dir" - pushd "$dir" - ocrd-cis-log "downloading $url" - wget -r -np -l1 -nd -N -A zip -erobots=off "$url" || true # ignore exit status of wget - for zip in *.zip; do - # this archive is broken - if [[ "$(basename $zip)" == $'bi\u00dfmarck_carmina_1657.zip' ]]; then continue; fi - unzip -u -o $zip - done - popd -} diff --git a/bashlib/ocrd-cis-pack-result-dir.sh b/bashlib/ocrd-cis-pack-result-dir.sh deleted file mode 100755 index 1d2f9b2f..00000000 --- a/bashlib/ocrd-cis-pack-result-dir.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -set -e - -# bdir=$(dirname "$0") -# source "$bdir/ocrd-cis-lib.sh" - -idir=$1 -odir=$2 -tar=$(basename "$idir") -tar="$odir/$tar.tar.bz2" - -GZIP=-9 # use best compression -pushd $idir -tar -cjf "$tar" **/train/*.txt -tar -tf "$tar" -echo $tar -popd diff --git a/bashlib/ocrd-cis-post-correct.sh b/bashlib/ocrd-cis-post-correct.sh deleted file mode 100644 index 267a5516..00000000 --- a/bashlib/ocrd-cis-post-correct.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -# set -x -source $(dirname $0)/ocrd-cis-lib.sh - -# Post correct OCR results. Args: -# * the option --mets for the workspace. -# * the option --log-level is passed to other tools. -# * the option --parameter is used as configuration. -# * the option --output-file-grp defines the output filegroup of the -# post-corrected page xml files. -# * the option --input-file-grp specifies the file group of the ocr -# results that will be processed. - -# -# tmp dir and cleanup -# -tmpdir=$(mktemp -d) -trap "rm -rfv $tmpdir" EXIT - -# -# command line arguments -# -LOG_LEVEL=INFO -if ocrd-cis-getopt -l --log-level $*; then - LOG_LEVEL=$OCRD_CIS_OPTARG -fi -ocrd-cis-debug "log-level: $LOG_LEVEL" -ocrd-cis-getopt -m --mets $* || ocrd-cis-fail "error: missing METS file (--mets)" -METS=$OCRD_CIS_OPTARG -METS=$(realpath $METS) -ocrd-cis-debug "mets: $METS" -ocrd-cis-getopt -p --parameter $* || ocrd-cis-fail "error: missing configuration file (--parameter)" -PARAMETER=$OCRD_CIS_OPTARG -PARAMETER=$(realpath $PARAMETER) -ocrd-cis-debug "parameter: $PARAMETER" -ocrd-cis-getopt -O --output-file-grp $* || ocrd-cis-fail "error: missing output file group (--output-file-grp)" -OUTPUT_FILE_GRP=$OCRD_CIS_OPTARG -ocrd-cis-debug "output file group: $OUTPUT_FILE_GRP" -ocrd-cis-getopt -I --input-file-grp $* || ocrd-cis-fail "error: missing input file group (--input-file-grp)" -INPUT_FILE_GRP=$OCRD_CIS_OPTARG -ocrd-cis-debug "input file group: $INPUT_FILE_GRP" - -# -# do additional ocrs and align -# -ocrd-cis-info "step: additional ocr and alignment" -# preset in case that there are no ocr steps -XML_INPUT_FILE_GRP="$INPUT_FILE_GRP" -XML_OUTPUT_FILE_GRP="$INPUT_FILE_GRP" -alignfgs="$XML_INPUT_FILE_GRP" # master ocr comes first -n=1 -for cmd in $(cat $PARAMETER | jq -r '.ocrSteps[] | @base64'); do - XML_OUTPUT_FILE_GRP="$XML_OUTPUT_FILE_GRP-OCR$n" - n=$((n+1)) - cmd=$(echo $cmd | base64 -d) - eval ocrd-cis-debug "$cmd" - eval $cmd || exit 1 - alignfgs="$alignfgs,$XML_OUTPUT_FILE_GRP" -done -alignfg="$XML_INPUT_FILE_GRP-ALIGN" -ocrd-cis-debug ocrd-cis-align --mets $METS \ - --input-file-grp "$alignfgs" \ - --output-file-grp "$alignfg" -ocrd-cis-align --mets $METS \ - --input-file-grp "$alignfgs" \ - --output-file-grp "$alignfg" - -# -# post correction -# -pcdir="$tmpdir/training" -mkdir -p "$pcdir" -main="de.lmu.cis.ocrd.cli.Main" -jar=$(ocrd-cis-data -jar) -nocr=$(jq ".ocrSteps | length+1" "$PARAMETER") -ocrd-cis-info "step: post-correction" -ocrd-cis-debug java -Dfile.encoding=UTF-8 -Xmx3g -cp $jar $main \ - --log-level $LOG_LEVEL \ - -c post-correct \ - --mets $METS \ - --parameter <(jq ".postCorrection.nOCR = \"$nocr\" | .postCorrection" "$PARAMETER") \ - --input-file-grp "$trainfgs" -java -Dfile.encoding=UTF-8 -Xmx3g -cp $jar $main \ - --log-level $LOG_LEVEL \ - -c post-correct \ - --mets $METS \ - --parameter <(jq ".postCorrection.nOCR = \"$nocr\" | .postCorrection" "$PARAMETER") \ - --input-file-grp "$alignfg" \ - --output-file-grp "$OUTPUT_FILE_GRP" - -# -# add protocols to the workspace -# -pushd $(dirname $METS) -if [[ -f "$pcdir/le-protocol.json" ]]; then - ocrd-cis-info "step: add lexicon extension protocol" - ocrd workspace add \ - --file-grp "$OUTPUT_FILE_GRP-LE-PROTOCOL" \ - --mimetype "application/json" \ - --file-id "ocrd-cis-le-protocol.json" \ - "$pcwdir/le-protocol.json" -fi -if [[ -f "$pcdir/dm-protocol.json" ]]; then - ocrd-cis-info "step: add desicion maker protocol" - ocrd workspace add \ - --file-grp "$OUTPUT_FILE_GRP-DM-PROTOCOL" \ - --mimetype "application/json" \ - --file-id "ocrd-cis-dm-protocol.json" \ - "$pcwdir/md-protocol.json" -fi -popd diff --git a/bashlib/ocrd-cis-synpage.sh b/bashlib/ocrd-cis-synpage.sh deleted file mode 100755 index 906f8237..00000000 --- a/bashlib/ocrd-cis-synpage.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -set -e - -idir=$1 -odir=$2 -n=$3 -if [[ -z "$n" ]]; then n=25; fi - -files="" -i=1 -j=1 -for f in $idir/*.gt.txt; do - if [[ -z "$files" ]]; then - files=$f - else - files="$files $f" - fi - if [[ $((i%n)) == 0 ]]; then - mkdir -p "$odir" - out=$(printf "$odir/%04d" $j) - gocrd synpage -o "$out" $files - files="" - j=$((j+1)) - fi - i=$((i+1)) -done -if [[ ! -z "$files" ]]; then - mkdir -p "$odir" - out=$(printf "$odir/%04d" $j) - gocrd synpage -o "$out" $files - files="" -fi diff --git a/bashlib/ocrd-cis-train.sh b/bashlib/ocrd-cis-train.sh deleted file mode 100755 index 21e4ee59..00000000 --- a/bashlib/ocrd-cis-train.sh +++ /dev/null @@ -1,177 +0,0 @@ -#!/bin/bash -# set -x -source $(dirname $0)/ocrd-cis-lib.sh - -# Train a model. The tool somewhat mimics the behaviour of other ocrd -# tools: -# * the option --mets for the workspace. -# * the option --log-level is passed to other tools. -# * the option --parameter is used as configuration. -# * the option --output-file-grp defines the output filegroup of the -# model file. -# * the option --input-file-grp is ignored. - -# -# tmp dir and cleanup -# -tmpdir=$(mktemp -d) -trap "rm -rfv $tmpdir" EXIT - -# -# command line arguments -# -LOG_LEVEL=INFO -if ocrd-cis-getopt -l --log-level $*; then - LOG_LEVEL=$OCRD_CIS_OPTARG -fi -ocrd-cis-debug "log-level: $LOG_LEVEL" -ocrd-cis-getopt -m --mets $* || ocrd-cis-fail "error: missing METS file (--mets)" -METS=$OCRD_CIS_OPTARG -METS=$(realpath $METS) -ocrd-cis-debug "mets: $METS" -ocrd-cis-getopt -p --parameter $* || ocrd-cis-fail "error: missing configuration file (--parameter)" -PARAMETER=$OCRD_CIS_OPTARG -PARAMETER=$(realpath $PARAMETER) -ocrd-cis-debug "parameter: $PARAMETER" -ocrd-cis-getopt -O --output-file-grp $* || ocrd-cis-fail "error: missing output file group (--output-file-grp)" -OUTPUT_FILE_GRP=$OCRD_CIS_OPTARG -ocrd-cis-debug "output file group: $OUTPUT_FILE_GRP" - -# -# download and prepare ground truth from archives -# -ocrd-cis-info "step: ground truth" -pushd $(dirname $METS) -for archive in $(cat $PARAMETER | jq -r '.gtArchives[]'); do - archivedir="$tmpdir/$(basename $archive)" - mkdir -p "$archivedir" - if [[ $archive == http://* ]] || [[ $archive == https://* ]]; then - ocrd-cis-download-and-add-gt-zip "$archive" "$archivedir" - else - ocrd-cis-add-gt-zip "$archive" "$archivedir" - fi - if [[ -z $xmlfgs ]]; then - xmlfgs=$OCR_D_CIS_GT_FILEGRP - imgfgs=$OCR_D_CIS_IMG_FILEGRP - else - xmlfgs="$xmlfgs $OCR_D_CIS_GT_FILEGRP" - imgfgs="$imgfgs $OCR_D_CIS_IMG_FILEGRP" - fi -done -popd - -# -# do image pre-processing -# -ocrd-cis-info "step: image pre processing" -tmpimgfgs="" -tmpxmlfgs="" -for imgfg in $imgfgs; do - xmlfg=${imgfg/IMG/GT} - IMG_INPUT_FILE_GRP=$imgfg - XML_INPUT_FILE_GRP=$xmlfg - # preset in case that there are no image preprocessing steps - IMG_OUTPUT_FILE_GRP=$imgfg - XML_OUTPUT_FILE_GRP=$xmlfg - n=1 - for cmd in $(cat $PARAMETER | jq -r '.imagePreprocessingSteps[] | @base64'); do - IMG_OUTPUT_FILE_GRP="${imgfg/IMG/IMG-IPP$n}" - XML_OUTPUT_FILE_GRP="${xmlfg/GT/XML-IPP$n}" - - n=$((n+1)) - cmd=$(echo $cmd | base64 -d) - eval ocrd-cis-debug "$cmd" - eval $cmd - IMG_INPUT_FILE_GRP=$IMG_OUTPUT_FILE_GRP - XML_INPUT_FILE_GRP=$XML_OUTPUT_FILE_GRP - done - if [[ -z tmpimgfgs ]]; then - tmpimgfgs=$IMG_OUTPUT_FILE_GRP - tmpxmlfgs=$XML_OUTPUT_FILE_GRP - else - tmpimgfgs="$tmpimgfgs $IMG_OUTPUT_FILE_GRP" - tmpxmlfgs="$tmpxmlfgs $XML_OUTPUT_FILE_GRP" - fi -done -imgfgs=$tmpimgfgs -xmlfgs=$tmpxmlfgs - -# -# do the ocr and align -# -ocrd-cis-info "step: ocr and alignment" -for xmlfg in $xmlfgs; do - # preset in case that there are no ocr steps - XML_INPUT_FILE_GRP=$xmlfg - XML_OUTPUT_FILE_GRP=$xmlfg - alignfgs="" - n=1 - for cmd in $(cat $PARAMETER | jq -r '.ocrSteps[] | @base64'); do - XML_OUTPUT_FILE_GRP="${xmlfg/XML/OCR$n}" - n=$((n+1)) - cmd=$(echo $cmd | base64 -d) - eval ocrd-cis-debug "$cmd" - eval $cmd || exit 1 - if [[ -z $alignfgs ]]; then - alignfgs="$XML_OUTPUT_FILE_GRP" - else - alignfgs="$alignfgs,$XML_OUTPUT_FILE_GRP" - fi - done - alignfgs="$alignfgs,$xmlfg" # append gt filegroup - trainfg="${xmlfg/XML/ALIGN}" - ocrd-cis-align --mets $METS \ - --input-file-grp "$alignfgs" \ - --output-file-grp "$trainfg" - werfg="${xmlfg/XML/WER}" - ocrd-cis-wer --mets $METS \ - --input-file-grp "$trainfg" \ - --output-file-grp "$werfg" - # sadly we cannot use something like ocrd workspace find -G | grep ALIGN - if [[ -z $trainfgs ]]; then - trainfgs="$trainfg" - else - trainfgs="$trainfg,$trainfgs" - fi -done - -# -# training -# -traindir="$tmpdir/training" -mkdir -p "$traindir" -main="de.lmu.cis.ocrd.cli.Main" -nocr=$(jq ".ocrSteps | length" "$PARAMETER") -ocrd-cis-info "step: training" -# eval ocrd-cis-debug java -Dfile.encoding=UTF-8 -Xmx3g -cp $(ocrd-cis-data -jar) $main \ -# --log-level $LOG_LEVEL \ -# -c train \ -# --mets $METS \ -# --parameter <(jq ".training.dir = \"$traindir\"" "$PARAMETER") \ -# --input-file-grp "$trainfgs" -java -Dfile.encoding=UTF-8 -Xmx3g -cp $(ocrd-cis-data -jar) $main \ - --log-level $LOG_LEVEL \ - -c train \ - --mets $METS \ - --parameter <(jq ".training.dir = \"$traindir\" | .training.nOCR = \"$nocr\" | .training" "$PARAMETER") \ - --input-file-grp "$trainfgs" - -# -# add model and training resources to workspace -# -pushd $(dirname $METS) -ocrd-cis-info "step: cleanup" -ocrd workspace add \ - --file-grp "$OUTPUT_FILE_GRP" \ - --mimetype "application/zip" \ - --file-id "ocrd-cis-model.zip" \ - "$traindir/model.zip" -rm -rf "$traindir/model.zip" - -zip -r "$tmpdir/training.zip" "$traindir" -ocrd workspace add \ - --file-grp "$OUTPUT_FILE_GRP-TRAINING" \ - --mimetype "application/zip" \ - --file-id "ocrd-cis-training.zip" \ - "$tmpdir/training.zip" -popd diff --git a/bashlib/ocrd_cis.bash b/bashlib/ocrd_cis.bash deleted file mode 100644 index 82f35804..00000000 --- a/bashlib/ocrd_cis.bash +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash - -set -e - -PAGE_XML_MIME_TYPE="application/vnd.prima.page+xml" -CACHE_DIR="/tmp/ocrd_cis-py-cache" -TMP_DIR=${TMP_DIR:-$(mktemp -d -t ocrd_cis-tmp-XXXXXXXXX)} -PAGE_XML_FILES="" -PERSISTENT=${PERSISTENT:-no} -JAR="" - -function maybe_rmtd() { - if [[ "$PERSISTENT" = "yes" ]]; then - echo tmp dir = $TMP_DIR - else - echo removing $TMP_DIR - rm -rf $TMP_DIR - fi -} -trap maybe_rmtd EXIT - -function wget_cached() { - local url=$1 - local filename=$2 - local destdir=$TMP_DIR/downloads - - if test ! -f $CACHE_DIR/$filename; then - mkdir -p $CACHE_DIR - echo "downloading $url/$filename" - wget -P $CACHE_DIR $url/$filename - fi - mkdir -p $destdir - ln $CACHE_DIR/$filename $destdir/$filename ||\ - cp $CACHE_DIR/$filename $destdir/$filename -} - -function download_ocrd_gt_zip() { - local url="http://www.ocr-d.de/sites/all/GTDaten" - local filename=$1 - wget_cached $url $filename -} - -function unzip_ocrd_gt() { - local zip="$TMP_DIR/downloads/$1" - echo unziping $zip - unzip -d $TMP_DIR/downloads ${zip/.zip/} >/dev/null -} - -function download_and_unzip_ocrd_gt() { - download_ocrd_gt_zip $1 - unzip_ocrd_gt $1 -} - -function get_page_xml_files() { - PAGE_XML_FILES="" - for d in $@; do - for d in $(find $TMP_DIR/downloads -type d -name page); do - for f in $(find $d -type f | sort); do - PAGE_XML_FILES+=" $f" - done - done - done -} - -function download_ocrd_jar() { - local url='http://www.cis.lmu.de/~finkf' - wget_cached $url "ocrd-0.1.jar" - JAR="$TMP_DIR/downloads/ocrd-0.1.jar" -} - -# sets PERSISTENT and ARG variables -function parse_cmd_line_args() { - for arg in "$@"; do - case $arg in - -p|--persistent) - PERSISTENT=yes - ;; - *) - ARG=$arg - esac - done -} - -function setup_ocrd_test_environment() { - download_and_unzip_ocrd_gt $1 - download_ocrd_jar -} - -function assert_file_group_exists() { - pushd $TMP_DIR - ocrd workspace list-group | grep "$1" && true || exit 1 - popd -} diff --git a/data/docker/deps.txt b/data/docker/deps.txt deleted file mode 100644 index af8c76fa..00000000 --- a/data/docker/deps.txt +++ /dev/null @@ -1,7 +0,0 @@ -cmake -jq -libcppunit-dev -libxerces-c-dev -locales -maven -wget diff --git a/data/docker/ocrd-cis-ocropy-fraktur1.json b/data/docker/ocrd-cis-ocropy-fraktur1.json deleted file mode 100644 index 01ea4866..00000000 --- a/data/docker/ocrd-cis-ocropy-fraktur1.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "textequiv_level": "glyph", - "model": "${DATA}/models/fraktur1-00085000.pyrnn.gz" -} diff --git a/data/docker/ocrd-cis-ocropy-fraktur2.json b/data/docker/ocrd-cis-ocropy-fraktur2.json deleted file mode 100644 index 34e8d941..00000000 --- a/data/docker/ocrd-cis-ocropy-fraktur2.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "textequiv_level": "glyph", - "model": "${DATA}/models/fraktur2-00062000.pyrnn.gz" -} diff --git a/data/docker/ocrd-cis-post-correction.json b/data/docker/ocrd-cis-post-correction.json deleted file mode 100644 index eb1bed49..00000000 --- a/data/docker/ocrd-cis-post-correction.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "model": "${DATA}/models/model.zip", - "jar": "${DATA}/ocrd-cis.jar", - "nOCR": 2, - "trigrams": "${DATA}/models/character-trigrams.csv", - "lexiconExtensionProtocol": "", - "decisionMakerProtocol": "", - "additionalLexicon": "", - "runLexiconExtension": true, - "runDescisionMaker": true, - "profiler": { - "type": "local", - "executable": "/apps/profiler", - "config": "${DATA}/languages/german.ini", - "cacheDir": "${DATA}/cache" - }, - "ocr": [ - { - "type": "ocropy", - "path": "${DATA}/config/ocrd-cis-ocropy-fraktur1.json" - } - ] -} \ No newline at end of file diff --git a/ocrd_cis/__init__.py b/ocrd_cis/__init__.py index 67a2d879..6f37f4f7 100644 --- a/ocrd_cis/__init__.py +++ b/ocrd_cis/__init__.py @@ -1,7 +1,3 @@ -from .javaprocess import JavaProcess -from .javaprocess import JavaTrain from .javaprocess import JavaAligner -from .javaprocess import JavaProfiler -from .javaprocess import JavaEvalDLE -from .javaprocess import JavaEvalRRDM +from .javaprocess import JavaPostCorrector from .ocrd_tool import get_ocrd_tool diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index 9bc384d6..c06f01fe 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -118,7 +118,7 @@ def align_words(self, lines): # self.log.info(json.dumps(lines[0].alignment)) mregion = lines[0].region.get_Word() oregion = [lines[i].region.get_Word() for i in range(1, len(lines))] - for word in lines[0].alignment['words']: + for word in lines[0].alignment['wordAlignments']: self.log.debug("aligning word %s", word['master']) master, rest = self.find_word([word['master']], mregion, "master") mregion = rest diff --git a/ocrd_cis/data/__main__.py b/ocrd_cis/data/__main__.py index 1552e025..3d8ef735 100644 --- a/ocrd_cis/data/__main__.py +++ b/ocrd_cis/data/__main__.py @@ -2,12 +2,19 @@ import sys def main(): - if '-jar' in sys.argv: + usage = 'usage: ' + sys.argv[0] + ' -jar|-3gs|-model|-config' + if '-h' in sys.argv: + print(usage) + elif '-jar' in sys.argv: print(pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) elif '-3gs' in sys.argv: print(pkg_resources.resource_filename('ocrd_cis', 'data/3gs.csv.gz')) + elif '-model' in sys.argv: + print(pkg_resources.resource_filename('ocrd_cis', 'data/model.zip')) + elif '-config' in sys.argv: + print(pkg_resources.resource_filename('ocrd_cis', 'data/config.json')) else: - raise ValueError('usage: ' + sys.argv[0] + ' -jar|-3gs') + raise ValueError(usage) if __name__ == "__main__": main() diff --git a/ocrd_cis/data/config.json b/ocrd_cis/data/config.json new file mode 100644 index 00000000..6a6b57ba --- /dev/null +++ b/ocrd_cis/data/config.json @@ -0,0 +1,216 @@ +{ + "runLE": true, + "runDM": true, + "profiler": { + "path": "/path/to/profiler", + "config": "/path/to/language.ini" + }, + "nOCR": 2, + "maxCandidates": 10, + "dir": "/path/to/train.dir", + "trigrams": "/path/to/trigrams.csv", + "ocropusOCRExtensions": [], + "ocropusImageExtension": "", + "filterClasses": ["deactivate"], + "leFeatures": [ + { + "type": "de.lmu.cis.ocrd.ml.features.TokenLengthClassFeature", + "name": "TokenLengthClass", + "short": 3, + "medium": 8, + "long": 13 + }, + { + "type": "de.lmu.cis.ocrd.ml.features.TokenCaseClassFeature", + "name": "TokenCaseClass" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.TokenLengthFeature", + "name": "TokenLength" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.MaxOCRCharacterConfidenceFeature", + "name": "MaxOCRConfidence" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.MatchingOCRTokensFeature", + "name": "MatchingOCRTokens" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.LineOverlapWithMasterOCRFeature", + "name": "LineOverlap" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.LinePositionFeature", + "name": "LinePosition" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.OCRWordConfidenceFeature", + "name": "WordConfidence" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateMatchesOCRFeature", + "name": "HighestRankedCandidateMatches" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateHistoricalPatternsDistanceFeature", + "name": "HighestRankedCandidateHistPatternsDistance" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateOCRPatternsDistanceFeature", + "name": "HighestRankedCandidateOCRPatternsDistance" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateDistanceToNextFeature", + "name": "HighestRankedCandidateDistanceToNext" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateVoteWeightFeature", + "name": "HighestRankedCandidateVoteWeight" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.UnigramFeature", + "name": "UnigramOCRRelativeFrequency" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.LevenshteinDistanceFeature", + "name": "LevenshteinDistance", + "maxThreshold": 5 + } + ], + "rrFeatures": [ + { + "type": "de.lmu.cis.ocrd.ml.features.TokenLengthClassFeature", + "name": "TokenLengthClass", + "short": 3, + "medium": 7, + "long": 13 + }, + { + "type": "de.lmu.cis.ocrd.ml.features.TokenCaseClassFeature", + "name": "Tokenshape", + "class": "Tokenshape" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.TokenLengthFeature", + "name": "TokenLength" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.MaxCharNGramsFeature", + "name": "MaxCharTrigram" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.MinCharNGramsFeature", + "name": "MinCharTrigram" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.MaxOCRCharacterConfidenceFeature", + "name": "MaxOCRConfidence" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.MatchingOCRTokensFeature", + "name": "MatchingOCRTokens" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.LineOverlapWithMasterOCRFeature", + "name": "LineOverlap" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.LinePositionFeature", + "name": "LinePosition" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.OCRWordConfidenceFeature", + "name": "WordConfidence" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateCaseClassFeature", + "name": "CandidateCaseClass" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateHistoricalPatternsDistanceFeature", + "name": "CandidateHistoricalPatternsDistance" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateLengthClassFeature", + "name": "CandidateLengthClass", + "short": 3, + "medium": 8, + "long": 13 + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateLengthFeature", + "name": "CandidateLength" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateMatchesOCRTokenFeature", + "name": "CandidateMatchesOCR" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateMatchingOCRsFeature", + "name": "CandidateMatchingOCRs" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateMaxCharNGramsFeature", + "name": "CandidateMaxCharNGram" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateMinCharNGramsFeature", + "name": "CandidateMinCharNGram" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateMaxHistoricalPatternConfidenceFeature", + "name": "CandidateMaxHistoricalPatternConfidence" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateMinHistoricalPatternConfidenceFeature", + "name": "CandidateMinHistoricalPatternConfidence" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateVoteWeightFeature", + "name": "CandidateVoteWeight" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateUnigramFeature", + "name": "CandidateUnigram" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.UnigramFeature", + "name": "UnigramOCRRelativeFrequency" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateOCRPatternConfidenceFeature", + "name": "CandidateOCRPatternConfidenceFeature", + "classes": ["deactivate"] + } + ], + "dmFeatures": [ + { + "type": "de.lmu.cis.ocrd.ml.features.DMBestRankFeature", + "name": "BestRank" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.DMDifferenceToNextRankFeature", + "name": "BestRankDifferenceToNext" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateVoteWeightFeature", + "name": "CandidateVoteWeight" + }, + { + "type": "de.lmu.cis.ocrd.ml.features.CandidateOCRPatternConfidenceFeature", + "name": "OCRPatternConfidenceFeature", + "classes": ["deactivate"] + }, + { + "type": "de.lmu.cis.ocrd.ml.features.MinOCRCharacterConfidenceFeature", + "name": "MinOCRCharacterConfidence", + "classes": ["deactivate"] + }, + { + "type": "de.lmu.cis.ocrd.ml.features.AverageOCRCharacterConfidenceFeature", + "name": "AverageOCRCharacterConfidence", + "classes": ["deactivate"] + } + ] +} diff --git a/ocrd_cis/data/model.zip b/ocrd_cis/data/model.zip new file mode 100644 index 00000000..1b587274 Binary files /dev/null and b/ocrd_cis/data/model.zip differ diff --git a/ocrd_cis/data/ocrd-cis.jar b/ocrd_cis/data/ocrd-cis.jar index a21ecd20..0350d3b0 100644 Binary files a/ocrd_cis/data/ocrd-cis.jar and b/ocrd_cis/data/ocrd-cis.jar differ diff --git a/ocrd_cis/javaprocess.py b/ocrd_cis/javaprocess.py index 07ce0a2e..ce2f6bfd 100644 --- a/ocrd_cis/javaprocess.py +++ b/ocrd_cis/javaprocess.py @@ -14,8 +14,8 @@ def JavaAligner(n, loglvl): '--log-level', loglvl, '--parameter', '{}'.format(json.dumps({'n':n}))]) -def JavaProfiler(mets, ifg, ofg, params, loglvl): - return JavaProcess(JAR, ['-c', 'profile', +def JavaPostCorrector(mets, ifg, ofg, params, loglvl): + return JavaProcess(JAR, ['-c', 'post-correct', '--log-level', loglvl, '--input-file-grp', ifg, '--output-file-grp', ofg, @@ -23,45 +23,6 @@ def JavaProfiler(mets, ifg, ofg, params, loglvl): '-p', "{}".format(json.dumps(params))]) -def JavaTrain(jar, mets, ifgs, parameter, loglvl="DEBUG"): - args = [ - "-c", "train", - "--mets", mets, - "--log-level", loglvl, - "--parameter", parameter - ] - for ifg in ifgs: - args.append("-I") - args.append(ifg) - return JavaProcess(jar, args) - - -def JavaEvalDLE(jar, mets, ifgs, parameter, loglvl="DEBUG"): - args = [ - '-c', 'evaluate-dle', - '--mets', mets, - '--log-level', loglvl, - '--parameter', parameter - ] - for ifg in ifgs: - args.append('-I') - args.append(ifg) - return JavaProcess(jar, args) - - -def JavaEvalRRDM(jar, mets, ifgs, parameter, loglvl="DEBUG"): - args = [ - '-c', 'evaluate-rrdm', - '--mets', mets, - '--log-level', loglvl, - '--parameter', parameter - ] - for ifg in ifgs: - args.append('-I') - args.append(ifg) - return JavaProcess(jar, args) - - class JavaProcess: def __init__(self, jar, args): self.jar = jar @@ -106,17 +67,17 @@ def exe(self): """ cmd = self.get_cmd() self.log.info('command: %s', " ".join(cmd)) - ret = subprocess.run( - cmd, - stderr=subprocess.PIPE, - check=False, - universal_newlines=True, - ) - self.log.debug("%s: %i", " ".join(cmd), ret.returncode) - if ret.returncode != 0: - raise ValueError( - "cannot execute {}: {}\n{}" - .format(" ".join(cmd), ret.returncode, ret.stderr)) + with subprocess.Popen( + cmd, + stderr=subprocess.PIPE + ) as p: + sout, eout = p.communicate() + self.log_stderr(eout) + retval = p.wait() + if retval != 0: + raise ValueError( + "cannot execute {}: {}\n{}" + .format(" ".join(cmd), retval, eout.decode('utf-8'))) def log_stderr(self, err): for line in err.decode("utf-8").split("\n"): diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index ad4fe48d..c815b37e 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/cisocrgroup/ocrd_cis", - "version": "0.0.7", + "version": "0.0.8", "tools": { "ocrd-cis-ocropy-binarize": { "executable": "ocrd-cis-ocropy-binarize", @@ -281,6 +281,10 @@ "steps": [ "recognition/text-recognition" ], + "input_file_grp": [ + "OCR-D-GT-SEG-BLOCK", + "OCR-D-SEG-BLOCK" + ], "description": "Recognize text snippets", "parameters": { "model": { @@ -387,15 +391,20 @@ "ocrd-cis-ocropy-train": { "executable": "ocrd-cis-ocropy-train", "categories": [ - "lstm ocropy model training" + "Text recognition and optimization" ], "steps": [ - "training" + "recognition/text-recognition" + ], + "input_file_grp": [ + "OCR-D-GT-SEG-BLOCK", + "OCR-D-SEG-BLOCK" ], "description": "train model with ground truth from mets data", "parameters": { "textequiv_level": { "type": "string", + "description": "PAGE XML hierarchy level granularity", "enum": ["line", "word", "glyph"], "default": "line" }, @@ -404,7 +413,8 @@ "description": "load model or crate new one (e.g. fraktur.pyrnn)" }, "ntrain": { - "type": "integer", + "type": "number", + "format": "integer", "description": "lines to train before stopping", "default": 1000000 }, @@ -420,316 +430,65 @@ "Text recognition and optimization" ], "steps": [ - "postprocessing/alignment" + "recognition/post-correction" ], - "description": "Align multiple OCRs and/or GTs" - }, - "ocrd-cis-wer": { - "executable": "ocrd-cis-wer", - "categories": [ - "evaluation" + "input_file_grp": [ + "OCR-D-OCR-1", + "OCR-D-OCR-2", + "OCR-D-OCR-N" ], - "steps": [ - "evaluation" + "output_file_grp": [ + "OCR-D-ALIGNED" ], - "description": "calculate the word error rate for aligned page xml files", - "parameters": { - "testIndex": { - "description": "text equiv index for the test/ocr tokens", - "type": "integer", - "default": 0 - }, - "gtIndex": { - "type": "integer", - "description": "text equiv index for the gt tokens", - "default": -1 - } - } + "description": "Align multiple OCRs and/or GTs" }, - "ocrd-cis-jar": { - "executable": "ocrd-cis-jar", + "ocrd-cis-postcorrect": { + "executable": "ocrd-cis-postcorrect", "categories": [ "Text recognition and optimization" ], "steps": [ - "postprocessing/alignment" + "recognition/post-correction" ], - "description": "Output path to the ocrd-cis.jar file" - }, - "ocrd-cis-profile": { - "executable": "ocrd-cis-profile", - "categories": [ - "Text recognition and optimization" + "description": "Post correct OCR results", + "input_file_grp": [ + "OCR-D-LINE-ALIGNED" ], - "steps": [ - "postprocessing/alignment" + "output_file_grp": [ + "OCR-D-POST-CORRECTED" ], - "description": "Add a correction suggestions and suspicious tokens (profile)", "parameters": { - "executable": { - "type": "string", - "required": true + "maxCandidates": { + "description": "Maximum number of considered correction candidates per suspicious token", + "type": "number", + "format": "integer", + "default": 10 }, - "backend": { + "profilerPath": { + "description": "Path to the profiler executable", + "required": true, + "type": "string" + }, + "profilerConfig": { + "description": "Path to the profiler's language config file", + "required": true, + "type": "string" + }, + "model": { + "description": "Path to the post correction model file", "type": "string", "required": true }, - "language": { - "type": "string", - "required": false, - "default": "german" + "nOCR": { + "description": "Number of parallel OCR's to use for the post correction", + "type": "number", + "format": "integer", + "default": 1 }, - "additionalLexicon": { - "type": "string", - "required": false, - "default": "" - } - } - }, - "ocrd-cis-train": { - "executable": "ocrd-cis-train.sh", - "categories": [ - "Text recognition and optimization" - ], - "steps": [ - "postprocessing/alignment" - ], - "description": "Train post correction model", - "parameters": { - "gtArchives": { - "description": "List of ground truth archives", - "type": "array", - "required": true, - "items": { - "description": "Path (or URL) to a ground truth archive", - "type": "string" - } - }, - "imagePreprocessingSteps": { - "description": "List of image preprocessing steps", - "type": "array", - "required": true, - "items": { - "description": "Image preprocessing command that is evaled using the bash eval command (available parameters: $METS, $LOG_LEVEL, $XML_INPUT_FILE_GRP, $XML_OUTPUT_FILE_GRP, $IMG_OUTPUT_FILE_GRP, $IMG_INPUT_FILE_GRP, $PARAMETER)", - "type": "string" - } - }, - "ocrSteps": { - "description": "List of ocr steps", - "type": "array", - "required": true, - "items": { - "description": "OCR command that is evaled using the bash eval command (available parameters: $METS, $LOG_LEVEL, $XML_INPUT_FILE_GRP, $XML_OUTPUT_FILE_GRP, $PARAMETER)", - "type": "string" - } - }, - "training": { - "description": "Configuration of training command", - "type": "object", - "required": [ - "trigrams", - "maxCandidate", - "profiler", - "leFeatures", - "rrFeatures", - "dmFeatures" - ], - "properties": { - "trigrams": { - "description": "Path to character trigrams csv file (format: n,trigram)", - "type": "string", - "required": true - }, - "maxCandidate": { - "description": "Maximum number of considered profiler candidates per token", - "type": "integer", - "required": true - }, - "filterClasses": { - "description": "List of filtered feature classes", - "required": false, - "type": "array", - "items": { - "description": "Class name of feature class to filter", - "type": "string" - } - }, - "profiler": { - "description": "Profiler configuration", - "type": "object", - "required": [ - "path", - "config" - ], - "properties": { - "path": { - "description": "Path to the profiler executable", - "required": true, - "type": "string" - }, - "config": { - "description": "Path to the profiler language config file", - "required": true, - "type": "string" - } - } - }, - "leFeatures": { - "description": "List of the lexicon extension features", - "required": true, - "type": "array", - "items": { - "description": "Feature configuration", - "type": "object", - "required": [ - "type", - "name" - ], - "properties": { - "name": { - "description": "Name of the feature", - "type": "string" - }, - "type": { - "description": "Fully qualified java class name of the feature", - "type": "string" - }, - "class": { - "description": "Class name of the feature", - "type": "string" - } - } - } - }, - "rrFeatures": { - "description": "List of the reranker features", - "required": true, - "type": "array", - "items": { - "description": "Feature configuration", - "type": "object", - "required": [ - "type", - "name" - ], - "properties": { - "name": { - "description": "Name of the feature", - "type": "string" - }, - "type": { - "description": "Fully qualified java class name of the feature", - "type": "string" - }, - "class": { - "description": "Class name of the feature", - "type": "string" - } - } - } - }, - "dmFeatures": { - "description": "List of the desicion maker features", - "required": true, - "type": "array", - "items": { - "description": "Feature configuration", - "type": "object", - "required": [ - "type", - "name" - ], - "properties": { - "name": { - "description": "Name of the feature", - "type": "string" - }, - "type": { - "description": "Fully qualified java class name of the feature", - "type": "string" - }, - "class": { - "description": "Class name of the feature", - "type": "string" - } - } - } - } - } - } - } - }, - "ocrd-cis-post-correct": { - "executable": "ocrd-cis-post-correct.sh", - "categories": [ - "Text recognition and optimization" - ], - "steps": [ - "postprocessing/alignment" - ], - "description": "Post correct OCR results", - "parameters": { - "ocrSteps": { - "description": "List of additional ocr steps", - "type": "array", - "required": true, - "items": { - "description": "OCR command that is evaled using the bash eval command (available parameters: $METS, $LOG_LEVEL, $XML_INPUT_FILE_GRP, $XML_OUTPUT_FILE_GRP, $PARAMETER)", - "type": "string" - } - }, - "postCorrection": { - "description": "Configuration of post correction command", - "type": "object", - "required": [ - "maxCandidate", - "profiler", - "model", - "runLE", - "runDM" - ], - "properties": { - "maxCandidate": { - "description": "Maximum number of considered profiler candidates per token", - "type": "integer", - "required": true - }, - "profiler": { - "description": "Profiler configuration", - "type": "object", - "required": [ - "path", - "config" - ], - "properties": { - "path": { - "description": "Path to the profiler executable", - "required": true, - "type": "string" - }, - "config": { - "description": "Path to the profiler language config file", - "required": true, - "type": "string" - } - } - }, - "model": { - "description": "Path to the post correction model file", - "type": "string", - "required": true - }, - "runLE": { - "description": "Do run the lexicon extension step for the post correction", - "required": true, - "type": "boolean" - }, - "runDM": { - "description": "Do run the ranking and the decision step for the post correction", - "required": true, - "type": "boolean" - } - } + "runLE": { + "description": "Do run the lexicon extension step for the post correction", + "type": "boolean", + "default": false } } } diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 2b34abbc..14ac9563 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -146,7 +146,7 @@ def process(self): pcgts = page_from_file(self.workspace.download_file(input_file)) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() - + # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( @@ -159,7 +159,7 @@ def process(self): Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) - + page_image, page_coords, _ = self.workspace.image_from_page( page, page_id) @@ -169,7 +169,7 @@ def process(self): if not regions: LOG.warning("Page '%s' contains no text regions", page_id) self.process_regions(regions, maxlevel, page_image, page_coords) - + # update METS (add the PAGE file): file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) diff --git a/ocrd_cis/profile/__init__.py b/ocrd_cis/postcorrect/__init__.py similarity index 100% rename from ocrd_cis/profile/__init__.py rename to ocrd_cis/postcorrect/__init__.py diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py new file mode 100644 index 00000000..d9033981 --- /dev/null +++ b/ocrd_cis/postcorrect/cli.py @@ -0,0 +1,46 @@ +from __future__ import absolute_import +import click +import json +import os +from ocrd import Processor +from ocrd.decorators import ocrd_cli_options +from ocrd.decorators import ocrd_cli_wrap_processor +from ocrd_utils import getLogger +from ocrd_models.ocrd_mets import OcrdMets +from ocrd_cis import JavaPostCorrector +from ocrd_cis import get_ocrd_tool + +LOG_LEVEL = 'INFO' + +@click.command() +@ocrd_cli_options +def ocrd_cis_postcorrect(*args, **kwargs): + if 'log_level' in kwargs: + global LOG_LEVEL + LOG_LEVEL = kwargs['log_level'] + return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs) + +class PostCorrector(Processor): + def __init__(self, *args, **kwargs): + ocrd_tool = get_ocrd_tool() + kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect'] + kwargs['version'] = ocrd_tool['version'] + super(PostCorrector, self).__init__(*args, **kwargs) + self.log = getLogger('cis.Processor.PostCorrector') + + def process(self): + ifgs = self.input_file_grp.split(",") # input file groups + ofg = self.output_file_grp + profiler = {} + profiler["path"] = self.parameter["profilerPath"] + profiler["config"] = self.parameter["profilerConfig"] + profiler["noCache"] = True + self.parameter["profiler"] = profiler + self.parameter["runDM"] = True + metspath = os.path.join(self.workspace.directory, "mets.xml") + print(json.dumps(self.parameter, indent=4)) + p = JavaPostCorrector(metspath, ",".join(ifgs), ofg, self.parameter, LOG_LEVEL) + p.exe() + # reload the mets file to prevent it from overriding the + # updated version from the java process + self.workspace.mets = OcrdMets(filename=metspath) diff --git a/ocrd_cis/profile/cli.py b/ocrd_cis/profile/cli.py deleted file mode 100644 index 41d82ba7..00000000 --- a/ocrd_cis/profile/cli.py +++ /dev/null @@ -1,38 +0,0 @@ -import click - -from ocrd.decorators import ocrd_cli_options -from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd import Processor -from ocrd_utils import getLogger -from ocrd_cis import get_ocrd_tool -from ocrd_cis import JavaProfiler - -LOG_LEVEL = 'INFO' - -@click.command() -@ocrd_cli_options -def ocrd_cis_profile(*args, **kwargs): - global LOG_LEVEL - if 'log_level' in kwargs: - LOG_LEVEL = kwargs['log_level'] - return ocrd_cli_wrap_processor(Profiler, *args, **kwargs) - -class Profiler(Processor): - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-profile'] - kwargs['version'] = ocrd_tool['version'] - super(Profiler, self).__init__(*args, **kwargs) - self.log = getLogger('cis.Processor.Profiler') - - def process(self): - global LOG_LEVEL - self.log.debug("starting java client") - self.log.debug("LOG_LEVEL = %s", LOG_LEVEL) - p = JavaProfiler(self.workspace.mets_target, self.input_file_grp, - self.output_file_grp, self.parameter, - LOG_LEVEL or 'INFO') - p.exe() - # Reload the updated METS file to make sure that run_processor - # does not overwrite the updated file with the old. - self.workspace.reload_mets() diff --git a/setup.py b/setup.py index fbd3d5fe..f582318c 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,7 @@ """ Installs: - ocrd-cis-align - - ocrd-cis-training - - ocrd-cis-profile - - ocrd-cis-wer + - ocrd-cis-postcorrect - ocrd-cis-data - ocrd-cis-ocropy-clip - ocrd-cis-ocropy-denoise @@ -25,7 +23,7 @@ setup( name='ocrd_cis', - version='0.0.7', + version='0.0.8', description='CIS OCR-D command line tools', long_description=README, long_description_content_type='text/markdown', @@ -49,18 +47,12 @@ 'calamari_ocr == 0.3.5' ], package_data={ - '': ['*.json', '*.yml', '*.yaml', '*.csv.gz', '*.jar'], + '': ['*.json', '*.yml', '*.yaml', '*.csv.gz', '*.jar', '*.zip'], }, - scripts=[ - 'bashlib/ocrd-cis-lib.sh', - 'bashlib/ocrd-cis-train.sh', - 'bashlib/ocrd-cis-post-correct.sh', - ], entry_points={ 'console_scripts': [ 'ocrd-cis-align=ocrd_cis.align.cli:ocrd_cis_align', - 'ocrd-cis-profile=ocrd_cis.profile.cli:ocrd_cis_profile', - 'ocrd-cis-wer=ocrd_cis.wer.cli:ocrd_cis_wer', + 'ocrd-cis-postcorrect=ocrd_cis.postcorrect.cli:ocrd_cis_postcorrect', 'ocrd-cis-data=ocrd_cis.data.__main__:main', 'ocrd-cis-ocropy-binarize=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_binarize', 'ocrd-cis-ocropy-clip=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_clip', diff --git a/tests/data/benner_herrnhuterey04_1748_0015-wer.xml b/tests/data/benner_herrnhuterey04_1748_0015-wer.xml deleted file mode 100644 index f95028d1..00000000 --- a/tests/data/benner_herrnhuterey04_1748_0015-wer.xml +++ /dev/null @@ -1,73 +0,0 @@ - - - - OCR-D - 2016-10-10T14:19:48.077+02:00 - 2017-01-04T10:32:27.651+01:00 - - - - - - - - - - - - - - - - - - - - - - - - - - - first - - - first - - - first - - - - - - second - - - second - - - secondx - - - - - - third - - - third - - - thirdx - - - - first second third - - - - - - diff --git a/tests/data/benner_herrnhuterey04_1748_0015.xml b/tests/data/benner_herrnhuterey04_1748_0015.xml deleted file mode 100644 index ae5e743f..00000000 --- a/tests/data/benner_herrnhuterey04_1748_0015.xml +++ /dev/null @@ -1,1499 +0,0 @@ - - - - OCR-D - 2016-10-10T14:19:48.077+02:00 - 2017-01-04T10:32:27.651+01:00 - - - - - - - - - - - - - - - - - - - - - - - - - - - vierter - - - - - - Theil. - - - - vierter Theil. - - - - - vierter Theil. - - - - - - - - - - - 3 - - - - 3 - - - - - 3 - - - - - - - - - - - geſtalt - - - - - - - vor - - - - - - - ihren - - - - - - - Stifter - - - - - - - erklaͤret, - - - - - - - daß - - - - - - - ſie - - - - - - - aus - - - - - geſtalt vor ihren Stifter erklaͤret, daß ſie aus - - - - - geſtalt vor ihren Stifter erklaͤret, daß ſie aus - - - - - - - - - - - deſen - - - - - deſen - - - - deſen - - - - - - - - - - - Buch, - - - - - - ſtehen - - - - - - ſ. - - - - - - 29. - - - - - - wo - - - - - - es - - - - - - heiſet: - - - - - - Zweyte - - - - Buch, ſtehen ſ. 29. wo es heiſet: Zweyte - - - - - - - - - Warheit - - - - - - Daß - - - - - - ich - - - - - - die - - - - - - Puͤncktlichkeit - - - - Warheit Daß ich die Puͤncktlichkeit - - - - - - - - - - meiner - - - - - - Lutheriſchen - - - - - - Verfaſſung - - - - - - ſoweit - - - - meiner Lutheriſchen Verfaſſung ſoweit - - - - - - - - - - pouſſiret/ - - - - - - - daß - - - - - - ich - - - - - - ſogar - - - - - - in - - - - - - Penſylvanien - - - - pouſſiret/ daß ich ſogar in Penſylvanien - - - - - - - - - alwo - - - - - - mein - - - - - - Haupt-Augenmerck - - - - - - war/ - - - - - - den - - - - alwo mein Haupt-Augenmerck war/ den - - - - - - - - - - Verfall - - - - - - der - - - - - - Religionen - - - - - - und - - - - - - aller - - - - - - Kirch- - - - - Verfall der Religionen und aller Kirch- - - - - - - - - - - lichkeit - - - - - - uͤberhaupt, - - - - - - zu - - - - - - redreſſiren, - - - - - - - we- - - - - lichkeit uͤberhaupt, zu redreſſiren, we- - - - - - - - - - gen - - - - - - der - - - - - - diverſen - - - - - - - Liturgien/ - - - - - - die - - - - - - ernſte - - - - gen der diverſen Liturgien/ die ernſte - - - - - - - - - Verfuͤgung - - - - - - getroffen/ - - - - - - daß - - - - - - weder - - - - - - ein - - - - Verfuͤgung getroffen/ daß weder ein - - - - - - - - - Reformirter/ - - - - - - noch - - - - - - ein - - - - - - Maͤhriſcher - - - - - - Bru- - - - - Reformirter/ noch ein Maͤhriſcher Bru- - - - - - - - - - - der/ - - - - - - bey - - - - - - unſerer - - - - - - dortigen - - - - - - Lutheriſchen - - - - der/ bey unſerer dortigen Lutheriſchen - - - - - - - - - - Communion - - - - - - admittiret - - - - - - - werde/ - - - - - - und - - - - - - auf - - - - Communion admittiret werde/ und auf - - - - - - - - - dem - - - - - - oͤffentlichen - - - - - - Confeſſu - - - - - - - aller - - - - - - proteſtan- - - - - dem oͤffentlichen Confeſſu aller proteſtan- - - - - - - - - - tiſchen - - - - - - Religionen - - - - - - daſelbſt/ - - - - - - mich - - - - - - vor - - - - tiſchen Religionen daſelbſt/ mich vor - - - - - - - - - - den - - - - - - Lutheriſchen - - - - - - teutſchen - - - - - - Pfarrer - - - - - - von - - - - den Lutheriſchen teutſchen Pfarrer von - - - - - - - - - - Philadelphia/ - - - - - - der - - - - - - ich - - - - - - war/ - - - - - - den - - - - - - erſten/ - - - - Philadelphia/ der ich war/ den erſten/ - - - - - - - - - - und - - - - - - ſo - - - - - - lange - - - - - - ich - - - - - - da - - - - - - war/ - - - - - - ohne - - - - - - einige - - - - - - Con- - - - - - und ſo lange ich da war/ ohne einige Con- - - - - - - - - - currenz - - - - - - - mit - - - - - - einigem - - - - - - andern/ - - - - - - und - - - - - - ſonſt - - - - currenz mit einigem andern/ und ſonſt - - - - - - - - - vor - - - - - - nichts/ - - - - - - angegeben - - - - - - und - - - - - - geriret. - - - - vor nichts/ angegeben und geriret. - - - - - Buch, ſtehen ſ. 29. wo es heiſet: Zweyte -Warheit Daß ich die Puͤncktlichkeit -meiner Lutheriſchen Verfaſſung ſoweit -pouſſiret/ daß ich ſogar in Penſylvanien -alwo mein Haupt-Augenmerck war/ den -Verfall der Religionen und aller Kirch- -lichkeit uͤberhaupt, zu redreſſiren, we- -gen der diverſen Liturgien/ die ernſte -Verfuͤgung getroffen/ daß weder ein -Reformirter/ noch ein Maͤhriſcher Bru- -der/ bey unſerer dortigen Lutheriſchen -Communion admittiret werde/ und auf -dem oͤffentlichen Confeſſu aller proteſtan- -tiſchen Religionen daſelbſt/ mich vor -den Lutheriſchen teutſchen Pfarrer von -Philadelphia/ der ich war/ den erſten/ -und ſo lange ich da war/ ohne einige Con- -currenz mit einigem andern/ und ſonſt -vor nichts/ angegeben und geriret. - - - - - - - - - - - Die - - - - - - - uͤbrige - - - - - - - Stellen - - - - - - - ſind - - - - - - - ſ. - - - - - - 179. - - - - - - wo - - - - - - - Zin- - - - - - Die uͤbrige Stellen ſind ſ. 179. wo Zin- - - - - - - - - - zendorf - - - - - - - gegen - - - - - - - den - - - - - - - teutſchen - - - - - - - Zeitungsſchrei- - - - - - zendorf gegen den teutſchen Zeitungsſchrei- - - - - - - - - - - ber - - - - - - - in - - - - - - - Penſylvanien - - - - - - - sich - - - - - - - reget, - - - - - - - aber - - - - - - - mit - - - - - ber in Penſylvanien sich reget, aber mit - - - - - - - - - - keinem - - - - - - - eintzigen - - - - - - - Grund - - - - - - - vertheidiget. - - - - - - - Jn- - - - - - keinem eintzigen Grund vertheidiget. Jn- - - - - - - - - - - gleichem - - - - - - - ſ. - - - - - - - 109. - - - - - - - f. - - - - - - - wo - - - - - - - er - - - - - - - ſeiner - - - - - - - in - - - - - - - Penſyl- - - - - - gleichem ſ. 109. f. wo er ſeiner in Penſyl- - - - - - - - - - - vanien - - - - - - - gehaltenen - - - - - - - ſieben - - - - - - - General-Versam- - - - - - vanien gehaltenen ſieben General-Versam- - - - - - - - - - - lungen - - - - - - - gedencket, - - - - - - - auch - - - - - - - ſogar - - - - - - - etwas - - - - - - - von - - - - - - - den - - - - - lungen gedencket, auch ſogar etwas von den - - - - - - - - - - Schluͤſſen - - - - - - - und - - - - - - - Decreten, - - - - - - - der - - - - - - - verſamleten - - - - - Schluͤſſen und Decreten, der verſamleten - - - - - - - - - - Vaͤter, - - - - - - - nicht - - - - - - - weniger - - - - - - - eine - - - - - - - lateiniſche - - - - - - - Rede - - - - - Vaͤter, nicht weniger eine lateiniſche Rede - - - - - Die uͤbrige Stellen ſind ſ. 179. wo Zin- -zendorf gegen den teutſchen Zeitungsſchrei- -ber in Penſylvanien sich reget, aber mit -keinem eintzigen Grund vertheidiget. Jn- -gleichem ſ. 109. f. wo er ſeiner in Penſyl- -vanien gehaltenen ſieben General-Versam- -lungen gedencket, auch ſogar etwas von den -Schluͤſſen und Decreten, der verſamleten -Vaͤter, nicht weniger eine lateiniſche Rede - - - - - - - - - - - A - - - - - - - 2 - - - - - A 2 - - - - - A 2 - - - - - - - - - - - mitthei- - - - - - mitthei- - - - - - mitthei- - - - - - - - diff --git a/tests/data/profiler b/tests/data/profiler deleted file mode 100755 index 88b2d589..00000000 --- a/tests/data/profiler +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -echo $0 $* > /dev/stderr -cat< $other1 - -other2=$tmpdir/other2.xml -cat $pagexmlfile | sed -e 's/ſ/f/g' > $other2 - -# add page xml files to align -pushd $tmpws -ocrd workspace add \ - -G OCR-D-CIS-TEST-1 \ - -i test01 \ - -m 'application/vnd.prima.page+xml' \ - "$pagexmlfile" -ocrd workspace add \ - -G OCR-D-CIS-TEST-2 \ - -i test02 \ - -m 'application/vnd.prima.page+xml' \ - "$other1" -ocrd workspace add \ - -G OCR-D-CIS-TEST-3 \ - -i test03 \ - -m 'application/vnd.prima.page+xml' \ - "$other2" -popd - -# align the three workspaces -ocrd-cis-align --log-level DEBUG \ - -I OCR-D-CIS-TEST-1,OCR-D-CIS-TEST-2,OCR-D-CIS-TEST-3 \ - -O OCR-D-CIS-ALIGN \ - -m $tmpws/mets.xml - -pushd $tmpws -if [[ ! -f $(ocrd workspace find -G OCR-D-CIS-ALIGN) ]]; then - echo "cannot find aligned file group workspace" - exit 1 -fi diff --git a/tests/run_data_test.sh b/tests/run_data_test.bash similarity index 50% rename from tests/run_data_test.sh rename to tests/run_data_test.bash index 8e9c57ad..4b7c6e59 100644 --- a/tests/run_data_test.sh +++ b/tests/run_data_test.bash @@ -1,4 +1,5 @@ #!/bin/bash +set -e if [[ ! -f $(ocrd-cis-data -jar) ]] ; then echo "jar file does not exist"; @@ -9,3 +10,13 @@ if [[ ! -f $(ocrd-cis-data -3gs) ]] ; then echo "three grams file does not exist"; exit 1 fi + +if [[ ! -f $(ocrd-cis-data -config) ]] ; then + echo "config file does not exist"; + exit 1 +fi + +if [[ ! -f $(ocrd-cis-data -model) ]] ; then + echo "model file does not exist"; + exit 1 +fi diff --git a/tests/run_image_preprocessing_test.sh b/tests/run_image_preprocessing_test.bash similarity index 68% rename from tests/run_image_preprocessing_test.sh rename to tests/run_image_preprocessing_test.bash index 741a9e32..4fd028e4 100644 --- a/tests/run_image_preprocessing_test.sh +++ b/tests/run_image_preprocessing_test.bash @@ -1,17 +1,13 @@ #!/bin/bash -source ocrd-cis-lib.sh -source $(dirname $0)/test_lib.sh +set -e +source $(dirname $0)/test_lib.bash -url="http://www.ocr-d.de/sites/all/GTDaten/blumenbach_anatomie_1805.zip" -mkdir -p "$tmpdir/download" -pushd "$tmpws" -ocrd-cis-download-and-add-gt-zip "$url" "$tmpdir/download" -popd +ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip" # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G "$OCR_D_CIS_GT_FILEGRP"); do +for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do if [[ ! -f "$file" ]]; then echo "cannot find ground truth file: $file" exit 1 @@ -24,18 +20,18 @@ if [[ $found_files != 3 ]]; then fi popd -ocrd-cis-ocropy-clip --log-level DEBUG \ - --input-file-grp "$OCR_D_CIS_GT_FILEGRP" \ - --output-file-grp OCR-D-CIS-IMG-CLIP \ - --mets "$tmpws/mets.xml" - ocrd-cis-ocropy-binarize --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-CLIP \ + --input-file-grp OCR-D-GT-SEG-LINE \ --output-file-grp OCR-D-CIS-IMG-BIN \ --mets "$tmpws/mets.xml" +ocrd-cis-ocropy-clip --log-level DEBUG \ + --input-file-grp OCR-D-CIS-IMG-BIN \ + --output-file-grp OCR-D-CIS-IMG-CLIP \ + --mets "$tmpws/mets.xml" + ocrd-cis-ocropy-denoise --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-BIN \ + --input-file-grp OCR-D-CIS-IMG-CLIP \ --output-file-grp OCR-D-CIS-IMG-DEN \ --mets "$tmpws/mets.xml" @@ -48,3 +44,8 @@ ocrd-cis-ocropy-dewarp --log-level DEBUG \ --input-file-grp OCR-D-CIS-IMG-DES \ --output-file-grp OCR-D-CIS-IMG-DEW \ --mets "$tmpws/mets.xml" + +ocrd-cis-ocropy-segment --log-level DEBUG \ + --input-file-grp OCR-D-CIS-IMG-DEW \ + --output-file-grp OCR-D-CIS-IMG-SEG \ + --mets "$tmpws/mets.xml" diff --git a/tests/run_ocr_test.sh b/tests/run_ocr_test.bash similarity index 55% rename from tests/run_ocr_test.sh rename to tests/run_ocr_test.bash index 4c9346ec..6de88a7b 100644 --- a/tests/run_ocr_test.sh +++ b/tests/run_ocr_test.bash @@ -1,20 +1,12 @@ #!/bin/bash -source ocrd-cis-lib.sh -source $(dirname $0)/test_lib.sh - -url="http://www.ocr-d.de/sites/all/GTDaten/blumenbach_anatomie_1805.zip" -mkdir -p "$tmpdir/download" -pushd "$tmpws" -ocrd-cis-download-and-add-gt-zip "$url" "$tmpdir/download" -popd -pushd "$tmpdir/download" -wget -N "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz" -popd +set -e +source $(dirname $0)/test_lib.bash +ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G "$OCR_D_CIS_GT_FILEGRP"); do +for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do if [[ ! -f "$file" ]]; then echo "cannot find ground truth file: $file" exit 1 @@ -27,9 +19,12 @@ if [[ $found_files != 3 ]]; then fi popd +# download ocr model +wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz" + # run ocr ocrd-cis-ocropy-recognize --log-level DEBUG \ - --input-file-grp "$OCR_D_CIS_GT_FILEGRP" \ + --input-file-grp "OCR-D-GT-SEG-LINE" \ --output-file-grp OCR-D-CIS-OCR \ --mets "$tmpws/mets.xml" \ --parameter <(cat < "$tmpdir/bin/profiler.bash" < /dev/null +echo '{}' +EOF +chmod a+x "$tmpdir/bin/profiler.bash" +ocrd-cis-postcorrect --log-level DEBUG \ + -I OCR-D-CIS-ALIGN \ + -O OCR-D-CIS-POSTCORRECT \ + -m $tmpws/mets.xml \ + --parameter <(cat <e.#Säugethiere.#' $f + sed -i -e 's#E#Säugethieren#' $f +done +popd + +mkdir "$tmpdir/bin" +cat > "$tmpdir/bin/profiler.bash" < /dev/null +echo '{"Säugethiere":{ +"Candidates": [{ +"Suggestion": "Säugethiere", +"Modern": "Säugetiere", +"Dict": "dict_modern_hypothetic_errors", +"HistPatterns": [{"Left":"t","Right":"th","Pos":5}], +"OCRPatterns": [], +"Distance": 0, +"Weight": 1.0 +}]}}' +EOF +chmod a+x "$tmpdir/bin/profiler.bash" + +java -jar $(ocrd-cis-data -jar) -c train \ + --log-level DEBUG \ + -I OCR-D-CIS-ALIGN \ + -m $tmpws/mets.xml \ + --parameter <( +cat $(ocrd-cis-data -config) \ + | sed -e "s#/path/to/profiler#$tmpdir/bin/profiler.bash#" \ + | sed -e "s#/path/to/trigrams.csv#$(ocrd-cis-data -3gs)#" \ + | sed -e "s#/path/to/train.dir#$tmpdir/train#" +) + +if [[ ! -f $tmpdir/train/model.zip ]]; then + echo $tmpdir/train/model.zip not found + exit 1 +fi diff --git a/tests/run_validation.bash b/tests/run_validation.bash new file mode 100644 index 00000000..0dc98777 --- /dev/null +++ b/tests/run_validation.bash @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +if ocrd ocrd-tool ocrd-tool.json validate | grep ''; then + exit 1 +fi diff --git a/tests/run_wer_test.sh b/tests/run_wer_test.sh deleted file mode 100644 index d85a4809..00000000 --- a/tests/run_wer_test.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -source $(dirname $0)/test_lib.sh - -pushd "$tmpws" -ocrd workspace add \ - --file-grp "WERFILEGRP" \ - --mimetype "application/vnd.prima.page+xml" \ - --file-id "WERFILEID" \ - "$werfile" -popd - -ocrd-cis-wer \ - --mets "$tmpws/mets.xml" \ - --output-file-grp "WER" \ - --input-file-grp "WERFILEGRP" - - -# tests -pushd "$tmpws" -if [[ ! -f $(ocrd workspace find -G "WER") ]]; then - echo "missing wer file" - exit 1 -fi -if [[ $(jq '.totalWords' $(ocrd workspace find -G "WER")) != 3 ]]; then - echo "invalid number of words" - exit 1 -fi -if [[ $(jq '.incorrectWords' $(ocrd workspace find -G "WER")) != 2 ]]; then - echo "invalid number of bad words" - exit 1 -fi -if [[ $(jq '.correctWords' $(ocrd workspace find -G "WER")) != 1 ]]; then - echo "invalid number of good words" - exit 1 -fi -popd diff --git a/tests/test_lib.bash b/tests/test_lib.bash new file mode 100644 index 00000000..5d38f482 --- /dev/null +++ b/tests/test_lib.bash @@ -0,0 +1,51 @@ +#/bin/bash + +tmpdir=$(mktemp -d) +trap "rm -rf $tmpdir" EXIT + +OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" +data_url="https://ocr-d-repo.scc.kit.edu/api/v1/dataresources/75ad9f94-dbaa-43e0-ab06-2ce24c497c61/data" +function ocrd_cis_download_bagit() { + local url="$data_url/$1" + mkdir -p "$tmpdir/download" + wget -P "$tmpdir/download" "$url" +} + +function ocrd_cis_init_ws() { + ocrd_cis_download_bagit "$1" + ocrd zip spill -d "$tmpdir" "$tmpdir/download/$1" + tmpws="$tmpdir/${1%.ocrd.zip}" +} + +function ocrd_cis_align() { + # download ocr models + wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz" + wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur2-00062000.pyrnn.gz" + # run ocr + ocrd-cis-ocropy-recognize --log-level DEBUG \ + --input-file-grp "OCR-D-GT-SEG-LINE" \ + --output-file-grp OCR-D-CIS-OCR-1 \ + --mets "$tmpws/mets.xml" \ + --parameter <(cat <