diff --git a/Dockerfile b/Dockerfile
index 3618565d..f58112b8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,13 +1,11 @@
-FROM ocrd/core:latest
-ENV VERSION="Mi 9. Okt 13:26:16 CEST 2019"
+FROM ocrd/core:latest AS base
+ENV VERSION="Di 12. Mai 13:26:35 CEST 2020"
ENV GITURL="https://github.com/cisocrgroup"
ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf"
-ENV DATA="/apps/ocrd-cis-post-correction"
# deps
-COPY data/docker/deps.txt ${DATA}/deps.txt
RUN apt-get update \
- && apt-get -y install --no-install-recommends $(cat ${DATA}/deps.txt)
+ && apt-get -y install --no-install-recommends locales
# locales
RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \
@@ -15,40 +13,46 @@ RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \
&& update-locale LANG=en_US.UTF-8
# install the profiler
-RUN git clone ${GITURL}/Profiler --branch devel --single-branch /tmp/profiler \
- && cd /tmp/profiler \
- && mkdir build \
- && cd build \
- && cmake -DCMAKE_BUILD_TYPE=release .. \
- && make compileFBDic trainFrequencyList profiler \
- && cp bin/compileFBDic bin/trainFrequencyList bin/profiler /apps/ \
+FROM base AS profiler
+RUN apt-get update \
+ && apt-get -y install --no-install-recommends cmake g++ libcppunit-dev libxerces-c-dev \
+ && git clone ${GITURL}/Profiler --branch devel --single-branch /build \
+ && cd /build \
+ && cmake -DCMAKE_BUILD_TYPE=release . \
+ && make compileFBDic trainFrequencyList runDictSearch profiler \
+ && mkdir /apps \
+ && cp bin/compileFBDic bin/trainFrequencyList bin/profiler bin/runDictSearch /apps/ \
&& cd / \
- && rm -rf /tmp/profiler
+ && rm -rf /build
+FROM profiler AS languagemodel
# install the profiler's language backend
-RUN git clone ${GITURL}/Resources --branch master --single-branch /tmp/resources \
- && cd /tmp/resources/lexica \
- && make FBDIC=/apps/compileFBDic TRAIN=/apps/trainFrequencyList \
- && mkdir -p /${DATA}/languages \
- && cp -r german latin greek german.ini latin.ini greek.ini /${DATA}/languages \
+COPY --from=profiler /apps/compileFBDic /apps/
+COPY --from=profiler /apps/trainFrequencyList /apps/
+COPY --from=profiler /apps/runDictSearch /apps/
+RUN apt-get update \
+ && apt-get -y install --no-install-recommends icu-devtools \
+ && git clone ${GITURL}/Resources --branch master --single-branch /build \
+ && cd /build/lexica \
+ && PATH=$PATH:/apps make \
+ && PATH=$PATH:/apps make test \
+ && PATH=$PATH:/apps make install \
&& cd / \
- && rm -rf /tmp/resources
+ && rm -rf /build
+FROM base AS postcorrection
# install ocrd_cis (python)
-COPY Manifest.in Makefile setup.py ocrd-tool.json /tmp/build/
-COPY ocrd_cis/ /tmp/build/ocrd_cis/
-COPY bashlib/ /tmp/build/bashlib/
-# COPY . /tmp/ocrd_cis
-RUN cd /tmp/build \
+VOLUME ["/data"]
+COPY --from=languagemodel /etc/profiler/languages /etc/profiler/languages
+COPY --from=profiler /apps/profiler /apps/
+COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linux-gnu/
+COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/
+COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/
+COPY . /build
+RUN apt-get update \
+ && apt-get -y install --no-install-recommends gcc wget default-jre-headless \
+ && cd /build \
&& make install \
+ && make test \
&& cd / \
- && rm -rf /tmp/build
-# download ocr models and pre-trainded post-correction model
-RUN mkdir /apps/models \
- && cd /apps/models \
- && wget ${DOWNLOAD_URL}/model.zip >/dev/null 2>&1 \
- && wget ${DOWNLOAD_URL}/fraktur1-00085000.pyrnn.gz >/dev/null 2>&1 \
- && wget ${DOWNLOAD_URL}/fraktur2-00062000.pyrnn.gz >/dev/null 2>&1
-VOLUME ["/data"]
+ && rm -rf /build
diff --git a/Makefile b/Makefile
index 33de67f2..730ba3f4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,21 +1,24 @@
PY ?= python3
PIP ?= pip3
+V ?= > /dev/null 2>&1
+PKG = ocrd_cis
${PIP} install --upgrade pip .
${PIP} install --upgrade pip -e .
+ ${PIP} uninstall ${PKG}
docker-build: Dockerfile
docker build -t flobar/ocrd_cis:latest .
docker-push: docker-build
docker push flobar/ocrd_cis:latest
-TEST_SCRIPTS=$(wildcard tests/run_*.sh)
+TEST_SCRIPTS=$(sort $(wildcard tests/run_*.bash))
- bash $@
-# run test scripts
+ bash $@ $V
+ echo $^
.PHONY: install test
diff --git a/README.md b/README.md
index 0568ff07..a2d7fb32 100644
--- a/README.md
+++ b/README.md
@@ -32,23 +32,39 @@ It is possible to install `ocrd_cis` in a custom directory using
+## Profiler
+The post correction is dependent on the language
+[profiler](https://github.com/cisocrgroup/Profiler) and its laguage
+configurations to generate corrections for suspicious words. In order
+to use the post correction a profiler with according language
+configruations have to be present on the system. You can refer to our
+and our [lexical
+for more information.
+If you use docker you can use the preinstalled profiler from within
+the docker-container. The profiler is installed to `/apps/profiler`
+and the language configurations lie in `/etc/profiler/languages` in
+the container image.
## Usage
Most tools follow the [OCR-D cli
conventions](https://ocr-d.github.io/cli). They accept the
`--input-file-grp`, `--output-file-grp`, `--parameter`, `--mets`,
-`--log-level` command line arguments (short and long). For some tools
-(most notably the alignment tool) expect a comma seperated list of
-multiple input file groups.
+`--log-level` command line arguments (short and long). Some of the
+tools (most notably the alignment tool) expect a comma seperated list
+of multiple input file groups.
The [ocrd-tool.json](ocrd_cis/ocrd-tool.json) contains a schema
description of the parameter config file for the different tools that
accept the `--parameter` argument.
-### ocrd-cis-post-correct.sh
-This bash script runs the post correction using a pre-trained
-[model](http://cis.lmu.de/~finkf/model.zip). If additional support
-OCRs should be used, models for these OCR steps are required and must
-be configured in an according configuration file (see ocrd-tool.json).
+### ocrd-cis-postcorrect
+This command runs the post correction using a pre-trained model. If
+additional support OCRs should be used, models for these OCR steps are
+required and must be executed and aligned beforehand (see [the test
+script](tests/run_postcorrection_test.bash) for an example).
* `--parameter` path to configuration file
@@ -57,6 +73,20 @@ Arguments:
* `--log-level` set log level
* `--mets` path to METS file in workspace
+As mentioned above in order to use the postcorrection with input from
+multiple OCR's, some preprocessing steps are needed: firstly the
+additional OCR recognition has to be done and secondly the multiple
+OCR's have to be aligned (you can also take a look to the function
+`ocrd_cis_align` in the [tests](tests/test_lib.bash)). Assuming an
+original recognition as file group `OCR1` on the segmented document of
+file group `SEG`, the folloing commands can be used:
+ocrd-ocropus-recognize -I SEG -O OCR2 ... # additional OCR
+ocrd-cis-align -I OCR1,OCR2 -O ALGN ... # align OCR1 and OCR2
+ocrd-cis-postcorrect -I ALGN -O PC ... # post correction
### ocrd-cis-align
Aligns tokens of multiple input file groups to one output file group.
This tool is used to align the master OCR with any additional support
@@ -66,41 +96,26 @@ it aligns in order.
* `--parameter` path to configuration file
* `--input-file-grp` comma seperated list of the input file groups;
- first input file group is the master OCR
+ first input file group is the master OCR; if there is a ground
+ truth (for evaluation) it must be the last file group in the list
* `--output-file-grp` name of the file group for the aligned result
* `--log-level` set log level
* `--mets` path to METS file in workspace
-### ocrd-cis-train.sh
-Script to train a model from a list of ground-truth archives (see
-ocrd-tool.json) for the post correction. The tool somewhat mimics the
-behaviour of other ocrd tools:
- * `--mets` for the workspace
- * `--log-level` is passed to other tools
- * `--parameter` is used as configuration
- * `--output-file-grp` defines the output file group for the model
### ocrd-cis-data
Helper tool to get the path of the installed data files. Usage:
-`ocrd-cis-data [-jar|-3gs]` to get the path of the jar library or the
-path to th default 3-grams language model file.
-### ocrd-cis-wer
-Helper tool to calculate the word error rate aligned ocr files. It
-writes a simple JSON-formated stats file to the given output file group.
- * `--input-file-grp` input file group of aligned ocr results with
- their respective ground truth.
- * `--output-file-grp` name of the file group for the stats file
- * `--log-level` set log level
- * `--mets` path to METS file in workspace
-### ocrd-cis-profile
-Run the profiler over the given files of the according the given input
-file grp and adds a gzipped JSON-formatted profile to the output file
-group of the workspace. This tools requires an installed [language
+`ocrd-cis-data [-h|-jar|-3gs|-model|-config]` to get the path of the
+jar library, the pre-trained post correction model, the path to the
+default 3-grams language model file or the default training
+configuration file. This tool does not follow the OCR-D conventions.
+### Trainining
+There is no dedicated training script provided. Models are trained
+using the java implementation directly (check out the [training test
+script](tests/run_training_test.bash) for an example). Training a
+model requires a workspace containing one or more file groups
+consisting of aligned OCR and ground-truth documents (the last file
+group has to be the ground truth).
* `--parameter` path to configuration file
@@ -114,11 +129,14 @@ Arguments:
The `ocropy-train` tool can be used to train LSTM models.
It takes ground truth from the workspace and saves (image+text) snippets from the corresponding pages.
Then a model is trained on all snippets for 1 million (or the given number of) randomized iterations from the parameter file.
-ocrd-cis-ocropy-train \
- --input-file-grp OCR-D-GT-SEG-LINE \
- --mets mets.xml
- --parameter file:///path/to/config.json
+java -jar $(ocrd-cis-data -jar) \
+ -c train \
+ --input-file-grp OCR1,OCR2,GT \
+ --log-level DEBUG \
+ -m mets.xml \
+ --parameter $(ocrd-cis-data -config)
### ocrd-cis-ocropy-clip
@@ -228,9 +246,8 @@ pip install .
Download and move tesseract models from:
-or use your own models and
-place them into: /usr/share/tesseract-ocr/4.00/tessdata
+https://github.com/tesseract-ocr/tesseract/wiki/Data-Files or use your
+own models and place them into: /usr/share/tesseract-ocr/4.00/tessdata
## Workflow configuration
@@ -256,6 +273,7 @@ If GT is used, steps 1, 5 and 8 can be omitted. Else if a segmentation is used i
To run a few basic tests type `make test` (`ocrd_cis` has to be
installed in order to run any tests).
+# Miscellaneous
## OCR-D workspace
* Create a new (empty) workspace: `ocrd workspace init workspace-dir`
diff --git a/bashlib/ocrd-cis-eval-all.sh b/bashlib/ocrd-cis-eval-all.sh
deleted file mode 100755
index 3f0709da..00000000
--- a/bashlib/ocrd-cis-eval-all.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-set -e
-bdir=$(dirname "$0")
-source "$bdir/ocrd-cis-lib.sh"
-get_eval_dir() {
- case $1 in
- bodenstein) echo "eval/1557-Bodenstein-WieSichMeniglich";;
- grenzboten) echo "eval/1841-DieGrenzboten";;
- *) exit 1;;
- esac
-odir=$(date +%Y%m%d_%H_%M)
-for how in shuffle ocrd; do
- for corpus in grenzboten bodenstein; do
- dir="eval-$corpus-$how"
- cmd="$bdir/ocrd-cis-eval-$how.sh"
- # rm -rf "$dir"
- ocrd-cis-log ./"$cmd" -P "config/config-$how-$corpus.json" $(get_eval_dir $corpus) "$dir"
- ./"$cmd" -P "config/config-$how-$corpus.json" $(get_eval_dir $corpus) "$dir"
- mkdir -p "$odir"
- cp -r "$dir" "$odir"
- done
diff --git a/bashlib/ocrd-cis-eval-ocrd-self.sh b/bashlib/ocrd-cis-eval-ocrd-self.sh
deleted file mode 100755
index 0067d0bc..00000000
--- a/bashlib/ocrd-cis-eval-ocrd-self.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-set -e
-bdir=$(dirname "$0")
-source "$bdir/ocrd-cis-lib.sh"
-config=$(ocrd-cis-getopt -P --parameter $*)
-url=$(cat "$config" | jq --raw-output .gtlink)
-ocrd-cis-download-and-extract-ground-truth "$url" downloads
-# shuffle files into eval and train folders #
-mkdir -p "$odirtrain" "$odireval"
-for dir in downloads/*; do
- if [[ ! -d "$dir" ]]; then continue; fi
- i=1
- for xml in $(find "$dir" -type f -name '*.xml' | grep -v 'alto' | shuf); do
- echo "xml: $xml"
- img=$(ocrd-cis-find-image-for-xml "$dir" "$xml")
- echo "xml: $xml img: $img"
- x=$((i%2))
- name=$(basename "$dir")
- if [[ $x == 0 ]]; then
- mkdir -p "$odireval/$name"
- cp "$xml" "$img" "$odireval/$name"
- else
- mkdir -p "$odirtrain/$name"
- cp "$xml" "$img" "$odirtrain/$name"
- fi
- i=$((i+1))
- done
-# train #
-mkdir -p "$odir/trainws"
-pushd "$odir/trainws"
-ocrd workspace init .
-for dir in "$odirtrain/"*; do
- name=$(basename "$dir")
- ocrd-cis-run-ocr-and-align "$config" "$odir/trainws/mets.xml" "$dir" "train-ocrd-self-$name" GT
-ocrd-cis-run-training "$config" "$odir/trainws/mets.xml"
-# eval #
-mkdir -p "$odir/evalws"
-pushd "$odir/evalws"
-ocrd workspace init .
-for dir in "$odireval/"*; do
- name=$(basename "$dir")
- ocrd-cis-run-ocr-and-align "$config" "$odir/evalws/mets.xml" "$dir" "eval-ocrd-self-$name" GT
-ocrd-cis-run-evaluation "$config" "$odir/evalws/mets.xml"
diff --git a/bashlib/ocrd-cis-eval-ocrd.sh b/bashlib/ocrd-cis-eval-ocrd.sh
deleted file mode 100755
index 943cebf5..00000000
--- a/bashlib/ocrd-cis-eval-ocrd.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-set -e
-bdir=$(dirname "$0")
-source "$bdir/ocrd-cis-lib.sh"
-if [[ $# != 4 ]]; then
- echo "usage: $0 -P|--parameter config input-dir output-dir-basename"
- exit 2
-config=$(ocrd-cis-getopt -P --parameter $*)
-# train post correction from ocrd corpus #
-if [[ ! -d "$odir/trainws" ]]; then
- mkdir -p "$odir/trainws"
- pushd "$odir/trainws"
- ocrd workspace init .
- popd
-gtlink=$(cat "$config" | jq --raw-output '.gtlink')
-ocrd-cis-download-and-extract-ground-truth "$gtlink" downloads
-for dir in downloads/*; do
- if [[ ! -d "$dir" ]]; then continue; fi
- name=$(basename "$dir")
- ocrd-cis-run-ocr-and-align "$config" "$odir/trainws/mets.xml" "$dir" "train-ocrd-corpus-$name" GT
-ocrd-cis-run-training "$config" "$odir/trainws/mets.xml"
-# evaluate #
-mkdir -p "$odir/evalws"
-pushd "$odir/evalws"
-ocrd workspace init .
-ocrd-cis-run-ocr-and-align "$config" "$odir/evalws/mets.xml" "$idir" eval-ocrd-corpus GT
-ocrd-cis-run-evaluation "$config" "$odir/evalws/mets.xml"
diff --git a/bashlib/ocrd-cis-eval-shuffle.sh b/bashlib/ocrd-cis-eval-shuffle.sh
deleted file mode 100755
index f55b9821..00000000
--- a/bashlib/ocrd-cis-eval-shuffle.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-set -e
-bdir=$(dirname "$0")
-source "$bdir/ocrd-cis-lib.sh"
-if [[ $# != 4 ]]; then
- echo "usage: $0 -P|--parameter config input-dir output-dir-basename"
- exit 2
-config=$(ocrd-cis-getopt -P --parameter $*)
-# shuffle files into eval and train folders #
-if [[ ! -d "$odirtrain" ]]; then
- mkdir -p "$odirtrain" "$odireval"
- i=1
- for xml in $(find "$idir" -type f -name '*.xml' | sort); do
- img=$(ocrd-cis-find-image-for-xml "$(dirname $xml)" "$xml")
- x=$((i%2))
- if [[ $x == 0 ]]; then
- cp "$xml" "$img" "$odirtrain"
- else
- cp "$xml" "$img" "$odireval"
- fi
- i=$((i+1))
- done
-# train #
-if [[ ! -d "$odir/trainws" ]]; then
- mkdir -p "$odir/trainws"
- pushd "$odir/trainws"
- ocrd workspace init .
- popd
-ocrd-cis-run-ocr-and-align "$config" "$odir/trainws/mets.xml" "$odirtrain" train-shuffle GT
-ocrd-cis-run-training "$config" "$odir/trainws/mets.xml"
-# eval #
-if [[ ! -d "$odir/evalws" ]]; then
- mkdir -p "$odir/evalws"
- pushd "$odir/evalws"
- ocrd workspace init .
- popd
-ocrd-cis-run-ocr-and-align "$config" "$odir/evalws/mets.xml" "$odireval" eval-shuffle GT
-ocrd-cis-run-evaluation "$config" "$odir/evalws/mets.xml"
diff --git a/bashlib/ocrd-cis-evaluate.sh b/bashlib/ocrd-cis-evaluate.sh
deleted file mode 100755
index dc09772a..00000000
--- a/bashlib/ocrd-cis-evaluate.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-set -e
-source "$(dirname $0)/ocrd-cis-lib.sh"
-config=$(ocrd-cis-getopt -P --parameter $*)
-ifg=$(ocrd-cis-getopt -I --input-file-grp $*)
-mets=$(ocrd-cis-getopt -M --mets $*)
-workspace=$(/usr/bin/dirname "$mets")
-jar=$(cat "$config" | jq --raw-output '.jar')
-echo ifg $ifg
-echo mets $mets
-echo workspace $workspace
-echo jar $jar
-echo config $config
-# download newest jar #
-ocrd-cis-download-jar "$jar"
-# create workspace (really?) #
-if [[ ! -d "$workspace" ]]; then
- mkdir -p "$workspace"
- pushd "$workspace"
- ocrd workspace init .
- popd
-# search for intput file group directory #
-inputdir=$(find . -type d -name "$ifg")
-if [[ -z $inputdir ]]; then
- echo "cannot find input directory for $ifg"
- exit 1
-# add gt and image files to worksapce #
-max=-1 # set to -1 for all
-for pxml in $(find "$inputdir" -type f -name '*.xml'); do
- if [[ max -eq 0 ]]; then
- break;
- fi
- max=$((max-1))
- img=$(ocrd-cis-find-image-for-xml "$inputdir" $(basename "$pxml"))
- if [[ -z $img ]]; then
- echo cannot find image file for $img
- exit 1
- fi
- if [[ -f $img ]]; then
- ocrd-cis-add-pagexml-and-image-to-workspace \
- "$workspace" "OCR-D-GT-EVAL-$ifg" "$pxml" "OCR-D-IMG-EVAL-$ifg" "$img"
- fi
-ocrd-cis-run-ocr "$config" "$mets" "OCR-D-GT-EVAL-$ifg" "OCR-D-OCR-EVAL-XXX-$ifg"
-OCRFILEGRPS=$(ocrd-cis-join-by , $OCRFILEGRPS)
-ocrd-cis-log ocrd-cis-align \
- --input-file-grp "$OCRFILEGRPS" \
- --output-file-grp "OCR-D-ALIGN-EVAL-$ifg" \
- --mets "$mets"\
- --parameter $(cat "$config" | jq --raw-output ".alignparampath") \
- --log-level $LOG_LEVEL
-ocrd-cis-align \
- --input-file-grp "$OCRFILEGRPS" \
- --output-file-grp "OCR-D-ALIGN-EVAL-$ifg" \
- --mets "$mets" \
- --parameter $(cat "$config" | jq --raw-output ".alignparampath") \
- --log-level $LOG_LEVEL
-param=$(cat "$config" | jq --raw-output '.trainparampath')
-for cmd in evaluate-dle evaluate-rrdm; do
- java -Dfile.encoding=UTF-8 -Xmx3g -cp "$jar" "$main" -c $cmd \
- --mets "$mets" \
- --parameter "$param" \
- --input-file-grp "OCR-D-ALIGN-EVAL-$ifg" \
- --log-level $LOG_LEVEL
diff --git a/bashlib/ocrd-cis-lib.sh b/bashlib/ocrd-cis-lib.sh
deleted file mode 100644
index c2bff00c..00000000
--- a/bashlib/ocrd-cis-lib.sh
+++ /dev/null
@@ -1,469 +0,0 @@
-set -e
-# global default log-level
-ocrd-cis-log() {
- echo $(date +%R:%S.%N | sed -e 's/.*\([0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9][0-9]\).*/\1/') $LOG_LEVEL - $* >&2
-# Write a OCR-D debug log message to stderr.
-ocrd-cis-debug() {
- case $LOG_LEVEL in
- DEBUG) echo $(date +%R:%S.%N | sed -e 's/.*\([0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9][0-9]\).*/\1/') DEBUG ocrd.cis.bashlib - $* >&2;;
- esac
-# Write a OCR-D info log message to stderr.
-ocrd-cis-info() {
- case $LOG_LEVEL in
- DEBUG|INFO) echo $(date +%R:%S.%N | sed -e 's/.*\([0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9][0-9]\).*/\1/') INFO ocrd.cis.bashlib - $* >&2;;
- esac
-# Print error message to stderr and exit.
-# Usage `ocrd-cis-fail "error message" [EXIT]`
-function ocrd-cis-fail {
- printf '%s\n' "$1" >&2
- exit "${2-1}"
-# utility function to join strings with a given string
-function ocrd-cis-join-by { local IFS="$1"; shift; echo "$*"; }
-# Parse command line arguments for a given argument and
-# SETS_CIS_OPTARG to the additional provided value. Usage:
-# `ocrd-cis-getopt -P --parameter $*`.
-ocrd-cis-getopt() {
- local short=$1
- shift
- local long=$1
- shift
- while [[ $# -gt 0 ]]; do
- case $1 in
- $short|$long) OCRD_CIS_OPTARG=$2; return 0;;
- *) shift;;
- esac
- done
- return 1;
-# Download the ocrd.jar if it does not exist.
-ocrd-cis-download-jar() {
- if [[ -f "$1" ]]; then
- return 0
- fi
- local jar=http://www.cis.lmu.de/~finkf/ocrd.jar
- local dir=$(/usr/bin/dirname $1)
- pushd $dir
- wget -N $jar || true
- popd
-# Get the file for a file path. Sets OCRD_CIS_FILE_ID to
-# the file id. Usage: ocrd-cis-file-id path/to/file.xml
-ocrd-cis-get-file-id() {
- local path=$1
- local filename=$(basename "$path")
- local ext="${filename##*.}"
- local fileid="${filename%.*}"
- OCRD_CIS_FILE_ID="${fileid}_${ext}"
- echo $path $OCRD_CIS_FILE_ID
-# Add a zipped OCR-D ground truth zip to the workspace. The current
-# directory must be a valid workspace with an according mets file.
-# Sets OCR_D_CIS_GT_FILEGRP and OCR_D_CIS_IMG_FILEGRP to the according
-# filegroups. Exits if the image file for a page-XML file in the zip
-# archive cannot be found.
-# Usage: `ocrd-cis-add-gt-zip ZIP TMP_DIR
-# * ZIP: path to the gt-zip file
-# * TMP_DIR: existing temporary directory for extracted files
-ocrd-cis-add-gt-zip() {
- local zip=$1
- local tmp=$2
- ocrd-cis-log ocrd-cis-add-gt-zip $zip $tmp
- unzip -d "$tmp" "$zip"
- local base=$(echo $(basename $zip) | tr '_ \t' '-')
- base=${base/.zip/}
- local gtfg="OCR-D-GT-$base"
- local imgfg="OCR-D-IMG-$base"
- for xml in $(find "$tmp" -type f -name '*.xml' | grep -i 'page'); do
- local imgname=$(sed -ne 's/.*imageFilename="\([^"]*\)".*/\1/p' "$xml")
- local img=$(find "$tmp" -type f -name "$imgname")
- if [[ ! -f "$img" ]]; then
- echo "cannot find image: $imgname"
- exit 1
- fi
- # add image to workspace
- local imgmimetype=$(ocrd-cis-get-mimetype-by-extension "$img")
- ocrd-cis-get-file-id "$img"
- ocrd workspace add \
- --file-grp "$imgfg" \
- --mimetype "$imgmimetype" \
- --file-id "$OCRD_CIS_FILE_ID" \
- "$img"
- # get img path in workspace and set imageFilename in page xml accordingly.
- img=$(ocrd workspace find -i "$OCRD_CIS_FILE_ID")
- sed -i -e "s#imageFilename=\"[^\"]*\"#imageFilename=\"$img\"#" "$xml"
- # add page xml file to workspace
- ocrd-cis-get-file-id "$xml"
- ocrd workspace add \
- --file-grp "$gtfg" \
- --mimetype "application/vnd.prima.page+xml" \
- --file-id "$OCRD_CIS_FILE_ID" \
- "$xml"
- done
- # set global filegroup variables
-# Add a zipped OCR-D ground truth zip to the workspace. The current
-# directory must be a valid workspace with an according mets file.
-# Usage: `ocrd-cis-add-gt-zip URL TMP_DIR
-# * URL: URL to the gt-zip file
-# * TMP_DIR: existing temporary directory for downloaded (and
-# extracted) files
-ocrd-cis-download-and-add-gt-zip() {
- local url=$1
- local tmp=$2
- ocrd-cis-log ocrd-cis-download-and-add-gt-zip $url $tmp
- wget -P "$tmp" $url
- local zip=$(find $tmp -type f -name '*.zip')
- echo $zip
- ocrd-cis-add-gt-zip "$zip" "$tmp"
-# Get the mimetype of a given path. The mimetype is determined using
-# the file's extension.
-ocrd-cis-get-mimetype-by-extension() {
- case $(echo $1 | tr '[:upper:]' '[:lower:]') in
- *.tif | *.tiff) echo "image/tif";;
- *.jpg | *.jpeg) echo "image/jpeg";;
- *.png) echo "image/png";;
- *.xml) echo "application/vnd.prima.page+xml";;
- *) echo "UNKWNON"
- esac
-# Check if given file-id exists in the given mets file. Usage:
-# `ocrd-cis-file-id-exists mets fileid`.
-# * mets: path to the mets file
-# * fileid: the file id
-ocrd-cis-file-id-exists() {
- local workspace=$(dirname "$1")
- local fileid=$2
- pushd "$workspace"
- local check=$(ocrd workspace find --file-id "$fileid")
- popd
- if [[ -z "$check" ]]; then return 1; fi
- return 0
-# Check if given file-grp exists in the given mets file. Usage:
-# `ocrd-cis-file-grp-exists mets filegrp`.
-# * mets: path to the mets file
-# * filegrp: the file grp
-ocrd-cis-file-grp-exists() {
- local workspace=$(dirname "$1")
- local filegrp=$2
- pushd "$workspace"
- local check=$(ocrd workspace find --file-grp "$filegrp")
- popd
- if [[ -z "$check" ]]; then return 1; fi
- return 0
-# Run multiple OCRs over a file group. Usage: `ocrd-cis-run-ocr
-# configfile mets ifg ofg`. A XXX in the ofg is replaced with the
-# ocr-type and number. This function sets the global variable
-# $OCRFILEGRPS to a space-separated list of the ocr output file
-# groups.
-ocrd-cis-run-ocr() {
- local config=$1
- local mets=$2
- local ifg=$3
- local ofg=$4
- for i in $(seq 0 $(cat "$config" | jq ".ocr | length-1")); do
- local type=$(cat "$config" | jq --raw-output ".ocr[$i].type")
- local path=$(cat "$config" | jq --raw-output ".ocr[$i].path")
- local utype=$(echo $type | tr '[:lower:]' '[:upper:]')
- local xofg=${ofg/XXX/$utype-$((i+1))}
- if ocrd-cis-file-grp-exists "$mets" "$xofg"; then
- ocrd-cis-log skipping ocr for $xofg
- continue
- # else
- # ocrd-cis-log $xofg does not exist.
- # exit 1
- fi
- case $utype in
- ocrd-cis-log ocrd-cis-ocropy-recognize \
- --input-file-grp $ifg \
- --output-file-grp $xofg \
- --mets "$mets" \
- --parameter $path \
- --log-level $LOG_LEVEL
- ocrd-cis-ocropy-recognize \
- --input-file-grp $ifg \
- --output-file-grp $xofg \
- --mets "$mets" \
- --parameter $path \
- --log-level $LOG_LEVEL
- ;;
- ocrd-cis-log ocrd-tesserocr-recognize \
- --input-file-grp $ifg \
- --output-file-grp $xofg \
- --mets "$mets" \
- --parameter $path \
- --log-level $LOG_LEVEL
- ocrd-tesserocr-recognize \
- --input-file-grp $ifg \
- --output-file-grp $xofg \
- --mets "$mets" \
- --parameter $path \
- --log-level $LOG_LEVEL
- ;;
- *)
- echo "invalid ocr type: $utype"
- exit 1
- ;;
- esac
- done
-# Search for the associated image file for the given xml file in the
-# given directory. The given xml file must end with .xml. Usage:
-# `ocrd-cis-find-image-for-xml dir xy.xml`
-ocrd-cis-find-image-for-xml() {
- local dir=$1
- local xml=$2
- for pre in .bin .dew ""; do # prefer binary before descewed before normal images
- for ext in .jpg .jpeg .JPG .JPEG .png .tiff .tif; do
- # strict search based on the xml file's name
- # try also using the xml file's number, e.g xyz_123.xml -> 123.bin.png
- local name=$(basename "$xml")
- local name=${name/.xml/$pre$ext}
- local numname=$(echo $name | sed -e 's/.*[-_]\([0-9]*\.\)/\1/')
- local file=$(find "$dir" -type f -name "$name" -o -type f -name "$numname")
- # echo find "$dir" -type f -name "$name" -o -type f -name "$numname"
- # echo file $file
- if [[ ! -z "$file" ]]; then
- ocrd-cis-log found image: $file for xml: $xml
- echo $file
- return 0
- fi
- done
- done
- return 1
-# Add a pagexml and image file pair to a workspace. The according
-# imageFilename attribute of the page xml file is set accordingly.
-# The basename of the given files are used as file ids. Usage:
-# `ocrd-cis-add-xml-image-pair mets xml xmlfg img imgfg`.
-# * mets: path to the workspace's mets file
-# * xml: path to the page xml file
-# * xmlfg: file group of the xml file
-# * img: path to the imaage file
-# * imgfg: file group of the image file
-ocrd-cis-add-xml-image-pair() {
- local mets=$1
- local xml=$2
- local xmlfg=$3
- local img=$4
- local imgfg=$5
- local imgmt=$(ocrd-cis-get-mimetype-by-extension "$img")
- local xmlmt=$(ocrd-cis-get-mimetype-by-extension "$xml")
- local workspace=$(dirname "$mets")
- local absxml=$(realpath "$xml")
- local absimg=$(realpath "$img")
- if ocrd-cis-file-id-exists "$mets" "$(basename "$img")"; then
- ocrd-cis-log skipping add to workspace for $img and $xml
- return
- fi
- pushd $workspace
- # add image file
- ocrd workspace add \
- --file-grp "$imgfg" \
- --mimetype "$imgmt" \
- --file-id "$(basename "$img")" \
- --force "$absimg"
- # add xml file
- ocrd workspace add \
- --file-grp "$xmlfg" \
- --mimetype "$xmlmt" \
- --file-id "$(basename "$xml")" \
- --force "$absxml"
- # fix filepath
- local relxml="$xmlfg/$(basename $xml)"
- local relimg="$imgfg/$(basename $img)"
- echo sed -i "s#imageFilename=\"\([^\"]*\)\"#imageFilename=\"$relimg\"#" "$relxml"
- sed -i "s#imageFilename=\"\([^\"]*\)\"#imageFilename=\"$relimg\"#" "$relxml"
- popd
-# Given a directory add image and base xml files, run additional ocrs
-# and align them. Sets ALGINFILEGRP to the alignment file group.
-# Usage: `ocrd-cis-run-ocr-and-align config mets dir fg gt`.
-# * config : path to the main config file
-# * mets : path to the mets file
-# * dir : path to the directory
-# * fg : base name of filegroups
-# * gt : gt=GT if xml files are ground truth; anything else if not
-ocrd-cis-run-ocr-and-align() {
- local config=$1
- local mets=$2
- local dir=$3
- local fg=$4
- local gt=$5
- local workspace=$(dirname "$mets")
- for xml in $(find "$dir" -type f -name '*.xml'); do
- if [[ "$xml" == *"alto"* ]]; then # skip alto xml files in gt archives
- continue
- fi
- local img=$(ocrd-cis-find-image-for-xml "$dir" "$xml")
- ocrd-cis-add-xml-image-pair "$mets" "$xml" "OCR-D-$gt-$fg" "$img" "OCR-D-IMG-$fg"
- done
- ocrd-cis-run-ocr "$config" "$mets" "OCR-D-$gt-$fg" "OCR-D-XXX-$fg"
- if [[ $(echo "$gt" | tr '[[:upper:]]' '[[:lower:]]') == "gt" ]]; then
- else
- fi
- OCRFILEGRPS=$(ocrd-cis-join-by , $OCRFILEGRPS)
- if ocrd-cis-file-grp-exists "$mets" "$ALIGNFILEGRP"; then
- ocrd-cis-log skipping aligning of $ALIGNFILEGRP
- return
- fi
- ocrd-cis-log ocrd-cis-align \
- --input-file-grp "$OCRFILEGRPS" \
- --output-file-grp "$ALIGNFILEGRP" \
- --mets "$mets" \
- --parameter $(cat "$config" | jq --raw-output ".alignparampath") \
- --log-level $LOG_LEVEL
- ocrd-cis-align \
- --input-file-grp "$OCRFILEGRPS" \
- --output-file-grp "$ALIGNFILEGRP" \
- --mets "$mets" \
- --parameter $(cat "$config" | jq --raw-output ".alignparampath") \
- --log-level $LOG_LEVEL
- # (Cannot use non unicode chars if installing this)
- # Change long s (\u017f) to normal s if the ground truth
- # does not contain long s.
- # fixlongs=$(cat "$config" | jq --raw-output '.fixLongS')
- # if [[ "$fixlongs" == "true" ]]; then
- # pushd "$workspace"
- # ocrd-cis-log "fixing long s in file"
- # for fg in $(ocrd workspace list-group | grep 'ALIGN'); do
- # ocrd-cis-log "fixing long s in filegroup $fg"
- # for xml in "$fg"/*; do
- # ocrd-cis-log "fixing long s in file $xml"
- # sed -i -e 's/\u017f/s/g' "$xml"
- # done
- # done
- # popd
- # fi
-# Run the training over the `-ALIGN-` filegroups in the workspace
-# directory of the given mets.xml file. Usage: `ocrd-cis-run-training
-# config mets`.
-# * config: path to the configuration file
-# * mets: path to the mets file
-ocrd-cis-run-training() {
- local config=$1
- local mets=$2
- local workspace=$(dirname "$mets")
- local main="de.lmu.cis.ocrd.cli.Main"
- local jar=$(cat "$config" | jq --raw-output '.jar')
- local trainconfig=$(cat "$config" | jq --raw-output '.trainparampath')
- # get -ALIGN- filegroups
- pushd "$workspace"
- local trainfilegrps=""
- for fg in $(ocrd workspace list-group); do
- if [[ $fg == *"-ALIGN-"* ]]; then
- trainfilegrps="$trainfilegrps -I $(basename $fg)"
- fi
- done
- popd
- # run training
- ocrd-cis-log java -Dfile.encoding=UTF-8 -Xmx3g -cp $jar $main --log-level $LOG_LEVEL \
- -c train --mets "$mets" --parameter $trainconfig $trainfilegrps
- java -Dfile.encoding=UTF-8 -Xmx3g -cp "$jar" "$main" --log-level "$LOG_LEVEL" \
- -c train --mets "$mets" --parameter "$trainconfig" $trainfilegrps
-# Run the evaluation over the `-ALIGN-` filegroups in the workspace
-# directory of the given mets.xml file. Usage:
-# `ocrd-cis-run-evaluation config mets`.
-# * config: path to the configuration file
-# * mets: path to the mets file
-ocrd-cis-run-evaluation() {
- local config=$1
- local mets=$2
- local workspace=$(dirname "$mets")
- local main="de.lmu.cis.ocrd.cli.Main"
- local jar=$(cat "$config" | jq --raw-output '.jar')
- local evalconfig=$(cat "$config" | jq --raw-output '.evalparampath')
- # get -ALIGN- filegroups
- pushd "$workspace"
- local trainfilegrps=""
- for fg in $(ocrd workspace list-group); do
- if [[ $fg == *"-ALIGN-"* ]]; then
- trainfilegrps="$trainfilegrps -I $(basename $fg)"
- fi
- done
- popd
- # run evaluation
- for cmd in evaluate-dle evaluate-rrdm; do
- ocrd-cis-log java -Dfile.encoding=UTF-8 -Xmx3g -cp "$jar" "$main" -c "$cmd" \
- --mets "$mets" \
- --parameter "$param" \
- $trainfilegrps \
- --log-level $LOG_LEVEL
- java -Dfile.encoding=UTF-8 -Xmx3g -cp "$jar" "$main" -c "$cmd" \
- --mets "$mets" \
- --parameter "$evalconfig" \
- $trainfilegrps \
- --log-level $LOG_LEVEL
- done
-# Download the ground truth archives and unzip them into a dedicated
-# directory. Usage: `ocrd-cis-download-and-extract-ground-truth url
-# dir`.
-# * url: URL of the archives
-# * dir: output directory for the extracted archives
-ocrd-cis-download-and-extract-ground-truth() {
- local url=$1
- local dir=$2
- mkdir -p "$dir"
- pushd "$dir"
- ocrd-cis-log "downloading $url"
- wget -r -np -l1 -nd -N -A zip -erobots=off "$url" || true # ignore exit status of wget
- for zip in *.zip; do
- # this archive is broken
- if [[ "$(basename $zip)" == $'bi\u00dfmarck_carmina_1657.zip' ]]; then continue; fi
- unzip -u -o $zip
- done
- popd
diff --git a/bashlib/ocrd-cis-pack-result-dir.sh b/bashlib/ocrd-cis-pack-result-dir.sh
deleted file mode 100755
index 1d2f9b2f..00000000
--- a/bashlib/ocrd-cis-pack-result-dir.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-set -e
-# bdir=$(dirname "$0")
-# source "$bdir/ocrd-cis-lib.sh"
-tar=$(basename "$idir")
-GZIP=-9 # use best compression
-pushd $idir
-tar -cjf "$tar" **/train/*.txt
-tar -tf "$tar"
-echo $tar
diff --git a/bashlib/ocrd-cis-post-correct.sh b/bashlib/ocrd-cis-post-correct.sh
deleted file mode 100644
index 267a5516..00000000
--- a/bashlib/ocrd-cis-post-correct.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-# set -x
-source $(dirname $0)/ocrd-cis-lib.sh
-# Post correct OCR results. Args:
-# * the option --mets for the workspace.
-# * the option --log-level is passed to other tools.
-# * the option --parameter is used as configuration.
-# * the option --output-file-grp defines the output filegroup of the
-# post-corrected page xml files.
-# * the option --input-file-grp specifies the file group of the ocr
-# results that will be processed.
-# tmp dir and cleanup
-tmpdir=$(mktemp -d)
-trap "rm -rfv $tmpdir" EXIT
-# command line arguments
-if ocrd-cis-getopt -l --log-level $*; then
-ocrd-cis-debug "log-level: $LOG_LEVEL"
-ocrd-cis-getopt -m --mets $* || ocrd-cis-fail "error: missing METS file (--mets)"
-METS=$(realpath $METS)
-ocrd-cis-debug "mets: $METS"
-ocrd-cis-getopt -p --parameter $* || ocrd-cis-fail "error: missing configuration file (--parameter)"
-ocrd-cis-debug "parameter: $PARAMETER"
-ocrd-cis-getopt -O --output-file-grp $* || ocrd-cis-fail "error: missing output file group (--output-file-grp)"
-ocrd-cis-debug "output file group: $OUTPUT_FILE_GRP"
-ocrd-cis-getopt -I --input-file-grp $* || ocrd-cis-fail "error: missing input file group (--input-file-grp)"
-ocrd-cis-debug "input file group: $INPUT_FILE_GRP"
-# do additional ocrs and align
-ocrd-cis-info "step: additional ocr and alignment"
-# preset in case that there are no ocr steps
-alignfgs="$XML_INPUT_FILE_GRP" # master ocr comes first
-for cmd in $(cat $PARAMETER | jq -r '.ocrSteps[] | @base64'); do
- n=$((n+1))
- cmd=$(echo $cmd | base64 -d)
- eval ocrd-cis-debug "$cmd"
- eval $cmd || exit 1
- alignfgs="$alignfgs,$XML_OUTPUT_FILE_GRP"
-ocrd-cis-debug ocrd-cis-align --mets $METS \
- --input-file-grp "$alignfgs" \
- --output-file-grp "$alignfg"
-ocrd-cis-align --mets $METS \
- --input-file-grp "$alignfgs" \
- --output-file-grp "$alignfg"
-# post correction
-mkdir -p "$pcdir"
-jar=$(ocrd-cis-data -jar)
-nocr=$(jq ".ocrSteps | length+1" "$PARAMETER")
-ocrd-cis-info "step: post-correction"
-ocrd-cis-debug java -Dfile.encoding=UTF-8 -Xmx3g -cp $jar $main \
- --log-level $LOG_LEVEL \
- -c post-correct \
- --mets $METS \
- --parameter <(jq ".postCorrection.nOCR = \"$nocr\" | .postCorrection" "$PARAMETER") \
- --input-file-grp "$trainfgs"
-java -Dfile.encoding=UTF-8 -Xmx3g -cp $jar $main \
- --log-level $LOG_LEVEL \
- -c post-correct \
- --mets $METS \
- --parameter <(jq ".postCorrection.nOCR = \"$nocr\" | .postCorrection" "$PARAMETER") \
- --input-file-grp "$alignfg" \
- --output-file-grp "$OUTPUT_FILE_GRP"
-# add protocols to the workspace
-pushd $(dirname $METS)
-if [[ -f "$pcdir/le-protocol.json" ]]; then
- ocrd-cis-info "step: add lexicon extension protocol"
- ocrd workspace add \
- --mimetype "application/json" \
- --file-id "ocrd-cis-le-protocol.json" \
- "$pcwdir/le-protocol.json"
-if [[ -f "$pcdir/dm-protocol.json" ]]; then
- ocrd-cis-info "step: add desicion maker protocol"
- ocrd workspace add \
- --mimetype "application/json" \
- --file-id "ocrd-cis-dm-protocol.json" \
- "$pcwdir/md-protocol.json"
diff --git a/bashlib/ocrd-cis-synpage.sh b/bashlib/ocrd-cis-synpage.sh
deleted file mode 100755
index 906f8237..00000000
--- a/bashlib/ocrd-cis-synpage.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-set -e
-if [[ -z "$n" ]]; then n=25; fi
-for f in $idir/*.gt.txt; do
- if [[ -z "$files" ]]; then
- files=$f
- else
- files="$files $f"
- fi
- if [[ $((i%n)) == 0 ]]; then
- mkdir -p "$odir"
- out=$(printf "$odir/%04d" $j)
- gocrd synpage -o "$out" $files
- files=""
- j=$((j+1))
- fi
- i=$((i+1))
-if [[ ! -z "$files" ]]; then
- mkdir -p "$odir"
- out=$(printf "$odir/%04d" $j)
- gocrd synpage -o "$out" $files
- files=""
diff --git a/bashlib/ocrd-cis-train.sh b/bashlib/ocrd-cis-train.sh
deleted file mode 100755
index 21e4ee59..00000000
--- a/bashlib/ocrd-cis-train.sh
+++ /dev/null
@@ -1,177 +0,0 @@
-# set -x
-source $(dirname $0)/ocrd-cis-lib.sh
-# Train a model. The tool somewhat mimics the behaviour of other ocrd
-# tools:
-# * the option --mets for the workspace.
-# * the option --log-level is passed to other tools.
-# * the option --parameter is used as configuration.
-# * the option --output-file-grp defines the output filegroup of the
-# model file.
-# * the option --input-file-grp is ignored.
-# tmp dir and cleanup
-tmpdir=$(mktemp -d)
-trap "rm -rfv $tmpdir" EXIT
-# command line arguments
-if ocrd-cis-getopt -l --log-level $*; then
-ocrd-cis-debug "log-level: $LOG_LEVEL"
-ocrd-cis-getopt -m --mets $* || ocrd-cis-fail "error: missing METS file (--mets)"
-METS=$(realpath $METS)
-ocrd-cis-debug "mets: $METS"
-ocrd-cis-getopt -p --parameter $* || ocrd-cis-fail "error: missing configuration file (--parameter)"
-ocrd-cis-debug "parameter: $PARAMETER"
-ocrd-cis-getopt -O --output-file-grp $* || ocrd-cis-fail "error: missing output file group (--output-file-grp)"
-ocrd-cis-debug "output file group: $OUTPUT_FILE_GRP"
-# download and prepare ground truth from archives
-ocrd-cis-info "step: ground truth"
-pushd $(dirname $METS)
-for archive in $(cat $PARAMETER | jq -r '.gtArchives[]'); do
- archivedir="$tmpdir/$(basename $archive)"
- mkdir -p "$archivedir"
- if [[ $archive == http://* ]] || [[ $archive == https://* ]]; then
- ocrd-cis-download-and-add-gt-zip "$archive" "$archivedir"
- else
- ocrd-cis-add-gt-zip "$archive" "$archivedir"
- fi
- if [[ -z $xmlfgs ]]; then
- else
- xmlfgs="$xmlfgs $OCR_D_CIS_GT_FILEGRP"
- imgfgs="$imgfgs $OCR_D_CIS_IMG_FILEGRP"
- fi
-# do image pre-processing
-ocrd-cis-info "step: image pre processing"
-for imgfg in $imgfgs; do
- xmlfg=${imgfg/IMG/GT}
- # preset in case that there are no image preprocessing steps
- n=1
- for cmd in $(cat $PARAMETER | jq -r '.imagePreprocessingSteps[] | @base64'); do
- n=$((n+1))
- cmd=$(echo $cmd | base64 -d)
- eval ocrd-cis-debug "$cmd"
- eval $cmd
- done
- if [[ -z tmpimgfgs ]]; then
- tmpimgfgs=$IMG_OUTPUT_FILE_GRP
- tmpxmlfgs=$XML_OUTPUT_FILE_GRP
- else
- tmpimgfgs="$tmpimgfgs $IMG_OUTPUT_FILE_GRP"
- tmpxmlfgs="$tmpxmlfgs $XML_OUTPUT_FILE_GRP"
- fi
-# do the ocr and align
-ocrd-cis-info "step: ocr and alignment"
-for xmlfg in $xmlfgs; do
- # preset in case that there are no ocr steps
- alignfgs=""
- n=1
- for cmd in $(cat $PARAMETER | jq -r '.ocrSteps[] | @base64'); do
- n=$((n+1))
- cmd=$(echo $cmd | base64 -d)
- eval ocrd-cis-debug "$cmd"
- eval $cmd || exit 1
- if [[ -z $alignfgs ]]; then
- alignfgs="$XML_OUTPUT_FILE_GRP"
- else
- alignfgs="$alignfgs,$XML_OUTPUT_FILE_GRP"
- fi
- done
- alignfgs="$alignfgs,$xmlfg" # append gt filegroup
- trainfg="${xmlfg/XML/ALIGN}"
- ocrd-cis-align --mets $METS \
- --input-file-grp "$alignfgs" \
- --output-file-grp "$trainfg"
- werfg="${xmlfg/XML/WER}"
- ocrd-cis-wer --mets $METS \
- --input-file-grp "$trainfg" \
- --output-file-grp "$werfg"
- # sadly we cannot use something like ocrd workspace find -G | grep ALIGN
- if [[ -z $trainfgs ]]; then
- trainfgs="$trainfg"
- else
- trainfgs="$trainfg,$trainfgs"
- fi
-# training
-mkdir -p "$traindir"
-nocr=$(jq ".ocrSteps | length" "$PARAMETER")
-ocrd-cis-info "step: training"
-# eval ocrd-cis-debug java -Dfile.encoding=UTF-8 -Xmx3g -cp $(ocrd-cis-data -jar) $main \
-# --log-level $LOG_LEVEL \
-# -c train \
-# --mets $METS \
-# --parameter <(jq ".training.dir = \"$traindir\"" "$PARAMETER") \
-# --input-file-grp "$trainfgs"
-java -Dfile.encoding=UTF-8 -Xmx3g -cp $(ocrd-cis-data -jar) $main \
- --log-level $LOG_LEVEL \
- -c train \
- --mets $METS \
- --parameter <(jq ".training.dir = \"$traindir\" | .training.nOCR = \"$nocr\" | .training" "$PARAMETER") \
- --input-file-grp "$trainfgs"
-# add model and training resources to workspace
-pushd $(dirname $METS)
-ocrd-cis-info "step: cleanup"
-ocrd workspace add \
- --file-grp "$OUTPUT_FILE_GRP" \
- --mimetype "application/zip" \
- --file-id "ocrd-cis-model.zip" \
- "$traindir/model.zip"
-rm -rf "$traindir/model.zip"
-zip -r "$tmpdir/training.zip" "$traindir"
-ocrd workspace add \
- --file-grp "$OUTPUT_FILE_GRP-TRAINING" \
- --mimetype "application/zip" \
- --file-id "ocrd-cis-training.zip" \
- "$tmpdir/training.zip"
diff --git a/bashlib/ocrd_cis.bash b/bashlib/ocrd_cis.bash
deleted file mode 100644
index 82f35804..00000000
--- a/bashlib/ocrd_cis.bash
+++ /dev/null
@@ -1,93 +0,0 @@
-set -e
-TMP_DIR=${TMP_DIR:-$(mktemp -d -t ocrd_cis-tmp-XXXXXXXXX)}
-function maybe_rmtd() {
- if [[ "$PERSISTENT" = "yes" ]]; then
- echo tmp dir = $TMP_DIR
- else
- echo removing $TMP_DIR
- rm -rf $TMP_DIR
- fi
-trap maybe_rmtd EXIT
-function wget_cached() {
- local url=$1
- local filename=$2
- local destdir=$TMP_DIR/downloads
- if test ! -f $CACHE_DIR/$filename; then
- mkdir -p $CACHE_DIR
- echo "downloading $url/$filename"
- wget -P $CACHE_DIR $url/$filename
- fi
- mkdir -p $destdir
- ln $CACHE_DIR/$filename $destdir/$filename ||\
- cp $CACHE_DIR/$filename $destdir/$filename
-function download_ocrd_gt_zip() {
- local url="http://www.ocr-d.de/sites/all/GTDaten"
- local filename=$1
- wget_cached $url $filename
-function unzip_ocrd_gt() {
- local zip="$TMP_DIR/downloads/$1"
- echo unziping $zip
- unzip -d $TMP_DIR/downloads ${zip/.zip/} >/dev/null
-function download_and_unzip_ocrd_gt() {
- download_ocrd_gt_zip $1
- unzip_ocrd_gt $1
-function get_page_xml_files() {
- for d in $@; do
- for d in $(find $TMP_DIR/downloads -type d -name page); do
- for f in $(find $d -type f | sort); do
- done
- done
- done
-function download_ocrd_jar() {
- local url='http://www.cis.lmu.de/~finkf'
- wget_cached $url "ocrd-0.1.jar"
- JAR="$TMP_DIR/downloads/ocrd-0.1.jar"
-# sets PERSISTENT and ARG variables
-function parse_cmd_line_args() {
- for arg in "$@"; do
- case $arg in
- -p|--persistent)
- ;;
- *)
- ARG=$arg
- esac
- done
-function setup_ocrd_test_environment() {
- download_and_unzip_ocrd_gt $1
- download_ocrd_jar
-function assert_file_group_exists() {
- pushd $TMP_DIR
- ocrd workspace list-group | grep "$1" && true || exit 1
- popd
diff --git a/data/docker/deps.txt b/data/docker/deps.txt
deleted file mode 100644
index af8c76fa..00000000
--- a/data/docker/deps.txt
+++ /dev/null
@@ -1,7 +0,0 @@
diff --git a/data/docker/ocrd-cis-ocropy-fraktur1.json b/data/docker/ocrd-cis-ocropy-fraktur1.json
deleted file mode 100644
index 01ea4866..00000000
--- a/data/docker/ocrd-cis-ocropy-fraktur1.json
+++ /dev/null
@@ -1,4 +0,0 @@
- "textequiv_level": "glyph",
- "model": "${DATA}/models/fraktur1-00085000.pyrnn.gz"
diff --git a/data/docker/ocrd-cis-ocropy-fraktur2.json b/data/docker/ocrd-cis-ocropy-fraktur2.json
deleted file mode 100644
index 34e8d941..00000000
--- a/data/docker/ocrd-cis-ocropy-fraktur2.json
+++ /dev/null
@@ -1,4 +0,0 @@
- "textequiv_level": "glyph",
- "model": "${DATA}/models/fraktur2-00062000.pyrnn.gz"
diff --git a/data/docker/ocrd-cis-post-correction.json b/data/docker/ocrd-cis-post-correction.json
deleted file mode 100644
index eb1bed49..00000000
--- a/data/docker/ocrd-cis-post-correction.json
+++ /dev/null
@@ -1,23 +0,0 @@
- "model": "${DATA}/models/model.zip",
- "jar": "${DATA}/ocrd-cis.jar",
- "nOCR": 2,
- "trigrams": "${DATA}/models/character-trigrams.csv",
- "lexiconExtensionProtocol": "",
- "decisionMakerProtocol": "",
- "additionalLexicon": "",
- "runLexiconExtension": true,
- "runDescisionMaker": true,
- "profiler": {
- "type": "local",
- "executable": "/apps/profiler",
- "config": "${DATA}/languages/german.ini",
- "cacheDir": "${DATA}/cache"
- },
- "ocr": [
- {
- "type": "ocropy",
- "path": "${DATA}/config/ocrd-cis-ocropy-fraktur1.json"
- }
- ]
\ No newline at end of file
diff --git a/ocrd_cis/__init__.py b/ocrd_cis/__init__.py
index 67a2d879..6f37f4f7 100644
--- a/ocrd_cis/__init__.py
+++ b/ocrd_cis/__init__.py
@@ -1,7 +1,3 @@
-from .javaprocess import JavaProcess
-from .javaprocess import JavaTrain
from .javaprocess import JavaAligner
-from .javaprocess import JavaProfiler
-from .javaprocess import JavaEvalDLE
-from .javaprocess import JavaEvalRRDM
+from .javaprocess import JavaPostCorrector
from .ocrd_tool import get_ocrd_tool
diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py
index 9bc384d6..c06f01fe 100644
--- a/ocrd_cis/align/cli.py
+++ b/ocrd_cis/align/cli.py
@@ -118,7 +118,7 @@ def align_words(self, lines):
# self.log.info(json.dumps(lines[0].alignment))
mregion = lines[0].region.get_Word()
oregion = [lines[i].region.get_Word() for i in range(1, len(lines))]
- for word in lines[0].alignment['words']:
+ for word in lines[0].alignment['wordAlignments']:
self.log.debug("aligning word %s", word['master'])
master, rest = self.find_word([word['master']], mregion, "master")
mregion = rest
diff --git a/ocrd_cis/data/__main__.py b/ocrd_cis/data/__main__.py
index 1552e025..3d8ef735 100644
--- a/ocrd_cis/data/__main__.py
+++ b/ocrd_cis/data/__main__.py
@@ -2,12 +2,19 @@
import sys
def main():
- if '-jar' in sys.argv:
+ usage = 'usage: ' + sys.argv[0] + ' -jar|-3gs|-model|-config'
+ if '-h' in sys.argv:
+ print(usage)
+ elif '-jar' in sys.argv:
print(pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar'))
elif '-3gs' in sys.argv:
print(pkg_resources.resource_filename('ocrd_cis', 'data/3gs.csv.gz'))
+ elif '-model' in sys.argv:
+ print(pkg_resources.resource_filename('ocrd_cis', 'data/model.zip'))
+ elif '-config' in sys.argv:
+ print(pkg_resources.resource_filename('ocrd_cis', 'data/config.json'))
- raise ValueError('usage: ' + sys.argv[0] + ' -jar|-3gs')
+ raise ValueError(usage)
if __name__ == "__main__":
diff --git a/ocrd_cis/data/config.json b/ocrd_cis/data/config.json
new file mode 100644
index 00000000..6a6b57ba
--- /dev/null
+++ b/ocrd_cis/data/config.json
@@ -0,0 +1,216 @@
+ "runLE": true,
+ "runDM": true,
+ "profiler": {
+ "path": "/path/to/profiler",
+ "config": "/path/to/language.ini"
+ },
+ "nOCR": 2,
+ "maxCandidates": 10,
+ "dir": "/path/to/train.dir",
+ "trigrams": "/path/to/trigrams.csv",
+ "ocropusOCRExtensions": [],
+ "ocropusImageExtension": "",
+ "filterClasses": ["deactivate"],
+ "leFeatures": [
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.TokenLengthClassFeature",
+ "name": "TokenLengthClass",
+ "short": 3,
+ "medium": 8,
+ "long": 13
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.TokenCaseClassFeature",
+ "name": "TokenCaseClass"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.TokenLengthFeature",
+ "name": "TokenLength"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.MaxOCRCharacterConfidenceFeature",
+ "name": "MaxOCRConfidence"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.MatchingOCRTokensFeature",
+ "name": "MatchingOCRTokens"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.LineOverlapWithMasterOCRFeature",
+ "name": "LineOverlap"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.LinePositionFeature",
+ "name": "LinePosition"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.OCRWordConfidenceFeature",
+ "name": "WordConfidence"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateMatchesOCRFeature",
+ "name": "HighestRankedCandidateMatches"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateHistoricalPatternsDistanceFeature",
+ "name": "HighestRankedCandidateHistPatternsDistance"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateOCRPatternsDistanceFeature",
+ "name": "HighestRankedCandidateOCRPatternsDistance"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateDistanceToNextFeature",
+ "name": "HighestRankedCandidateDistanceToNext"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.HighestRankedCandidateVoteWeightFeature",
+ "name": "HighestRankedCandidateVoteWeight"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.UnigramFeature",
+ "name": "UnigramOCRRelativeFrequency"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.LevenshteinDistanceFeature",
+ "name": "LevenshteinDistance",
+ "maxThreshold": 5
+ }
+ ],
+ "rrFeatures": [
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.TokenLengthClassFeature",
+ "name": "TokenLengthClass",
+ "short": 3,
+ "medium": 7,
+ "long": 13
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.TokenCaseClassFeature",
+ "name": "Tokenshape",
+ "class": "Tokenshape"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.TokenLengthFeature",
+ "name": "TokenLength"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.MaxCharNGramsFeature",
+ "name": "MaxCharTrigram"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.MinCharNGramsFeature",
+ "name": "MinCharTrigram"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.MaxOCRCharacterConfidenceFeature",
+ "name": "MaxOCRConfidence"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.MatchingOCRTokensFeature",
+ "name": "MatchingOCRTokens"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.LineOverlapWithMasterOCRFeature",
+ "name": "LineOverlap"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.LinePositionFeature",
+ "name": "LinePosition"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.OCRWordConfidenceFeature",
+ "name": "WordConfidence"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateCaseClassFeature",
+ "name": "CandidateCaseClass"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateHistoricalPatternsDistanceFeature",
+ "name": "CandidateHistoricalPatternsDistance"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateLengthClassFeature",
+ "name": "CandidateLengthClass",
+ "short": 3,
+ "medium": 8,
+ "long": 13
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateLengthFeature",
+ "name": "CandidateLength"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateMatchesOCRTokenFeature",
+ "name": "CandidateMatchesOCR"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateMatchingOCRsFeature",
+ "name": "CandidateMatchingOCRs"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateMaxCharNGramsFeature",
+ "name": "CandidateMaxCharNGram"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateMinCharNGramsFeature",
+ "name": "CandidateMinCharNGram"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateMaxHistoricalPatternConfidenceFeature",
+ "name": "CandidateMaxHistoricalPatternConfidence"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateMinHistoricalPatternConfidenceFeature",
+ "name": "CandidateMinHistoricalPatternConfidence"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateVoteWeightFeature",
+ "name": "CandidateVoteWeight"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateUnigramFeature",
+ "name": "CandidateUnigram"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.UnigramFeature",
+ "name": "UnigramOCRRelativeFrequency"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateOCRPatternConfidenceFeature",
+ "name": "CandidateOCRPatternConfidenceFeature",
+ "classes": ["deactivate"]
+ }
+ ],
+ "dmFeatures": [
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.DMBestRankFeature",
+ "name": "BestRank"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.DMDifferenceToNextRankFeature",
+ "name": "BestRankDifferenceToNext"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateVoteWeightFeature",
+ "name": "CandidateVoteWeight"
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.CandidateOCRPatternConfidenceFeature",
+ "name": "OCRPatternConfidenceFeature",
+ "classes": ["deactivate"]
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.MinOCRCharacterConfidenceFeature",
+ "name": "MinOCRCharacterConfidence",
+ "classes": ["deactivate"]
+ },
+ {
+ "type": "de.lmu.cis.ocrd.ml.features.AverageOCRCharacterConfidenceFeature",
+ "name": "AverageOCRCharacterConfidence",
+ "classes": ["deactivate"]
+ }
+ ]
diff --git a/ocrd_cis/data/model.zip b/ocrd_cis/data/model.zip
new file mode 100644
index 00000000..1b587274
Binary files /dev/null and b/ocrd_cis/data/model.zip differ
diff --git a/ocrd_cis/data/ocrd-cis.jar b/ocrd_cis/data/ocrd-cis.jar
index a21ecd20..0350d3b0 100644
Binary files a/ocrd_cis/data/ocrd-cis.jar and b/ocrd_cis/data/ocrd-cis.jar differ
diff --git a/ocrd_cis/javaprocess.py b/ocrd_cis/javaprocess.py
index 07ce0a2e..ce2f6bfd 100644
--- a/ocrd_cis/javaprocess.py
+++ b/ocrd_cis/javaprocess.py
@@ -14,8 +14,8 @@ def JavaAligner(n, loglvl):
'--log-level', loglvl,
'--parameter', '{}'.format(json.dumps({'n':n}))])
-def JavaProfiler(mets, ifg, ofg, params, loglvl):
- return JavaProcess(JAR, ['-c', 'profile',
+def JavaPostCorrector(mets, ifg, ofg, params, loglvl):
+ return JavaProcess(JAR, ['-c', 'post-correct',
'--log-level', loglvl,
'--input-file-grp', ifg,
'--output-file-grp', ofg,
@@ -23,45 +23,6 @@ def JavaProfiler(mets, ifg, ofg, params, loglvl):
'-p', "{}".format(json.dumps(params))])
-def JavaTrain(jar, mets, ifgs, parameter, loglvl="DEBUG"):
- args = [
- "-c", "train",
- "--mets", mets,
- "--log-level", loglvl,
- "--parameter", parameter
- ]
- for ifg in ifgs:
- args.append("-I")
- args.append(ifg)
- return JavaProcess(jar, args)
-def JavaEvalDLE(jar, mets, ifgs, parameter, loglvl="DEBUG"):
- args = [
- '-c', 'evaluate-dle',
- '--mets', mets,
- '--log-level', loglvl,
- '--parameter', parameter
- ]
- for ifg in ifgs:
- args.append('-I')
- args.append(ifg)
- return JavaProcess(jar, args)
-def JavaEvalRRDM(jar, mets, ifgs, parameter, loglvl="DEBUG"):
- args = [
- '-c', 'evaluate-rrdm',
- '--mets', mets,
- '--log-level', loglvl,
- '--parameter', parameter
- ]
- for ifg in ifgs:
- args.append('-I')
- args.append(ifg)
- return JavaProcess(jar, args)
class JavaProcess:
def __init__(self, jar, args):
self.jar = jar
@@ -106,17 +67,17 @@ def exe(self):
cmd = self.get_cmd()
self.log.info('command: %s', " ".join(cmd))
- ret = subprocess.run(
- cmd,
- stderr=subprocess.PIPE,
- check=False,
- universal_newlines=True,
- )
- self.log.debug("%s: %i", " ".join(cmd), ret.returncode)
- if ret.returncode != 0:
- raise ValueError(
- "cannot execute {}: {}\n{}"
- .format(" ".join(cmd), ret.returncode, ret.stderr))
+ with subprocess.Popen(
+ cmd,
+ stderr=subprocess.PIPE
+ ) as p:
+ sout, eout = p.communicate()
+ self.log_stderr(eout)
+ retval = p.wait()
+ if retval != 0:
+ raise ValueError(
+ "cannot execute {}: {}\n{}"
+ .format(" ".join(cmd), retval, eout.decode('utf-8')))
def log_stderr(self, err):
for line in err.decode("utf-8").split("\n"):
diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json
index ad4fe48d..c815b37e 100644
--- a/ocrd_cis/ocrd-tool.json
+++ b/ocrd_cis/ocrd-tool.json
@@ -1,6 +1,6 @@
"git_url": "https://github.com/cisocrgroup/ocrd_cis",
- "version": "0.0.7",
+ "version": "0.0.8",
"tools": {
"ocrd-cis-ocropy-binarize": {
"executable": "ocrd-cis-ocropy-binarize",
@@ -281,6 +281,10 @@
"steps": [
+ "input_file_grp": [
+ ],
"description": "Recognize text snippets",
"parameters": {
"model": {
@@ -387,15 +391,20 @@
"ocrd-cis-ocropy-train": {
"executable": "ocrd-cis-ocropy-train",
"categories": [
- "lstm ocropy model training"
+ "Text recognition and optimization"
"steps": [
- "training"
+ "recognition/text-recognition"
+ ],
+ "input_file_grp": [
"description": "train model with ground truth from mets data",
"parameters": {
"textequiv_level": {
"type": "string",
+ "description": "PAGE XML hierarchy level granularity",
"enum": ["line", "word", "glyph"],
"default": "line"
@@ -404,7 +413,8 @@
"description": "load model or crate new one (e.g. fraktur.pyrnn)"
"ntrain": {
- "type": "integer",
+ "type": "number",
+ "format": "integer",
"description": "lines to train before stopping",
"default": 1000000
@@ -420,316 +430,65 @@
"Text recognition and optimization"
"steps": [
- "postprocessing/alignment"
+ "recognition/post-correction"
- "description": "Align multiple OCRs and/or GTs"
- },
- "ocrd-cis-wer": {
- "executable": "ocrd-cis-wer",
- "categories": [
- "evaluation"
+ "input_file_grp": [
+ "OCR-D-OCR-1",
+ "OCR-D-OCR-2",
- "steps": [
- "evaluation"
+ "output_file_grp": [
- "description": "calculate the word error rate for aligned page xml files",
- "parameters": {
- "testIndex": {
- "description": "text equiv index for the test/ocr tokens",
- "type": "integer",
- "default": 0
- },
- "gtIndex": {
- "type": "integer",
- "description": "text equiv index for the gt tokens",
- "default": -1
- }
- }
+ "description": "Align multiple OCRs and/or GTs"
- "ocrd-cis-jar": {
- "executable": "ocrd-cis-jar",
+ "ocrd-cis-postcorrect": {
+ "executable": "ocrd-cis-postcorrect",
"categories": [
"Text recognition and optimization"
"steps": [
- "postprocessing/alignment"
+ "recognition/post-correction"
- "description": "Output path to the ocrd-cis.jar file"
- },
- "ocrd-cis-profile": {
- "executable": "ocrd-cis-profile",
- "categories": [
- "Text recognition and optimization"
+ "description": "Post correct OCR results",
+ "input_file_grp": [
- "steps": [
- "postprocessing/alignment"
+ "output_file_grp": [
- "description": "Add a correction suggestions and suspicious tokens (profile)",
"parameters": {
- "executable": {
- "type": "string",
- "required": true
+ "maxCandidates": {
+ "description": "Maximum number of considered correction candidates per suspicious token",
+ "type": "number",
+ "format": "integer",
+ "default": 10
- "backend": {
+ "profilerPath": {
+ "description": "Path to the profiler executable",
+ "required": true,
+ "type": "string"
+ },
+ "profilerConfig": {
+ "description": "Path to the profiler's language config file",
+ "required": true,
+ "type": "string"
+ },
+ "model": {
+ "description": "Path to the post correction model file",
"type": "string",
"required": true
- "language": {
- "type": "string",
- "required": false,
- "default": "german"
+ "nOCR": {
+ "description": "Number of parallel OCR's to use for the post correction",
+ "type": "number",
+ "format": "integer",
+ "default": 1
- "additionalLexicon": {
- "type": "string",
- "required": false,
- "default": ""
- }
- }
- },
- "ocrd-cis-train": {
- "executable": "ocrd-cis-train.sh",
- "categories": [
- "Text recognition and optimization"
- ],
- "steps": [
- "postprocessing/alignment"
- ],
- "description": "Train post correction model",
- "parameters": {
- "gtArchives": {
- "description": "List of ground truth archives",
- "type": "array",
- "required": true,
- "items": {
- "description": "Path (or URL) to a ground truth archive",
- "type": "string"
- }
- },
- "imagePreprocessingSteps": {
- "description": "List of image preprocessing steps",
- "type": "array",
- "required": true,
- "items": {
- "description": "Image preprocessing command that is evaled using the bash eval command (available parameters: $METS, $LOG_LEVEL, $XML_INPUT_FILE_GRP, $XML_OUTPUT_FILE_GRP, $IMG_OUTPUT_FILE_GRP, $IMG_INPUT_FILE_GRP, $PARAMETER)",
- "type": "string"
- }
- },
- "ocrSteps": {
- "description": "List of ocr steps",
- "type": "array",
- "required": true,
- "items": {
- "description": "OCR command that is evaled using the bash eval command (available parameters: $METS, $LOG_LEVEL, $XML_INPUT_FILE_GRP, $XML_OUTPUT_FILE_GRP, $PARAMETER)",
- "type": "string"
- }
- },
- "training": {
- "description": "Configuration of training command",
- "type": "object",
- "required": [
- "trigrams",
- "maxCandidate",
- "profiler",
- "leFeatures",
- "rrFeatures",
- "dmFeatures"
- ],
- "properties": {
- "trigrams": {
- "description": "Path to character trigrams csv file (format: n,trigram)",
- "type": "string",
- "required": true
- },
- "maxCandidate": {
- "description": "Maximum number of considered profiler candidates per token",
- "type": "integer",
- "required": true
- },
- "filterClasses": {
- "description": "List of filtered feature classes",
- "required": false,
- "type": "array",
- "items": {
- "description": "Class name of feature class to filter",
- "type": "string"
- }
- },
- "profiler": {
- "description": "Profiler configuration",
- "type": "object",
- "required": [
- "path",
- "config"
- ],
- "properties": {
- "path": {
- "description": "Path to the profiler executable",
- "required": true,
- "type": "string"
- },
- "config": {
- "description": "Path to the profiler language config file",
- "required": true,
- "type": "string"
- }
- }
- },
- "leFeatures": {
- "description": "List of the lexicon extension features",
- "required": true,
- "type": "array",
- "items": {
- "description": "Feature configuration",
- "type": "object",
- "required": [
- "type",
- "name"
- ],
- "properties": {
- "name": {
- "description": "Name of the feature",
- "type": "string"
- },
- "type": {
- "description": "Fully qualified java class name of the feature",
- "type": "string"
- },
- "class": {
- "description": "Class name of the feature",
- "type": "string"
- }
- }
- }
- },
- "rrFeatures": {
- "description": "List of the reranker features",
- "required": true,
- "type": "array",
- "items": {
- "description": "Feature configuration",
- "type": "object",
- "required": [
- "type",
- "name"
- ],
- "properties": {
- "name": {
- "description": "Name of the feature",
- "type": "string"
- },
- "type": {
- "description": "Fully qualified java class name of the feature",
- "type": "string"
- },
- "class": {
- "description": "Class name of the feature",
- "type": "string"
- }
- }
- }
- },
- "dmFeatures": {
- "description": "List of the desicion maker features",
- "required": true,
- "type": "array",
- "items": {
- "description": "Feature configuration",
- "type": "object",
- "required": [
- "type",
- "name"
- ],
- "properties": {
- "name": {
- "description": "Name of the feature",
- "type": "string"
- },
- "type": {
- "description": "Fully qualified java class name of the feature",
- "type": "string"
- },
- "class": {
- "description": "Class name of the feature",
- "type": "string"
- }
- }
- }
- }
- }
- }
- }
- },
- "ocrd-cis-post-correct": {
- "executable": "ocrd-cis-post-correct.sh",
- "categories": [
- "Text recognition and optimization"
- ],
- "steps": [
- "postprocessing/alignment"
- ],
- "description": "Post correct OCR results",
- "parameters": {
- "ocrSteps": {
- "description": "List of additional ocr steps",
- "type": "array",
- "required": true,
- "items": {
- "description": "OCR command that is evaled using the bash eval command (available parameters: $METS, $LOG_LEVEL, $XML_INPUT_FILE_GRP, $XML_OUTPUT_FILE_GRP, $PARAMETER)",
- "type": "string"
- }
- },
- "postCorrection": {
- "description": "Configuration of post correction command",
- "type": "object",
- "required": [
- "maxCandidate",
- "profiler",
- "model",
- "runLE",
- "runDM"
- ],
- "properties": {
- "maxCandidate": {
- "description": "Maximum number of considered profiler candidates per token",
- "type": "integer",
- "required": true
- },
- "profiler": {
- "description": "Profiler configuration",
- "type": "object",
- "required": [
- "path",
- "config"
- ],
- "properties": {
- "path": {
- "description": "Path to the profiler executable",
- "required": true,
- "type": "string"
- },
- "config": {
- "description": "Path to the profiler language config file",
- "required": true,
- "type": "string"
- }
- }
- },
- "model": {
- "description": "Path to the post correction model file",
- "type": "string",
- "required": true
- },
- "runLE": {
- "description": "Do run the lexicon extension step for the post correction",
- "required": true,
- "type": "boolean"
- },
- "runDM": {
- "description": "Do run the ranking and the decision step for the post correction",
- "required": true,
- "type": "boolean"
- }
- }
+ "runLE": {
+ "description": "Do run the lexicon extension step for the post correction",
+ "type": "boolean",
+ "default": false
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
index 2b34abbc..14ac9563 100644
--- a/ocrd_cis/ocropy/recognize.py
+++ b/ocrd_cis/ocropy/recognize.py
@@ -146,7 +146,7 @@ def process(self):
pcgts = page_from_file(self.workspace.download_file(input_file))
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
page = pcgts.get_Page()
# add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
@@ -159,7 +159,7 @@ def process(self):
for name in self.parameter.keys()])]))
page_image, page_coords, _ = self.workspace.image_from_page(
page, page_id)
@@ -169,7 +169,7 @@ def process(self):
if not regions:
LOG.warning("Page '%s' contains no text regions", page_id)
self.process_regions(regions, maxlevel, page_image, page_coords)
# update METS (add the PAGE file):
file_id = input_file.ID.replace(self.input_file_grp,
diff --git a/ocrd_cis/profile/__init__.py b/ocrd_cis/postcorrect/__init__.py
similarity index 100%
rename from ocrd_cis/profile/__init__.py
rename to ocrd_cis/postcorrect/__init__.py
diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py
new file mode 100644
index 00000000..d9033981
--- /dev/null
+++ b/ocrd_cis/postcorrect/cli.py
@@ -0,0 +1,46 @@
+from __future__ import absolute_import
+import click
+import json
+import os
+from ocrd import Processor
+from ocrd.decorators import ocrd_cli_options
+from ocrd.decorators import ocrd_cli_wrap_processor
+from ocrd_utils import getLogger
+from ocrd_models.ocrd_mets import OcrdMets
+from ocrd_cis import JavaPostCorrector
+from ocrd_cis import get_ocrd_tool
+def ocrd_cis_postcorrect(*args, **kwargs):
+ if 'log_level' in kwargs:
+ global LOG_LEVEL
+ LOG_LEVEL = kwargs['log_level']
+ return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs)
+class PostCorrector(Processor):
+ def __init__(self, *args, **kwargs):
+ ocrd_tool = get_ocrd_tool()
+ kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect']
+ kwargs['version'] = ocrd_tool['version']
+ super(PostCorrector, self).__init__(*args, **kwargs)
+ self.log = getLogger('cis.Processor.PostCorrector')
+ def process(self):
+ ifgs = self.input_file_grp.split(",") # input file groups
+ ofg = self.output_file_grp
+ profiler = {}
+ profiler["path"] = self.parameter["profilerPath"]
+ profiler["config"] = self.parameter["profilerConfig"]
+ profiler["noCache"] = True
+ self.parameter["profiler"] = profiler
+ self.parameter["runDM"] = True
+ metspath = os.path.join(self.workspace.directory, "mets.xml")
+ print(json.dumps(self.parameter, indent=4))
+ p = JavaPostCorrector(metspath, ",".join(ifgs), ofg, self.parameter, LOG_LEVEL)
+ p.exe()
+ # reload the mets file to prevent it from overriding the
+ # updated version from the java process
+ self.workspace.mets = OcrdMets(filename=metspath)
diff --git a/ocrd_cis/profile/cli.py b/ocrd_cis/profile/cli.py
deleted file mode 100644
index 41d82ba7..00000000
--- a/ocrd_cis/profile/cli.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import click
-from ocrd.decorators import ocrd_cli_options
-from ocrd.decorators import ocrd_cli_wrap_processor
-from ocrd import Processor
-from ocrd_utils import getLogger
-from ocrd_cis import get_ocrd_tool
-from ocrd_cis import JavaProfiler
-def ocrd_cis_profile(*args, **kwargs):
- global LOG_LEVEL
- if 'log_level' in kwargs:
- LOG_LEVEL = kwargs['log_level']
- return ocrd_cli_wrap_processor(Profiler, *args, **kwargs)
-class Profiler(Processor):
- def __init__(self, *args, **kwargs):
- ocrd_tool = get_ocrd_tool()
- kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-profile']
- kwargs['version'] = ocrd_tool['version']
- super(Profiler, self).__init__(*args, **kwargs)
- self.log = getLogger('cis.Processor.Profiler')
- def process(self):
- global LOG_LEVEL
- self.log.debug("starting java client")
- self.log.debug("LOG_LEVEL = %s", LOG_LEVEL)
- p = JavaProfiler(self.workspace.mets_target, self.input_file_grp,
- self.output_file_grp, self.parameter,
- p.exe()
- # Reload the updated METS file to make sure that run_processor
- # does not overwrite the updated file with the old.
- self.workspace.reload_mets()
diff --git a/setup.py b/setup.py
index fbd3d5fe..f582318c 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,7 @@
- ocrd-cis-align
- - ocrd-cis-training
- - ocrd-cis-profile
- - ocrd-cis-wer
+ - ocrd-cis-postcorrect
- ocrd-cis-data
- ocrd-cis-ocropy-clip
- ocrd-cis-ocropy-denoise
@@ -25,7 +23,7 @@
- version='0.0.7',
+ version='0.0.8',
description='CIS OCR-D command line tools',
@@ -49,18 +47,12 @@
'calamari_ocr == 0.3.5'
- '': ['*.json', '*.yml', '*.yaml', '*.csv.gz', '*.jar'],
+ '': ['*.json', '*.yml', '*.yaml', '*.csv.gz', '*.jar', '*.zip'],
- scripts=[
- 'bashlib/ocrd-cis-lib.sh',
- 'bashlib/ocrd-cis-train.sh',
- 'bashlib/ocrd-cis-post-correct.sh',
- ],
'console_scripts': [
- 'ocrd-cis-profile=ocrd_cis.profile.cli:ocrd_cis_profile',
- 'ocrd-cis-wer=ocrd_cis.wer.cli:ocrd_cis_wer',
+ 'ocrd-cis-postcorrect=ocrd_cis.postcorrect.cli:ocrd_cis_postcorrect',
diff --git a/tests/data/benner_herrnhuterey04_1748_0015-wer.xml b/tests/data/benner_herrnhuterey04_1748_0015-wer.xml
deleted file mode 100644
index f95028d1..00000000
--- a/tests/data/benner_herrnhuterey04_1748_0015-wer.xml
+++ /dev/null
@@ -1,73 +0,0 @@
- 2016-10-10T14:19:48.077+02:00
- 2017-01-04T10:32:27.651+01:00
- first
- first
- first
- second
- second
- secondx
- third
- third
- thirdx
- first second third
diff --git a/tests/data/benner_herrnhuterey04_1748_0015.xml b/tests/data/benner_herrnhuterey04_1748_0015.xml
deleted file mode 100644
index ae5e743f..00000000
--- a/tests/data/benner_herrnhuterey04_1748_0015.xml
+++ /dev/null
@@ -1,1499 +0,0 @@
- 2016-10-10T14:19:48.077+02:00
- 2017-01-04T10:32:27.651+01:00
- vierter
- Theil.
- vierter Theil.
- vierter Theil.
- 3
- 3
- 3
- geſtalt
- vor
- ihren
- Stifter
- erklaͤret,
- daß
- ſie
- aus
- geſtalt vor ihren Stifter erklaͤret, daß ſie aus
- geſtalt vor ihren Stifter erklaͤret, daß ſie aus
- deſen
- deſen
- deſen
- Buch,
- ſtehen
- ſ.
- 29.
- wo
- es
- heiſet:
- Zweyte
- Buch, ſtehen ſ. 29. wo es heiſet: Zweyte
- Warheit
- Daß
- ich
- die
- Puͤncktlichkeit
- Warheit Daß ich die Puͤncktlichkeit
- meiner
- Lutheriſchen
- Verfaſſung
- ſoweit
- meiner Lutheriſchen Verfaſſung ſoweit
- pouſſiret/
- daß
- ich
- ſogar
- in
- Penſylvanien
- pouſſiret/ daß ich ſogar in Penſylvanien
- alwo
- mein
- Haupt-Augenmerck
- war/
- den
- alwo mein Haupt-Augenmerck war/ den
- Verfall
- der
- Religionen
- und
- aller
- Kirch-
- Verfall der Religionen und aller Kirch-
- lichkeit
- uͤberhaupt,
- zu
- redreſſiren,
- we-
- lichkeit uͤberhaupt, zu redreſſiren, we-
- gen
- der
- diverſen
- Liturgien/
- die
- ernſte
- gen der diverſen Liturgien/ die ernſte
- Verfuͤgung
- getroffen/
- daß
- weder
- ein
- Verfuͤgung getroffen/ daß weder ein
- Reformirter/
- noch
- ein
- Maͤhriſcher
- Bru-
- Reformirter/ noch ein Maͤhriſcher Bru-
- der/
- bey
- unſerer
- dortigen
- Lutheriſchen
- der/ bey unſerer dortigen Lutheriſchen
- Communion
- admittiret
- werde/
- und
- auf
- Communion admittiret werde/ und auf
- dem
- oͤffentlichen
- Confeſſu
- aller
- proteſtan-
- dem oͤffentlichen Confeſſu aller proteſtan-
- tiſchen
- Religionen
- daſelbſt/
- mich
- vor
- tiſchen Religionen daſelbſt/ mich vor
- den
- Lutheriſchen
- teutſchen
- Pfarrer
- von
- den Lutheriſchen teutſchen Pfarrer von
- Philadelphia/
- der
- ich
- war/
- den
- erſten/
- Philadelphia/ der ich war/ den erſten/
- und
- ſo
- lange
- ich
- da
- war/
- ohne
- einige
- Con-
- und ſo lange ich da war/ ohne einige Con-
- currenz
- mit
- einigem
- andern/
- und
- ſonſt
- currenz mit einigem andern/ und ſonſt
- vor
- nichts/
- angegeben
- und
- geriret.
- vor nichts/ angegeben und geriret.
- Buch, ſtehen ſ. 29. wo es heiſet: Zweyte
-Warheit Daß ich die Puͤncktlichkeit
-meiner Lutheriſchen Verfaſſung ſoweit
-pouſſiret/ daß ich ſogar in Penſylvanien
-alwo mein Haupt-Augenmerck war/ den
-Verfall der Religionen und aller Kirch-
-lichkeit uͤberhaupt, zu redreſſiren, we-
-gen der diverſen Liturgien/ die ernſte
-Verfuͤgung getroffen/ daß weder ein
-Reformirter/ noch ein Maͤhriſcher Bru-
-der/ bey unſerer dortigen Lutheriſchen
-Communion admittiret werde/ und auf
-dem oͤffentlichen Confeſſu aller proteſtan-
-tiſchen Religionen daſelbſt/ mich vor
-den Lutheriſchen teutſchen Pfarrer von
-Philadelphia/ der ich war/ den erſten/
-und ſo lange ich da war/ ohne einige Con-
-currenz mit einigem andern/ und ſonſt
-vor nichts/ angegeben und geriret.
- Die
- uͤbrige
- Stellen
- ſind
- ſ.
- 179.
- wo
- Zin-
- Die uͤbrige Stellen ſind ſ. 179. wo Zin-
- zendorf
- gegen
- den
- teutſchen
- Zeitungsſchrei-
- zendorf gegen den teutſchen Zeitungsſchrei-
- ber
- in
- Penſylvanien
- sich
- reget,
- aber
- mit
- ber in Penſylvanien sich reget, aber mit
- keinem
- eintzigen
- Grund
- vertheidiget.
- Jn-
- keinem eintzigen Grund vertheidiget. Jn-
- gleichem
- ſ.
- 109.
- f.
- wo
- er
- ſeiner
- in
- Penſyl-
- gleichem ſ. 109. f. wo er ſeiner in Penſyl-
- vanien
- gehaltenen
- ſieben
- General-Versam-
- vanien gehaltenen ſieben General-Versam-
- lungen
- gedencket,
- auch
- ſogar
- etwas
- von
- den
- lungen gedencket, auch ſogar etwas von den
- Schluͤſſen
- und
- Decreten,
- der
- verſamleten
- Schluͤſſen und Decreten, der verſamleten
- Vaͤter,
- nicht
- weniger
- eine
- lateiniſche
- Rede
- Vaͤter, nicht weniger eine lateiniſche Rede
- Die uͤbrige Stellen ſind ſ. 179. wo Zin-
-zendorf gegen den teutſchen Zeitungsſchrei-
-ber in Penſylvanien sich reget, aber mit
-keinem eintzigen Grund vertheidiget. Jn-
-gleichem ſ. 109. f. wo er ſeiner in Penſyl-
-vanien gehaltenen ſieben General-Versam-
-lungen gedencket, auch ſogar etwas von den
-Schluͤſſen und Decreten, der verſamleten
-Vaͤter, nicht weniger eine lateiniſche Rede
- A
- 2
- A 2
- A 2
- mitthei-
- mitthei-
- mitthei-
diff --git a/tests/data/profiler b/tests/data/profiler
deleted file mode 100755
index 88b2d589..00000000
--- a/tests/data/profiler
+++ /dev/null
@@ -1,5 +0,0 @@
-echo $0 $* > /dev/stderr
-cat< $other1
-cat $pagexmlfile | sed -e 's/ſ/f/g' > $other2
-# add page xml files to align
-pushd $tmpws
-ocrd workspace add \
- -i test01 \
- -m 'application/vnd.prima.page+xml' \
- "$pagexmlfile"
-ocrd workspace add \
- -i test02 \
- -m 'application/vnd.prima.page+xml' \
- "$other1"
-ocrd workspace add \
- -i test03 \
- -m 'application/vnd.prima.page+xml' \
- "$other2"
-# align the three workspaces
-ocrd-cis-align --log-level DEBUG \
- -m $tmpws/mets.xml
-pushd $tmpws
-if [[ ! -f $(ocrd workspace find -G OCR-D-CIS-ALIGN) ]]; then
- echo "cannot find aligned file group workspace"
- exit 1
diff --git a/tests/run_data_test.sh b/tests/run_data_test.bash
similarity index 50%
rename from tests/run_data_test.sh
rename to tests/run_data_test.bash
index 8e9c57ad..4b7c6e59 100644
--- a/tests/run_data_test.sh
+++ b/tests/run_data_test.bash
@@ -1,4 +1,5 @@
+set -e
if [[ ! -f $(ocrd-cis-data -jar) ]] ; then
echo "jar file does not exist";
@@ -9,3 +10,13 @@ if [[ ! -f $(ocrd-cis-data -3gs) ]] ; then
echo "three grams file does not exist";
exit 1
+if [[ ! -f $(ocrd-cis-data -config) ]] ; then
+ echo "config file does not exist";
+ exit 1
+if [[ ! -f $(ocrd-cis-data -model) ]] ; then
+ echo "model file does not exist";
+ exit 1
diff --git a/tests/run_image_preprocessing_test.sh b/tests/run_image_preprocessing_test.bash
similarity index 68%
rename from tests/run_image_preprocessing_test.sh
rename to tests/run_image_preprocessing_test.bash
index 741a9e32..4fd028e4 100644
--- a/tests/run_image_preprocessing_test.sh
+++ b/tests/run_image_preprocessing_test.bash
@@ -1,17 +1,13 @@
-source ocrd-cis-lib.sh
-source $(dirname $0)/test_lib.sh
+set -e
+source $(dirname $0)/test_lib.bash
-mkdir -p "$tmpdir/download"
-pushd "$tmpws"
-ocrd-cis-download-and-add-gt-zip "$url" "$tmpdir/download"
+ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip"
# test if there are 3 gt files
pushd "$tmpws"
-for file in $(ocrd workspace find -G "$OCR_D_CIS_GT_FILEGRP"); do
+for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
if [[ ! -f "$file" ]]; then
echo "cannot find ground truth file: $file"
exit 1
@@ -24,18 +20,18 @@ if [[ $found_files != 3 ]]; then
-ocrd-cis-ocropy-clip --log-level DEBUG \
- --input-file-grp "$OCR_D_CIS_GT_FILEGRP" \
- --output-file-grp OCR-D-CIS-IMG-CLIP \
- --mets "$tmpws/mets.xml"
ocrd-cis-ocropy-binarize --log-level DEBUG \
- --input-file-grp OCR-D-CIS-IMG-CLIP \
+ --input-file-grp OCR-D-GT-SEG-LINE \
--output-file-grp OCR-D-CIS-IMG-BIN \
--mets "$tmpws/mets.xml"
+ocrd-cis-ocropy-clip --log-level DEBUG \
+ --input-file-grp OCR-D-CIS-IMG-BIN \
+ --output-file-grp OCR-D-CIS-IMG-CLIP \
+ --mets "$tmpws/mets.xml"
ocrd-cis-ocropy-denoise --log-level DEBUG \
- --input-file-grp OCR-D-CIS-IMG-BIN \
+ --input-file-grp OCR-D-CIS-IMG-CLIP \
--output-file-grp OCR-D-CIS-IMG-DEN \
--mets "$tmpws/mets.xml"
@@ -48,3 +44,8 @@ ocrd-cis-ocropy-dewarp --log-level DEBUG \
--input-file-grp OCR-D-CIS-IMG-DES \
--output-file-grp OCR-D-CIS-IMG-DEW \
--mets "$tmpws/mets.xml"
+ocrd-cis-ocropy-segment --log-level DEBUG \
+ --input-file-grp OCR-D-CIS-IMG-DEW \
+ --output-file-grp OCR-D-CIS-IMG-SEG \
+ --mets "$tmpws/mets.xml"
diff --git a/tests/run_ocr_test.sh b/tests/run_ocr_test.bash
similarity index 55%
rename from tests/run_ocr_test.sh
rename to tests/run_ocr_test.bash
index 4c9346ec..6de88a7b 100644
--- a/tests/run_ocr_test.sh
+++ b/tests/run_ocr_test.bash
@@ -1,20 +1,12 @@
-source ocrd-cis-lib.sh
-source $(dirname $0)/test_lib.sh
-mkdir -p "$tmpdir/download"
-pushd "$tmpws"
-ocrd-cis-download-and-add-gt-zip "$url" "$tmpdir/download"
-pushd "$tmpdir/download"
-wget -N "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz"
+set -e
+source $(dirname $0)/test_lib.bash
+ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip
# test if there are 3 gt files
pushd "$tmpws"
-for file in $(ocrd workspace find -G "$OCR_D_CIS_GT_FILEGRP"); do
+for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do
if [[ ! -f "$file" ]]; then
echo "cannot find ground truth file: $file"
exit 1
@@ -27,9 +19,12 @@ if [[ $found_files != 3 ]]; then
+# download ocr model
+wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz"
# run ocr
ocrd-cis-ocropy-recognize --log-level DEBUG \
- --input-file-grp "$OCR_D_CIS_GT_FILEGRP" \
+ --input-file-grp "OCR-D-GT-SEG-LINE" \
--output-file-grp OCR-D-CIS-OCR \
--mets "$tmpws/mets.xml" \
--parameter <(cat < "$tmpdir/bin/profiler.bash" < /dev/null
+echo '{}'
+chmod a+x "$tmpdir/bin/profiler.bash"
+ocrd-cis-postcorrect --log-level DEBUG \
+ -m $tmpws/mets.xml \
+ --parameter <(cat <e.#Säugethiere.#' $f
+ sed -i -e 's#E#Säugethieren#' $f
+mkdir "$tmpdir/bin"
+cat > "$tmpdir/bin/profiler.bash" < /dev/null
+echo '{"Säugethiere":{
+"Candidates": [{
+"Suggestion": "Säugethiere",
+"Modern": "Säugetiere",
+"Dict": "dict_modern_hypothetic_errors",
+"HistPatterns": [{"Left":"t","Right":"th","Pos":5}],
+"OCRPatterns": [],
+"Distance": 0,
+"Weight": 1.0
+chmod a+x "$tmpdir/bin/profiler.bash"
+java -jar $(ocrd-cis-data -jar) -c train \
+ --log-level DEBUG \
+ -m $tmpws/mets.xml \
+ --parameter <(
+cat $(ocrd-cis-data -config) \
+ | sed -e "s#/path/to/profiler#$tmpdir/bin/profiler.bash#" \
+ | sed -e "s#/path/to/trigrams.csv#$(ocrd-cis-data -3gs)#" \
+ | sed -e "s#/path/to/train.dir#$tmpdir/train#"
+if [[ ! -f $tmpdir/train/model.zip ]]; then
+ echo $tmpdir/train/model.zip not found
+ exit 1
diff --git a/tests/run_validation.bash b/tests/run_validation.bash
new file mode 100644
index 00000000..0dc98777
--- /dev/null
+++ b/tests/run_validation.bash
@@ -0,0 +1,6 @@
+set -e
+if ocrd ocrd-tool ocrd-tool.json validate | grep ''; then
+ exit 1
diff --git a/tests/run_wer_test.sh b/tests/run_wer_test.sh
deleted file mode 100644
index d85a4809..00000000
--- a/tests/run_wer_test.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-source $(dirname $0)/test_lib.sh
-pushd "$tmpws"
-ocrd workspace add \
- --file-grp "WERFILEGRP" \
- --mimetype "application/vnd.prima.page+xml" \
- --file-id "WERFILEID" \
- "$werfile"
-ocrd-cis-wer \
- --mets "$tmpws/mets.xml" \
- --output-file-grp "WER" \
- --input-file-grp "WERFILEGRP"
-# tests
-pushd "$tmpws"
-if [[ ! -f $(ocrd workspace find -G "WER") ]]; then
- echo "missing wer file"
- exit 1
-if [[ $(jq '.totalWords' $(ocrd workspace find -G "WER")) != 3 ]]; then
- echo "invalid number of words"
- exit 1
-if [[ $(jq '.incorrectWords' $(ocrd workspace find -G "WER")) != 2 ]]; then
- echo "invalid number of bad words"
- exit 1
-if [[ $(jq '.correctWords' $(ocrd workspace find -G "WER")) != 1 ]]; then
- echo "invalid number of good words"
- exit 1
diff --git a/tests/test_lib.bash b/tests/test_lib.bash
new file mode 100644
index 00000000..5d38f482
--- /dev/null
+++ b/tests/test_lib.bash
@@ -0,0 +1,51 @@
+tmpdir=$(mktemp -d)
+trap "rm -rf $tmpdir" EXIT
+function ocrd_cis_download_bagit() {
+ local url="$data_url/$1"
+ mkdir -p "$tmpdir/download"
+ wget -P "$tmpdir/download" "$url"
+function ocrd_cis_init_ws() {
+ ocrd_cis_download_bagit "$1"
+ ocrd zip spill -d "$tmpdir" "$tmpdir/download/$1"
+ tmpws="$tmpdir/${1%.ocrd.zip}"
+function ocrd_cis_align() {
+ # download ocr models
+ wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz"
+ wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur2-00062000.pyrnn.gz"
+ # run ocr
+ ocrd-cis-ocropy-recognize --log-level DEBUG \
+ --input-file-grp "OCR-D-GT-SEG-LINE" \
+ --output-file-grp OCR-D-CIS-OCR-1 \
+ --mets "$tmpws/mets.xml" \
+ --parameter <(cat <