Merge branch 'fix/validation' into dev

cisocrgroup · May 13, 2020 · 34201b6 · 34201b6
2 parents fe129fe + efa27ab
commit 34201b6
Show file tree

Hide file tree

Showing 48 changed files with 712 additions and 3,482 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,54 +1,58 @@
-FROM ocrd/core:latest
-ENV VERSION="Mi 9. Okt 13:26:16 CEST 2019"
+FROM ocrd/core:latest AS base
+ENV VERSION="Di 12. Mai 13:26:35 CEST 2020"
 ENV GITURL="https://github.com/cisocrgroup"
 ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf"
-ENV DATA="/apps/ocrd-cis-post-correction"
 
 # deps
-COPY data/docker/deps.txt ${DATA}/deps.txt
 RUN apt-get update \
-	&& apt-get -y install --no-install-recommends $(cat ${DATA}/deps.txt)
+	&& apt-get -y install --no-install-recommends locales
 
 # locales
 RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \
     && dpkg-reconfigure --frontend=noninteractive locales \
     && update-locale LANG=en_US.UTF-8
 
 # install the profiler
-RUN	git clone ${GITURL}/Profiler --branch devel --single-branch /tmp/profiler \
-	&& cd /tmp/profiler \
-	&& mkdir build \
-	&& cd build \
-	&& cmake -DCMAKE_BUILD_TYPE=release .. \
-	&& make compileFBDic trainFrequencyList profiler \
-	&& cp bin/compileFBDic bin/trainFrequencyList bin/profiler /apps/ \
+FROM base AS profiler
+RUN apt-get update \
+	&& apt-get -y install --no-install-recommends cmake g++ libcppunit-dev libxerces-c-dev \
+	&& git clone ${GITURL}/Profiler --branch devel --single-branch /build \
+	&& cd /build \
+	&& cmake -DCMAKE_BUILD_TYPE=release . \
+	&& make compileFBDic trainFrequencyList runDictSearch profiler \
+	&& mkdir /apps \
+	&& cp bin/compileFBDic bin/trainFrequencyList bin/profiler bin/runDictSearch /apps/ \
 	&& cd / \
-    && rm -rf /tmp/profiler
+    && rm -rf /build
 
+FROM profiler AS languagemodel
 # install the profiler's language backend
-RUN	git clone ${GITURL}/Resources --branch master --single-branch /tmp/resources \
-	&& cd /tmp/resources/lexica \
-	&& make FBDIC=/apps/compileFBDic TRAIN=/apps/trainFrequencyList \
-	&& mkdir -p /${DATA}/languages \
-	&& cp -r german latin greek german.ini latin.ini greek.ini /${DATA}/languages \
+COPY --from=profiler /apps/compileFBDic /apps/
+COPY --from=profiler /apps/trainFrequencyList /apps/
+COPY --from=profiler /apps/runDictSearch /apps/
+RUN apt-get update \
+	&& apt-get -y install --no-install-recommends icu-devtools \
+	&& git clone ${GITURL}/Resources --branch master --single-branch /build \
+	&& cd /build/lexica \
+	&& PATH=$PATH:/apps make \
+	&& PATH=$PATH:/apps make test \
+	&& PATH=$PATH:/apps make install \
 	&& cd / \
-	&& rm -rf /tmp/resources
+	&& rm -rf /build
 
+FROM base AS postcorrection
 # install ocrd_cis (python)
-COPY Manifest.in Makefile setup.py ocrd-tool.json /tmp/build/
-COPY ocrd_cis/ /tmp/build/ocrd_cis/
-COPY bashlib/ /tmp/build/bashlib/
-# COPY . /tmp/ocrd_cis
-RUN cd /tmp/build \
+VOLUME ["/data"]
+COPY --from=languagemodel /etc/profiler/languages /etc/profiler/languages
+COPY --from=profiler /apps/profiler /apps/
+COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linux-gnu/
+COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/
+COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/
+COPY . /build
+RUN apt-get update \
+	&& apt-get -y install --no-install-recommends gcc wget default-jre-headless \
+	&& cd /build \
 	&& make install \
+	&& make test \
 	&& cd / \
-	&& rm -rf /tmp/build
-
-# download ocr models and pre-trainded post-correction model
-RUN mkdir /apps/models \
-	&& cd /apps/models \
-	&& wget ${DOWNLOAD_URL}/model.zip >/dev/null 2>&1 \
-	&& wget ${DOWNLOAD_URL}/fraktur1-00085000.pyrnn.gz >/dev/null 2>&1 \
-	&& wget ${DOWNLOAD_URL}/fraktur2-00062000.pyrnn.gz >/dev/null 2>&1
-
-VOLUME ["/data"]
+	&& rm -rf /build
diff --git a/Makefile b/Makefile
@@ -1,21 +1,24 @@
 PY ?= python3
 PIP ?= pip3
+V ?= > /dev/null 2>&1
+PKG = ocrd_cis
 
 install:
 	${PIP} install --upgrade pip .
 install-devel:
 	${PIP} install --upgrade pip -e .
+uninstall:
+	${PIP} uninstall ${PKG}
 
 docker-build: Dockerfile
 	docker build -t flobar/ocrd_cis:latest .
 docker-push: docker-build
 	docker push flobar/ocrd_cis:latest
 
-TEST_SCRIPTS=$(wildcard tests/run_*.sh)
+TEST_SCRIPTS=$(sort $(wildcard tests/run_*.bash))
 .PHONY: $(TEST_SCRIPTS)
 $(TEST_SCRIPTS):
-	bash $@
-# run test scripts
+	bash $@ $V
 test: $(TEST_SCRIPTS)
-
+	echo $^
 .PHONY: install test
diff --git a/README.md b/README.md
@@ -32,23 +32,39 @@ It is possible to install `ocrd_cis` in a custom directory using
  deactivate
 ```
 
+## Profiler
+The post correction is dependent on the language
+[profiler](https://github.com/cisocrgroup/Profiler) and its laguage
+configurations to generate corrections for suspicious words.  In order
+to use the post correction a profiler with according language
+configruations have to be present on the system.  You can refer to our
+[manuals](https://github.com/cisocrgroup/Resources/tree/master/manuals)
+and our [lexical
+resources](https://github.com/cisocrgroup/Resources/tree/master/lexica)
+for more information.
+
+If you use docker you can use the preinstalled profiler from within
+the docker-container.  The profiler is installed to `/apps/profiler`
+and the language configurations lie in `/etc/profiler/languages` in
+the container image.
+
 ## Usage
 Most tools follow the [OCR-D cli
 conventions](https://ocr-d.github.io/cli).  They accept the
 `--input-file-grp`, `--output-file-grp`, `--parameter`, `--mets`,
-`--log-level` command line arguments (short and long).  For some tools
-(most notably the alignment tool) expect a comma seperated list of
-multiple input file groups.
+`--log-level` command line arguments (short and long).  Some of the
+tools (most notably the alignment tool) expect a comma seperated list
+of multiple input file groups.
 
 The [ocrd-tool.json](ocrd_cis/ocrd-tool.json) contains a schema
 description of the parameter config file for the different tools that
 accept the `--parameter` argument.
 
-### ocrd-cis-post-correct.sh
-This bash script runs the post correction using a pre-trained
-[model](http://cis.lmu.de/~finkf/model.zip).  If additional support
-OCRs should be used, models for these OCR steps are required and must
-be configured in an according configuration file (see ocrd-tool.json).
+### ocrd-cis-postcorrect
+This command runs the post correction using a pre-trained model.  If
+additional support OCRs should be used, models for these OCR steps are
+required and must be executed and aligned beforehand (see [the test
+script](tests/run_postcorrection_test.bash) for an example).
 
 Arguments:
  * `--parameter` path to configuration file
@@ -57,6 +73,20 @@ Arguments:
  * `--log-level` set log level
  * `--mets` path to METS file in workspace
 
+As mentioned above in order to use the postcorrection with input from
+multiple OCR's, some preprocessing steps are needed: firstly the
+additional OCR recognition has to be done and secondly the multiple
+OCR's have to be aligned (you can also take a look to the function
+`ocrd_cis_align` in the [tests](tests/test_lib.bash)).  Assuming an
+original recognition as file group `OCR1` on the segmented document of
+file group `SEG`, the folloing commands can be used:
+
+```sh
+ocrd-ocropus-recognize -I SEG -O OCR2 ... # additional OCR
+ocrd-cis-align -I OCR1,OCR2 -O ALGN ... # align OCR1 and OCR2
+ocrd-cis-postcorrect -I ALGN -O PC ... # post correction
+```
+
 ### ocrd-cis-align
 Aligns tokens of multiple input file groups to one output file group.
 This tool is used to align the master OCR with any additional support
@@ -66,41 +96,26 @@ it aligns in order.
 Arguments:
  * `--parameter` path to configuration file
  * `--input-file-grp` comma seperated list of the input file groups;
-   first input file group is the master OCR
+   first input file group is the master OCR; if there is a ground
+   truth (for evaluation) it must be the last file group in the list
  * `--output-file-grp` name of the file group for the aligned result
  * `--log-level` set log level
  * `--mets` path to METS file in workspace
 
-### ocrd-cis-train.sh
-Script to train a model from a list of ground-truth archives (see
-ocrd-tool.json) for the post correction.  The tool somewhat mimics the
-behaviour of other ocrd tools:
- * `--mets` for the workspace
- * `--log-level` is passed to other tools
- * `--parameter` is used as configuration
- * `--output-file-grp` defines the output file group for the model
-
 ### ocrd-cis-data
 Helper tool to get the path of the installed data files. Usage:
-`ocrd-cis-data [-jar|-3gs]` to get the path of the jar library or the
-path to th default 3-grams language model file.
-
-### ocrd-cis-wer
-Helper tool to calculate the word error rate aligned ocr files.  It
-writes a simple JSON-formated stats file to the given output file group.
-
-Arguments:
- * `--input-file-grp` input file group of aligned ocr results with
-   their respective ground truth.
- * `--output-file-grp` name of the file group for the stats file
- * `--log-level` set log level
- * `--mets` path to METS file in workspace
-
-### ocrd-cis-profile
-Run the profiler over the given files of the according the given input
-file grp and adds a gzipped JSON-formatted profile to the output file
-group of the workspace.  This tools requires an installed [language
-profiler](https://github.com/cisocrgroup/Profiler).
+`ocrd-cis-data [-h|-jar|-3gs|-model|-config]` to get the path of the
+jar library, the pre-trained post correction model, the path to the
+default 3-grams language model file or the default training
+configuration file.  This tool does not follow the OCR-D conventions.
+
+### Trainining
+There is no dedicated training script provided. Models are trained
+using the java implementation directly (check out the [training test
+script](tests/run_training_test.bash) for an example).  Training a
+model requires a workspace containing one or more file groups
+consisting of aligned OCR and ground-truth documents (the last file
+group has to be the ground truth).
 
 Arguments:
  * `--parameter` path to configuration file
@@ -114,11 +129,14 @@ Arguments:
 The `ocropy-train` tool can be used to train LSTM models.
 It takes ground truth from the workspace and saves (image+text) snippets from the corresponding pages.
 Then a model is trained on all snippets for 1 million (or the given number of) randomized iterations from the parameter file.
+
 ```sh
-ocrd-cis-ocropy-train \
-  --input-file-grp OCR-D-GT-SEG-LINE \
-  --mets mets.xml
-  --parameter file:///path/to/config.json
+java -jar $(ocrd-cis-data -jar) \
+	 -c train \
+	 --input-file-grp OCR1,OCR2,GT \
+     --log-level DEBUG \
+	 -m mets.xml \
+	 --parameter $(ocrd-cis-data -config)
 ```
 
 ### ocrd-cis-ocropy-clip
@@ -228,9 +246,8 @@ pip install .
 ```
 
 Download and move tesseract models from:
-https://github.com/tesseract-ocr/tesseract/wiki/Data-Files
-or use your own models and
-place them into: /usr/share/tesseract-ocr/4.00/tessdata
+https://github.com/tesseract-ocr/tesseract/wiki/Data-Files or use your
+own models and place them into: /usr/share/tesseract-ocr/4.00/tessdata
 
 ## Workflow configuration
 
@@ -256,6 +273,7 @@ If GT is used, steps 1, 5 and 8 can be omitted. Else if a segmentation is used i
 To run a few basic tests type `make test` (`ocrd_cis` has to be
 installed in order to run any tests).
 
+# Miscellaneous
 ## OCR-D workspace
 
 * Create a new (empty) workspace: `ocrd workspace init workspace-dir`

diff --git a/bashlib/ocrd-cis-eval-all.sh b/bashlib/ocrd-cis-eval-all.sh
diff --git a/bashlib/ocrd-cis-eval-ocrd-self.sh b/bashlib/ocrd-cis-eval-ocrd-self.sh