-
Notifications
You must be signed in to change notification settings - Fork 2
/
pipeline.sh
109 lines (95 loc) · 2.51 KB
/
pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
#
# In each of the models a translate.sh file is expected, something that looks like:
#
# #!/bin/bash
# set -euo pipefail
#
# cd $(dirname $0)
#
# $BPE_BIN \
# -c deen.bpe \
# | $MARIAN_BIN "$@" \
# -d $GPU \
# --vocabs train.bpe.de.json train.bpe.en.json \
# -m model.NMT104080k1060k1080k.npz \
# | sed -r 's/\@\@ //g' \
# | sed -r 's/\@\@$//g'
#
set -eou pipefail
# How many bleualign?
THREADS=${THREADS:-4}
ROOT=$(dirname $(realpath $0))
# Add the local bin to my path. This contains marian-decoder,
# binaries from https://github.com/jelmervdl/bitextor/tree/doctools
# bleualign_cpp from https://github.com/jelmervdl/bleualign-cpp/tree/processed-text-2
# and a self-contained jar with classbase.jar.
export PATH=$ROOT/bin:$PATH
# Note subword nmt is not installed in bin, you'll have to
# pip3 install it yourself at the moment
export BPE_BIN="subword-nmt apply-bpe"
export MARIAN_BIN=marian-decoder
export SLANG=$1
shift
export GPU=$1
shift
case $SLANG in
de)
MODEL="$ROOT/DC01311513/nmt/v1/translate.sh"
;;
fr)
MODEL="$ROOT/DC01311514/nmt/v1/translate.sh"
;;
dummy)
SLANG=de
MODEL="$ROOT/debug-model/translate.sh"
;;
*)
echo "Supported languages: de, fr" 1>&2
exit 1
;;
esac
col () {
cut -d$'\t' -f$1
}
document_to_base64() {
# Remove trailing newline from input (to not cause an empty
# document at the end), suffix each line with a null byte,
# which will indicate where a document starts. Then inside each
# document replace the paragraphs and br tags with newslines.
# docenc will then group all of those into base64 encoded chunks.
awk 'NR > 1 { print prev } { prev=$0 } END { ORS=""; print }' \
| sed -r 's/$/\x0/g' \
| sed -r 's/<br\/>|<\/p><p>/\n/g' \
| sed -r 's/<\/?p>//g' \
| docenc -0
}
preprocess () {
b64filter java -jar $ROOT/bin/classbase.jar \
-C $ROOT/resource/classbase.json \
-I /dev/stdin \
-L ${1^^} \
-S
}
translate () {
b64filter foldfilter -w 1000 -- $MODEL --quiet-translation
}
for file in $*; do
paste \
<(cat $file | col 1) \
<(cat $file | col 3) \
<(cat $file | col 2 | document_to_base64) \
<(cat $file | col 4 | document_to_base64) \
<(cat $file | col 2 | document_to_base64 | preprocess $SLANG | translate) \
<(cat $file | col 4 | document_to_base64 | preprocess en) \
| tee >(gzip -9c > $(basename $file .tab)-bleualign-input.tab.gz) \
| parallel \
--halt 2 \
--pipe \
-k \
-l 1 \
-j $THREADS \
bleualign_cpp --bleu-threshold 0.2 \
| gzip \
> $(basename $file .tab)-aligned.gz
done