This repository has been archived by the owner on Apr 8, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Taskfile.yml
196 lines (179 loc) · 6.8 KB
/
Taskfile.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# https://github.com/opencultureconsulting/openrefine-task-runner
version: '3'
includes:
bielefeld: bielefeld
muenster: muenster
siegen: siegen
wuppertal: wuppertal
silent: true
output: prefixed
vars:
DATE: '{{ now | date "2006-01-02"}}'
env:
OPENREFINE:
sh: readlink -m .openrefine/refine
CLIENT:
sh: readlink -m .openrefine/client
tasks:
default:
desc: execute all projects in parallel
deps:
- task: bielefeld:main
- task: muenster:main
- task: siegen:main
- task: wuppertal:main
install:
desc: (re)install OpenRefine and openrefine-client into subdirectory .openrefine
cmds:
- | # delete existing install and recreate folder
rm -rf .openrefine
mkdir -p .openrefine
- > # download OpenRefine archive
wget --no-verbose -O openrefine.tar.gz
https://github.com/OpenRefine/OpenRefine/releases/download/3.4.1/openrefine-linux-3.4.1.tar.gz
- | # install OpenRefine into subdirectory .openrefine
tar -xzf openrefine.tar.gz -C .openrefine --strip 1
rm openrefine.tar.gz
- | # optimize OpenRefine for batch processing
sed -i 's/cd `dirname $0`/cd "$(dirname "$0")"/' ".openrefine/refine" # fix path issue in OpenRefine startup file
sed -i '$ a JAVA_OPTIONS=-Drefine.headless=true' ".openrefine/refine.ini" # do not try to open OpenRefine in browser
sed -i 's/#REFINE_AUTOSAVE_PERIOD=60/REFINE_AUTOSAVE_PERIOD=1440/' ".openrefine/refine.ini" # set autosave period from 5 minutes to 25 hours
- > # download openrefine-client into subdirectory .openrefine
wget --no-verbose -O .openrefine/client
https://github.com/opencultureconsulting/openrefine-client/releases/download/v0.3.10/openrefine-client_0-3-10_linux
- chmod +x .openrefine/client # make client executable
start:
dir: ./{{.PROJECT}}/refine
cmds:
- | # verify that OpenRefine is installed
if [ ! -f "$OPENREFINE" ]; then
echo 1>&2 "OpenRefine missing; try task install"; exit 1
fi
- | # delete temporary files and log file of previous run
rm -rf ./*.project* workspace.json
rm -rf "{{.PROJECT}}.log"
- > # launch OpenRefine with specific data directory and redirect its output to a log file
"$OPENREFINE" -v warn -p {{.PORT}} -m {{.RAM}}
-d ../{{.PROJECT}}/refine
>> "{{.PROJECT}}.log" 2>&1 &
- | # wait until OpenRefine API is available
timeout 30s bash -c "until
wget -q -O - http://localhost:{{.PORT}} | cat | grep -q -o OpenRefine
do sleep 1
done"
stop:
dir: ./{{.PROJECT}}/refine
cmds:
- | # shut down OpenRefine gracefully
PID=$(lsof -t -i:{{.PORT}})
kill $PID
while ps -p $PID > /dev/null; do sleep 1; done
- > # archive the OpenRefine project
tar cfz
"{{.PROJECT}}.openrefine.tar.gz"
-C $(grep -l "{{.PROJECT}}" *.project/metadata.json | cut -d '/' -f 1)
.
- rm -rf ./*.project* workspace.json # delete temporary files
kill:
dir: ./{{.PROJECT}}/refine
cmds:
- | # shut down OpenRefine immediately to save time and disk space
PID=$(lsof -t -i:{{.PORT}})
kill -9 $PID
while ps -p $PID > /dev/null; do sleep 1; done
- rm -rf ./*.project* workspace.json # delete temporary files
check:
dir: ./{{.PROJECT}}/refine
cmds:
- | # find log file(s) and check for "exception" or "error"
if grep -i 'exception\|error' $(find . -name '*.log'); then
echo 1>&2 "log contains warnings!"; exit 1
fi
- | # Prüfen, ob Mindestanzahl von Datensätzen generiert wurde
if (( {{.MINIMUM}} > $(grep -c recordIdentifier {{.PROJECT}}.txt) )); then
echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/{{.PROJECT}}.txt!"; exit 1
fi
split:
label: '{{.TASK}}-{{.PROJECT}}'
dir: ./{{.PROJECT}}/split
cmds:
- test -n "{{.PROJECT}}"
# in Einzeldateien aufteilen
- csplit -s -z ../refine/{{.PROJECT}}.txt '/<mets:mets /' "{*}"
# ggf. vorhandene XML-Dateien löschen
- rm -f *.xml
# Identifier als Dateinamen
- for f in xx*; do mv "$f" "$(xmllint --xpath "//*[local-name(.) = 'recordIdentifier']/text()" "$f").xml"; done
sources:
- ../refine/{{.PROJECT}}.txt
generates:
- ./*.xml
validate:
label: '{{.TASK}}-{{.PROJECT}}'
dir: ./{{.PROJECT}}/validate
cmds:
- test -n "{{.PROJECT}}"
# Validierung gegen METS Schema
- wget -q -nc https://www.loc.gov/standards/mets/mets.xsd
- xmllint --schema mets.xsd --noout ../split/*.xml > validate.log 2>&1
sources:
- ../split/*.xml
generates:
- validate.log
zip:
label: '{{.TASK}}-{{.PROJECT}}'
dir: ./{{.PROJECT}}/zip
cmds:
- test -n "{{.PROJECT}}"
# ZIP-Archiv mit Zeitstempel erstellen
- zip -q -FS -j {{.PROJECT}}_{{.DATE}}.zip ../split/*.xml
sources:
- ../split/*.xml
generates:
- '{{.PROJECT}}_{{.DATE}}.zip'
diff:
label: '{{.TASK}}-{{.PROJECT}}'
dir: ./{{.PROJECT}}
cmds:
- test -n "{{.PROJECT}}"
# Inhalt der beiden letzten ZIP-Archive vergleichen
- if test -n "$(ls -t zip/*.zip | sed -n 2p)"; then unzip -q -d old $(ls -t zip/*.zip | sed -n 2p); unzip -q -d new $(ls -t zip/*.zip | sed -n 1p); fi
- diff -d old new > diff.log || exit 0
- rm -rf old new
# Diff prüfen, ob es weniger als 500 Zeilen enthält
- if (( 500 < $(wc -l <diff.log) )); then echo 1>&2 "Unerwartet große Änderungen in $PWD/diff.log!" && exit 1; fi
# Diff archivieren
- cp diff.log zip/{{.PROJECT}}_{{.DATE}}.diff
sources:
- split/*.xml
generates:
- diff.log
linkcheck:
label: '{{.TASK}}-{{.PROJECT}}'
dir: ./{{.PROJECT}}
cmds:
- test -n "{{.PROJECT}}"
# Links extrahieren
- grep -o 'href="[^"]*"' split/*.xml | sed 's/:href=/\t/' | tr -d '"' | sort -k 2 --unique > links.txt
# http status code ermitteln
- awk '{ print "url = " $2 "\noutput = /dev/null"; }' links.txt > curl.cfg
- curl --silent --head --location --write-out "%{http_code}\t%{url_effective}\n" --config curl.cfg > curl.log
# Tabelle mit status code, effektiver URL, Dateiname und start URL erstellen
- paste curl.log links.txt > linkcheck.log
- rm -rf curl.cfg curl.log links.txt
# Logdatei auf status code != 2XX prüfen
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
sources:
- split/*.xml
generates:
- linkcheck.log
delete:
label: '{{.TASK}}-{{.PROJECT}}'
dir: ./{{.PROJECT}}
cmds:
- test -n "{{.PROJECT}}"
- rm -rf harvest
- rm -rf refine
- rm -rf split
- rm -rf validate
- rm -f diff.log