-
Notifications
You must be signed in to change notification settings - Fork 9
/
run_mrt.sh
executable file
·374 lines (312 loc) · 12.1 KB
/
run_mrt.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/bin/bash
# Marian regression test script. Invocation examples:
# ./run_mrt.sh
# ./run_mrt.sh tests/training/basics
# ./run_mrt.sh tests/training/basics/test_valid_script.sh
# ./run_mrt.sh previous.log
# ./run_mrt.sh '#tag'
# where previous.log contains a list of test files in separate lines.
# Environment variables:
# - MARIAN - path to Marian build directory
# - CUDA_VISIBLE_DEVICES - CUDA's variable specifying GPU device IDs
# - NUM_DEVICES - maximum number of GPU devices to be used
# - TIMEOUT - maximum duration for execution of a single test in the format
# accepted by the timeout command; set to 0 to disable
# - MODELS - path to the directory with models, default: ./models
# - DATA - path to the directory with data, default: ./data
SHELL=/bin/bash
export LC_ALL=C.UTF-8
export MRT_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
export MRT_TOOLS="$MRT_ROOT/tools"
export MRT_MARIAN="$( realpath "${MARIAN:-$MRT_ROOT/../build}" )"
RUN_LOGS="$MRT_ROOT/previous.log.tmp" # Logging file for log and logn commands
rm -f $RUN_LOGS
# Needed so that previous.log is not overwritten when it is provided as an argument
function cleanup {
test -s "$RUN_LOGS" && mv "$RUN_LOGS" "$MRT_ROOT/previous.log"
}
trap cleanup EXIT
function log {
echo "[$(date '+%m/%d/%Y %T')] $@" | tee -a $RUN_LOGS
}
function logn {
echo -n "[$(date '+%m/%d/%Y %T')] $@" | tee -a $RUN_LOGS
}
function loge {
echo $@ | tee -a $RUN_LOGS
}
log "Running on $(hostname) as process $$"
# On Windows, the .exe suffix should be added to executables
UNAME=$(uname)
if [ "$UNAME" == "Linux" ]; then
log "Running on Linux machine"
export MRT_BIN=
export MRT_OS=linux
elif [[ "$UNAME" == CYGWIN* || "$UNAME" == MINGW* ]]; then
log "Running on Windows machine"
export MRT_BIN=.exe
export MRT_OS=windows
else
log "Unsupported or unrecognized machine with uname= $UNAME"
exit 1
fi
# Print folders which contain models and data for regression tests
export MRT_MODELS="$( realpath ${MODELS:-$MRT_ROOT/models} )"
export MRT_DATA="$( realpath ${DATA:-$MRT_ROOT/data} )"
log "Using models from: $MRT_MODELS"
log "Using data from: $MRT_DATA"
# Try adding build/ to MARIAN for backward compatibility
if [[ ! -e "$MRT_MARIAN/marian-decoder$MRT_BIN" ]]; then
MRT_MARIAN="$MRT_MARIAN/build"
fi
# Check if required tools are present in marian directory
for cmd in marian marian-decoder marian-scorer marian-vocab; do
if [ ! -e "$MRT_MARIAN/$cmd$MRT_BIN" ]; then
loge "Error: '$MRT_MARIAN/$cmd$MRT_BIN' not found. Do you need to compile the toolkit first?"
exit 1
fi
done
# Common Marian executables
export MRT_MARIAN_TRAINER="$MRT_MARIAN/marian$MRT_BIN"
export MRT_MARIAN_DECODER="$MRT_MARIAN/marian-decoder$MRT_BIN"
export MRT_MARIAN_SCORER="$MRT_MARIAN/marian-scorer$MRT_BIN"
export MRT_MARIAN_VOCAB="$MRT_MARIAN/marian-vocab$MRT_BIN"
log "Using Marian binary: $MRT_MARIAN_DECODER"
# Log Marian version
export MRT_MARIAN_VERSION=$($MRT_MARIAN_TRAINER --version 2>&1)
log "Version: $MRT_MARIAN_VERSION"
# Get CMake settings from the --build-info option
if ! grep -q "build-info" < <( $MRT_MARIAN_TRAINER --help ); then
loge "Error: Marian does not have the required --build-info option. Use newer version of Marian"
exit 1
fi
$MRT_MARIAN_TRAINER --build-info all 2> "$MRT_ROOT/cmake.log"
if test ! -s "$MRT_ROOT/cmake.log" || grep -q "Error: build-info is not available" "$MRT_ROOT/cmake.log"; then
loge "Warning: Marian does not set the required --build-info option. Tests may not work properly"
fi
# Check Marian compilation settings
export MRT_MARIAN_BUILD_TYPE=$(cat $MRT_ROOT/cmake.log | grep -i "CMAKE_BUILD_TYPE=" | cut -f2 -d=)
export MRT_MARIAN_COMPILER=$(cat $MRT_ROOT/cmake.log | grep -i "CMAKE_CXX_COMPILER=" | cut -f2 -d=)
export MRT_MARIAN_USE_CPU=$(cat $MRT_ROOT/cmake.log | egrep -i "COMPILE_CPU=(true|on|1)" | cat)
export MRT_MARIAN_USE_CUDA=$(cat $MRT_ROOT/cmake.log | egrep -i "COMPILE_CUDA=(true|on|1)" | cat)
export MRT_MARIAN_USE_CUDNN=$(cat $MRT_ROOT/cmake.log | egrep -i "USE_CUDNN=(true|on|1)" | cat)
export MRT_MARIAN_USE_SENTENCEPIECE=$(cat $MRT_ROOT/cmake.log | egrep -i "USE_SENTENCEPIECE=(true|on|1)" | cat)
export MRT_MARIAN_USE_FBGEMM=$(cat $MRT_ROOT/cmake.log | egrep -i "USE_FBGEMM=(true|on|1)" | cat)
export MRT_MARIAN_USE_UNITTESTS=$(cat $MRT_ROOT/cmake.log | egrep -i "COMPILE_TESTS=(true|on|1)" | cat)
log "Build type: $MRT_MARIAN_BUILD_TYPE"
log "Using compiler: $MRT_MARIAN_COMPILER"
log "Using CPU: $MRT_MARIAN_USE_CPU"
log "Using CUDNN: $MRT_MARIAN_USE_CUDNN"
log "Using SentencePiece: $MRT_MARIAN_USE_SENTENCEPIECE"
log "Using FBGEMM: $MRT_MARIAN_USE_FBGEMM"
log "Unit tests: $MRT_MARIAN_USE_UNITTESTS"
# Number of available devices
cuda_num_devices=$(($(echo $CUDA_VISIBLE_DEVICES | grep -c ',' | cat)+1))
export MRT_NUM_DEVICES=${NUM_DEVICES:-$cuda_num_devices}
log "Using CUDA visible devices: $CUDA_VISIBLE_DEVICES"
log "Using number of GPU devices: $MRT_NUM_DEVICES"
# CPU architecture details
test -e "$MRT_ROOT/cpuinfo.log" || cat /proc/cpuinfo > "$MRT_ROOT/cpuinfo.log"
grep -qi "avx2" "$MRT_ROOT/cpuinfo.log" && MRT_CPU_AVX2=true
grep -qi "avx512" "$MRT_ROOT/cpuinfo.log" && MRT_CPU_AVX512=true
grep -qi "avx512_vnni" "$MRT_ROOT/cpuinfo.log" && MRT_CPU_AVX512VNNI=true
export MRT_CPU_AVX2
export MRT_CPU_AVX512
export MRT_CPU_AVX512VNNI
log "CPU intrinsics: avx2=$MRT_CPU_AVX2 avx512=$MRT_CPU_AVX512 avx512vnni=$MRT_CPU_AVX512VNNI"
# Time out
export MRT_TIMEOUT=${TIMEOUT:-5m} # the default time out is 5 minutes, see `man timeout`
cmd_timeout=""
if [ $MRT_TIMEOUT != "0" ]; then
cmd_timeout="timeout $MRT_TIMEOUT"
fi
log "Using time out: $MRT_TIMEOUT"
# Exit codes
export EXIT_CODE_SUCCESS=0
export EXIT_CODE_SKIP=100
export EXIT_CODE_SKIP_MISSING_FILE=101
export EXIT_CODE_SKIP_NO_FBGEMM=105
export EXIT_CODE_SKIP_NO_SENTENCEPIECE=106
export EXIT_CODE_SKIP_NO_AVX2=110
export EXIT_CODE_SKIP_NO_AVX512=111
export EXIT_CODE_TIMEOUT=124 # Exit code returned by the timeout command if timed out
function format_time {
dt=$(python -c "print($2 - $1)" 2>/dev/null)
dh=$(python -c "print(int($dt/3600))" 2>/dev/null)
dt2=$(python -c "print($dt-3600*$dh)" 2>/dev/null)
dm=$(python -c "print(int($dt2/60))" 2>/dev/null)
ds=$(python -c "print($dt2-60*$dm)" 2>/dev/null)
LANG=C printf "%02d:%02d:%02.3fs" $dh $dm $ds
}
###############################################################################
# Default directory with all regression tests
test_prefixes=tests
if [ $# -ge 1 ]; then
test_prefixes=
for arg in "$@"; do
# A log file with paths to test files
if [[ "$arg" = *.log ]]; then
# Extract tests from .log file
args=$(cat $arg | grep -vP '^\[' | grep '/test_.*\.sh' | grep -v '/_' | sed 's/^ *- *//' | tr '\n' ' ' | sed 's/ *$//')
test_prefixes="$test_prefixes $args"
# A hash tag
elif [[ "$arg" = '#'* ]]; then
# Find all tests with the given hash tag
tag=${arg:1}
args=$(find tests -name '*test_*.sh' | xargs -I{} grep -H "^ *# *TAGS:.* $tag" {} | cut -f1 -d:)
test_prefixes="$test_prefixes $args"
# A test file or directory name
else
test_prefixes="$test_prefixes $arg"
fi
done
fi
# Check if the variable is empty or contains only spaces
if [[ -z "${test_prefixes// }" ]]; then
log "Error: no tests found in the specified input(s): $@"
exit 1
fi
# Extract all subdirectories, which will be traversed to look for regression tests
test_dirs=$(find $test_prefixes -type d | grep -v "/_" | cat)
if grep -q "/test_.*\.sh" <<< "$test_prefixes"; then
test_files=$(printf '%s\n' $test_prefixes | sed 's!*/!!')
test_dirs=$(printf '%s\n' $test_prefixes | xargs -I{} dirname {} | grep -v "/_" | sort | uniq)
fi
###############################################################################
success=true
count_all=0
count_failed=0
count_passed=0
count_skipped=0
count_timedout=0
declare -a tests_failed
declare -a tests_skipped
declare -a tests_timedout
time_start=$(date +%s.%N)
# Traverse test directories
cd $MRT_ROOT
for test_dir in $test_dirs
do
log "Checking directory: $test_dir"
nosetup=false
# Run setup script if exists
if [ -e $test_dir/setup.sh ]; then
log "Running setup script"
cd $test_dir
$cmd_timeout $SHELL -v setup.sh &> setup.log
if [ $? -ne 0 ]; then
log "Warning: setup script returns a non-success exit code"
success=false
nosetup=true
else
rm setup.log
fi
cd $MRT_ROOT
fi
# Run tests
for test_path in $(ls -A $test_dir/test_*.sh 2>/dev/null)
do
test_file=$(basename $test_path)
test_name="${test_file%.*}"
# In non-traverse mode skip tests if not requested
if [[ -n "$test_files" && $test_files != *"$test_file"* ]]; then
continue
fi
test_time_start=$(date +%s.%N)
((++count_all))
# Tests are executed from their directory
cd $test_dir
# Skip tests if setup failed
logn "Running $test_path ... "
if [ "$nosetup" = true ]; then
((++count_skipped))
tests_skipped+=($test_path)
loge " skipped"
cd $MRT_ROOT
continue;
fi
# Run test
# Note: all output gets written to stderr (very very few cases write to stdout)
$cmd_timeout $SHELL -x $test_file 2> $test_file.log 1>&2
exit_code=$?
# Check exit code
if [ $exit_code -eq $EXIT_CODE_SUCCESS ]; then
((++count_passed))
loge " OK"
elif [ $exit_code -eq $EXIT_CODE_SKIP ]; then
((++count_skipped))
tests_skipped+=($test_path)
loge " skipped"
elif [ $exit_code -eq $EXIT_CODE_TIMEOUT ]; then
((++count_timedout))
tests_timedout+=($test_path)
# Add a comment to the test log file that it timed out
echo "The test timed out after $TIMEOUT" >> $test_file.log
# A timed out test is a failed test
((++count_failed))
loge " timed out"
success=false
else
((++count_failed))
tests_failed+=($test_path)
loge " failed"
success=false
fi
# Report time
test_time_end=$(date +%s.%N)
test_time=$(format_time $test_time_start $test_time_end)
log "Test took $test_time"
cd $MRT_ROOT
done
cd $MRT_ROOT
# Run teardown script if exists
if [ -e $test_dir/teardown.sh ]; then
log "Running teardown script"
cd $test_dir
$cmd_timeout $SHELL teardown.sh &> teardown.log
if [ $? -ne 0 ]; then
log "Warning: teardown script returns a non-success exit code"
success=false
else
rm teardown.log
fi
cd $MRT_ROOT
fi
done
time_end=$(date +%s.%N)
time_total=$(format_time $time_start $time_end)
###############################################################################
# Print skipped and failed tests
if [ -n "$tests_skipped" ] || [ -n "$tests_failed" ] || [ -n "$tests_timedout" ]; then
loge "---------------------"
fi
[[ -z "$tests_skipped" ]] || loge "Skipped:"
for test_name in "${tests_skipped[@]}"; do
loge "- $test_name"
done
[[ -z "$tests_failed" ]] || loge "Failed:"
for test_name in "${tests_failed[@]}"; do
loge "- $test_name"
done
[[ -z "$tests_timedout" ]] || loge "Timed out:"
for test_name in "${tests_timedout[@]}"; do
loge "- $test_name"
done
[[ -z "$tests_failed" ]] || echo "Logs:"
for test_name in "${tests_failed[@]}"; do
echo "- $(realpath $test_name | sed 's/\.sh/.sh.log/')"
done
###############################################################################
# Print summary
loge "---------------------"
loge -n "Ran $count_all tests in $time_total, $count_passed passed, $count_skipped skipped, $count_failed failed"
[ -n "$tests_timedout" ] && loge -n " (incl. $count_timedout timed out)"
loge ""
# Return exit code
if $success && [ $count_all -gt 0 ]; then
loge "OK"
exit 0
else
loge "FAILED"
exit 1
fi