-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathtrain_pbs.sh
executable file
·117 lines (97 loc) · 3.47 KB
/
train_pbs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/bin/bash
# Copyright (c) 2021, Hitachi America Ltd. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -eu
script_dir="$(dirname "$(readlink -f "$0")")"
cd $script_dir
date_id=`date "+%m%d-%H%M%S"`
if [ "$#" -eq 3 ]; then
logdir=$script_dir/work/log_$date_id
elif [ "$#" -eq 4 ]; then
logdir="$4"
else
echo "usage: train_pbs.sh GROUP_ID CONF MULTI_GPU [LOG_DIR]";
fi
if [ "$3" -eq 1 ]; then
COMMAND="(python -m torch.distributed.launch --nproc_per_node=4 train.py $2 $logdir \\&\\& python test.py $logdir ./data/test.json $logdir/test_)"
RESOURCE="rt_G.large=1"
elif [ "$3" -eq 0 ]; then
COMMAND="(python train.py $2 $logdir \\&\\& python test.py $logdir ./data/test.json $logdir/test_)"
RESOURCE="rt_G.small=1"
else
echo "MULTI_GPU must be either 1 (multi-gpu training) or 0 (single GPU training)."
fi
tmppath=${logdir}/tmp
mkdir -p $tmppath
echo "Copying all files from $script_dir/ to $tmppath/files"
rsync -ar --exclude='work/' --exclude='.git/' $script_dir/ $tmppath/files/
# ln -s `realpath --relative-to=$tmppath/files/ ${script_dir}/data` $tmppath/files/data
ln -s `realpath --relative-to=$tmppath/files/ ${script_dir}/work` $tmppath/files/work
touch ${logdir}/waiting
{
printf '\n\n##### Git diff ##############\n'
git --no-pager diff
printf '\n\n##### Git status ############\n'
git status -s
printf '\n\n##### Other info ############\n'
printf "disptach date: $date_id\n"
printf "git hash: `git rev-parse HEAD`\n"
printf "project dir: $script_dir\n"
} 2>&1 > $logdir/screen.log
cat <<'__EOF__' |
#!/bin/bash
#$ -l __RESOURCE__
#$ -l h_rt=48:00:00
#$ -j y
#$ -o __PBS_LOG_PATH__
source /etc/profile.d/modules.sh
module load cuda/10.2/10.2.89 cudnn/7.6/7.6.5 nccl/2.6/2.6.4-1
export PYENV_VIRTUALENV_DISABLE_PROMPT=1;
export PYENV_ROOT="$HOME/.pyenv";
export PATH="$PYENV_ROOT/bin:$PATH";
export PYENV_ROOT="$HOME/.pyenv";
eval "$(pyenv init -)";
eval "$(pyenv virtualenv-init -)";
set -eu
cd __TMPDIR__/files
export PYTHONPATH=`pwd`
logdir=__LOGDIR__
logfile=$logdir/screen.log
trap "touch ${logdir}/failed; rm -f ${logdir}/running" ERR
mv ${logdir}/waiting ${logdir}/running
{
printf "temp workdir: `pwd`\n"
printf "JOB_ID: $JOB_ID\n"
printf "PE_HOSTFILE: $PE_HOSTFILE\n"
printf "started date: `date '+%m%d-%H%M%S'`\n"
printf "HOST: `hostname`\n"
printf "USER: $USER\n"
printf "log file: $logfile\n"
printf "COMMAND: __COMMAND__"
printf '\n\n\n\n\n\n\n-----------------------------\n\n'
} 2>&1 >> $logfile
__COMMAND__ 2>&1 >> $logfile
mv ${logdir}/running ${logdir}/succeeded
cd ~
rm -r __TMPDIR__
__EOF__
sed -e "s|__COMMAND__|$COMMAND|g" \
-e "s|__LOGDIR__|${logdir}|g" \
-e "s|__TMPDIR__|${tmppath}|g" \
-e "s|__RESOURCE__|${RESOURCE}|g" \
-e "s|__PBS_LOG_PATH__|${logdir}/pbs.log|g" \
> $tmppath/command.pbs
printf "Making temporary qsub script to ${tmppath}/command.pbs\n"
printf "Logging to $logdir with JOB id t${date_id}\n"
qsub -g "$1" -N t${date_id} $tmppath/command.pbs