-
Notifications
You must be signed in to change notification settings - Fork 447
/
train-ec2
executable file
·71 lines (60 loc) · 1.74 KB
/
train-ec2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/bin/bash
set -e
# parse arguments
POSITIONAL=()
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
--config-file)
CONFIG_FILE="$2"
shift # past argument
shift # past value
;;
--samples-file)
SAMPLES_FILE="$2"
shift # past argument
shift # past value
;;
--image-dir)
IMAGE_DIR="$2"
shift # past argument
shift # past value
;;
--docker-machine)
MACHINE="$2"
shift # past argument
shift # past value
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
shift # past argument
;;
esac
done
REMOTE="docker-machine ssh $MACHINE"
REMOTE_COPY="docker-machine scp"
# parse config file and assign parameters to variables
eval "$(jq -r "to_entries|map(\"export \(.key)=\(.value|tostring)\")|.[]" $CONFIG_FILE)"
# login to ECR on remote machine
eval $REMOTE "eval 'sudo $(aws ecr get-login --no-include-email --region eu-west-1)'"
# pull docker image on remote machine
eval $REMOTE "sudo docker pull $docker_image"
# create job dir on remote machine
TIMESTAMP=$(date +%Y_%m_%d_%H_%M_%S)
TRAIN_JOB_DIR=train_jobs/$TIMESTAMP
eval $REMOTE "sudo mkdir -p -m 777 $TRAIN_JOB_DIR"
# copy config and labels file to remote MACHINE
eval $REMOTE_COPY "$CONFIG_FILE $MACHINE:/home/ubuntu/$TRAIN_JOB_DIR/config.json"
eval $REMOTE_COPY "$SAMPLES_FILE $MACHINE:/home/ubuntu/$TRAIN_JOB_DIR/samples.json"
# start training
DOCKER_REMOTE_RUN="sudo nvidia-docker run -d
-v $IMAGE_DIR:/src/images
-v /home/ubuntu/$TRAIN_JOB_DIR:/src/$TRAIN_JOB_DIR
-e S3_BUCKET=$s3_bucket
-e TRAIN_JOB_DIR=$TRAIN_JOB_DIR
$docker_image"
eval $REMOTE $DOCKER_REMOTE_RUN
# stream logs from container
CONTAINER_ID=$($REMOTE sudo docker ps -l -q)
$REMOTE "sudo docker logs $CONTAINER_ID --follow"