-
Notifications
You must be signed in to change notification settings - Fork 96
/
Copy patheasy-setup.sh
executable file
·369 lines (329 loc) · 11.1 KB
/
easy-setup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#!/bin/bash
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this
# software and associated documentation files (the "Software"), to deal in the Software
# without restriction, including without limitation the rights to use, copy, modify,
# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
set -exo pipefail
: "${STACK_ID_VPC:=sagemaker-hyperpod}"
declare -a HELP=(
"[-h|--help]"
"[-r|--region]"
"[-p|--profile]"
"[-s|--stack-id-vpc]"
"[-i|--instance-type]"
"[-c|--instance-count]"
"[-e|--head-node-ebs-volume-size]"
"[-d|--dry-run]"
"CLUSTER_NAME"
)
declare -a aws_cli_args=()
DRY_RUN=0
parse_args() {
local key
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
-h|--help)
echo "Create a HyperPod Cluster with single partition."
echo "It requires sageamker-hyperpod CloudFormation stack to be deployed."
echo "Usage: $(basename ${BASH_SOURCE[0]}) ${HELP[@]}"
exit 0
;;
-r|--region)
AWS_REGION="$2"
shift 2
;;
-p|--profile)
AWS_PROFILE="$2"
shift 2
;;
-s|--stack-id-vpc)
STACK_ID_VPC="$2"
shift 2
;;
-i|--instance-type)
INSTANCE="$2"
shift 2
;;
-c|--instance-count)
INSTANCE_COUNT="$2"
shift 2
;;
-e|--head-node-ebs-volume-size)
EBS_VOLUME_SIZE="$2"
shift 2
;;
-d|--dry-run)
DRY_RUN=1
shift
;;
*)
CLUSTER_NAME="$key"
shift
;;
esac
done
}
parse_args $@
mkdir -p $CLUSTER_NAME
cd $CLUSTER_NAME
# Check for AWS CLI
if ! command -v aws &> /dev/null
then
echo -e "please install aws..."
echo -e "see https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for the installation guide"
exit 1
fi
# Check for JQ
if ! command -v jq &> /dev/null
then
echo -e "please install jq...\nsudo yum install -y jq or brew install jq"
exit 1
fi
# Check for wget
if ! command -v wget &> /dev/null
then
echo -e "please install wget...\nsudo yum install -y wget or brew install wget"
exit 1
fi
# Define cluster name
if [ -z ${CLUSTER_NAME} ]; then
echo "[WARNING] CLUSTER_NAME environment variable is not set, automatically set to ml-cluster"
CLUSTER_NAME=ml-cluster
fi
echo "export CLUSTER_NAME=${CLUSTER_NAME}" > env_vars
# Define stack name
if [ -z ${STACK_ID_VPC} ]; then
echo "[WARNING] STACK_ID_VPC environment variable is not set, automatically set to sagemaker-hyperpod"
STACK_ID_VPC=sagemaker-hyperpod
fi
echo "export STACK_ID_VPC=${STACK_ID_VPC}" >> env_vars
# Define AWS Region
if [ -z ${AWS_REGION} ]; then
echo "[WARNING] AWS_REGION environment variable is not set, automatically set depending on aws cli default region."
export AWS_REGION=$(aws "${aws_cli_args[@]}" configure get region)
fi
aws_cli_args+=(--region "${AWS_REGION}")
echo "export AWS_REGION=${AWS_REGION}" >> env_vars
echo "[INFO] AWS_REGION = ${AWS_REGION}"
# Define AWS Profile
if [ -z ${AWS_PROFILE} ];
then
echo "[WARNING] AWS_PROFILE environment variable is not set, ignore if you are using default profile."
else
echo "export AWS_PROFILE=${AWS_PROFILE}" >> env_vars
echo "[INFO] AWS_PROFILE = ${AWS_PROFILE}"
aws_cli_args+=(--profile "${AWS_PROFILE}")
fi
# Define Instances seperated by ','.
if [ -z ${INSTANCE} ]; then
echo "[WARNING] INSTANCE environment variable is not set, automatically set to g5.12xlarge."
export INSTANCE=g5.12xlarge
fi
echo "export INSTANCE=${INSTANCE}" >> env_vars
echo "[INFO] INSTANCE = ${INSTANCE}"
# Define Instance counts seperated by ','.
if [ -z ${INSTANCE_COUNT} ]; then
echo "[WARNING] INSTANCE_COUNTS environment variable is not set, automatically set to 2."
export INSTANCE_COUNT=2
fi
# Define EBS_VOLUME_SIZE
if [ -z ${EBS_VOLUME_SIZE} ]; then
echo "[WARNING] EBS_VOLUME_SIZE environment variable is not set, automatically set to 500GB."
export EBS_VOLUME_SIZE=500
fi
# Retrieve VPC ID
export VPC_ID=`aws "${aws_cli_args[@]}" cloudformation describe-stacks \
--stack-name $STACK_ID_VPC \
--query 'Stacks[0].Outputs[?OutputKey==\`VPC\`].OutputValue' \
--output text`
if [[ ! -z $VPC_ID ]]; then
echo "export VPC_ID=${VPC_ID}" >> env_vars
echo "[INFO] VPC_ID = ${VPC_ID}"
else
echo "[ERROR] failed to retrieve VPC ID"
return 1
fi
# Grab the subnet id
export SUBNET_ID=`aws "${aws_cli_args[@]}" cloudformation describe-stacks \
--stack-name $STACK_ID_VPC \
--query 'Stacks[0].Outputs[?OutputKey==\`PrimaryPrivateSubnet\`].OutputValue' \
--output text`
if [[ ! -z $SUBNET_ID ]]; then
echo "export SUBNET_ID=${SUBNET_ID}" >> env_vars
echo "[INFO] SUBNET_ID = ${SUBNET_ID}"
else
echo "[ERROR] failed to retrieve SUBNET ID"
return 1
fi
# Grab the subnet id
export PUBLIC_SUBNET_ID=`aws "${aws_cli_args[@]}" cloudformation describe-stacks \
--stack-name $STACK_ID_VPC \
--query 'Stacks[0].Outputs[?OutputKey==\`PublicSubnet\`].OutputValue' \
--output text`
if [[ ! -z $PUBLIC_SUBNET_ID ]]; then
echo "export PUBLIC_SUBNET_ID=${PUBLIC_SUBNET_ID}" >> env_vars
echo "[INFO] PUBLIC_SUBNET_ID = ${PUBLIC_SUBNET_ID}"
else
echo "[ERROR] failed to retrieve Public SUBNET ID"
return 1
fi
# Get FSx Filesystem id from CloudFormation
export FSX_ID=`aws "${aws_cli_args[@]}" cloudformation describe-stacks \
--stack-name $STACK_ID_VPC \
--query 'Stacks[0].Outputs[?OutputKey==\`FSxLustreFilesystemId\`].OutputValue' \
--output text`
if [[ ! -z $FSX_ID ]]; then
echo "export FSX_ID=${FSX_ID}" >> env_vars
echo "[INFO] FSX_ID = ${FSX_ID}"
else
echo "[ERROR] failed to retrieve FSX ID"
return 1
fi
# Get FSx Filesystem Mountname from CloudFormation
export FSX_MOUNTNAME=`aws "${aws_cli_args[@]}" cloudformation describe-stacks \
--stack-name $STACK_ID_VPC \
--query 'Stacks[0].Outputs[?OutputKey==\`FSxLustreFilesystemMountname\`].OutputValue' \
--output text`
if [[ ! -z $FSX_MOUNTNAME ]]; then
echo "export FSX_MOUNTNAME=${FSX_MOUNTNAME}" >> env_vars
echo "[INFO] FSX_MOUNTNAME = ${FSX_MOUNTNAME}"
else
echo "[ERROR] failed to retrieve FSX Mountname"
return 1
fi
# Get FSx Security Group from CloudFormation
export SECURITY_GROUP=`aws "${aws_cli_args[@]}" cloudformation describe-stacks \
--stack-name $STACK_ID_VPC \
--query 'Stacks[0].Outputs[?OutputKey==\`SecurityGroup\`].OutputValue' \
--output text`
if [[ ! -z $SECURITY_GROUP ]]; then
echo "export SECURITY_GROUP=${SECURITY_GROUP}" >> env_vars
echo "[INFO] SECURITY_GROUP = ${SECURITY_GROUP}"
else
echo "[ERROR] failed to retrieve FSX Security Group"
return 1
fi
# Get sagemaker role ARN
export ROLE=`aws "${aws_cli_args[@]}" cloudformation describe-stacks \
--stack-name $STACK_ID_VPC \
--query 'Stacks[0].Outputs[?OutputKey==\`AmazonSagemakerClusterExecutionRoleArn\`].OutputValue' \
--output text`
if [[ ! -z $ROLE ]]; then
echo "export ROLE=${ROLE}" >> env_vars
echo "[INFO] ROLE = ${ROLE}"
else
echo "[ERROR] failed to retrieve Role ARN"
return 1
fi
# Get sagemaker role ROLENAME
export ROLENAME=$(basename "$ROLE")
if [[ ! -z $ROLENAME ]]; then
echo "export ROLENAME=${ROLENAME}" >> env_vars
echo "[INFO] ROLENAME = ${ROLENAME}"
else
echo "[ERROR] failed to retrieve Role NAME"
return 1
fi
# Get s3 bucket name
export BUCKET=`aws "${aws_cli_args[@]}" cloudformation describe-stacks \
--stack-name $STACK_ID_VPC \
--query 'Stacks[0].Outputs[?OutputKey==\`AmazonS3BucketName\`].OutputValue' \
--output text`
if [[ ! -z $BUCKET ]]; then
echo "export BUCKET=${BUCKET}" >> env_vars
echo "[INFO] BUCKET = ${BUCKET}"
else
echo "[ERROR] failed to retrieve Bucket Name"
return 1
fi
if [ ! -d "awsome-distributed-training" ]
then
echo "Cloning the repository..."
git clone --depth=1 https://github.com/aws-samples/awsome-distributed-training/
else
echo "Repository already exists..."
fi
# Use pushd and popd to navigate directories https://en.wikipedia.org/wiki/Pushd_and_popd
pushd awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/
# upload data
aws "${aws_cli_args[@]}" s3 cp --recursive base-config/ s3://${BUCKET}/src
# move back to the previous directory
popd
cat > provisioning_parameters.json << EOL
{
"version": "1.0.0",
"workload_manager": "slurm",
"controller_group": "controller-machine",
"worker_groups": [
{
"instance_group_name": "worker-group-1",
"partition_name": "${INSTANCE}"
}
],
"fsx_dns_name": "${FSX_ID}.fsx.${AWS_REGION}.amazonaws.com",
"fsx_mountname": "${FSX_MOUNTNAME}"
}
EOL
# copy to the S3 Bucket
aws "${aws_cli_args[@]}" s3 cp provisioning_parameters.json s3://${BUCKET}/src/
cat > cluster-config.json << EOL
{
"ClusterName": "${CLUSTER_NAME}",
"InstanceGroups": [
{
"InstanceGroupName": "controller-machine",
"InstanceType": "ml.m5.12xlarge",
"InstanceStorageConfigs": [
{
"EbsVolumeConfig": {
"VolumeSizeInGB": ${EBS_VOLUME_SIZE}
}
}
],
"InstanceCount": 1,
"LifeCycleConfig": {
"SourceS3Uri": "s3://${BUCKET}/src",
"OnCreate": "on_create.sh"
},
"ExecutionRole": "${ROLE}",
"ThreadsPerCore": 1
},
{
"InstanceGroupName": "worker-group-1",
"InstanceType": "ml.${INSTANCE}",
"InstanceCount": ${INSTANCE_COUNT},
"LifeCycleConfig": {
"SourceS3Uri": "s3://${BUCKET}/src",
"OnCreate": "on_create.sh"
},
"ExecutionRole": "${ROLE}",
"ThreadsPerCore": 1
}
],
"VpcConfig": {
"SecurityGroupIds": ["$SECURITY_GROUP"],
"Subnets":["$SUBNET_ID"]
}
}
EOL
# Validate Cluster configuration
wget --no-clobber https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/main/1.architectures/5.sagemaker-hyperpod/validate-config.py
# install boto3
pip3 install boto3
# check config for known issues
python3 validate-config.py --cluster-config cluster-config.json --provisioning-parameters provisioning_parameters.json ${aws_cli_args[@]}
echo "aws ${aws_cli_args[@]} sagemaker create-cluster --cli-input-json file://cluster-config.json"
[[ DRY_RUN -eq 1 ]] && exit 0
aws ${aws_cli_args[@]} sagemaker create-cluster --cli-input-json "file://cluster-config.json"