-
Notifications
You must be signed in to change notification settings - Fork 96
/
Copy path1.distributed-training-llama2.sbatch
executable file
·84 lines (69 loc) · 2.53 KB
/
1.distributed-training-llama2.sbatch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
#SBATCH --nodes=4 # number of nodes to use
#SBATCH --job-name=FSDP # name of your job
#SBATCH --exclusive # job has exclusive use of the resource, no sharing
set -ex;
###########################
###### User Variables #####
###########################
GPUS_PER_NODE=8 # 4 for G5.12x, 8 for P4/P5
###########################
## Environment Variables ##
###########################
## Plenty of EFA level variables
## For G4dn and other G5, comment out all
export FI_LOG_LEVEL=warn
export FI_PROVIDER=efa
# export FI_EFA_USE_HUGE_PAGE=0 # Set to 0 when you see os.fork() causes OSError: Cannot allocate memory. Disabling huge page causes minor performance hit.
export NCCL_DEBUG=INFO
## Switching SYNC_MEMOPS to zero can boost throughput with FSDP
## Disables CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
## Reduces memory synchronizations
## https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html
export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
## Set HuggingFace metadata timeout (in seconds) for large clusters
export HF_HUB_ETAG_TIMEOUT=60
###########################
####### Torch Dist #######
###########################
declare -a TORCHRUN_ARGS=(
--nproc_per_node=$GPUS_PER_NODE
--nnodes=$SLURM_JOB_NUM_NODES
--rdzv_id=$SLURM_JOB_ID
--rdzv_backend=c10d
--rdzv_endpoint=$(hostname)
)
export TORCHRUN=./pt_fsdp/bin/torchrun
export TRAIN_SCRIPT=./train.py
############################
# Llama 2 Training Params ##
############################
declare -a TRAINING_ARGS=(
--max_context_width=4096
--num_key_value_heads=32 # 7b: 32 13b: 40 70b: 8
--intermediate_size=11008 # 7b: 11008 13b: 13824 70b: 28672
--hidden_width=4096 # 7b: 4096 13b: 5120 70b: 8192
--num_layers=32 # 7b: 32 13b: 40 70b: 80
--num_heads=32 # 7b: 32 13b: 40 70b: 64
--model_type=llama_v2
--tokenizer="hf-internal-testing/llama-tokenizer"
--checkpoint_freq=5000
--validation_freq=500
--max_steps=5000
--checkpoint_dir=./checkpoints
--dataset='c4'
--dataset_config_name='en'
--resume_from_checkpoint=./checkpoints
--train_batch_size=1
--val_batch_size=1
--sharding_strategy="full" # https://pytorch.org/docs/stable/fsdp.html
--offload_activations=1
)
AUTO_RESUME=""
if [ -d "/opt/sagemaker_cluster" ]; then
echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
AUTO_RESUME="--auto-resume=1"
fi
srun ${AUTO_RESUME} -l ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"