-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbridge_cogact_finetune.sh
99 lines (88 loc) · 3.44 KB
/
bridge_cogact_finetune.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/bin/bash
#SBATCH --account=MST113264 # (-A) iService Project ID
#SBATCH --job-name=openvla_ft # (-J) Job name
#SBATCH --partition=normal # (-p) Slurm partition for H100 nodes
#SBATCH --nodes=1 # (-N) Maximum number of nodes to be allocated
#SBATCH --gpus-per-node=8 # Gpus per node
#SBATCH --cpus-per-task=12 # (-c) Number of cores per MPI task
#SBATCH --ntasks-per-node=8 # Maximum number of tasks on each node
#SBATCH --time=48:00:00 # (-t) Wall time limit (days-hrs:min:sec)
#SBATCH --output=cogact-finetune.out # (-o) Path to the standard output file
#SBATCH --error=cogact-finetune.err # (-e) Path to the standard error file
#SBATCH --mail-type=END,FAIL # Mail events (NONE, BEGIN, END, FAIL, ALL)
#SBATCH [email protected] # Where to send mail. Set this to your email address
# Load necessary modules and set up environment
module purge
module load cuda/12.2
# Set CUDA environment variables
export CUDA_HOME=/usr/local/cuda-12.2
export PATH=$CUDA_HOME/bin:$PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
# Initialize conda
source ~/.bashrc
eval "$(conda shell.bash hook)"
conda activate vla-rl
# Check if conda env is activated
echo "Checking conda environment activation..."
if [[ -n "$CONDA_DEFAULT_ENV" ]]; then
echo "Conda environment is activated: $CONDA_DEFAULT_ENV"
else
echo "Conda environment is NOT activated"
exit 1
fi
# Verify CUDA is available
echo -e "\nChecking CUDA availability:"
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
python -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')"
python -c "import torch; print(f'CUDA version: {torch.version.cuda}')"
# List all installed packages
echo -e "\nListing installed packages:"
pip list
# Print Python path to verify we're using the correct interpreter
echo -e "\nPython interpreter path:"
which python
echo -e "\nPython version:"
python --version
# Set environment variables for distributed training
# Use a random port with retries
MAX_RETRIES=5
for i in $(seq 1 $MAX_RETRIES); do
export MASTER_PORT=$(shuf -i 29500-65000 -n 1)
nc -z $HOSTNAME $MASTER_PORT || break
if [ $i -eq $MAX_RETRIES ]; then
echo "Could not find an available port after $MAX_RETRIES attempts"
exit 1
fi
done
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export WORLD_SIZE=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
export LOCAL_RANK=$SLURM_LOCALID
export RANK=$SLURM_PROCID
echo "Distributed training configuration:"
echo "MASTER_ADDR: $MASTER_ADDR"
echo "MASTER_PORT: $MASTER_PORT"
echo "WORLD_SIZE: $WORLD_SIZE"
echo "LOCAL_RANK: $LOCAL_RANK"
echo "RANK: $RANK"
# Run the finetuning script
srun torchrun \
--nnodes=1 \
--nproc_per_node=8 \
--rdzv_id=$SLURM_JOB_ID \
--rdzv_backend=c10d \
--rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
vla-scripts/cogact-train.py \
--vla_path "openvla/openvla-7b" \
--data_root_dir "/work/crlc112358/datasets/" \
--dataset_name bridge_orig \
--run_root_dir "runs" \
--adapter_tmp_dir "adapters" \
--max_steps 100000 \
--lora_rank 32 \
--batch_size 8 \
--grad_accumulation_steps 1 \
--learning_rate 5e-4 \
--image_aug True \
--wandb_project "bridge_orig_finetune" \
--wandb_entity "elsa-vla" \
--save_steps 500