Skip to content

Commit

Permalink
Add multidataset example with deepspeed support (#316)
Browse files Browse the repository at this point in the history
* Add multidataset example with deepspeed support

* Change base.json to follow GPS' requirement
  • Loading branch information
licj15 authored Feb 5, 2025
1 parent 0906702 commit 69ee2e9
Show file tree
Hide file tree
Showing 7 changed files with 760 additions and 0 deletions.
2 changes: 2 additions & 0 deletions examples/multidataset_deepspeed/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
./logs
*.out
68 changes: 68 additions & 0 deletions examples/multidataset_deepspeed/base.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"Verbosity": {
"level": 2
},
"NeuralNetwork": {
"Architecture": {
"mpnn_type": "EGNN",
"pe_dim": 0,
"global_attn_engine": "",
"global_attn_type": "",
"global_attn_heads": 0,
"equivariance": true,
"radius": 5.0,
"max_neighbours": 100000,
"num_gaussians": 50,
"envelope_exponent": 5,
"int_emb_size": 64,
"basis_emb_size": 8,
"out_emb_size": 128,
"num_after_skip": 2,
"num_before_skip": 1,
"num_radial": 6,
"num_spherical": 7,
"num_filters": 126,
"edge_features": ["length"],
"hidden_dim": 50,
"num_conv_layers": 3,
"output_heads": {
"graph":{
"num_sharedlayers": 2,
"dim_sharedlayers": 50,
"num_headlayers": 2,
"dim_headlayers": [50,25]
},
"node": {
"num_headlayers": 2,
"dim_headlayers": [200,200],
"type": "mlp"
}
},
"task_weights": [1.0, 1.0]
},
"Variables_of_interest": {
"input_node_features": [0, 1, 2, 3],
"output_names": ["energy", "force"],
"output_index": [0, 2],
"output_dim": [1, 3],
"type": ["graph", "node"]
},
"Training": {
"num_epoch": 50,
"EarlyStopping": true,
"perc_train": 0.9,
"loss_function_type": "mae",
"batch_size": 32,
"continue": 0,
"Optimizer": {
"type": "AdamW",
"learning_rate": 1e-3
}
}
},
"Visualization": {
"plot_init_solution": true,
"plot_hist_solution": false,
"create_plots": true
}
}
116 changes: 116 additions & 0 deletions examples/multidataset_deepspeed/job-perlmutter-batch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/bash
#SBATCH -A m4716
#SBATCH -J HydraGNN
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH -t 48:00:00
#SBATCH --ntasks-per-node=4
#SBATCH --gpus-per-task=1
#SBATCH -c 32

# Retrieve the number of nodes set via `sbatch -N` or in the script
echo "Number of nodes allocated: $SLURM_NNODES"

## Remove write permission for others in terms of newly created files and dirs
umask 002

## Load Basic Envs
module reset
module load pytorch/2.0.1

module use -a /global/cfs/cdirs/m4133/jyc/perlmutter/sw/modulefiles
module load hydragnn/pytorch2.0.1-v2
module use -a /global/cfs/cdirs/m4133/c8l/sw/modulefiles
module load deepspeed

## MPI Envs
export MPICH_ENV_DISPLAY=0
export MPICH_VERSION_DISPLAY=0
export MPICH_GPU_SUPPORT_ENABLED=0

## HYDRAGNN Envs
HYDRAGNN_DIR=/global/cfs/cdirs/m4716/c8l/HydraGNN
export PYTHONPATH=$HYDRAGNN_DIR:$PYTHONPATH

export HYDRAGNN_NUM_WORKERS=0
export HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1
export HYDRAGNN_AGGR_BACKEND=mpi
export HYDRAGNN_VALTEST=1
export HYDRAGNN_TRACE_LEVEL=0

## Dataset Envs
DATASET_PATH="/global/cfs/projectdirs/m4716/mlupopa/HydraGNN/examples/multidataset_hpo/dataset"
DATASET_LIST="MPTrj-v3,ANI1x-v3,OC2020-20M-v3,OC2022-v3,qm7x-v3"

## Task 1: Outer loop WIDTH, Inner loop DEPTH, fixed DS, ZERO, and CKPT
for WIDTH in 800 1100 1700 2500; do
for DEPTH in 4 5 6; do
LOG_NAME="exp-${DEPTH}_depth-${WIDTH}_width-0.6_TB_data-${SLURM_NNODES}_nodes"

## Calculate batch size and num_samples
BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size
NS=$(echo "scale=0; 285715 / 1.2 * 0.6 * 32 / $SLURM_NNODES" | bc) # Fixed DS=0.6

## Handle optional arguments
EXTRA_ARGS="--zero_opt"

## Run script
set -x

srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \
python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \
--inputfile=base.json \
--dataset_path=$DATASET_PATH \
--multi \
--multi_model_list=$DATASET_LIST \
--num_epoch=10 \
--everyone --ddstore \
--log=$LOG_NAME \
--hidden_dim=${WIDTH} \
--num_conv_layers=${DEPTH} \
--full_test \
--batch_size=${BS} \
--num_samples=${NS} \
${EXTRA_ARGS}

set +x
done
done

## Task 2: Outer loop WIDTH, Inner loop DS, fixed DEPTH and ZERO, varying CKPT
for WIDTH in 2500 5400; do
for DS in 0.2 0.6 1.2; do
LOG_NAME="exp-3_depth-${WIDTH}_width-${DS}_TB_data-${SLURM_NNODES}_nodes"

## Calculate batch size and num_samples
BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size
NS=$(echo "scale=0; 285715 / 1.2 * ${DS} * 32 / $SLURM_NNODES" | bc) # Dynamic DS

## Handle optional arguments
EXTRA_ARGS="--zero_opt"
if [ "$WIDTH" = "5400" ]; then
EXTRA_ARGS+=" --conv_checkpointing"
fi

## Run script
set -x

srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \
python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \
--inputfile=base.json \
--dataset_path=$DATASET_PATH \
--multi \
--multi_model_list=$DATASET_LIST \
--num_epoch=10 \
--everyone --ddstore \
--log=$LOG_NAME \
--hidden_dim=${WIDTH} \
--num_conv_layers=3 \
--full_test \
--batch_size=${BS} \
--num_samples=${NS} \
${EXTRA_ARGS}

set +x
done
done
86 changes: 86 additions & 0 deletions examples/multidataset_deepspeed/job-perlmutter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash
#SBATCH -A m4716
#SBATCH -J HydraGNN
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH -t 48:00:00
#SBATCH --ntasks-per-node=4
#SBATCH --gpus-per-task=1
#SBATCH -c 32

# Retrieve the number of nodes set via `sbatch -N` or in the script
echo "Number of nodes allocated: $SLURM_NNODES"

WIDTH=${1:-50} # Default to 50 hidden_dim if not specified
DEPTH=${2:-3} # Default to 3 num_conv_layers if not specified
DS=${3:-1.2} # Default to 1.2TB data if not specified
ZERO=${4:-False} # Default to False if not specified
CKPT=${5:-False} # Default to False if not specified

## Remove write permission for others in terms of newly created files and dirs
umask 002

## Load Basic Envs
module reset
module load pytorch/2.0.1

module use -a /global/cfs/cdirs/m4133/jyc/perlmutter/sw/modulefiles
module load hydragnn/pytorch2.0.1-v2
module use -a /global/cfs/cdirs/m4133/c8l/sw/modulefiles
module load deepspeed

## MPI Envs
export MPICH_ENV_DISPLAY=0
export MPICH_VERSION_DISPLAY=0
export MPICH_GPU_SUPPORT_ENABLED=0

## HYDRAGNN Envs
HYDRAGNN_DIR=/global/cfs/cdirs/m4716/c8l/HydraGNN
export PYTHONPATH=$HYDRAGNN_DIR:$PYTHONPATH

export HYDRAGNN_NUM_WORKERS=0
export HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1
export HYDRAGNN_AGGR_BACKEND=mpi
export HYDRAGNN_VALTEST=1
export HYDRAGNN_TRACE_LEVEL=0

## Dataset Envs
DATASET_PATH="/global/cfs/projectdirs/m4716/mlupopa/HydraGNN/examples/multidataset_hpo/dataset"
DATASET_LIST="MPTrj-v3,ANI1x-v3,OC2020-20M-v3,OC2022-v3,qm7x-v3"

## Log Envs
LOG_NAME="exp-${DEPTH}_depth-${WIDTH}_width-${DS}_TB_data-${SLURM_NNODES}_nodes"

## Calculate batch size and num_samples
BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size, default setting: 32 nodes with 32 batch size per GPU
NS=$(echo "scale=0; 285715 / 1.2 * ${DS} * 32 / $SLURM_NNODES" | bc) # Calculate number of samples, default setting: 32 nodes with 285715 num_samples per GPU

## Handle optional arguments
EXTRA_ARGS=""
if [ "$ZERO" = "True" ]; then
EXTRA_ARGS+=" --zero_opt"
fi
if [ "$CKPT" = "True" ]; then
EXTRA_ARGS+=" --conv_checkpointing"
fi

## run scripts
set -x

srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \
python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \
--inputfile=base.json \
--dataset_path=$DATASET_PATH \
--multi \
--multi_model_list=$DATASET_LIST \
--num_epoch=10 \
--everyone --ddstore \
--log=$LOG_NAME \
--hidden_dim=${WIDTH} \
--num_conv_layers=${DEPTH} \
--full_test \
--batch_size=${BS} \
--num_samples=${NS} \
${EXTRA_ARGS}

set +x
29 changes: 29 additions & 0 deletions examples/multidataset_deepspeed/launch_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import subprocess
import argparse

def submit_job(nodes, width, depth, dataset_size, zero=False, ckpt=False):
# Command to execute
command = ["sbatch", "-N", str(nodes), "job-perlmutter.sh", str(width), str(depth), str(dataset_size), str(zero), str(ckpt)]
# Run the command and capture output
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
stdout, stderr = process.communicate()
# Extract the job ID
output = stdout.strip()
job_id = int(output.split()[-1])
return job_id

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Submit jobs with varying parameters.")
parser.add_argument("--width", type=int, required=True, help="Width of the model.")
parser.add_argument("--depth", type=int, required=True, help="Depth of the model.")
parser.add_argument("--zero", action="store_true", help="enable zero optimizer with stage 1", default=False)
parser.add_argument("--ckpt", action="store_true", help="enable checkpointing for conv layers", default=False)

args = parser.parse_args()

dataset_size_list = [0.1, 0.2, 0.4, 0.6]
nodes_list = [ 8, 16, 32, 32]

for dataset_size, nodes in zip(dataset_size_list, nodes_list):
job_id = submit_job(nodes, args.width, args.depth, dataset_size, args.zero, args.ckpt)
print(job_id)
Loading

0 comments on commit 69ee2e9

Please sign in to comment.