-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add multidataset example with deepspeed support (#316)
* Add multidataset example with deepspeed support * Change base.json to follow GPS' requirement
- Loading branch information
Showing
7 changed files
with
760 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
./logs | ||
*.out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
{ | ||
"Verbosity": { | ||
"level": 2 | ||
}, | ||
"NeuralNetwork": { | ||
"Architecture": { | ||
"mpnn_type": "EGNN", | ||
"pe_dim": 0, | ||
"global_attn_engine": "", | ||
"global_attn_type": "", | ||
"global_attn_heads": 0, | ||
"equivariance": true, | ||
"radius": 5.0, | ||
"max_neighbours": 100000, | ||
"num_gaussians": 50, | ||
"envelope_exponent": 5, | ||
"int_emb_size": 64, | ||
"basis_emb_size": 8, | ||
"out_emb_size": 128, | ||
"num_after_skip": 2, | ||
"num_before_skip": 1, | ||
"num_radial": 6, | ||
"num_spherical": 7, | ||
"num_filters": 126, | ||
"edge_features": ["length"], | ||
"hidden_dim": 50, | ||
"num_conv_layers": 3, | ||
"output_heads": { | ||
"graph":{ | ||
"num_sharedlayers": 2, | ||
"dim_sharedlayers": 50, | ||
"num_headlayers": 2, | ||
"dim_headlayers": [50,25] | ||
}, | ||
"node": { | ||
"num_headlayers": 2, | ||
"dim_headlayers": [200,200], | ||
"type": "mlp" | ||
} | ||
}, | ||
"task_weights": [1.0, 1.0] | ||
}, | ||
"Variables_of_interest": { | ||
"input_node_features": [0, 1, 2, 3], | ||
"output_names": ["energy", "force"], | ||
"output_index": [0, 2], | ||
"output_dim": [1, 3], | ||
"type": ["graph", "node"] | ||
}, | ||
"Training": { | ||
"num_epoch": 50, | ||
"EarlyStopping": true, | ||
"perc_train": 0.9, | ||
"loss_function_type": "mae", | ||
"batch_size": 32, | ||
"continue": 0, | ||
"Optimizer": { | ||
"type": "AdamW", | ||
"learning_rate": 1e-3 | ||
} | ||
} | ||
}, | ||
"Visualization": { | ||
"plot_init_solution": true, | ||
"plot_hist_solution": false, | ||
"create_plots": true | ||
} | ||
} |
116 changes: 116 additions & 0 deletions
116
examples/multidataset_deepspeed/job-perlmutter-batch.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#!/bin/bash | ||
#SBATCH -A m4716 | ||
#SBATCH -J HydraGNN | ||
#SBATCH -C gpu | ||
#SBATCH -q regular | ||
#SBATCH -t 48:00:00 | ||
#SBATCH --ntasks-per-node=4 | ||
#SBATCH --gpus-per-task=1 | ||
#SBATCH -c 32 | ||
|
||
# Retrieve the number of nodes set via `sbatch -N` or in the script | ||
echo "Number of nodes allocated: $SLURM_NNODES" | ||
|
||
## Remove write permission for others in terms of newly created files and dirs | ||
umask 002 | ||
|
||
## Load Basic Envs | ||
module reset | ||
module load pytorch/2.0.1 | ||
|
||
module use -a /global/cfs/cdirs/m4133/jyc/perlmutter/sw/modulefiles | ||
module load hydragnn/pytorch2.0.1-v2 | ||
module use -a /global/cfs/cdirs/m4133/c8l/sw/modulefiles | ||
module load deepspeed | ||
|
||
## MPI Envs | ||
export MPICH_ENV_DISPLAY=0 | ||
export MPICH_VERSION_DISPLAY=0 | ||
export MPICH_GPU_SUPPORT_ENABLED=0 | ||
|
||
## HYDRAGNN Envs | ||
HYDRAGNN_DIR=/global/cfs/cdirs/m4716/c8l/HydraGNN | ||
export PYTHONPATH=$HYDRAGNN_DIR:$PYTHONPATH | ||
|
||
export HYDRAGNN_NUM_WORKERS=0 | ||
export HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1 | ||
export HYDRAGNN_AGGR_BACKEND=mpi | ||
export HYDRAGNN_VALTEST=1 | ||
export HYDRAGNN_TRACE_LEVEL=0 | ||
|
||
## Dataset Envs | ||
DATASET_PATH="/global/cfs/projectdirs/m4716/mlupopa/HydraGNN/examples/multidataset_hpo/dataset" | ||
DATASET_LIST="MPTrj-v3,ANI1x-v3,OC2020-20M-v3,OC2022-v3,qm7x-v3" | ||
|
||
## Task 1: Outer loop WIDTH, Inner loop DEPTH, fixed DS, ZERO, and CKPT | ||
for WIDTH in 800 1100 1700 2500; do | ||
for DEPTH in 4 5 6; do | ||
LOG_NAME="exp-${DEPTH}_depth-${WIDTH}_width-0.6_TB_data-${SLURM_NNODES}_nodes" | ||
|
||
## Calculate batch size and num_samples | ||
BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size | ||
NS=$(echo "scale=0; 285715 / 1.2 * 0.6 * 32 / $SLURM_NNODES" | bc) # Fixed DS=0.6 | ||
|
||
## Handle optional arguments | ||
EXTRA_ARGS="--zero_opt" | ||
|
||
## Run script | ||
set -x | ||
|
||
srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \ | ||
python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \ | ||
--inputfile=base.json \ | ||
--dataset_path=$DATASET_PATH \ | ||
--multi \ | ||
--multi_model_list=$DATASET_LIST \ | ||
--num_epoch=10 \ | ||
--everyone --ddstore \ | ||
--log=$LOG_NAME \ | ||
--hidden_dim=${WIDTH} \ | ||
--num_conv_layers=${DEPTH} \ | ||
--full_test \ | ||
--batch_size=${BS} \ | ||
--num_samples=${NS} \ | ||
${EXTRA_ARGS} | ||
|
||
set +x | ||
done | ||
done | ||
|
||
## Task 2: Outer loop WIDTH, Inner loop DS, fixed DEPTH and ZERO, varying CKPT | ||
for WIDTH in 2500 5400; do | ||
for DS in 0.2 0.6 1.2; do | ||
LOG_NAME="exp-3_depth-${WIDTH}_width-${DS}_TB_data-${SLURM_NNODES}_nodes" | ||
|
||
## Calculate batch size and num_samples | ||
BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size | ||
NS=$(echo "scale=0; 285715 / 1.2 * ${DS} * 32 / $SLURM_NNODES" | bc) # Dynamic DS | ||
|
||
## Handle optional arguments | ||
EXTRA_ARGS="--zero_opt" | ||
if [ "$WIDTH" = "5400" ]; then | ||
EXTRA_ARGS+=" --conv_checkpointing" | ||
fi | ||
|
||
## Run script | ||
set -x | ||
|
||
srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \ | ||
python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \ | ||
--inputfile=base.json \ | ||
--dataset_path=$DATASET_PATH \ | ||
--multi \ | ||
--multi_model_list=$DATASET_LIST \ | ||
--num_epoch=10 \ | ||
--everyone --ddstore \ | ||
--log=$LOG_NAME \ | ||
--hidden_dim=${WIDTH} \ | ||
--num_conv_layers=3 \ | ||
--full_test \ | ||
--batch_size=${BS} \ | ||
--num_samples=${NS} \ | ||
${EXTRA_ARGS} | ||
|
||
set +x | ||
done | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/bin/bash | ||
#SBATCH -A m4716 | ||
#SBATCH -J HydraGNN | ||
#SBATCH -C gpu | ||
#SBATCH -q regular | ||
#SBATCH -t 48:00:00 | ||
#SBATCH --ntasks-per-node=4 | ||
#SBATCH --gpus-per-task=1 | ||
#SBATCH -c 32 | ||
|
||
# Retrieve the number of nodes set via `sbatch -N` or in the script | ||
echo "Number of nodes allocated: $SLURM_NNODES" | ||
|
||
WIDTH=${1:-50} # Default to 50 hidden_dim if not specified | ||
DEPTH=${2:-3} # Default to 3 num_conv_layers if not specified | ||
DS=${3:-1.2} # Default to 1.2TB data if not specified | ||
ZERO=${4:-False} # Default to False if not specified | ||
CKPT=${5:-False} # Default to False if not specified | ||
|
||
## Remove write permission for others in terms of newly created files and dirs | ||
umask 002 | ||
|
||
## Load Basic Envs | ||
module reset | ||
module load pytorch/2.0.1 | ||
|
||
module use -a /global/cfs/cdirs/m4133/jyc/perlmutter/sw/modulefiles | ||
module load hydragnn/pytorch2.0.1-v2 | ||
module use -a /global/cfs/cdirs/m4133/c8l/sw/modulefiles | ||
module load deepspeed | ||
|
||
## MPI Envs | ||
export MPICH_ENV_DISPLAY=0 | ||
export MPICH_VERSION_DISPLAY=0 | ||
export MPICH_GPU_SUPPORT_ENABLED=0 | ||
|
||
## HYDRAGNN Envs | ||
HYDRAGNN_DIR=/global/cfs/cdirs/m4716/c8l/HydraGNN | ||
export PYTHONPATH=$HYDRAGNN_DIR:$PYTHONPATH | ||
|
||
export HYDRAGNN_NUM_WORKERS=0 | ||
export HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1 | ||
export HYDRAGNN_AGGR_BACKEND=mpi | ||
export HYDRAGNN_VALTEST=1 | ||
export HYDRAGNN_TRACE_LEVEL=0 | ||
|
||
## Dataset Envs | ||
DATASET_PATH="/global/cfs/projectdirs/m4716/mlupopa/HydraGNN/examples/multidataset_hpo/dataset" | ||
DATASET_LIST="MPTrj-v3,ANI1x-v3,OC2020-20M-v3,OC2022-v3,qm7x-v3" | ||
|
||
## Log Envs | ||
LOG_NAME="exp-${DEPTH}_depth-${WIDTH}_width-${DS}_TB_data-${SLURM_NNODES}_nodes" | ||
|
||
## Calculate batch size and num_samples | ||
BS=$((32 * 32 / SLURM_NNODES)) # Dynamic calculation of batch size, default setting: 32 nodes with 32 batch size per GPU | ||
NS=$(echo "scale=0; 285715 / 1.2 * ${DS} * 32 / $SLURM_NNODES" | bc) # Calculate number of samples, default setting: 32 nodes with 285715 num_samples per GPU | ||
|
||
## Handle optional arguments | ||
EXTRA_ARGS="" | ||
if [ "$ZERO" = "True" ]; then | ||
EXTRA_ARGS+=" --zero_opt" | ||
fi | ||
if [ "$CKPT" = "True" ]; then | ||
EXTRA_ARGS+=" --conv_checkpointing" | ||
fi | ||
|
||
## run scripts | ||
set -x | ||
|
||
srun -N${SLURM_NNODES} -n$((SLURM_NNODES*4)) -c32 --ntasks-per-node=4 --gpus-per-task=1 \ | ||
python -u $HYDRAGNN_DIR/examples/multidataset_deepspeed/train.py \ | ||
--inputfile=base.json \ | ||
--dataset_path=$DATASET_PATH \ | ||
--multi \ | ||
--multi_model_list=$DATASET_LIST \ | ||
--num_epoch=10 \ | ||
--everyone --ddstore \ | ||
--log=$LOG_NAME \ | ||
--hidden_dim=${WIDTH} \ | ||
--num_conv_layers=${DEPTH} \ | ||
--full_test \ | ||
--batch_size=${BS} \ | ||
--num_samples=${NS} \ | ||
${EXTRA_ARGS} | ||
|
||
set +x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import subprocess | ||
import argparse | ||
|
||
def submit_job(nodes, width, depth, dataset_size, zero=False, ckpt=False): | ||
# Command to execute | ||
command = ["sbatch", "-N", str(nodes), "job-perlmutter.sh", str(width), str(depth), str(dataset_size), str(zero), str(ckpt)] | ||
# Run the command and capture output | ||
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) | ||
stdout, stderr = process.communicate() | ||
# Extract the job ID | ||
output = stdout.strip() | ||
job_id = int(output.split()[-1]) | ||
return job_id | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Submit jobs with varying parameters.") | ||
parser.add_argument("--width", type=int, required=True, help="Width of the model.") | ||
parser.add_argument("--depth", type=int, required=True, help="Depth of the model.") | ||
parser.add_argument("--zero", action="store_true", help="enable zero optimizer with stage 1", default=False) | ||
parser.add_argument("--ckpt", action="store_true", help="enable checkpointing for conv layers", default=False) | ||
|
||
args = parser.parse_args() | ||
|
||
dataset_size_list = [0.1, 0.2, 0.4, 0.6] | ||
nodes_list = [ 8, 16, 32, 32] | ||
|
||
for dataset_size, nodes in zip(dataset_size_list, nodes_list): | ||
job_id = submit_job(nodes, args.width, args.depth, dataset_size, args.zero, args.ckpt) | ||
print(job_id) |
Oops, something went wrong.