From 29d5f75e66c7f27bec0ad6b75d120106d83c597d Mon Sep 17 00:00:00 2001 From: gnadathur Date: Fri, 1 Mar 2024 18:04:56 -0800 Subject: [PATCH] 2DParallel test --- run_llama_train.sh | 2 +- train_configs/test/basic_integration_test.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/run_llama_train.sh b/run_llama_train.sh index 13b66aeac..37ac1e705 100755 --- a/run_llama_train.sh +++ b/run_llama_train.sh @@ -17,7 +17,7 @@ NGPU=${NGPU:-"8"} LOG_RANK=${LOG_RANK:-0} -CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"} +CONFIG_FILE=${CONFIG_FILE:-"./train_configs/test/basic_integration_test.toml"} torchrun --nproc_per_node=${NGPU} --rdzv_endpoint="localhost:5972" \ --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ diff --git a/train_configs/test/basic_integration_test.toml b/train_configs/test/basic_integration_test.toml index 0a51c090c..bc55d0a08 100644 --- a/train_configs/test/basic_integration_test.toml +++ b/train_configs/test/basic_integration_test.toml @@ -1,7 +1,7 @@ # TorchTrain Config.toml [job] dump_folder = "./outputs" -description = "1DParallel with debug model" +description = "2DParallel with debug model" [profiling] run_profiler = true @@ -31,7 +31,7 @@ warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps max_norm = 1.0 # grad norm clipping steps = 10 data_parallel_degree = -1 -sequence_parallel_degree = 1 +sequence_parallel_degree = 2 pipeline_parallel_degree = 1 compile = false checkpoint_interval = 3600