From 23c542d955cdc0b76c2e80b569e95646f38d93ae Mon Sep 17 00:00:00 2001 From: mrudat-iais Date: Tue, 24 Sep 2024 14:55:57 +0200 Subject: [PATCH] feat(lora): wip: run e2e test on two GPUs --- .../config_lorem_ipsum_lora_training.yaml | 8 ++++---- tests/fine_tuning/test_lora_training.py | 20 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/fine_tuning/test_configs/config_lorem_ipsum_lora_training.yaml b/tests/fine_tuning/test_configs/config_lorem_ipsum_lora_training.yaml index 7c5dba51..e5c0fd0e 100644 --- a/tests/fine_tuning/test_configs/config_lorem_ipsum_lora_training.yaml +++ b/tests/fine_tuning/test_configs/config_lorem_ipsum_lora_training.yaml @@ -44,7 +44,7 @@ train_dataset: component_key: dataset variant_key: packed_mem_map_dataset_continuous config: - raw_data_path: test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_train.7e71e5e.pbin + raw_data_path: tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_train.7e71e5e.pbin sequence_length: ${settings.training.sequence_length} sample_key: ${settings.referencing_keys.sample_key} reuse_last_target: false @@ -53,7 +53,7 @@ val_dataset: component_key: dataset variant_key: packed_mem_map_dataset_continuous config: - raw_data_path: test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin + raw_data_path: tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin sequence_length: ${settings.training.sequence_length} sample_key: ${settings.referencing_keys.sample_key} reuse_last_target: false @@ -62,7 +62,7 @@ test_dataset: component_key: dataset variant_key: packed_mem_map_dataset_continuous config: - raw_data_path: test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin + raw_data_path: tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin sequence_length: ${settings.training.sequence_length} sample_key: ${settings.referencing_keys.sample_key} reuse_last_target: false @@ -293,7 +293,7 @@ tokenizer: component_key: tokenizer variant_key: pretrained_hf_tokenizer config: - pretrained_model_name_or_path: ../data/tokenizer/hf_gpt2 + pretrained_model_name_or_path: data/tokenizer/hf_gpt2 padding: false truncation: false diff --git a/tests/fine_tuning/test_lora_training.py b/tests/fine_tuning/test_lora_training.py index b6d2a33f..b836d08f 100644 --- a/tests/fine_tuning/test_lora_training.py +++ b/tests/fine_tuning/test_lora_training.py @@ -10,14 +10,14 @@ from modalities.running_env.cuda_env import CudaEnv from tests.conftest import _ROOT_DIR -os.environ["LOCAL_RANK"] = "0" -os.environ["RANK"] = "0" -os.environ["WORLD_SIZE"] = "1" -os.environ["NNODES"] = "1" -os.environ["NPROC_PER_NODE"] = "1" -os.environ["RDZV_ENDPOINT"] = "0.0.0.0:29502" -os.environ["MASTER_ADDR"] = "localhost" -os.environ["MASTER_PORT"] = "29500" +# os.environ["LOCAL_RANK"] = "0" +# os.environ["RANK"] = "0" +# os.environ["WORLD_SIZE"] = "2" +# os.environ["NNODES"] = "1" +# os.environ["NPROC_PER_NODE"] = "2" +# os.environ["RDZV_ENDPOINT"] = "0.0.0.0:29502" +# os.environ["MASTER_ADDR"] = "localhost" +# os.environ["MASTER_PORT"] = "29500" @pytest.fixture() @@ -49,8 +49,8 @@ def main_obj(config_file_path, checkpointing_path): @pytest.mark.skipif( - "RANK" not in os.environ or torch.cuda.device_count() < 1, - reason="This e2e test requires 1 GPU and a torchrun distributed environment.", + "RANK" not in os.environ or torch.cuda.device_count() < 2, + reason="This e2e test requires 2 GPUs and a torchrun distributed environment.", ) def test_lora_model_training_on_one_gpu(main_obj, checkpointing_path): with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):