Skip to content

Commit

Permalink
feat(lora): wip: run e2e test on two GPUs
Browse files Browse the repository at this point in the history
  • Loading branch information
mrudat-iais committed Sep 24, 2024
1 parent 0d22826 commit 23c542d
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_train.7e71e5e.pbin
raw_data_path: tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_train.7e71e5e.pbin
sequence_length: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
reuse_last_target: false
Expand All @@ -53,7 +53,7 @@ val_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin
raw_data_path: tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin
sequence_length: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
reuse_last_target: false
Expand All @@ -62,7 +62,7 @@ test_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin
raw_data_path: tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin
sequence_length: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
reuse_last_target: false
Expand Down Expand Up @@ -293,7 +293,7 @@ tokenizer:
component_key: tokenizer
variant_key: pretrained_hf_tokenizer
config:
pretrained_model_name_or_path: ../data/tokenizer/hf_gpt2
pretrained_model_name_or_path: data/tokenizer/hf_gpt2
padding: false
truncation: false

Expand Down
20 changes: 10 additions & 10 deletions tests/fine_tuning/test_lora_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
from modalities.running_env.cuda_env import CudaEnv
from tests.conftest import _ROOT_DIR

os.environ["LOCAL_RANK"] = "0"
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["NNODES"] = "1"
os.environ["NPROC_PER_NODE"] = "1"
os.environ["RDZV_ENDPOINT"] = "0.0.0.0:29502"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29500"
# os.environ["LOCAL_RANK"] = "0"
# os.environ["RANK"] = "0"
# os.environ["WORLD_SIZE"] = "2"
# os.environ["NNODES"] = "1"
# os.environ["NPROC_PER_NODE"] = "2"
# os.environ["RDZV_ENDPOINT"] = "0.0.0.0:29502"
# os.environ["MASTER_ADDR"] = "localhost"
# os.environ["MASTER_PORT"] = "29500"


@pytest.fixture()
Expand Down Expand Up @@ -49,8 +49,8 @@ def main_obj(config_file_path, checkpointing_path):


@pytest.mark.skipif(
"RANK" not in os.environ or torch.cuda.device_count() < 1,
reason="This e2e test requires 1 GPU and a torchrun distributed environment.",
"RANK" not in os.environ or torch.cuda.device_count() < 2,
reason="This e2e test requires 2 GPUs and a torchrun distributed environment.",
)
def test_lora_model_training_on_one_gpu(main_obj, checkpointing_path):
with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
Expand Down

0 comments on commit 23c542d

Please sign in to comment.