Skip to content

Commit

Permalink
feat(lora): Adapted the end-to-end test for lora training.
Browse files Browse the repository at this point in the history
  • Loading branch information
ajude2s committed Sep 26, 2024
1 parent 23c542d commit d12bc98
Show file tree
Hide file tree
Showing 2 changed files with 270 additions and 9 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
settings:
experiment_id: ${modalities_env:experiment_id}
referencing_keys:
sample_key: input_ids
target_key: target_ids
training:
training_log_interval_in_steps: 1
checkpointing_interval_in_steps: 15
evaluation_interval_in_steps: 1
global_num_seen_tokens: 0
do_apply_activation_checkpointing: false
gradient_acc_steps: 1
local_train_micro_batch_size: 1
sequence_length: 256
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
checkpointing_path: /tmp/pytest-of-alex-jude/pytest-96

collate_fn:
component_key: collate_fn
variant_key: gpt_2_llm_collator
config:
sample_key: ${settings.referencing_keys.sample_key}
target_key: ${settings.referencing_keys.target_key}

train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: tests/end2end_tests/lorem_ipsum.pbin
sequence_length: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

train_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: "train"
skip_num_batches:
component_key: number_conversion
variant_key: local_num_batches_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
sequence_length: ${settings.training.sequence_length}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}

dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.training.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: true
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

eval_dataloaders: []

checkpoint_saving:
component_key: checkpoint_saving
variant_key: default
config:
checkpoint_saving_strategy:
component_key: checkpoint_saving_strategy
variant_key: save_k_most_recent_checkpoints_strategy
config:
k: -1 # -1 to save all checkpoints
checkpoint_saving_execution:
component_key: checkpoint_saving_execution
variant_key: fsdp
config:
checkpoint_path: ${settings.paths.checkpointing_path} # TODO <replaced_in_test>
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}
get_num_tokens_from_num_steps_callable:
component_key: number_conversion
variant_key: num_tokens_from_num_steps_callable
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
sequence_length: ${settings.training.sequence_length}


# resolving class types via different enums sucks...
loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
config:
target_key: target_ids
prediction_key: logits

lora_model:
component_key: model
variant_key: lora
config:
alpha: 1
r: 2
target_layers:
- q_attn
- k_attn
- v_attn
- c_proj
model:
instance_key: model
pass_type: BY_REFERENCE

wrapped_model:
component_key: model
variant_key: fsdp_wrapped
config:
model:
instance_key: lora_model
pass_type: BY_REFERENCE
sync_module_states: true
mixed_precision_settings: BF_16
sharding_strategy: FULL_SHARD
block_names: [GPT2Block]

model:
component_key: model
variant_key: model_initialized
config:
model:
instance_key: model_raw
pass_type: BY_REFERENCE
model_initializer:
component_key: model_initialization
variant_key: composed
config:
model_type: gpt2
weight_init_type: scaled
mean: 0.0
std: 0.02
num_layers: ${model_raw.config.n_layer}

model_raw:
component_key: model
variant_key: gpt2
config:
sample_key: ${settings.referencing_keys.sample_key}
poe_type: NOPE
sequence_length: ${settings.training.sequence_length}
prediction_key: ${loss_fn.config.prediction_key}
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 2
n_head_q: 8
n_head_kv: 8
ffn_hidden: 128
n_embd: 128
dropout: 0.0
bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
attention_config:
qkv_transforms:
- type_hint: RotaryTransform
config:
n_embd: ${model_raw.config.n_embd}
n_head: ${model_raw.config.n_head_q} #it has to be head_q here
seq_length_dim: -2
attention_implementation: manual
activation_type: gelu
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model_raw.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model_raw.config.n_embd}
bias: true
epsilon: 1e-5
lm_head_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model_raw.config.n_embd}
bias: true
epsilon: 1e-5

scheduler:
component_key: scheduler
variant_key: dummy_lr
config:
optimizer:
instance_key: optimizer
pass_type: BY_REFERENCE

optimizer:
component_key: optimizer
variant_key: adam_w
config:
lr: 0.0001
betas: [0.9, 0.95]
eps: 1e-8
weight_decay: 1e-1
weight_decay_groups_excluded: ["embedding", "layernorm"]
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE

gradient_clipper:
component_key: gradient_clipper
variant_key: fsdp
config:
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE
norm_type: P2_NORM
max_norm: 1.0

batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
sequence_length: ${settings.training.sequence_length}
gradient_acc_steps: ${settings.training.gradient_acc_steps}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
eval_dataloaders: []


evaluation_subscriber:
component_key: results_subscriber
variant_key: dummy
config: {}
22 changes: 13 additions & 9 deletions tests/fine_tuning/test_lora_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

# os.environ["LOCAL_RANK"] = "0"
# os.environ["RANK"] = "0"
# os.environ["WORLD_SIZE"] = "2"
# os.environ["WORLD_SIZE"] = "1"
# os.environ["NNODES"] = "1"
# os.environ["NPROC_PER_NODE"] = "2"
# os.environ["RDZV_ENDPOINT"] = "0.0.0.0:29502"
Expand All @@ -22,7 +22,8 @@

@pytest.fixture()
def config_file_name() -> str:
return "config_lorem_ipsum_lora_training.yaml"
# return "config_lorem_ipsum_lora_training.yaml"
return "config_lorem_ipsum_lora_training_test.yaml"


@pytest.fixture()
Expand All @@ -35,7 +36,7 @@ def config_file_path(config_file_name) -> Path:

@pytest.fixture
def checkpointing_path(tmp_path):
return tmp_path / "smol_lora_instruct/"
return tmp_path.parent


@pytest.fixture
Expand All @@ -50,16 +51,19 @@ def main_obj(config_file_path, checkpointing_path):

@pytest.mark.skipif(
"RANK" not in os.environ or torch.cuda.device_count() < 2,
reason="This e2e test requires 2 GPUs and a torchrun distributed environment.",
reason="This e2e test requires 2 GPU and a torchrun distributed environment.",
)
def test_lora_model_training_on_one_gpu(main_obj, checkpointing_path):
with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl):
components = main_obj.build_components(components_model_type=TrainingComponentsInstantiationModel)
main_obj.run(components)

assert os.path.exists(checkpointing_path)
checkpoint_files = [
"model" in path.name or "optimizer" in path.name or path.suffix == ".yaml"
for path in list(checkpointing_path.glob("*"))[0].glob("*")
]
assert sum(checkpoint_files) == 3, "Output of the test i.e. a model checkpoint was not created!"

checkpoint_files = []
for root, dirs, files in os.walk(checkpointing_path):
for file in files:
if "model" in file or "optimizer" in file or file.endswith('.yaml'):
checkpoint_files.append(file)
if torch.cuda.current_device() == 0:
assert len(checkpoint_files) >= 3, "Output of the test i.e. a model checkpoint was not created!"

0 comments on commit d12bc98

Please sign in to comment.