Skip to content

Commit

Permalink
chore(lora): Added some comments and linting for the end-to-end test …
Browse files Browse the repository at this point in the history
…for lora training.
  • Loading branch information
ajude2s committed Sep 30, 2024
1 parent d12bc98 commit 1bb3be6
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 406 deletions.
175 changes: 36 additions & 139 deletions tests/fine_tuning/test_configs/config_lorem_ipsum_lora_training.yaml
Original file line number Diff line number Diff line change
@@ -1,71 +1,38 @@
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
training:
training_log_interval_in_steps: 2
checkpointing_interval_in_steps: 2
evaluation_interval_in_steps: 40
training_log_interval_in_steps: 1
checkpointing_interval_in_steps: 15
evaluation_interval_in_steps: 1
global_num_seen_tokens: 0
activation_checkpointing_modules: [ ]
gradient_acc_steps: 10
local_train_micro_batch_size: 2
sequence_length: 2048
do_apply_activation_checkpointing: false
gradient_acc_steps: 1
local_train_micro_batch_size: 1
sequence_length: 256
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
checkpointing_path: xxx # is overwritten during test
checkpointing_path: /tmp/pytest-of-alex-jude/pytest-96

collate_fn:
component_key: collate_fn
variant_key: mask_loss_collator_wrapper
variant_key: gpt_2_llm_collator
config:
wrapped_collate_fn:
component_key: collate_fn
variant_key: gpt_2_llm_collator
config:
sample_key: ${settings.referencing_keys.sample_key}
target_key: ${settings.referencing_keys.target_key}
target_keys_to_mask:
- ${settings.referencing_keys.target_key}
loss_ignore_index: -100
mask_tokens:
b_include_to_loss_token: ^
e_include_to_loss_token: $
tokenizer:
instance_key: tokenizer
pass_type: BY_REFERENCE

train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_train.7e71e5e.pbin
sequence_length: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
reuse_last_target: false

val_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin
sequence_length: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
reuse_last_target: false
target_key: ${settings.referencing_keys.target_key}

test_dataset:
train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: tests/test_data/lorem_ipsum_sft_7e71e5e/lorem_ipsum_sft_converted_test.7e71e5e.pbin
raw_data_path: tests/end2end_tests/lorem_ipsum.pbin
sequence_length: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
reuse_last_target: false
sample_key: ${settings.referencing_keys.sample_key}

train_dataloader:
component_key: data_loader
Expand All @@ -74,7 +41,16 @@ train_dataloader:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: train
dataloader_tag: "train"
skip_num_batches:
component_key: number_conversion
variant_key: local_num_batches_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
sequence_length: ${settings.training.sequence_length}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}

dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
Expand All @@ -98,73 +74,7 @@ train_dataloader:
instance_key: collate_fn
pass_type: BY_REFERENCE

val_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: "val"
dataset:
instance_key: val_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: 2
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: false
dataset:
instance_key: val_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

test_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: "test"
dataset:
instance_key: test_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: 2
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: false
dataset:
instance_key: test_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

eval_dataloaders:
- instance_key: val_dataloader
pass_type: BY_REFERENCE
- instance_key: test_dataloader
pass_type: BY_REFERENCE
eval_dataloaders: []

checkpoint_saving:
component_key: checkpoint_saving
Expand All @@ -174,7 +84,7 @@ checkpoint_saving:
component_key: checkpoint_saving_strategy
variant_key: save_k_most_recent_checkpoints_strategy
config:
k: 1
k: -1 # -1 to save all checkpoints
checkpoint_saving_execution:
component_key: checkpoint_saving_execution
variant_key: fsdp
Expand All @@ -190,14 +100,15 @@ checkpoint_saving:
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
sequence_length: ${settings.training.sequence_length}


# resolving class types via different enums sucks...
loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
config:
target_key: target_ids
prediction_key: logits


lora_model:
component_key: model
variant_key: lora
Expand Down Expand Up @@ -253,7 +164,7 @@ model_raw:
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 2
n_head_q: 8
n_head_kv: 4
n_head_kv: 8
ffn_hidden: 128
n_embd: 128
dropout: 0.0
Expand All @@ -266,7 +177,7 @@ model_raw:
n_head: ${model_raw.config.n_head_q} #it has to be head_q here
seq_length_dim: -2
attention_implementation: manual
activation_type: swiglu
activation_type: gelu
attention_norm:
component_key: layer_norm
variant_key: rms_norm
Expand All @@ -289,36 +200,23 @@ model_raw:
bias: true
epsilon: 1e-5

tokenizer:
component_key: tokenizer
variant_key: pretrained_hf_tokenizer
config:
pretrained_model_name_or_path: data/tokenizer/hf_gpt2
padding: false
truncation: false

scheduler:
component_key: scheduler
variant_key: onecycle_lr
variant_key: dummy_lr
config:
optimizer:
instance_key: optimizer
pass_type: BY_REFERENCE
max_lr: 6e-4
div_factor: 10
final_div_factor: 1
total_steps: 276
pct_start: 0.01
anneal_strategy: cos

optimizer:
component_key: optimizer
variant_key: adam_w
config:
lr: 0.0001
betas: [ 0.9, 0.95 ]
betas: [0.9, 0.95]
eps: 1e-8
weight_decay: 1e-1
weight_decay_groups_excluded: [ ]
weight_decay_groups_excluded: ["embedding", "layernorm"]
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE
Expand Down Expand Up @@ -350,11 +248,10 @@ batch_progress_subscriber:
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
eval_dataloaders:
instance_key: eval_dataloaders
pass_type: BY_REFERENCE
eval_dataloaders: []


evaluation_subscriber:
component_key: results_subscriber
variant_key: dummy
config: {}
config: {}
Loading

0 comments on commit 1bb3be6

Please sign in to comment.