Skip to content

Commit

Permalink
[feat, SFT] Support LigerKernel for SFT (#173)
Browse files Browse the repository at this point in the history
  • Loading branch information
xingyaoww authored Jan 31, 2025
1 parent fb3793a commit 25fc194
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/e2e_sft.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,7 @@ jobs:
run: |
ray stop --force
bash tests/sft/run_sft_sp_loss_match.sh
- name: Running gsm8k e2e training tests on 8 L20 GPUs with sequence parallism and liger
run: |
ray stop --force
bash tests/sft/run_sft_qwen05_sp2_liger.sh 8 $HOME/ckpts/
32 changes: 32 additions & 0 deletions examples/sft/gsm8k/run_qwen_05_sp2_liger.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
set -x

if [ "$#" -lt 2 ]; then
echo "Usage: run_qwen_05_sp2.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi

nproc_per_node=$1
save_path=$2

# Shift the arguments so $@ refers to the rest
shift 2

torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
model.use_liger=True \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct-sp2-liger \
trainer.logger=['console'] \
trainer.default_hdfs_dir=null $@ \
ulysses_sequence_parallel_size=2 \
use_remove_padding=true
33 changes: 33 additions & 0 deletions tests/sft/run_sft_qwen05_sp2_liger.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
set -x

if [ "$#" -lt 2 ]; then
echo "Usage: run_qwen_05_sp2.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi

nproc_per_node=$1
save_path=$2

# Shift the arguments so $@ refers to the rest
shift 2

torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
model.use_liger=True \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct-sp2-liger \
trainer.logger=['console'] \
trainer.total_training_steps=1 \
trainer.default_hdfs_dir=null $@ \
ulysses_sequence_parallel_size=2 \
use_remove_padding=true
1 change: 1 addition & 0 deletions verl/trainer/config/sft_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ model:
lora_rank: 0 # Set to positive value to enable LoRA (e.g., 32)
lora_alpha: 16 # LoRA scaling factor
target_modules: all-linear # Target modules for LoRA adaptation
use_liger: False
optim:
lr: 1e-5
betas: [0.9, 0.95]
Expand Down
5 changes: 5 additions & 0 deletions verl/trainer/fsdp_sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,11 @@ def _build_model_optimizer(self):
attn_implementation='flash_attention_2',
trust_remote_code=trust_remote_code)

# Apply Liger kernel if use_liger is enabled
if self.config.model.get('use_liger', False):
from liger_kernel.transformers.monkey_patch import _apply_liger_kernel_to_instance
_apply_liger_kernel_to_instance(model=self.model)

if self.config.model.get('lora_rank', 0) > 0:
self.model.enable_input_require_grads()
# Convert config to regular Python types before creating PEFT model
Expand Down

0 comments on commit 25fc194

Please sign in to comment.