-
Notifications
You must be signed in to change notification settings - Fork 101
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
liushixuan
committed
Mar 28, 2024
1 parent
530b4f3
commit 74a6fa4
Showing
1 changed file
with
140 additions
and
152 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,152 +1,140 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
import torch | ||
import random | ||
import numpy as np | ||
from transformers import set_seed, AutoTokenizer | ||
import deepspeed | ||
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus | ||
import re | ||
import shutil | ||
|
||
def print_rank_0(msg, rank=0): | ||
if rank <= 0: | ||
print(msg) | ||
|
||
|
||
def to_device(batch, device): | ||
output = {} | ||
for k, v in batch.items(): | ||
try: | ||
output[k] = v.to(device) | ||
except: | ||
output[k] = v | ||
return output | ||
|
||
def get_dtype_size(dtype): | ||
if dtype == torch.bool: | ||
return 1 / 8 | ||
bit_search = re.search("[^\d](\d+)$", str(dtype)) | ||
if bit_search is None: | ||
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") | ||
bit_size = int(bit_search.groups()[0]) | ||
return bit_size // 8 | ||
|
||
def save_hf_format(model, tokenizer, args, sub_folder=""): | ||
model_to_save = model.module if hasattr(model, 'module') else model | ||
CONFIG_NAME = "config.json" | ||
output_dir = os.path.join(args.output_dir, sub_folder) | ||
os.makedirs(output_dir, exist_ok=True) | ||
output_config_file = os.path.join(output_dir, CONFIG_NAME) | ||
save_dict = model_to_save.state_dict() | ||
for key in list(save_dict.keys()): | ||
if "lora_" in key: | ||
del save_dict[key] | ||
model_to_save.save_pretrained(output_dir, state_dict=save_dict) | ||
model_to_save.config.to_json_file(output_config_file) | ||
tokenizer.save_pretrained(output_dir) | ||
# for models not in AutoModel, copy python module files | ||
train_from_model_path = model_to_save.config._name_or_path | ||
if os.path.exists(train_from_model_path): | ||
for filename in os.listdir(train_from_model_path): | ||
if filename.endswith(".py"): | ||
shutil.copy(os.path.join(train_from_model_path, filename), os.path.join(output_dir, filename)) | ||
|
||
|
||
def set_random_seed(seed): | ||
if seed is not None: | ||
set_seed(seed) | ||
random.seed(seed) | ||
np.random.seed(seed) | ||
torch.manual_seed(seed) | ||
torch.cuda.manual_seed_all(seed) | ||
|
||
|
||
def get_all_reduce_mean(tensor): | ||
torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) | ||
tensor = tensor / torch.distributed.get_world_size() | ||
return tensor | ||
|
||
|
||
def get_optimizer_grouped_parameters(model, | ||
weight_decay, | ||
no_decay_name_list=[ | ||
"bias", "LayerNorm.weight" | ||
]): | ||
optimizer_grouped_parameters = [ | ||
{ | ||
"params": [ | ||
p for n, p in model.named_parameters() | ||
if (not any(nd in n | ||
for nd in no_decay_name_list) and p.requires_grad) | ||
], | ||
"weight_decay": | ||
weight_decay, | ||
}, | ||
{ | ||
"params": [ | ||
p for n, p in model.named_parameters() | ||
if (any(nd in n | ||
for nd in no_decay_name_list) and p.requires_grad) | ||
], | ||
"weight_decay": | ||
0.0, | ||
}, | ||
] | ||
return optimizer_grouped_parameters | ||
|
||
|
||
def _z3_params_to_fetch(param_list): | ||
return [ | ||
p for p in param_list | ||
if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE | ||
] | ||
|
||
def save_zero_three_model(model, tokenizer, args, sub_folder=""): | ||
zero_stage_3 = (args.zero_stage == 3) | ||
os.makedirs(args.output_dir, exist_ok=True) | ||
WEIGHTS_NAME = "pytorch_model_00001-of-00001.bin" | ||
output_dir = os.path.join(args.output_dir, sub_folder) | ||
os.makedirs(output_dir, exist_ok=True) | ||
output_model_file = os.path.join(output_dir, WEIGHTS_NAME) | ||
model_to_save = model.module if hasattr(model, 'module') else model | ||
|
||
index_dict = {"weight_map": {}, "metadata": {}} | ||
total_size = 0 | ||
|
||
|
||
if not zero_stage_3: | ||
if args.global_rank == 0: | ||
torch.save(model_to_save.state_dict(), output_model_file) | ||
for k, v in model_to_save.named_parameters(): | ||
if "lora_" not in k: | ||
index_dict["weight_map"][k] = WEIGHTS_NAME | ||
total_size += v.numel() * get_dtype_size(v.dtype) | ||
else: | ||
output_state_dict = {} | ||
for k, v in model_to_save.named_parameters(): | ||
|
||
if hasattr(v, 'ds_id'): | ||
with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([v]), | ||
enabled=zero_stage_3): | ||
v_p = v.data.cpu() | ||
else: | ||
v_p = v.cpu() | ||
if args.global_rank == 0 and "lora" not in k: | ||
output_state_dict[k] = v_p | ||
if args.global_rank == 0: | ||
model_to_save.save_pretrained(output_dir, state_dict=output_state_dict) | ||
del output_state_dict | ||
|
||
if args.global_rank == 0: | ||
output_config_file = os.path.join(output_dir, "config.json") | ||
model_to_save.config.to_json_file(output_config_file) | ||
tokenizer.save_pretrained(output_dir) | ||
# for models not in AutoModel, copy python module files | ||
train_from_model_path = model_to_save.config._name_or_path | ||
if os.path.exists(train_from_model_path): | ||
for filename in os.listdir(train_from_model_path): | ||
if filename.endswith(".py"): | ||
shutil.copy(os.path.join(train_from_model_path, filename), os.path.join(output_dir, filename)) | ||
# Copyright (c) Microsoft Corporation. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
import torch | ||
import random | ||
import numpy as np | ||
from transformers import set_seed, AutoTokenizer | ||
import deepspeed | ||
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus | ||
import re | ||
import shutil | ||
|
||
def print_rank_0(msg, rank=0): | ||
if rank <= 0: | ||
print(msg) | ||
|
||
|
||
def to_device(batch, device): | ||
output = {} | ||
for k, v in batch.items(): | ||
try: | ||
output[k] = v.to(device) | ||
except: | ||
output[k] = v | ||
return output | ||
|
||
def get_dtype_size(dtype): | ||
if dtype == torch.bool: | ||
return 1 / 8 | ||
bit_search = re.search("[^\d](\d+)$", str(dtype)) | ||
if bit_search is None: | ||
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") | ||
bit_size = int(bit_search.groups()[0]) | ||
return bit_size // 8 | ||
|
||
def save_hf_format(model, tokenizer, args, sub_folder=""): | ||
model_to_save = model.module if hasattr(model, 'module') else model | ||
CONFIG_NAME = "config.json" | ||
output_dir = os.path.join(args.output_dir, sub_folder) | ||
os.makedirs(output_dir, exist_ok=True) | ||
output_config_file = os.path.join(output_dir, CONFIG_NAME) | ||
save_dict = model_to_save.state_dict() | ||
for key in list(save_dict.keys()): | ||
if "lora_" in key: | ||
del save_dict[key] | ||
model_to_save.save_pretrained(output_dir, state_dict=save_dict) | ||
model_to_save.config.to_json_file(output_config_file) | ||
tokenizer.save_pretrained(output_dir) | ||
# for models not in AutoModel, copy python module files | ||
train_from_model_path = model_to_save.config._name_or_path | ||
if os.path.exists(train_from_model_path): | ||
for filename in os.listdir(train_from_model_path): | ||
if filename.endswith(".py"): | ||
shutil.copy(os.path.join(train_from_model_path, filename), os.path.join(output_dir, filename)) | ||
|
||
|
||
def set_random_seed(seed): | ||
if seed is not None: | ||
set_seed(seed) | ||
random.seed(seed) | ||
np.random.seed(seed) | ||
torch.manual_seed(seed) | ||
torch.cuda.manual_seed_all(seed) | ||
|
||
|
||
def get_all_reduce_mean(tensor): | ||
torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) | ||
tensor = tensor / torch.distributed.get_world_size() | ||
return tensor | ||
|
||
|
||
def get_optimizer_grouped_parameters(model, | ||
weight_decay, | ||
no_decay_name_list=[ | ||
"bias", "LayerNorm.weight" | ||
]): | ||
optimizer_grouped_parameters = [ | ||
{ | ||
"params": [ | ||
p for n, p in model.named_parameters() | ||
if (not any(nd in n | ||
for nd in no_decay_name_list) and p.requires_grad) | ||
], | ||
"weight_decay": | ||
weight_decay, | ||
}, | ||
{ | ||
"params": [ | ||
p for n, p in model.named_parameters() | ||
if (any(nd in n | ||
for nd in no_decay_name_list) and p.requires_grad) | ||
], | ||
"weight_decay": | ||
0.0, | ||
}, | ||
] | ||
return optimizer_grouped_parameters | ||
|
||
|
||
def _z3_params_to_fetch(param_list): | ||
return [ | ||
p for p in param_list | ||
if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE | ||
] | ||
|
||
def save_zero_three_model(model, tokenizer, args, sub_folder=""): | ||
zero_stage_3 = (args.zero_stage == 3) | ||
os.makedirs(args.output_dir, exist_ok=True) | ||
if args.global_rank == 0: | ||
output_dir = os.path.join(args.output_dir, sub_folder) | ||
os.makedirs(output_dir, exist_ok=True) | ||
model_to_save = model.module if hasattr(model, 'module') else model | ||
|
||
if zero_stage_3: | ||
output_state_dict = {} | ||
for k, v in model_to_save.named_parameters(): | ||
|
||
if hasattr(v, 'ds_id'): | ||
with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([v]), | ||
enabled=zero_stage_3): | ||
v_p = v.data.cpu() | ||
else: | ||
v_p = v.cpu() | ||
if args.global_rank == 0 and "lora" not in k: | ||
output_state_dict[k] = v_p | ||
if args.global_rank == 0: | ||
model_to_save.save_pretrained(output_dir, state_dict=output_state_dict) | ||
del output_state_dict | ||
|
||
if args.global_rank == 0: | ||
output_config_file = os.path.join(output_dir, "config.json") | ||
model_to_save.config.to_json_file(output_config_file) | ||
tokenizer.save_pretrained(output_dir) | ||
# for models not in AutoModel, copy python module files | ||
train_from_model_path = model_to_save.config._name_or_path | ||
if os.path.exists(train_from_model_path): | ||
for filename in os.listdir(train_from_model_path): | ||
if filename.endswith(".py"): | ||
shutil.copy(os.path.join(train_from_model_path, filename), os.path.join(output_dir, filename)) |