From bc22fccae930c303964b28e852214165b15fd824 Mon Sep 17 00:00:00 2001 From: neavo Date: Tue, 24 Sep 2024 22:57:02 +0800 Subject: [PATCH] update --- 00.py | 72 +++++++++-------------- 01.py | 35 +++++------- 99.py | 111 ++++++++++++++++++++++++++++++++++++ 99_export.bat | 17 ++++++ model/NERDataset.py | 42 ++++++++------ model/NERTrainerCallback.py | 7 ++- 6 files changed, 200 insertions(+), 84 deletions(-) create mode 100644 99.py create mode 100644 99_export.bat diff --git a/00.py b/00.py index e3fd41e..378f5dc 100644 --- a/00.py +++ b/00.py @@ -10,6 +10,7 @@ import numpy import wandb +import torch import evaluate import bitsandbytes from transformers import Trainer @@ -18,9 +19,6 @@ from transformers import AutoTokenizer from transformers import AutoModelForTokenClassification -import torch -from torch.utils.data import DataLoader - from seqeval.metrics import f1_score from seqeval.metrics import recall_score from seqeval.metrics import accuracy_score @@ -32,22 +30,27 @@ from model.NERTrainerCallback import NERTrainerCallback # 参数设置 -MODEL_NAME = "facebookai_xlm_roberta_base_pretrain_20240826" +MODEL_NAME = "microsoft_mdeberta_v3_base_pretrain_20240916_e2" MODEL_PATH = f"assets/{MODEL_NAME}" OUTPUT_PATH = "output" DATASET_PATH = "dataset/ner" -EPOCHS = 24 -PATIENCE = 12 -PATIENCE_KEEPER = 0 +EPOCHS = 16 +PATIENCE = 16 +PATIENCE_KEEPER = 3 BATCH_SIZE = 32 -GRADIENT_ACCUMULATION_SIZE = 32 +GRADIENT_CHECKPOINTING = False +GRADIENT_ACCUMULATION_SIZE = 128 FROZEN_LAYER = 0 -LEARNING_RATE = 2 * 1e-5 +LEARNING_RATE = 5 * 1e-5 DO_LOWER_CASE = False INTERVAL_STEPS = 200 -# 工作模式 -MODE_MEASUREMENT = False +DATASET_PATH = [ + ("dataset/ner/zh_1.json", 1 * 10000), + ("dataset/ner/en_1.json", 1 * 10000), + ("dataset/ner/jp_1.json", 1 * 10000), + ("dataset/ner/ko_1.json", 1 * 10000), +] # 加载分词器 def load_tokenizer(): @@ -61,33 +64,27 @@ def load_tokenizer(): def load_dataset(tokenizer): count = 0 datas = [] - for file in os.scandir(DATASET_PATH): - if file.name.endswith(".json"): - with open(file.path, "r", encoding = "utf-8") as file: - count = count + 1 - datas.extend(random.sample(json.load(file), 10000)) + for path, num in DATASET_PATH: + with open(path, "r", encoding = "utf-8") as file: + count = count + 1 + datas_input = json.load(file) + datas.extend(random.sample(datas_input, min(int(num), len(datas_input)))) print(f"") print(f"找到数据文件 {count} 个,共 {len(datas)} 条数据 ...") # 分割数据集 - train_datas, test_datas = train_test_split(datas, test_size = 0.025, shuffle = True, random_state = 42) + train_datas, test_datas = train_test_split(datas, test_size = 1000.0/len(datas), shuffle = True, random_state = 42) # 创建数据集和数据加载器 print(f"") - test_dataset = NERDataset(test_datas, tokenizer, MODE_MEASUREMENT) - train_dataset = NERDataset(train_datas, tokenizer, MODE_MEASUREMENT) + test_dataset = NERDataset(test_datas, tokenizer) + train_dataset = NERDataset(train_datas, tokenizer) print(f"") - print(f"[green]test_dataset[/] 中最长条目为 {test_dataset.max_lenght} ...") - print(f"[green]train_dataloader[/] 中最长条目为 {train_dataset.max_lenght} ...") + print(f"[green]test_dataset[/] 中最长条目为 {test_dataset.max_lenght},长度阈值已设置为 {test_dataset.token_length_threshold} ...") + print(f"[green]train_dataloader[/] 中最长条目为 {train_dataset.max_lenght},长度阈值已设置为 {train_dataset.token_length_threshold} ...") print(f"") - # for sample in train_dataloader: - # for k, v in sample.items(): - # print(f"{k} : {v}") - # raise - # raise - return test_dataset, train_dataset # 加载模型 @@ -169,23 +166,22 @@ def compute_metrics(eval_prediction, test_dataset, train_dataset): # 开始训练 def start_training(model, tokenizer, test_dataset, train_dataset): training_args = TrainingArguments( - optim = "adamw_8bit", + # optim = "adamw_8bit", output_dir = OUTPUT_PATH, warmup_ratio = 0.1, weight_decay = 0.01, learning_rate = LEARNING_RATE, - logging_dir = "logs", - logging_steps = INTERVAL_STEPS / 10, + logging_dir = "logs", + logging_steps = INTERVAL_STEPS / 10, eval_steps = INTERVAL_STEPS, eval_strategy = "steps", save_strategy = "no", - save_safetensors = False, num_train_epochs = EPOCHS, bf16 = True, bf16_full_eval = True, per_device_eval_batch_size = min(128, BATCH_SIZE * 4), per_device_train_batch_size = BATCH_SIZE, - gradient_checkpointing = False, + gradient_checkpointing = GRADIENT_CHECKPOINTING, gradient_accumulation_steps = max(1, int(GRADIENT_ACCUMULATION_SIZE / BATCH_SIZE)), ) @@ -198,14 +194,6 @@ def start_training(model, tokenizer, test_dataset, train_dataset): patience_keeper = PATIENCE_KEEPER, )], tokenizer = tokenizer, - optimizers = ( - bitsandbytes.optim.Adam8bit( - model.parameters(), - lr = LEARNING_RATE, - weight_decay = 0.01, - ), - None, - ), eval_dataset = test_dataset, train_dataset = train_dataset, compute_metrics = functools.partial(compute_metrics, test_dataset = test_dataset, train_dataset = train_dataset), @@ -226,10 +214,6 @@ def main(): # 加载数据集 test_dataset, train_dataset = load_dataset(tokenizer) - # 测量模式时不继续后续流程 - if MODE_MEASUREMENT: - return - # 加载模型 model = load_model(test_dataset, train_dataset) diff --git a/01.py b/01.py index 0529f21..422bc84 100644 --- a/01.py +++ b/01.py @@ -24,13 +24,14 @@ from model.PreTrainerCallback import PreTrainerCallback # 参数设置 -MODEL_NAME = "facebookai_xlm_roberta_base" +MODEL_NAME = "microsoft_mdeberta_v3_base_pretrain_20240916_e1" MODEL_PATH = f"assets/{MODEL_NAME}" OUTPUT_PATH = f"output/{MODEL_NAME}_pretrain" -EPOCHS = 2 +EPOCHS = 1 LENGTH_THRESHOLD = 256 -BATCH_SIZE = 11 -GRADIENT_ACCUMULATION_SIZE = 128 +BATCH_SIZE = 8 +GRADIENT_CHECKPOINTING = False +GRADIENT_ACCUMULATION_SIZE = 256 DO_LOWER_CASE = False LEARNING_RATE = 2 * 1e-5 INTERVAL_STEPS = 200 @@ -41,9 +42,9 @@ ("dataset/pretrain/en_r18_visual_novels", 20 * 10000), ("dataset/pretrain/zh", 20 * 10000), ("dataset/pretrain/zh_r18_pixiv", 20 * 10000), - ("dataset/pretrain/jp", 15 * 10000), - ("dataset/pretrain/jp_r18", 15 * 10000), - ("dataset/pretrain/jp_r18_rpgmaker", 10 * 10000), + ("dataset/pretrain/jp", 40 * 10000), + ("dataset/pretrain/jp_r18", 20 * 10000), + ("dataset/pretrain/jp_r18_rpg", 20 * 10000), ("dataset/pretrain/kr", 40 * 10000), ] @@ -90,6 +91,7 @@ def cleanup(line): # 移除开头结尾的符号 line = TextHelper.strip_punctuation(line) + return line # 生成数据 @@ -151,7 +153,7 @@ def map_function(tokenizer, samples): truncation = True, max_length = LENGTH_THRESHOLD, return_attention_mask = True, - return_offsets_mapping = True, + return_offsets_mapping = True if tokenizer.is_fast else False, # 只有快速 tokenizer 才有这个功能 return_special_tokens_mask = True, ) @@ -195,7 +197,7 @@ def load_dataset(tokenizer): datas_by_type = random.sample(datas_by_type, min(int(num), len(datas_by_type))) with open(f"{dir_path}/{MODEL_NAME}_{dir_name}.txt", "w", encoding = "utf-8") as file: - file.writelines([f"{line}\n" for line in datas_by_type]) + file.writelines("\n".join(datas_by_type)) datas.extend(datas_by_type) @@ -266,17 +268,16 @@ def start_training(model, tokenizer, dataset_train_tokenized): warmup_ratio = 0.1, weight_decay = 0.01, learning_rate = LEARNING_RATE, - logging_dir = "logs", - logging_steps = INTERVAL_STEPS / 10, + logging_dir = "logs", + logging_steps = INTERVAL_STEPS / 10, eval_strategy = "no", save_strategy = "steps", save_steps = INTERVAL_STEPS, save_total_limit = 3, - save_safetensors = False, num_train_epochs = EPOCHS, bf16 = True, per_device_train_batch_size = BATCH_SIZE, - gradient_checkpointing = True, + gradient_checkpointing = GRADIENT_CHECKPOINTING, gradient_accumulation_steps = max(1, int(GRADIENT_ACCUMULATION_SIZE / BATCH_SIZE)), ) @@ -287,14 +288,6 @@ def start_training(model, tokenizer, dataset_train_tokenized): callbacks = [ PreTrainerCallback(), ], - optimizers = ( - bitsandbytes.optim.Adam8bit( - model.parameters(), - lr = LEARNING_RATE, - weight_decay = 0.01, - ), - None, - ), data_collator = DataCollatorForWholeWordMask( tokenizer = tokenizer, mlm = True, diff --git a/99.py b/99.py new file mode 100644 index 0000000..3d10112 --- /dev/null +++ b/99.py @@ -0,0 +1,111 @@ +import os +import sys +import shutil +import subprocess + +from tqdm import tqdm +from rich import print + +from transformers import BitsAndBytesConfig +from transformers import AutoTokenizer +from transformers import AutoModelForTokenClassification + +def export_fp16(tag): + path = f"{tag}_fp16" + + print(f"") + print(f"正在导出 {path} ...") + shutil.rmtree(f"{path}", ignore_errors = True) + shutil.copytree(tag, f"{path}", dirs_exist_ok = True) + os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None + os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None + + model = AutoModelForTokenClassification.from_pretrained( + tag, + local_files_only = True, + low_cpu_mem_usage = True, + ) + + model = model.half() + model.save_pretrained(f"{path}") + +def export_bnb_4bit(tag): + path = f"{tag}_bnb_4bit" + + print(f"") + print(f"正在导出 {path} ...") + shutil.rmtree(f"{path}", ignore_errors = True) + shutil.copytree(tag, f"{path}", dirs_exist_ok = True) + os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None + os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None + + model = AutoModelForTokenClassification.from_pretrained( + tag, + quantization_config = BitsAndBytesConfig(load_in_4bit = True), + local_files_only = True, + low_cpu_mem_usage = True, + ) + model.save_pretrained(f"{path}") + +def export_bnb_8bit(tag): + path = f"{tag}_bnb_8bit" + + print(f"") + print(f"正在导出 {path} ...") + shutil.rmtree(f"{path}", ignore_errors = True) + shutil.copytree(tag, f"{path}", dirs_exist_ok = True) + os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None + os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None + + model = AutoModelForTokenClassification.from_pretrained( + tag, + quantization_config = BitsAndBytesConfig(load_in_8bit = True), + local_files_only = True, + low_cpu_mem_usage = True, + ) + model.save_pretrained(f"{path}") + +def export_onnx(tag: str): + path = f"{tag}_onnx" + + print(f"") + print(f"正在导出 {path} ...") + shutil.rmtree(f"{path}", ignore_errors = True) + shutil.copytree(tag, f"{path}", dirs_exist_ok = True) + os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None + os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None + + subprocess.run( + f"optimum-cli export onnx --task token-classification -m {tag} {path}", + shell = True, + check = True, + ) + +def export_onnx_avx512(tag: str): + path = f"{tag}_onnx_avx512" + + print(f"") + print(f"正在导出 {path} ...") + shutil.rmtree(f"{path}", ignore_errors = True) + shutil.copytree(tag, f"{path}", dirs_exist_ok = True) + os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None + os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None + + subprocess.run( + f"optimum-cli onnxruntime quantize --avx512 --per_channel --onnx_model {tag}_onnx -o {path}", + shell = True, + check = True, + ) + +# 运行主函数 +def main(tag): + export_fp16(tag) + export_bnb_4bit(tag) + export_bnb_8bit(tag) + + export_onnx(tag) + export_onnx_avx512(tag) + +# 运行主函数 +if __name__ == "__main__": + main(sys.argv[1]) \ No newline at end of file diff --git a/99_export.bat b/99_export.bat new file mode 100644 index 0000000..d681527 --- /dev/null +++ b/99_export.bat @@ -0,0 +1,17 @@ +@echo off +@chcp 65001 > nul + +@REM 设置工作目录 +cd /d %~dp0 + +@REM 检查是否有参数传递给脚本 +if "%~1"=="" ( + echo 请拖放一个文件到此批处理文件上 ... + goto :END +) + +@REM 执行python脚本 +call python 99.py %1 + +:END +pause \ No newline at end of file diff --git a/model/NERDataset.py b/model/NERDataset.py index e162d78..fd8b137 100644 --- a/model/NERDataset.py +++ b/model/NERDataset.py @@ -14,14 +14,11 @@ class NERDataset(Dataset): CHUNK_SIZE = 2048 - TOKEN_LENGTH_THRESHOLD = 256 - def __init__(self, datas, tokenizer, mode_measurement): + def __init__(self, datas, tokenizer): self.tokenizer = tokenizer - self.mode_measurement = mode_measurement - self.id2label, self.label2id = self.generate_id_label_map(datas) - self.encodings, self.max_lenght = self.generate_encodings(datas) + self.encodings, self.max_lenght, self.token_length_threshold = self.generate_encodings(datas) def __len__(self): return len(self.encodings) @@ -46,10 +43,13 @@ def char_to_token(self, encoding, char_start, char_end): return token_start, token_end + # 获取 Token 长度阈值 + def get_token_length_threshold(self, datas): + return max(len(self.tokenizer(data.get("sentence", "")).input_ids) for data in datas) + # 生成数据块 - def generate_chunks(self, datas): + def generate_chunks(self, datas, token_length_threshold): encodings = [] - max_lenght = 0 for data in datas: sentence = data.get("sentence", "") @@ -66,14 +66,11 @@ def generate_chunks(self, datas): sentence, padding = "max_length", truncation = True, - max_length = 0 if self.mode_measurement else self.TOKEN_LENGTH_THRESHOLD, + max_length = token_length_threshold, return_offsets_mapping = True if self.tokenizer.is_fast else False, # 只有快速 tokenizer 才有这个功能 return_special_tokens_mask = True ) - # 记录最长条目 - max_lenght = max(max_lenght, len(encoding.input_ids)) - # 根据特殊标记设置 attention_mask for token_i, is_special_token in enumerate(encoding.special_tokens_mask): encoding.attention_mask[token_i] = 0 if is_special_token == 1 else 1 @@ -93,6 +90,7 @@ def generate_chunks(self, datas): # print(f"{names}") # print(f"{encoding.labels}") # print(f"{self.tokenizer.convert_ids_to_tokens(encoding.input_ids)}") + # raise # Trainer 会自动将数据移动到 GPU,不需要手动显式移动 data = {} @@ -106,23 +104,35 @@ def generate_chunks(self, datas): data["attention_mask"] = torch.tensor(encoding.attention_mask) encodings.append(data) - return encodings, max_lenght + return encodings # 生成编码数据 def generate_encodings(self, datas): encodings = [] max_lenght = 0 + # 分割数据 datas = [datas[i:(i + self.CHUNK_SIZE)] for i in range(0, len(datas), self.CHUNK_SIZE)] + + # 获取 Token 长度阈值 + results = Parallel(n_jobs = -1, prefer = "processes", return_as = "generator_unordered")( + delayed(self.get_token_length_threshold)(v) for v in datas + ) + + for v in tqdm(results, total = len(datas)): + max_lenght = max(max_lenght, v) + + token_length_threshold = max_lenght + 4 + + # 生成 Token 编码数据 results = Parallel(n_jobs = -1, prefer = "processes", return_as = "generator_unordered")( - delayed(self.generate_chunks)(v) for v in datas + delayed(self.generate_chunks)(v, token_length_threshold) for v in datas ) for v in tqdm(results, total = len(datas)): - encodings.extend(v[0]) - max_lenght = max(max_lenght, v[1]) + encodings.extend(v) - return encodings, max_lenght + return encodings, max_lenght, token_length_threshold # 生成 ID-Label 映射表 def generate_id_label_map(self, datas): diff --git a/model/NERTrainerCallback.py b/model/NERTrainerCallback.py index ea148ac..5210613 100644 --- a/model/NERTrainerCallback.py +++ b/model/NERTrainerCallback.py @@ -215,7 +215,7 @@ def check_early_stopping(self, args, state, control, metrics, **kwargs): + f"在本次评估中,最佳评估指标已更新 " + f"{key_metrics_f1:.4f} / {self.best_metric_for_f1:.4f} ..." ) - self.wait_for_early_stop = 0 + # self.wait_for_early_stop = 0 self.best_metric_for_f1 = key_metrics_f1 if eval_loss_improved: @@ -236,7 +236,8 @@ def check_early_stopping(self, args, state, control, metrics, **kwargs): if f1_improved or eval_loss_improved or train_loss_improved: print(f"") - if not f1_improved and not eval_loss_improved: + # if not f1_improved and not eval_loss_improved: + if not eval_loss_improved: self.wait_for_early_stop += 1 print("" + f"在本次评估中," @@ -255,5 +256,5 @@ def check_early_stopping(self, args, state, control, metrics, **kwargs): ): control.should_training_stop = True self.wait_for_early_stop = self.wait_for_early_stop - 1 - print(f"在连续 {self.patience} 次的评估中,各项指标均未改善,训练已中止 ...") + print(f"在连续 {self.patience} 次的评估中,目标指标均未改善,训练已中止 ...") print(f"") \ No newline at end of file