update

neavo · Sep 24, 2024 · bc22fcc · bc22fcc
1 parent 186fd7b
commit bc22fcc
Show file tree

Hide file tree

Showing 6 changed files with 200 additions and 84 deletions.
diff --git a/00.py b/00.py
@@ -10,6 +10,7 @@
 
 import numpy
 import wandb
+import torch
 import evaluate
 import bitsandbytes
 from transformers import Trainer
@@ -18,9 +19,6 @@
 from transformers import AutoTokenizer
 from transformers import AutoModelForTokenClassification
 
-import torch
-from torch.utils.data import DataLoader
-
 from seqeval.metrics import f1_score
 from seqeval.metrics import recall_score
 from seqeval.metrics import accuracy_score
@@ -32,22 +30,27 @@
 from model.NERTrainerCallback import NERTrainerCallback
 
 # 参数设置
-MODEL_NAME = "facebookai_xlm_roberta_base_pretrain_20240826"
+MODEL_NAME = "microsoft_mdeberta_v3_base_pretrain_20240916_e2"
 MODEL_PATH = f"assets/{MODEL_NAME}"
 OUTPUT_PATH = "output"
 DATASET_PATH = "dataset/ner"
-EPOCHS = 24
-PATIENCE = 12
-PATIENCE_KEEPER = 0
+EPOCHS = 16
+PATIENCE = 16
+PATIENCE_KEEPER = 3
 BATCH_SIZE = 32
-GRADIENT_ACCUMULATION_SIZE = 32
+GRADIENT_CHECKPOINTING = False
+GRADIENT_ACCUMULATION_SIZE = 128
 FROZEN_LAYER = 0
-LEARNING_RATE = 2 * 1e-5
+LEARNING_RATE = 5 * 1e-5
 DO_LOWER_CASE = False
 INTERVAL_STEPS = 200
 
-# 工作模式
-MODE_MEASUREMENT = False
+DATASET_PATH = [
+    ("dataset/ner/zh_1.json", 1 * 10000),
+    ("dataset/ner/en_1.json", 1 * 10000),
+    ("dataset/ner/jp_1.json", 1 * 10000),
+    ("dataset/ner/ko_1.json", 1 * 10000),
+]
 
 # 加载分词器
 def load_tokenizer():
@@ -61,33 +64,27 @@ def load_tokenizer():
 def load_dataset(tokenizer):
     count = 0
     datas = []
-    for file in os.scandir(DATASET_PATH):
-        if file.name.endswith(".json"):
-            with open(file.path, "r", encoding = "utf-8") as file:
-                count = count + 1
-                datas.extend(random.sample(json.load(file), 10000))
+    for path, num in DATASET_PATH:
+        with open(path, "r", encoding = "utf-8") as file:
+            count = count + 1
+            datas_input = json.load(file)
+            datas.extend(random.sample(datas_input, min(int(num), len(datas_input))))
 
     print(f"")
     print(f"找到数据文件 {count} 个，共 {len(datas)} 条数据 ...")
 
     # 分割数据集
-    train_datas, test_datas = train_test_split(datas, test_size = 0.025, shuffle = True, random_state = 42)
+    train_datas, test_datas = train_test_split(datas, test_size = 1000.0/len(datas), shuffle = True, random_state = 42)
 
     # 创建数据集和数据加载器
     print(f"")
-    test_dataset = NERDataset(test_datas, tokenizer, MODE_MEASUREMENT)
-    train_dataset = NERDataset(train_datas, tokenizer, MODE_MEASUREMENT)
+    test_dataset = NERDataset(test_datas, tokenizer)
+    train_dataset = NERDataset(train_datas, tokenizer)
     print(f"")
-    print(f"[green]test_dataset[/] 中最长条目为 {test_dataset.max_lenght} ...")
-    print(f"[green]train_dataloader[/] 中最长条目为 {train_dataset.max_lenght} ...")
+    print(f"[green]test_dataset[/] 中最长条目为 {test_dataset.max_lenght}，长度阈值已设置为 {test_dataset.token_length_threshold} ...")
+    print(f"[green]train_dataloader[/] 中最长条目为 {train_dataset.max_lenght}，长度阈值已设置为 {train_dataset.token_length_threshold} ...")
     print(f"")
 
-    # for sample in train_dataloader:
-    #     for k, v in sample.items():
-    #         print(f"{k} : {v}")
-    #     raise
-    # raise
-
     return test_dataset, train_dataset
 
 # 加载模型
@@ -169,23 +166,22 @@ def compute_metrics(eval_prediction, test_dataset, train_dataset):
 # 开始训练
 def start_training(model, tokenizer, test_dataset, train_dataset):
     training_args = TrainingArguments(
-        optim = "adamw_8bit",
+        # optim = "adamw_8bit",
         output_dir = OUTPUT_PATH,
         warmup_ratio = 0.1,
         weight_decay = 0.01,
         learning_rate = LEARNING_RATE,
-        logging_dir = "logs",   
-        logging_steps = INTERVAL_STEPS / 10,     
+        logging_dir = "logs",
+        logging_steps = INTERVAL_STEPS / 10,
         eval_steps = INTERVAL_STEPS,
         eval_strategy = "steps",
         save_strategy = "no",
-        save_safetensors = False,
         num_train_epochs = EPOCHS,
         bf16 = True,
         bf16_full_eval = True,
         per_device_eval_batch_size = min(128, BATCH_SIZE * 4),
         per_device_train_batch_size = BATCH_SIZE,
-        gradient_checkpointing = False,
+        gradient_checkpointing = GRADIENT_CHECKPOINTING,
         gradient_accumulation_steps = max(1, int(GRADIENT_ACCUMULATION_SIZE / BATCH_SIZE)),
     )
 
@@ -198,14 +194,6 @@ def start_training(model, tokenizer, test_dataset, train_dataset):
             patience_keeper = PATIENCE_KEEPER,
         )],
         tokenizer = tokenizer,
-        optimizers = (
-            bitsandbytes.optim.Adam8bit(
-                model.parameters(),
-                lr = LEARNING_RATE,
-                weight_decay = 0.01,
-            ),
-            None,
-        ),
         eval_dataset = test_dataset,
         train_dataset = train_dataset,
         compute_metrics = functools.partial(compute_metrics, test_dataset = test_dataset, train_dataset = train_dataset),
@@ -226,10 +214,6 @@ def main():
     # 加载数据集
     test_dataset, train_dataset = load_dataset(tokenizer)
 
-    # 测量模式时不继续后续流程
-    if MODE_MEASUREMENT:
-        return
-
     # 加载模型
     model = load_model(test_dataset, train_dataset)
 

diff --git a/01.py b/01.py
@@ -24,13 +24,14 @@
 from model.PreTrainerCallback import PreTrainerCallback
 
 # 参数设置
-MODEL_NAME = "facebookai_xlm_roberta_base"
+MODEL_NAME = "microsoft_mdeberta_v3_base_pretrain_20240916_e1"
 MODEL_PATH = f"assets/{MODEL_NAME}"
 OUTPUT_PATH = f"output/{MODEL_NAME}_pretrain"
-EPOCHS = 2
+EPOCHS = 1
 LENGTH_THRESHOLD = 256
-BATCH_SIZE = 11
-GRADIENT_ACCUMULATION_SIZE = 128
+BATCH_SIZE = 8
+GRADIENT_CHECKPOINTING = False
+GRADIENT_ACCUMULATION_SIZE = 256
 DO_LOWER_CASE = False
 LEARNING_RATE = 2 * 1e-5
 INTERVAL_STEPS = 200
@@ -41,9 +42,9 @@
     ("dataset/pretrain/en_r18_visual_novels", 20 * 10000),
     ("dataset/pretrain/zh", 20 * 10000),
     ("dataset/pretrain/zh_r18_pixiv", 20 * 10000),
-    ("dataset/pretrain/jp", 15 * 10000),
-    ("dataset/pretrain/jp_r18", 15 * 10000),
-    ("dataset/pretrain/jp_r18_rpgmaker", 10 * 10000),
+    ("dataset/pretrain/jp", 40 * 10000),
+    ("dataset/pretrain/jp_r18", 20 * 10000),
+    ("dataset/pretrain/jp_r18_rpg", 20 * 10000),
     ("dataset/pretrain/kr", 40 * 10000),
 ]
 
@@ -90,6 +91,7 @@ def cleanup(line):
     # 移除开头结尾的符号
     line = TextHelper.strip_punctuation(line)
 
+
     return line
 
 # 生成数据
@@ -151,7 +153,7 @@ def map_function(tokenizer, samples):
         truncation = True,
         max_length = LENGTH_THRESHOLD,
         return_attention_mask = True,
-        return_offsets_mapping = True,
+        return_offsets_mapping = True if tokenizer.is_fast else False, # 只有快速 tokenizer 才有这个功能
         return_special_tokens_mask = True,
     )
 
@@ -195,7 +197,7 @@ def load_dataset(tokenizer):
 
             datas_by_type = random.sample(datas_by_type, min(int(num), len(datas_by_type)))
             with open(f"{dir_path}/{MODEL_NAME}_{dir_name}.txt", "w", encoding = "utf-8") as file:
-                file.writelines([f"{line}\n" for line in datas_by_type])
+                file.writelines("\n".join(datas_by_type))
 
         datas.extend(datas_by_type)
 
@@ -266,17 +268,16 @@ def start_training(model, tokenizer, dataset_train_tokenized):
         warmup_ratio = 0.1,
         weight_decay = 0.01,
         learning_rate = LEARNING_RATE,
-        logging_dir = "logs",   
-        logging_steps = INTERVAL_STEPS / 10,     
+        logging_dir = "logs",
+        logging_steps = INTERVAL_STEPS / 10,
         eval_strategy = "no",
         save_strategy = "steps",
         save_steps = INTERVAL_STEPS,
         save_total_limit = 3,
-        save_safetensors = False,
         num_train_epochs = EPOCHS,
         bf16 = True,
         per_device_train_batch_size = BATCH_SIZE,
-        gradient_checkpointing = True,
+        gradient_checkpointing = GRADIENT_CHECKPOINTING,
         gradient_accumulation_steps = max(1, int(GRADIENT_ACCUMULATION_SIZE / BATCH_SIZE)),
     )
 
@@ -287,14 +288,6 @@ def start_training(model, tokenizer, dataset_train_tokenized):
         callbacks = [
             PreTrainerCallback(),
         ],
-        optimizers = (
-            bitsandbytes.optim.Adam8bit(
-                model.parameters(),
-                lr = LEARNING_RATE,
-                weight_decay = 0.01,
-            ),
-            None,
-        ),
         data_collator = DataCollatorForWholeWordMask(
             tokenizer = tokenizer,
             mlm = True, 

diff --git a/99.py b/99.py
@@ -0,0 +1,111 @@
+import os
+import sys
+import shutil
+import subprocess
+
+from tqdm import tqdm
+from rich import print
+
+from transformers import BitsAndBytesConfig
+from transformers import AutoTokenizer
+from transformers import AutoModelForTokenClassification
+
+def export_fp16(tag):
+    path = f"{tag}_fp16"
+
+    print(f"")
+    print(f"正在导出 {path} ...")
+    shutil.rmtree(f"{path}", ignore_errors = True)
+    shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
+    os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
+    os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None
+
+    model = AutoModelForTokenClassification.from_pretrained(
+        tag,
+        local_files_only = True,
+        low_cpu_mem_usage = True,
+    )
+
+    model = model.half()
+    model.save_pretrained(f"{path}")
+
+def export_bnb_4bit(tag):
+    path = f"{tag}_bnb_4bit"
+
+    print(f"")
+    print(f"正在导出 {path} ...")
+    shutil.rmtree(f"{path}", ignore_errors = True)
+    shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
+    os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
+    os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None
+
+    model = AutoModelForTokenClassification.from_pretrained(
+        tag,
+        quantization_config = BitsAndBytesConfig(load_in_4bit = True),
+        local_files_only = True,
+        low_cpu_mem_usage = True,
+    )
+    model.save_pretrained(f"{path}")
+
+def export_bnb_8bit(tag):
+    path = f"{tag}_bnb_8bit"
+
+    print(f"")
+    print(f"正在导出 {path} ...")
+    shutil.rmtree(f"{path}", ignore_errors = True)
+    shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
+    os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
+    os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None
+
+    model = AutoModelForTokenClassification.from_pretrained(
+        tag,
+        quantization_config = BitsAndBytesConfig(load_in_8bit = True),
+        local_files_only = True,
+        low_cpu_mem_usage = True,
+    )
+    model.save_pretrained(f"{path}")
+
+def export_onnx(tag: str):
+    path = f"{tag}_onnx"
+
+    print(f"")
+    print(f"正在导出 {path} ...")
+    shutil.rmtree(f"{path}", ignore_errors = True)
+    shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
+    os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
+    os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None
+
+    subprocess.run(
+        f"optimum-cli export onnx --task token-classification -m {tag} {path}", 
+        shell = True,
+        check = True,
+    )
+
+def export_onnx_avx512(tag: str):
+    path = f"{tag}_onnx_avx512"
+
+    print(f"")
+    print(f"正在导出 {path} ...")
+    shutil.rmtree(f"{path}", ignore_errors = True)
+    shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
+    os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
+    os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None
+
+    subprocess.run(
+        f"optimum-cli onnxruntime quantize --avx512 --per_channel --onnx_model {tag}_onnx -o {path}", 
+        shell = True,
+        check = True,
+    )
+
+# 运行主函数
+def main(tag):
+    export_fp16(tag)
+    export_bnb_4bit(tag)
+    export_bnb_8bit(tag)
+
+    export_onnx(tag)
+    export_onnx_avx512(tag)
+
+# 运行主函数
+if __name__ == "__main__":
+    main(sys.argv[1])
diff --git a/99_export.bat b/99_export.bat
@@ -0,0 +1,17 @@
+@echo off
+@chcp 65001 > nul
+
+@REM 设置工作目录
+cd /d %~dp0
+
+@REM 检查是否有参数传递给脚本
+if "%~1"=="" (
+    echo 请拖放一个文件到此批处理文件上 ...
+    goto :END
+)
+
+@REM 执行python脚本
+call python 99.py %1
+
+:END
+pause