Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
neavo committed Sep 24, 2024
1 parent 186fd7b commit bc22fcc
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 84 deletions.
72 changes: 28 additions & 44 deletions 00.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy
import wandb
import torch
import evaluate
import bitsandbytes
from transformers import Trainer
Expand All @@ -18,9 +19,6 @@
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification

import torch
from torch.utils.data import DataLoader

from seqeval.metrics import f1_score
from seqeval.metrics import recall_score
from seqeval.metrics import accuracy_score
Expand All @@ -32,22 +30,27 @@
from model.NERTrainerCallback import NERTrainerCallback

# 参数设置
MODEL_NAME = "facebookai_xlm_roberta_base_pretrain_20240826"
MODEL_NAME = "microsoft_mdeberta_v3_base_pretrain_20240916_e2"
MODEL_PATH = f"assets/{MODEL_NAME}"
OUTPUT_PATH = "output"
DATASET_PATH = "dataset/ner"
EPOCHS = 24
PATIENCE = 12
PATIENCE_KEEPER = 0
EPOCHS = 16
PATIENCE = 16
PATIENCE_KEEPER = 3
BATCH_SIZE = 32
GRADIENT_ACCUMULATION_SIZE = 32
GRADIENT_CHECKPOINTING = False
GRADIENT_ACCUMULATION_SIZE = 128
FROZEN_LAYER = 0
LEARNING_RATE = 2 * 1e-5
LEARNING_RATE = 5 * 1e-5
DO_LOWER_CASE = False
INTERVAL_STEPS = 200

# 工作模式
MODE_MEASUREMENT = False
DATASET_PATH = [
("dataset/ner/zh_1.json", 1 * 10000),
("dataset/ner/en_1.json", 1 * 10000),
("dataset/ner/jp_1.json", 1 * 10000),
("dataset/ner/ko_1.json", 1 * 10000),
]

# 加载分词器
def load_tokenizer():
Expand All @@ -61,33 +64,27 @@ def load_tokenizer():
def load_dataset(tokenizer):
count = 0
datas = []
for file in os.scandir(DATASET_PATH):
if file.name.endswith(".json"):
with open(file.path, "r", encoding = "utf-8") as file:
count = count + 1
datas.extend(random.sample(json.load(file), 10000))
for path, num in DATASET_PATH:
with open(path, "r", encoding = "utf-8") as file:
count = count + 1
datas_input = json.load(file)
datas.extend(random.sample(datas_input, min(int(num), len(datas_input))))

print(f"")
print(f"找到数据文件 {count} 个,共 {len(datas)} 条数据 ...")

# 分割数据集
train_datas, test_datas = train_test_split(datas, test_size = 0.025, shuffle = True, random_state = 42)
train_datas, test_datas = train_test_split(datas, test_size = 1000.0/len(datas), shuffle = True, random_state = 42)

# 创建数据集和数据加载器
print(f"")
test_dataset = NERDataset(test_datas, tokenizer, MODE_MEASUREMENT)
train_dataset = NERDataset(train_datas, tokenizer, MODE_MEASUREMENT)
test_dataset = NERDataset(test_datas, tokenizer)
train_dataset = NERDataset(train_datas, tokenizer)
print(f"")
print(f"[green]test_dataset[/] 中最长条目为 {test_dataset.max_lenght} ...")
print(f"[green]train_dataloader[/] 中最长条目为 {train_dataset.max_lenght} ...")
print(f"[green]test_dataset[/] 中最长条目为 {test_dataset.max_lenght},长度阈值已设置为 {test_dataset.token_length_threshold} ...")
print(f"[green]train_dataloader[/] 中最长条目为 {train_dataset.max_lenght},长度阈值已设置为 {train_dataset.token_length_threshold} ...")
print(f"")

# for sample in train_dataloader:
# for k, v in sample.items():
# print(f"{k} : {v}")
# raise
# raise

return test_dataset, train_dataset

# 加载模型
Expand Down Expand Up @@ -169,23 +166,22 @@ def compute_metrics(eval_prediction, test_dataset, train_dataset):
# 开始训练
def start_training(model, tokenizer, test_dataset, train_dataset):
training_args = TrainingArguments(
optim = "adamw_8bit",
# optim = "adamw_8bit",
output_dir = OUTPUT_PATH,
warmup_ratio = 0.1,
weight_decay = 0.01,
learning_rate = LEARNING_RATE,
logging_dir = "logs",
logging_steps = INTERVAL_STEPS / 10,
logging_dir = "logs",
logging_steps = INTERVAL_STEPS / 10,
eval_steps = INTERVAL_STEPS,
eval_strategy = "steps",
save_strategy = "no",
save_safetensors = False,
num_train_epochs = EPOCHS,
bf16 = True,
bf16_full_eval = True,
per_device_eval_batch_size = min(128, BATCH_SIZE * 4),
per_device_train_batch_size = BATCH_SIZE,
gradient_checkpointing = False,
gradient_checkpointing = GRADIENT_CHECKPOINTING,
gradient_accumulation_steps = max(1, int(GRADIENT_ACCUMULATION_SIZE / BATCH_SIZE)),
)

Expand All @@ -198,14 +194,6 @@ def start_training(model, tokenizer, test_dataset, train_dataset):
patience_keeper = PATIENCE_KEEPER,
)],
tokenizer = tokenizer,
optimizers = (
bitsandbytes.optim.Adam8bit(
model.parameters(),
lr = LEARNING_RATE,
weight_decay = 0.01,
),
None,
),
eval_dataset = test_dataset,
train_dataset = train_dataset,
compute_metrics = functools.partial(compute_metrics, test_dataset = test_dataset, train_dataset = train_dataset),
Expand All @@ -226,10 +214,6 @@ def main():
# 加载数据集
test_dataset, train_dataset = load_dataset(tokenizer)

# 测量模式时不继续后续流程
if MODE_MEASUREMENT:
return

# 加载模型
model = load_model(test_dataset, train_dataset)

Expand Down
35 changes: 14 additions & 21 deletions 01.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@
from model.PreTrainerCallback import PreTrainerCallback

# 参数设置
MODEL_NAME = "facebookai_xlm_roberta_base"
MODEL_NAME = "microsoft_mdeberta_v3_base_pretrain_20240916_e1"
MODEL_PATH = f"assets/{MODEL_NAME}"
OUTPUT_PATH = f"output/{MODEL_NAME}_pretrain"
EPOCHS = 2
EPOCHS = 1
LENGTH_THRESHOLD = 256
BATCH_SIZE = 11
GRADIENT_ACCUMULATION_SIZE = 128
BATCH_SIZE = 8
GRADIENT_CHECKPOINTING = False
GRADIENT_ACCUMULATION_SIZE = 256
DO_LOWER_CASE = False
LEARNING_RATE = 2 * 1e-5
INTERVAL_STEPS = 200
Expand All @@ -41,9 +42,9 @@
("dataset/pretrain/en_r18_visual_novels", 20 * 10000),
("dataset/pretrain/zh", 20 * 10000),
("dataset/pretrain/zh_r18_pixiv", 20 * 10000),
("dataset/pretrain/jp", 15 * 10000),
("dataset/pretrain/jp_r18", 15 * 10000),
("dataset/pretrain/jp_r18_rpgmaker", 10 * 10000),
("dataset/pretrain/jp", 40 * 10000),
("dataset/pretrain/jp_r18", 20 * 10000),
("dataset/pretrain/jp_r18_rpg", 20 * 10000),
("dataset/pretrain/kr", 40 * 10000),
]

Expand Down Expand Up @@ -90,6 +91,7 @@ def cleanup(line):
# 移除开头结尾的符号
line = TextHelper.strip_punctuation(line)


return line

# 生成数据
Expand Down Expand Up @@ -151,7 +153,7 @@ def map_function(tokenizer, samples):
truncation = True,
max_length = LENGTH_THRESHOLD,
return_attention_mask = True,
return_offsets_mapping = True,
return_offsets_mapping = True if tokenizer.is_fast else False, # 只有快速 tokenizer 才有这个功能
return_special_tokens_mask = True,
)

Expand Down Expand Up @@ -195,7 +197,7 @@ def load_dataset(tokenizer):

datas_by_type = random.sample(datas_by_type, min(int(num), len(datas_by_type)))
with open(f"{dir_path}/{MODEL_NAME}_{dir_name}.txt", "w", encoding = "utf-8") as file:
file.writelines([f"{line}\n" for line in datas_by_type])
file.writelines("\n".join(datas_by_type))

datas.extend(datas_by_type)

Expand Down Expand Up @@ -266,17 +268,16 @@ def start_training(model, tokenizer, dataset_train_tokenized):
warmup_ratio = 0.1,
weight_decay = 0.01,
learning_rate = LEARNING_RATE,
logging_dir = "logs",
logging_steps = INTERVAL_STEPS / 10,
logging_dir = "logs",
logging_steps = INTERVAL_STEPS / 10,
eval_strategy = "no",
save_strategy = "steps",
save_steps = INTERVAL_STEPS,
save_total_limit = 3,
save_safetensors = False,
num_train_epochs = EPOCHS,
bf16 = True,
per_device_train_batch_size = BATCH_SIZE,
gradient_checkpointing = True,
gradient_checkpointing = GRADIENT_CHECKPOINTING,
gradient_accumulation_steps = max(1, int(GRADIENT_ACCUMULATION_SIZE / BATCH_SIZE)),
)

Expand All @@ -287,14 +288,6 @@ def start_training(model, tokenizer, dataset_train_tokenized):
callbacks = [
PreTrainerCallback(),
],
optimizers = (
bitsandbytes.optim.Adam8bit(
model.parameters(),
lr = LEARNING_RATE,
weight_decay = 0.01,
),
None,
),
data_collator = DataCollatorForWholeWordMask(
tokenizer = tokenizer,
mlm = True,
Expand Down
111 changes: 111 additions & 0 deletions 99.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import sys
import shutil
import subprocess

from tqdm import tqdm
from rich import print

from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification

def export_fp16(tag):
path = f"{tag}_fp16"

print(f"")
print(f"正在导出 {path} ...")
shutil.rmtree(f"{path}", ignore_errors = True)
shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None

model = AutoModelForTokenClassification.from_pretrained(
tag,
local_files_only = True,
low_cpu_mem_usage = True,
)

model = model.half()
model.save_pretrained(f"{path}")

def export_bnb_4bit(tag):
path = f"{tag}_bnb_4bit"

print(f"")
print(f"正在导出 {path} ...")
shutil.rmtree(f"{path}", ignore_errors = True)
shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None

model = AutoModelForTokenClassification.from_pretrained(
tag,
quantization_config = BitsAndBytesConfig(load_in_4bit = True),
local_files_only = True,
low_cpu_mem_usage = True,
)
model.save_pretrained(f"{path}")

def export_bnb_8bit(tag):
path = f"{tag}_bnb_8bit"

print(f"")
print(f"正在导出 {path} ...")
shutil.rmtree(f"{path}", ignore_errors = True)
shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None

model = AutoModelForTokenClassification.from_pretrained(
tag,
quantization_config = BitsAndBytesConfig(load_in_8bit = True),
local_files_only = True,
low_cpu_mem_usage = True,
)
model.save_pretrained(f"{path}")

def export_onnx(tag: str):
path = f"{tag}_onnx"

print(f"")
print(f"正在导出 {path} ...")
shutil.rmtree(f"{path}", ignore_errors = True)
shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None

subprocess.run(
f"optimum-cli export onnx --task token-classification -m {tag} {path}",
shell = True,
check = True,
)

def export_onnx_avx512(tag: str):
path = f"{tag}_onnx_avx512"

print(f"")
print(f"正在导出 {path} ...")
shutil.rmtree(f"{path}", ignore_errors = True)
shutil.copytree(tag, f"{path}", dirs_exist_ok = True)
os.remove(f"{path}/model.safetensors") if os.path.exists(f"{path}/model.safetensors") else None
os.remove(f"{path}/pytorch_model.bin") if os.path.exists(f"{path}/pytorch_model.bin") else None

subprocess.run(
f"optimum-cli onnxruntime quantize --avx512 --per_channel --onnx_model {tag}_onnx -o {path}",
shell = True,
check = True,
)

# 运行主函数
def main(tag):
export_fp16(tag)
export_bnb_4bit(tag)
export_bnb_8bit(tag)

export_onnx(tag)
export_onnx_avx512(tag)

# 运行主函数
if __name__ == "__main__":
main(sys.argv[1])
17 changes: 17 additions & 0 deletions 99_export.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
@echo off
@chcp 65001 > nul

@REM 设置工作目录
cd /d %~dp0

@REM 检查是否有参数传递给脚本
if "%~1"=="" (
echo 请拖放一个文件到此批处理文件上 ...
goto :END
)

@REM 执行python脚本
call python 99.py %1

:END
pause
Loading

0 comments on commit bc22fcc

Please sign in to comment.