-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_ebae.py
204 lines (165 loc) · 7.43 KB
/
train_ebae.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# import peft
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from loss_calculation import ebae_ebar_loss, ebae_loss
# import torch.distributed as dist
# from torch.nn.parallel import DistributedDataParallel as DDP
import torch
class EBAE_EBARDataset(Dataset):
"""
Custom PyTorch Dataset for tokenized EBAE and EBAR data.
Separates the input prompt and the next sentence.
"""
def __init__(self, prompts, next_sentences, tokenizer, seq_length):
"""
Initialize the dataset.
:param prompts: List of input prompts for EBAE.
:param next_sentences: List of next sentences for EBAR.
:param tokenizer: Tokenizer to process the text.
:param seq_length: Maximum sequence length for tokenization.
"""
self.tokenizer = tokenizer
self.seq_length = seq_length
# Tokenize the prompts
self.prompt_data = tokenizer(
prompts,
max_length=seq_length,
padding="max_length",
truncation=True,
return_tensors="pt",
)
# Tokenize the next sentences separately
self.next_sentence_data = tokenizer(
next_sentences,
max_length=seq_length,
padding="max_length",
truncation=True,
return_tensors="pt",
)
def __len__(self):
return len(self.prompt_data["input_ids"])
def __getitem__(self, idx):
return {
"input_ids": self.prompt_data["input_ids"][idx],
"attention_mask": self.prompt_data["attention_mask"][idx],
"next_input_ids": self.next_sentence_data["input_ids"][idx],
"next_attention_mask": self.next_sentence_data["attention_mask"][idx],
}
class ChunkDataset(Dataset):
"""Custom PyTorch Dataset for tokenized chunks."""
def __init__(self, tokenized_chunks):
self.input_ids = tokenized_chunks["input_ids"]
self.attention_mask = tokenized_chunks["attention_mask"]
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return {
"input_ids": self.input_ids[idx],
"attention_mask": self.attention_mask[idx],
}
def train_steps(
model_name: str,
chunks: list,
next_sentences: list,
seq_length: int = 1024,
batch_size: int = 256,
learning_rate: float = 1e-5,
epochs: int = 3,
device: str = "cuda:1"
):
"""
Fine-tune a language model using LoRA for a specific number of epochs.
"""
print(f"Model: {model_name}")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Add padding token
model = AutoModelForCausalLM.from_pretrained(model_name)
# Resize model embeddings to match new tokenizer vocabulary
model.resize_token_embeddings(len(tokenizer))
model.to(device)
# Tokenize chunks with EBAE prompts and the <\s> token
print("Tokenizing chunks with EBAE prompts and eos token...")
# ebae_chunks = [f"{chunk} El texto de entrada es: {tokenizer.eos_token}" for chunk in chunks] # Construct the prompt
ebae_ebar_chunks = [f"{chunk} The input text is: {tokenizer.eos_token} The next sentence is: {tokenizer.eos_token}" for chunk in chunks] # Construct the prompt
# for idx, chunk in enumerate(ebae_ebar_chunks):
# tokenized_chunk = tokenizer(chunk)["input_ids"]
# if len(tokenized_chunk) > seq_length:
# print(f"Chunk {idx} is too long: {len(tokenized_chunk)} tokens.")
# tokenized_chunks = tokenizer(
# ebae_ebar_chunks,
# max_length=seq_length,
# padding="max_length",
# truncation=True,
# return_tensors="pt",
# )
# if not all([tokenizer.eos_token_id in seq for seq in tokenized_chunks["input_ids"]]):
# raise ValueError("Some tokenized sequences are missing the EOS token.")
# print("Tokenization complete.")
# Prepare dataset and dataloader
dataset = EBAE_EBARDataset(ebae_ebar_chunks, next_sentences, tokenizer, seq_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)
averaged_losses = {}
loss_buffer = {}
gradient_norms = {}
max_grad_norm = 1500.0 # Maximum gradient norm
# Set the number of accumulation steps to simulate a larger batch size
gradient_accumulation_steps = 8 # Simulates a batch size of 4 * 8 = 32
# Training loop
model.train()
for e in range(epochs):
print(f"Epoch {e + 1}/{epochs}")
averaged_losses[e] = []
loss_buffer[e] = []
gradient_norms[e] = []
step_count = 0
for step, batch in enumerate(dataloader):
# Extract inputs and masks
input_ids = batch["input_ids"]
# attention_mask = batch["attention_mask"]
next_input_ids = batch["next_input_ids"]
# next_attention_mask = batch["next_attention_mask"]
# Compute the custom loss for the last token
loss = ebae_ebar_loss(model, input_ids, next_input_ids, tokenizer, device)
# Normalize the loss by the number of accumulation steps
loss = loss / gradient_accumulation_steps
# Handle `None` loss
if loss is None:
print(f"Skipping batch {step} due to an error.")
gradient_norms[e].append(float('nan')) # Log NaN for skipped batch
continue # Skip this batch and move to the next one
loss.backward() # Calculate gradients
# Record loss
loss_buffer[e].append(loss.item()) # Add to accumulation buffer
# Update parameters after every `gradient_accumulation_steps`
if (step + 1) % gradient_accumulation_steps == 0:
# Calculate gradient norm
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
gradient_norms[e].append(grad_norm)
optimizer.step() # Perform optimizer step (update model's parameters)
optimizer.zero_grad() # Reset gradients
# Average losses from this accumulation cycle
avg_loss = sum(loss_buffer[e]) / gradient_accumulation_steps
averaged_losses[e].append(avg_loss)
loss_buffer[e] = [] # Reset buffer for next cycle
# Increment effective step count
step_count += 1
print(f"Effective Step: {step_count}, Averaged Loss: {avg_loss}")
# Perform a final optimizer step if the total steps are not a multiple of `gradient_accumulation_steps`
if (step + 1) % gradient_accumulation_steps != 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
optimizer.zero_grad()
# Save the LoRA-adapted model
output_dir = "./ebar-ebae-model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"EBAR-EBAE-adapted model and tokenizer saved to {output_dir}")
# Save results for plotting
torch.save({"losses": averaged_losses, "gradient_norms": gradient_norms}, "training_metrics.pth")
print("Training metrics saved to training_metrics.pth")