Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Llm tutorial #40

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions llm-inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#%%
from transformers.models.llama import LlamaForCausalLM, LlamaTokenizer
from transformers import GenerationConfig

#path = '/home/oamontoy/workspace/weights-llama-2-7B'
path = '/home/oamontoy/workspace/weights-llama-2-7B-chat'
tokenizer = LlamaTokenizer.from_pretrained(path)
model = LlamaForCausalLM.from_pretrained(path)
# %%
from peft import PeftModel
model = PeftModel.from_pretrained(model, "dominguesm/alpaca-lora-ptbr-7b")
# %%

def generate_prompt(instruction, input=None):
if input:
return f"""Below is a statement that describes a task, paired with an input that provides more context. Write a response that appropriately completes the request.

### instruction:
{instruction}

### input:
{input}

### response:"""
else:
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### instruction:
{instruction}

### response:"""

# %%
from pprint import pprint
# %%
generation_config = GenerationConfig(
temperature=0.1,
top_p=0.75,
num_beams=4,
)

def evaluate(instruction, input=None):
prompt = generate_prompt(instruction, input)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"]
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=256
)
for s in generation_output.sequences:
output = tokenizer.decode(s)
pprint("response: " + output.split("### response:")[1].strip())
# %%
#evaluate(input("instruction: "))
# %%
instruction = 'print a long paragraph of giberish'
prompt = generate_prompt(instruction, None)
inputs = tokenizer(prompt, return_tensors="pt")
inputs
#%%
input_ids = inputs["input_ids"]
# %%
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=256
)
# %%
for s in generation_output.sequences:
output = tokenizer.decode(s)
pprint("response: " + output.split("### response:")[1].strip())
# %%
178 changes: 178 additions & 0 deletions llm-lora_classification copy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#%%
import numpy as np
import torch
from datasets import load_dataset, load_metric
from peft import LoraConfig, TaskType, get_peft_model
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm

from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
DataCollatorWithPadding, Trainer, TrainingArguments,
get_scheduler)

# %%
task = "mrpc"
dataset = load_dataset("glue", task)
padding_side = "right"

metric = load_metric('glue', task)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)

#%%
model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
tokenizer.pad_token_id = tokenizer.eos_token_id

def tokenize_function(examples):
# max_length=None => use the model max length (it's actually the default)
outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
return outputs

tokenized_datasets = dataset.map(
tokenize_function,
batched=True,
remove_columns=["idx", "sentence1", "sentence2"],
)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

# %%
trainer_model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint, return_dict=True)

trainer_peft_config = LoraConfig(
task_type=TaskType.SEQ_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)


model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint, return_dict=True)
peft_config = LoraConfig(
task_type=TaskType.SEQ_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)

trainer_model = get_peft_model(trainer_model, trainer_peft_config)
trainer_model.print_trainable_parameters()

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

#%%

BATCH_SIZE = 32
NUM_EPOCHS = 5
LR = 1e-3
WEIGHT_DECAY = 0.01
#%%

training_args = TrainingArguments(
output_dir="roberta-large-lora-seq1_tests",
learning_rate=LR,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=NUM_EPOCHS,
weight_decay=WEIGHT_DECAY,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
)
trainer = Trainer(
model=trainer_model,
args=training_args,
#train_dataset=tokenized_datasets["train"],
train_dataset=tokenized_datasets["train"].select(range(1000)),
#eval_dataset=tokenized_datasets["test"],
eval_dataset=tokenized_datasets["test"].select(range(1000)),
tokenizer=tokenizer,
compute_metrics=compute_metrics,
data_collator=data_collator
)

trainer.train()
#%%
#model.save_pretrained('saved_model')

#%%

#accelerator = Accelerator()
#accelerator.wait_for_everyone()

def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)

from torch import nn

from transformers.trainer_pt_utils import get_parameter_names

ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS)
decay_parameters = [name for name in decay_parameters if "bias" not in name]

optimizer_grouped_parameters = [
{
"params": [
p for n, p in model.named_parameters() if (n in decay_parameters and p.requires_grad)
],
"weight_decay": WEIGHT_DECAY,
},
{
"params": [
p for n, p in model.named_parameters() if (n not in decay_parameters and p.requires_grad)
],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR)

#%%
#train_dataloader = DataLoader(tokenized_datasets["train"].select(range(1500)), shuffle=True, collate_fn=data_collator, batch_size=BATCH_SIZE)
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=BATCH_SIZE)
#valid_dataloader = DataLoader(tokenized_datasets["test"].select(range(500)), collate_fn=data_collator, batch_size=BATCH_SIZE)
valid_dataloader = DataLoader(tokenized_datasets["test"], collate_fn=data_collator, batch_size=BATCH_SIZE)

lr_scheduler = get_scheduler(
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * NUM_EPOCHS
)
def eval_metrics(model, dataloader, device='cpu'):
model.eval()
for batch in dataloader:
batch = {k: v.to(device) for k, v in batch.items()}

with torch.no_grad():
outputs = model(**batch)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions, references=batch["labels"])
model.train()
return metric.compute()

#model, optimizer, train_dataloader, valid_dataloader, lr_scheduler = accelerator.prepare(
# model, optimizer, train_dataloader, valid_dataloader, lr_scheduler
# )
#%%
for epoch in range(NUM_EPOCHS):
model.train()
losses = []
for batch in tqdm(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
loss.backward()

optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
#model.zero_grad()
losses.append(loss.detach())

accuracy = eval_metrics(model, valid_dataloader)
print(accuracy, 'loss: ', np.mean(losses))
# %%
Loading
Loading