-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheval.py
68 lines (53 loc) · 2.35 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Code adapted from https://huggingface.co/docs/transformers/v4.25.1/en/tasks/language_modeling#language-modeling
# - Evaluation:
# - Task 1: sentiment classification. For test set, evaluate accuracy of “This does suggest that it is _”.
# - Plot:
# - Matrix: x-axis: model size; y-axis: X%; cell: task 1/2 accuracy
import click
import datasets
import numpy as np
from tqdm import trange
from transformers import AutoModelForCausalLM, AutoTokenizer
from colors import red, blue
@click.command()
@click.option(
"--model_name", default="sherryycxie/interpolated_pre_trained_fine_tuned_model", help="Model name"
)
def infer(model_name: str):
prompt = "it is a terrible movie. this is not"
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
model = AutoModelForCausalLM.from_pretrained(model_name)
outputs = model.generate(inputs, max_new_tokens=1, do_sample=False)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
@click.command()
@click.option(
"--model_name", default="sherryycxie/interpolated_pre_trained_fine_tuned_model", help="Model name"
)
def evaluate(model_name: str):
dataset = datasets.load_dataset("sst2", split="validation")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
preds_original, labels = [], []
for i in trange(len(dataset["sentence"])):
sentence = dataset["sentence"][i].strip()
if sentence[-1] not in [".", "?", "!"]:
sentence += "."
processed_sentence_original = f"{sentence} This does suggest that it is"
inputs = tokenizer(
processed_sentence_original, return_tensors="pt"
).input_ids
outputs = model.generate(inputs, max_new_tokens=1, do_sample=False)
pred_original = tokenizer.batch_decode(outputs, skip_special_tokens=True)[
0
].split()[-1]
preds_original.append(pred_original)
labels.append(dataset["label"][i])
labels_original = ["good" if label == 1 else "bad" for label in labels]
print(preds_original[:20], labels_original[:20])
acc_original = (np.array(preds_original) == np.array(labels_original)).mean()
print(
f"Original accuracy: {acc_original * 100:.2f}%\n"
)
if __name__ == "__main__":
evaluate()