Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mrpc robustness task #2

Open
wants to merge 12 commits into
base: add-mrpc-robustness-task
Choose a base branch
from
25,703 changes: 25,703 additions & 0 deletions MRPC Confirmation.ipynb

Large diffs are not rendered by default.

893 changes: 893 additions & 0 deletions MRPC Negative.ipynb

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions What_Remains_To_Add.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Datasets:


WMT19
IMDB
AGNES
RTE
MRPC

# Models

T0 (and its decendants)
XGLM and its descendants % https://huggingface.co/facebook/xglm-564M


21 changes: 21 additions & 0 deletions classification_experiement.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export HF_DATASETS_CACHE="/gpfswork/rech/tts/unm25jp/datasets"
for dataset in "mnli" "rte" "mrpc" "wmt" "imdb" "emotion" "ag-news"; do
for exp in "two-sentences-classification" "single-sentence-classification"; do
for MODEL_NAME in t5-small t5-base t5-large t5-3b bigscience/T0_3B bigscience/T0pp bigscience/T0p bigscience/T0 gpt gpt2 distilgpt2 EleutherAI/gpt-neo-125M EleutherAI/gpt-neo-1.3B EleutherAI/gpt-j-6B EleutherAI/gpt-neo-2.7B; do
sbatch --job-name=${MODEL_NAME}${exp}${dataset} \
--gres=gpu:1 \
--account=six@gpu \
--no-requeue \
--cpus-per-task=10 \
--hint=nomultithread \
--time=5:00:00 \
-C v100-32g \
--output=jobinfo/${MODEL_NAME}${exp}${dataset}_%j.out \
--error=jobinfo/${MODEL_NAME}${exp}${dataset}_%j.err \
--qos=qos_gpu-t3 \
--wrap="module purge; module load pytorch-gpu/py3/1.7.0 ; python evaluation/eval.py --model_name_or_path ${MODEL_NAME} --eval_tasks $exp --dataset_name $dataset --output_dir outputs --tag ${MODEL_NAME}${exp}${dataset}"

done

done
done
19 changes: 12 additions & 7 deletions evaluation/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@

import sys

sys.path.append('/home/infres/pcolombo/evaluation-robustness-consistency/evaluation/')
sys.path.append('/home/infres/pcolombo/evaluation-robustness-consistency/')
sys.path.append('/gpfswork/rech/tts/unm25jp/evaluation-robustness-consistency/')
sys.path.append('/gpfswork/rech/tts/unm25jp/evaluation-robustness-consistency/evaluation/')

sys.path.append(os.path.join(os.getcwd(), '/evaluation/'))
sys.path.append(os.path.join(os.getcwd(), '/single-sentence-classification/'))
sys.path.append(os.getcwd())
import evaluation.tasks # noqa: F401
from evaluation.tasks.auto_task import AutoTask
from evaluation.utils.log import get_logger
Expand All @@ -25,6 +23,9 @@ class EvaluationArguments:
Arguments for any adjustable params in this evaluation script
"""

dataset_name: str = field(
metadata={"help": "The model checkpoint that we want to evaluate, could be name or the path."}
)
model_name_or_path: str = field(
metadata={"help": "The model checkpoint that we want to evaluate, could be name or the path."}
)
Expand All @@ -41,6 +42,7 @@ class EvaluationArguments:
data_dir: Optional[str] = field(default=None, metadata={"help": "Path to the local dataset folder"})

do_sample: Optional[bool] = field(default=False, metadata={"help": "Whether to use sampling instead of greedy."})
use_multi_gpu: Optional[bool] = field(default=False, metadata={"help": "Whether to use multi gpus."})
early_stopping: Optional[bool] = field(default=False,
metadata={"help": "Whether to stop when the correct number of sample"})
min_length: Optional[int] = field(
Expand Down Expand Up @@ -98,14 +100,17 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(eval_args.tokenizer_name or eval_args.model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
if "T0" in eval_args.model_name_or_path: # in ["bigscience/T0_3B", "bigscience/T0"]:
if ("t5" in eval_args.model_name_or_path.lower()) or "t0" in (
eval_args.model_name_or_path.lower()): # in ["bigscience/T0_3B", "bigscience/T0"]:
MODEL_TYPE = AutoModelForSeq2SeqLM
else:
MODEL_TYPE = AutoModelForCausalLM
model = MODEL_TYPE.from_pretrained(
eval_args.model_name_or_path,
pad_token_id=tokenizer.eos_token,
)
if eval_args.use_multi_gpu:
model.parallelize()
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer))
model.to(device)
Expand All @@ -128,7 +133,7 @@ def main():
data_dir=eval_args.data_dir,
)
set_seed(train_args.seed)
task.evaluate()
task.evaluate(dataset_name=eval_args.dataset_name)
task.save_metrics(output_dir, logger)


Expand Down
Empty file.
5 changes: 5 additions & 0 deletions evaluation/tasks/generation-consistency/english.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"pair": "kk-en",
"stride": 512,
"batch_size": 8
}
280 changes: 280 additions & 0 deletions evaluation/tasks/generation-consistency/generation-consistency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
# Module for any additional processing required for the WMT dataset
# HuggingFace dataset link: https://huggingface.co/datasets/wmt19
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from datasets import load_dataset
from jinja2 import Template
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
import difflib
from evaluation.tasks.auto_task import AutoTask
from evaluation.utils.log import get_logger

from evaluation.tasks.auto_task import AutoTask

TEMPLATE_PARAPHRASE = Template(
"""Sentence: {{sent1}} How would you rephrase the sentence with different words?"""
)

TEMPLATE_CONFIRMATION = Template(
"""Sentence 1: {{sent1}} Sentence 2: {{sent2}} Do Sentence 1 and Sentence 2 convey the same meaning? Yes or No?"""
)

class MRPCDataset(Dataset):
def __init__(self, tokenizer):
super().__init__()
mrpc = load_dataset("glue", "mrpc", split="validation")
self.items = []
self.labels2id = {
"Yes": 0, 'No': 1
}
self.id2labels = {v: k for k, v in self.labels2id.items()}

for sample in mrpc:
prompt = TEMPLATE_PARAPHRASE.render(
sent1=sample["sentence1"],
)

# Tokenize and construct this sample
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
)
self.items.append(
{
"prompt": prompt,
"label": "Yes",
"sentence1": sample["sentence1"],
"sentence2": sample["sentence2"],
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"input_len": inputs["attention_mask"].shape[1],
}
)

def __len__(self):
return len(self.items)

def __getitem__(self, index):
return self.items[index]


class WMTEnglishDataset(Dataset):
def __init__(self, tokenizer, pair="kk-en"):
super().__init__()

self.languages = ['cs-en', 'kk-en', 'fi-en'] # , 'gu-en','de-en', 'kk-en', 'lt-en', 'ru-en', 'zh-en', 'fr-en']
self.filter = 150

assert "en" in pair, f"Expected `pair` to contain English, but got {pair} instead"
wmt_ds = dict()
for pair in self.languages:
print('Loading', pair)
wmt_ds[pair] = load_dataset("wmt19", pair, split="validation")["translation"]

self.items = []
self.labels2id = {
"Yes": 0, 'No': 1
}
self.id2labels = {v: k for k, v in self.labels2id.items()}
for key, wmt in wmt_ds.items():
key_1 = key.split('-')[0]
key_2 = key.split('-')[1]
for index, sample in enumerate(wmt):
if index == self.filter:
break
prompt = TEMPLATE_CONFIRMATION.render(
sent1=sample["text"],
)

# Tokenize and construct this sample
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
)
self.items.append(
{"sentence1": sample[key_1],
"sentence2": sample[key_2],
"prompt": prompt,
"pair": key,
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"input_len": inputs["attention_mask"].shape[1],
"label": "Yes", # TODO voir les details
}
)

def __len__(self):
return len(self.items)

def __getitem__(self, index):
return self.items[index]


class GenerationTask(AutoTask):
@staticmethod
def get_display_name() -> str:
return "generation-consistency"

def evaluate(self, dataset_name='wmt') -> None:
if dataset_name == "wmt":
dataset = WMTEnglishDataset(self.tokenizer)
elif dataset_name == "mrpc":
dataset = MRPCDataset(self.tokenizer)
else:
raise NotImplementedError

is_first = True
LABELS_LIST = dict()
for k, v in dataset.labels2id.items():
assert len(self.tokenizer.tokenize(k)) == 1, "Thinks of changing the label {}".format(
self.tokenizer.tokenize(k))
LABELS_LIST[k] = self.tokenizer.vocab[self.tokenizer.tokenize(k)[0]]

def get_classification_output(sample):
with torch.no_grad():
if ('t5' in self.model.name_or_path.lower()) or ('t0' in self.model.name_or_path.lower()):
output = self.model(labels=sample["input_ids"].to(self.device),
input_ids=sample["input_ids"].to(self.device),
attention_mask=sample["attention_mask"].to(self.device))

elif ('gpt' in self.model.name_or_path.lower()):
output = self.model(input_ids=sample["input_ids"].to(self.device),
attention_mask=sample["attention_mask"].to(self.device))
else:
raise NotImplementedError
logits = output['logits']
sofmax_results = torch.nn.Softmax()(
torch.tensor([logits[:, -1, label_id] for label_id in list(LABELS_LIST.values())])).tolist()
return sofmax_results

def get_sequences(sample): # TODO adapt for nmt
with torch.no_grad():
output = self.model.generate(
input_ids=sample["input_ids"].to(self.device), output_scores=True,
attention_mask=sample["attention_mask"].to(self.device),
max_length=min(sample["input_len"] * 2, 1024),
# hard-coded to 1024 since each model has diferent naming for max length
min_length=self.args.min_length,
do_sample=self.args.do_sample, # need to be set to true not to use greedy sampling
early_stopping=True,
# whether to stop when num_beams sentences are generated
num_beams=self.args.num_beams,
temperature=self.args.temperature, # lower than 1 conservative, greater than one diverse
top_k=self.args.top_k, num_return_sequences=self.args.num_beams,
# number of highest probability vocabulary tokens to keep for top-k-filtering
top_p=self.args.top_p, #
repetition_penalty=self.args.repetition_penalty,
length_penalty=self.args.length_penalty # 1 no penalty >1 foster long sentences
)
# remove everything that follows a special symbol
if False:
outputs = []
for untok_output in output:
stop_appening = False
start_appening = False
current_output = []
for token in untok_output.tolist(): # skip two first token
if token not in self.tokenizer.all_special_ids:
start_appening = True
if not stop_appening and start_appening:
current_output.append(token)
else:
stop_appening = True if start_appening else False
outputs.append(current_output)
seq = [self.tokenizer.decode(torch.tensor(output), skip_special_tokens=False) for output in
outputs]
seq = self.tokenizer.batch_decode(output, skip_special_tokens=True)
logger.info(
" ************************** Raw sentences ************************** \n{}".format(
'\n'.join(seq)))
return seq

logger = get_logger()
count = 0
l_samples, l_samples_golden, l_confirmation_prompts, l_prompts = [], [], [], []
l_y_label, l_stry_label, l_soft_labels, l_paraphrases = [], [], [], []
for sample in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"):
count += 1
if count == 4:
break

paraphrases = get_sequences(sample)

# Itterate throughs the paraphrases
confirmation_prompts, samples, samples_golden, y_predicted, = [], [], [], []
y_label, stry_label, soft_labels, paraphrases_c, prompts = [], [], [], [], []
for index, paraphrase in enumerate(paraphrases):
if index == 2:
break

# use the output to confirm
prompt = TEMPLATE_CONFIRMATION.render(
sent1=sample["sentence1"],
sent2=paraphrase
)
# Tokenize and construct this sample
inputs = self.tokenizer(
prompt,
return_tensors="pt",
truncation=True,
)

sample_confirmation = {
"prompt": prompt,
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"input_len": inputs["attention_mask"].shape[1],
}
soft_confirmations = get_classification_output(sample_confirmation)
if is_first:
is_first = False
log_msg = "Evaluation example for MRPC-Negative\nLabels\t{}\n".format(LABELS_LIST)

log_msg += "\nprompt#1 (Standard):\n" + sample["prompt"]
log_msg += "\nmodel output:\n" + paraphrase

log_msg += "\n\nprompt:\n" + sample_confirmation["prompt"]
log_msg += "\nsoft model output:\n" + str(soft_confirmations)
log_msg += "\ngolden:\n" + str(sample["label"])
log_msg += "\ngolden:\n" + str(dataset.labels2id[sample["label"]])
logger.info(log_msg)

# log the prompts and the outputs
prompts.append(sample["prompt"])
samples.append(sample["sentence1"])
samples_golden.append(sample["sentence2"])

confirmation_prompts.append(prompt)

paraphrases_c.append(sample["prompt"])

y_label.append(dataset.labels2id[sample["label"]])
stry_label.append(sample["label"])
soft_labels.append(soft_confirmations)

l_paraphrases.append(paraphrases_c)
l_prompts.append(prompts)
l_samples.append(samples)
l_samples_golden.append(samples_golden)
l_confirmation_prompts.append(confirmation_prompts)
l_y_label.append(y_label)
l_stry_label.append(stry_label)
l_soft_labels.append(soft_labels)

self.metrics = {
"l_prompts": l_prompts,
"l_samples": l_samples,
"l_paraphrases": l_paraphrases,
"l_samples_golden": l_samples_golden,
"l_confirmation_prompts": l_confirmation_prompts,
"l_y_label": l_y_label,
"l_stry_label": l_stry_label,
"l_soft_labels": l_soft_labels
}
logger.info("Metrics : {}".format(self.metrics))
Loading