Skip to content

Commit

Permalink
Merge branch 'main' into build-apply_project_boilerplate
Browse files Browse the repository at this point in the history
  • Loading branch information
jaketae authored Aug 19, 2021
2 parents 0381bcc + aae25bc commit 8425a26
Show file tree
Hide file tree
Showing 9 changed files with 176 additions and 11 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,5 @@ A [simple benchmark](https://github.com/bigscience-workshop/Megatron-DeepSpeed/i
[WMT](https://huggingface.co/datasets/wmt19) and [TyDi QA](https://huggingface.co/datasets/tydiqa)
E.g.
```shell
python3 -m evaluation.eval --model_name_or_path=gpt2 --eval_tasks tydiqa_secondary
python3 -m evaluation.eval --model_name_or_path=gpt2 --eval_tasks tydiqa_secondary --output_dir outputs
```
17 changes: 15 additions & 2 deletions evaluation/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,14 @@ class EvaluationArguments:
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."}
)
tag: Optional[str] = field(default=None, metadata={"help": "Identifier for the evaluation run."})
tag: Optional[str] = field(
default=None,
metadata={"help": "Identifier for the evaluation run."}
)
english_only: Optional[bool] = field(
default=True,
metadata={"help": "Whether to run evaluation in English only."}
)


def main():
Expand Down Expand Up @@ -64,7 +71,13 @@ def main():

for eval_task in eval_args.eval_tasks:
logger.info(f"Benchmarking {eval_task}...")
task = AutoTask.from_task_name(eval_task, tokenizer=tokenizer, model=model, device=device)
task = AutoTask.from_task_name(
eval_task,
model=model,
tokenizer=tokenizer,
device=device,
english_only=eval_args.english_only,
)
set_seed(train_args.seed)
task.evaluate()
task.save_metrics(output_dir, logger)
Expand Down
Empty file added evaluation/models/__init__.py
Empty file.
4 changes: 4 additions & 0 deletions evaluation/models/loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from transformers import AutoModelForCausalLM

def load_model(model_name_or_path):
return AutoModelForCausalLM.from_pretrained(model_name_or_path)
70 changes: 63 additions & 7 deletions evaluation/tasks/auto_task.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,77 @@
import os
from abc import ABC, abstractmethod
from typing import Dict

from evaluation.utils.io import save_json
import torch
from transformers import AutoTokenizer, PreTrainedModel, PreTrainedTokenizerFast

from evaluation.utils.io import save_json, load_json
from evaluation.models.loader import load_model


class AutoTask(ABC):
def __init__(self, tokenizer, model, device):
self.tokenizer = tokenizer
def __init__(
self,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizerFast,
device: torch.device,
english_only: bool,
):
self.model = model
self.tokenizer = tokenizer
self.device = device
self.metrics = {}

self.task_config = self.load_task_args(english_only)

@classmethod
def from_task_name(cls, task_name: str, tokenizer, model, device):
def _get_task(cls, task_name):
all_tasks = cls.__subclasses__()
for task in all_tasks:
if task.get_display_name() == task_name:
return task(tokenizer=tokenizer, model=model, device=device)
return task
raise ValueError(f'Invalid task: {task_name}')

raise ValueError(f"Invalid task: {task_name}")
@classmethod
def from_task_name(
cls,
task_name: str,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizerFast,
device: torch.device,
english_only: bool,
):
task = cls._get_task(task_name)
return task(
model=model,
tokenizer=tokenizer,
device=device,
english_only=english_only,
)

@classmethod
def from_spec(
cls,
task_name: str,
model_name_or_path: str,
tokenizer_name: str,
device: torch.device,
english_only: bool,
):
task = cls._get_task(task_name)
model = load_model(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name_or_path)
return task(
model=model,
tokenizer=tokenizer,
device=device,
english_only=english_only,
)

def load_task_args(self, english_only) -> Dict:
task_root = os.path.join("evaluation", "tasks", self.get_display_name())
config_filename = "english.json" if english_only else "multiligual.json"
return load_json(os.path.join(task_root, config_filename))

@staticmethod
@abstractmethod
def get_display_name() -> str:
Expand All @@ -29,6 +81,10 @@ def get_display_name() -> str:
def evaluate(self) -> None:
pass

def train(self) -> None:
# TODO: convert to `abstractmethod` once simple_benchmark is ready
raise NotImplementedError

def save_metrics(self, output_dir, logger=None) -> str:
output_filename = os.path.join(output_dir, f"{self.get_display_name()}.json")
save_json(self.metrics, output_filename)
Expand Down
3 changes: 3 additions & 0 deletions evaluation/tasks/tydiqa_secondary/english.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"target_langs": ["english"]
}
3 changes: 2 additions & 1 deletion evaluation/tasks/tydiqa_secondary/tydiqa_secondary.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
class TyDiQADataset(Dataset):
def __init__(self, tokenizer, target_langs):
super().__init__()
assert tokenizer.pad_token == tokenizer.eos_token
tydiqa = load_dataset("tydiqa", "secondary_task", split="validation")
self.items = []

Expand Down Expand Up @@ -71,7 +72,7 @@ def get_display_name() -> str:
return "tydiqa_secondary"

def evaluate(self) -> None:
dataset = TyDiQADataset(self.tokenizer, target_langs=["english"])
dataset = TyDiQADataset(self.tokenizer, target_langs=self.task_config["target_langs"])

substring_matches = 0
for sample in tqdm(dataset, desc=f"Evaluating {self.get_display_name()}"):
Expand Down
84 changes: 84 additions & 0 deletions evaluation/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional, List
import os

import torch
from transformers import (
HfArgumentParser,
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
set_seed,
)
import evaluation.tasks # needed for AutoTask.__subclass__() to work correctly
from evaluation.tasks.auto_task import AutoTask
from evaluation.utils.log import get_logger


@dataclass
class EvaluationArguments:
"""
Arguments for any adjustable params in this evaluation script
"""
model_name_or_path: str = field(
metadata={"help": "The model checkpoint that we want to evaluate, could be name or the path."}
)
eval_tasks: List[str] = field(
metadata={"help": "A list of tasks to run the evaluation on, e.g. tydiqa_secondary"}
)
config_name: Optional[str] = field(
default=None,
metadata={"help": "Pretrained config name or path if not the same as model_name."}
)
tokenizer_name: Optional[str] = field(
default=None,
metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."}
)
tag: Optional[str] = field(
default=None,
metadata={"help": "Identifier for the evaluation run."}
)


def main():
parser = HfArgumentParser((EvaluationArguments, TrainingArguments))
eval_args, train_args = parser.parse_args_into_dataclasses()

if not eval_args.eval_tasks:
raise ValueError('Must provide at least one eval task!')

# initialize device
device = torch.device(train_args.device)

logger = get_logger()
logger.info(f"Beginning evaluation on device {train_args.device}")

# Load model & tokenizer
logger.info("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(eval_args.tokenizer_name or eval_args.model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
eval_args.model_name_or_path, pad_token_id=tokenizer.eos_token,
)
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# Exporting results
tag = eval_args.tag or datetime.now().strftime("%y%m%d_%H%M%S")
output_dir = os.path.join(train_args.output_dir, tag)
os.makedirs(output_dir, exist_ok=True)

for eval_task in eval_args.eval_tasks:
logger.info(f"Benchmarking {eval_task}...")
task = AutoTask.from_task_name(eval_task, tokenizer=tokenizer, model=model, device=device)
set_seed(train_args.seed)
task.train()
task.save_metrics(output_dir, logger)


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions evaluation/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@
def save_json(content: Dict, path: str, indent: int = 4, **kwargs) -> None:
with open(path, "w") as f:
json.dump(content, f, indent=indent, sort_keys=True, **kwargs)

def load_json(path: str) -> Dict:
with open(path, "r") as f:
return json.load(f)

0 comments on commit 8425a26

Please sign in to comment.