Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimentation with DistillCARP #14

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ carp/dataset/dataset_dict.json
carp/dataset/train/cache-a5b0849dd9416bdb.arrow
carp/dataset/train/dataset_info.json
carp/dataset/train/state.json
*.csv
/carp/experiments/distil_carp/20B_tokenizer.json
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions .idea/magiCARP.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

File renamed without changes.
File renamed without changes.
46 changes: 46 additions & 0 deletions carp/experiments/distil_carp/examine_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pandas as pd
import os
import csv

def read_dataset_component(filepath):
data = list()
with open(filepath, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
data.append(row[1])
return data

def read_paraphrase_component(filepath):
data = list()
with open(filepath, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
data.append(row)
return data

path = 'distil_data'

crit_data = []
crit_datapath = path+'/paraphrase_train_crits'
files = os.listdir(crit_datapath)
files.sort()
for file in files:
print(file)
filepath = os.path.join(crit_datapath, file)
crit_data_chunk = read_paraphrase_component(filepath)
crit_data += crit_data_chunk

story_file = 'train_stories.csv'
story_datapath = os.path.join(path, story_file)
story_data = read_dataset_component(story_datapath)

orig_crit_file = 'train_crits.csv'
orig_crit_datapath = os.path.join(path, orig_crit_file)
orig_crit_data = read_dataset_component(orig_crit_datapath)

print("NUM STORIES: ", len(story_data))
print("NUM CRITIQUE LISTS: ", len(crit_data))
print("NUM ORIG CRITS: ", len(orig_crit_data))
print("NUM CRITIQUES PER: ", len(crit_data[1]))
print(story_data[1])
print(crit_data[1])
75 changes: 75 additions & 0 deletions carp/experiments/distil_carp/generate_fuzzing_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import csv
import numpy as np
import openai
from neox_tokenizer import *

bad_word_dict = \
{"0": -1000, "50256": -1000, "3353": -1000}
# add all line breaks to the bad word dict
for idx in line_break_token_ids:
bad_word_dict[str(idx)] = -1000

key = ""
with open("../../pytorch/data/utils/api_key.txt") as f:
key = f.read()

# nice try ;)
openai.api_key = key
openai.api_base = "https://api.goose.ai/v1"

prompt = "You are an editor of stories. Below is a set of stories and the the criticisms you have written for each " \
"manuscript.\n\n "


def read_dataset_component(filepath):
data = list()
with open(filepath, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
data.append(row[1])
return data


val_stories = read_dataset_component("../../pytorch/data/utils/val_stories.csv")
val_crits = read_dataset_component("../../pytorch/data/utils/val_crits.csv")

# Filter out stories that are below seven words long.
stories_crits = list(list(filter(lambda x: (len(x[0].split()) > 7), zip(val_stories, val_crits))))
stories_crits = [[story for story, _ in stories_crits],
[crit for _, crit in stories_crits]]
val_stories = stories_crits[0]
val_crits = stories_crits[1]

train_stories = read_dataset_component("../../pytorch/data/utils/train_stories.csv")[0:10]
examples_n = 5
indices = np.random.choice(len(val_stories), examples_n, replace=False)
print("[" + ", ".join(list(map(str, indices))) + "]")
indices = [56, 23, 14, 65, 60]

for i in range(1, examples_n + 1, 1):
idx = indices[i - 1]
story_example = str(i) + ". Story: " + val_stories[idx] + "\nCriticism: " + val_crits[idx] + "\n\n"
prompt += story_example

for input_story in train_stories:
story_prompt = prompt + str(examples_n + 1) + ". Story: " + input_story + "\nCriticism:"
# Biasing against EOT (0, 50256), and \n
print(story_prompt)
import sys
sys.exit()
completion = openai.Completion.create(
engine="gpt-neo-20b",
prompt=story_prompt,
max_tokens=40,
typical_p=0.5,
logit_bias=bad_word_dict,
logprobs=30,
stream=True)
print(input_story)
print("\n")

# Print each token as it is returned
for c in completion:
print(c.choices[0].text, end='')

print("\n=============\n")
8 changes: 8 additions & 0 deletions carp/experiments/distil_carp/neox_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import json
tokenizer_data = None
with open("20B_tokenizer.json") as f:
tokenizer_data = json.load(f)
line_break_token_ids = list()
for idx, k in enumerate(tokenizer_data['model']['vocab'].keys()):
if u"\u010a" in k:
line_break_token_ids.append(idx)
55 changes: 55 additions & 0 deletions carp/experiments/distil_carp/paraphrase_critiques.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import csv
import sys
from tqdm import tqdm

tokenizer_pegasus = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')
model_pegasus = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase').half().to("cuda")

def read_dataset_component(filepath):
data = list()
with open(filepath, newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL)
for row in reader:
data.append(row[1])
return data

num_beams = 5
def get_review_ensemble(input_text):
batch = tokenizer_pegasus(input_text,truncation=True,padding='longest', max_length=60, return_tensors="pt").to("cuda")
translated = model_pegasus.generate(**batch,max_length=60, num_beams=num_beams, num_return_sequences=num_beams, temperature=1.5)
return tokenizer_pegasus.batch_decode(translated, skip_special_tokens=True)

def write_dataset_csv(data, filepath):
with open(filepath, mode='w') as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
writer.writerows(data)

filepath = '/mnt/raid/users/AlexH/magiCARP/carp/pytorch/data/utils/train_crits.csv'
data = read_dataset_component(filepath)
batch_size = 100
num_batches = (len(data) + batch_size - 1) // batch_size
output_file = 'paraphrase_train_crits.csv'
write_thresh = 1000
temp_csv = []
print(len(data))
for i in tqdm(range(num_batches)):
cur_batch_size = min(batch_size, len(data)-batch_size*i)
batch = data[i*batch_size:i*batch_size+cur_batch_size]
#print(batch)
num_paraphrases = 5
paraphrases = get_review_ensemble(batch)
reshaped_paraphrases = []
for j in range(len(paraphrases)//num_paraphrases):
reshaped_paraphrases.append([])
for k in range(num_paraphrases):
reshaped_paraphrases[-1].append(paraphrases[j*num_paraphrases+k])
#print(reshaped_paraphrases)
temp_csv += reshaped_paraphrases
if (i+1) % write_thresh == 0:
print("WRITING TO CSV")
cur_output_file = output_file+f"_{i}"
write_dataset_csv(temp_csv ,cur_output_file)
temp_csv = []
write_dataset_csv(temp_csv, output_file)

24 changes: 14 additions & 10 deletions carp/pytorch/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
# specifies a dictionary of architectures
_DATAPIPELINE: Dict[str, any] = {} # registry


def register_datapipeline(name):
"""Decorator used register a CARP architecture
"""Decorator used register a CARP architecture

Args:
name: Name of the architecture
Expand All @@ -29,7 +30,7 @@ def register_class(cls, name):
if isinstance(name, str):
name = name.lower()
return lambda c: register_class(c, name)

cls = name
name = cls.__name__
register_class(cls, name.lower())
Expand All @@ -42,9 +43,9 @@ class BaseDataPipeline(Dataset):
"""Dataset wrapper class to ease working with the CARP dataset and Pytorch data utilities."""

def __init__(
self,
dupe_protection: bool = True,
path: str = "dataset",
self,
dupe_protection: bool = True,
path: str = "dataset",
):
dataset = load_from_disk(path)
train = dataset["train"]
Expand All @@ -70,8 +71,7 @@ def __len__(self) -> int:
return len(self.passages)

@staticmethod
def create_tokenizer_factory(call_tokenizer : Callable, tokenizer_factory : Callable, context_len : int) -> Callable:

def create_tokenizer_factory(call_tokenizer: Callable, tokenizer_factory: Callable, context_len: int) -> Callable:
"""Function creates a callable tokenizer subroutine and uses it to curry the tokenizer factory

Args:
Expand All @@ -85,7 +85,7 @@ def create_tokenizer_factory(call_tokenizer : Callable, tokenizer_factory : Call
return partial(tokenizer_factory, tok_func)

@staticmethod
def tokenizer_factory(_tok : Callable, encoder: BaseEncoder) -> Callable:
def tokenizer_factory(_tok: Callable, encoder: BaseEncoder) -> Callable:

"""Function factory that creates a collate function for use with a torch.util.data.Dataloader

Expand All @@ -95,9 +95,10 @@ def tokenizer_factory(_tok : Callable, encoder: BaseEncoder) -> Callable:
Returns:
Callable: A function that will take a batch of string tuples and tokenize them properly.
"""

@typechecked
def collate(
data: Iterable[Tuple[str, str]]
data: Iterable[Tuple[str, str]]
) -> Tuple[BatchElement, BatchElement]:
passages, reviews = zip(*data)
pass_tokens, rev_tokens = _tok(list(passages)), _tok(list(reviews))
Expand All @@ -111,12 +112,15 @@ def collate(
)

return collate

from carp.pytorch.data.mlm_pipeline import MLMDataPipeline
from carp.pytorch.data.scarecrow_pipeline import ScarecrowDataPipeline
from carp.pytorch.data.distill_pipeline import DistillDataPipeline


def get_datapipeline(name):
return _DATAPIPELINE[name.lower()]


def get_datapipeline_names():
return _DATAPIPELINE.keys()
Loading