EleutherAI · LouisCastricato · Feb 3, 2022 · Feb 3, 2022 · Feb 11, 2022 · Feb 11, 2022
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,5 @@ carp/dataset/dataset_dict.json
 carp/dataset/train/cache-a5b0849dd9416bdb.arrow
 carp/dataset/train/dataset_info.json
 carp/dataset/train/state.json
+*.csv
+/carp/experiments/distil_carp/20B_tokenizer.json
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/magiCARP.iml b/.idea/magiCARP.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/carp/pytorch/finetune/__init__.py → carp/experiments/__init__.py b/carp/pytorch/finetune/__init__.py → carp/experiments/__init__.py
diff --git a/checkpoints/.gitkeep → carp/experiments/distil_carp/__init__.py b/checkpoints/.gitkeep → carp/experiments/distil_carp/__init__.py
diff --git a/carp/experiments/distil_carp/examine_data.py b/carp/experiments/distil_carp/examine_data.py
@@ -0,0 +1,46 @@
+import pandas as pd
+import os
+import csv
+
+def read_dataset_component(filepath):
+    data = list()
+    with open(filepath, newline='') as csvfile:
+        reader = csv.reader(csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL)
+        for row in reader:
+            data.append(row[1])
+    return data
+
+def read_paraphrase_component(filepath):
+    data = list()
+    with open(filepath, newline='') as csvfile:
+        reader = csv.reader(csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL)
+        for row in reader:
+            data.append(row)
+    return data
+
+path = 'distil_data'
+
+crit_data = []
+crit_datapath = path+'/paraphrase_train_crits'
+files = os.listdir(crit_datapath)
+files.sort()
+for file in files:
+	print(file)
+	filepath = os.path.join(crit_datapath, file)
+	crit_data_chunk = read_paraphrase_component(filepath)
+	crit_data += crit_data_chunk
+
+story_file = 'train_stories.csv'
+story_datapath = os.path.join(path, story_file)
+story_data = read_dataset_component(story_datapath)
+
+orig_crit_file = 'train_crits.csv'
+orig_crit_datapath = os.path.join(path, orig_crit_file)
+orig_crit_data = read_dataset_component(orig_crit_datapath)
+
+print("NUM STORIES: ", len(story_data))
+print("NUM CRITIQUE LISTS: ", len(crit_data))
+print("NUM ORIG CRITS: ", len(orig_crit_data))
+print("NUM CRITIQUES PER: ", len(crit_data[1]))
+print(story_data[1])
+print(crit_data[1])
diff --git a/carp/experiments/distil_carp/generate_fuzzing_data.py b/carp/experiments/distil_carp/generate_fuzzing_data.py
@@ -0,0 +1,75 @@
+import csv
+import numpy as np
+import openai
+from neox_tokenizer import *
+
+bad_word_dict = \
+    {"0": -1000, "50256": -1000, "3353": -1000}
+# add all line breaks to the bad word dict
+for idx in line_break_token_ids:
+    bad_word_dict[str(idx)] = -1000
+
+key = ""
+with open("../../pytorch/data/utils/api_key.txt") as f:
+    key = f.read()
+
+# nice try ;)
+openai.api_key = key
+openai.api_base = "https://api.goose.ai/v1"
+
+prompt = "You are an editor of stories. Below is a set of stories and the the criticisms you have written for each " \
+         "manuscript.\n\n "
+
+
+def read_dataset_component(filepath):
+    data = list()
+    with open(filepath, newline='') as csvfile:
+        reader = csv.reader(csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL)
+        for row in reader:
+            data.append(row[1])
+    return data
+
+
+val_stories = read_dataset_component("../../pytorch/data/utils/val_stories.csv")
+val_crits = read_dataset_component("../../pytorch/data/utils/val_crits.csv")
+
+# Filter out stories that are below seven words long.
+stories_crits = list(list(filter(lambda x: (len(x[0].split()) > 7), zip(val_stories, val_crits))))
+stories_crits = [[story for story, _ in stories_crits],
+                 [crit for _, crit in stories_crits]]
+val_stories = stories_crits[0]
+val_crits = stories_crits[1]
+
+train_stories = read_dataset_component("../../pytorch/data/utils/train_stories.csv")[0:10]
+examples_n = 5
+indices = np.random.choice(len(val_stories), examples_n, replace=False)
+print("[" + ", ".join(list(map(str, indices))) + "]")
+indices = [56, 23, 14, 65, 60]
+
+for i in range(1, examples_n + 1, 1):
+    idx = indices[i - 1]
+    story_example = str(i) + ". Story: " + val_stories[idx] + "\nCriticism: " + val_crits[idx] + "\n\n"
+    prompt += story_example
+
+for input_story in train_stories:
+    story_prompt = prompt + str(examples_n + 1) + ". Story: " + input_story + "\nCriticism:"
+    # Biasing against EOT (0, 50256), and \n
+    print(story_prompt)
+    import sys
+    sys.exit()
+    completion = openai.Completion.create(
+        engine="gpt-neo-20b",
+        prompt=story_prompt,
+        max_tokens=40,
+        typical_p=0.5,
+        logit_bias=bad_word_dict,
+        logprobs=30,
+        stream=True)
+    print(input_story)
+    print("\n")
+
+    # Print each token as it is returned
+    for c in completion:
+        print(c.choices[0].text, end='')
+
+    print("\n=============\n")
diff --git a/carp/experiments/distil_carp/neox_tokenizer.py b/carp/experiments/distil_carp/neox_tokenizer.py
@@ -0,0 +1,8 @@
+import json
+tokenizer_data = None
+with open("20B_tokenizer.json") as f:
+    tokenizer_data = json.load(f)
+line_break_token_ids = list()
+for idx, k in enumerate(tokenizer_data['model']['vocab'].keys()):
+    if u"\u010a" in k:
+        line_break_token_ids.append(idx)
diff --git a/carp/experiments/distil_carp/paraphrase_critiques.py b/carp/experiments/distil_carp/paraphrase_critiques.py
@@ -0,0 +1,55 @@
+from transformers import PegasusForConditionalGeneration, PegasusTokenizer
+import csv
+import sys
+from tqdm import tqdm
+
+tokenizer_pegasus = PegasusTokenizer.from_pretrained('tuner007/pegasus_paraphrase')
+model_pegasus = PegasusForConditionalGeneration.from_pretrained('tuner007/pegasus_paraphrase').half().to("cuda")
+
+def read_dataset_component(filepath):
+		data = list()
+		with open(filepath, newline='') as csvfile:
+				reader = csv.reader(csvfile, delimiter=",", quoting=csv.QUOTE_MINIMAL)
+				for row in reader:
+						data.append(row[1])
+		return data
+
+num_beams = 5
+def get_review_ensemble(input_text):
+	batch = tokenizer_pegasus(input_text,truncation=True,padding='longest', max_length=60, return_tensors="pt").to("cuda")
+	translated = model_pegasus.generate(**batch,max_length=60, num_beams=num_beams, num_return_sequences=num_beams, temperature=1.5)
+	return tokenizer_pegasus.batch_decode(translated, skip_special_tokens=True)
+
+def write_dataset_csv(data, filepath):
+    with open(filepath, mode='w') as csvfile:
+        writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
+        writer.writerows(data)
+
+filepath = '/mnt/raid/users/AlexH/magiCARP/carp/pytorch/data/utils/train_crits.csv'
+data = read_dataset_component(filepath)
+batch_size = 100
+num_batches = (len(data) + batch_size - 1) // batch_size
+output_file = 'paraphrase_train_crits.csv'
+write_thresh = 1000
+temp_csv = []
+print(len(data))
+for i in tqdm(range(num_batches)):
+	cur_batch_size = min(batch_size, len(data)-batch_size*i)
+	batch = data[i*batch_size:i*batch_size+cur_batch_size]
+	#print(batch)
+	num_paraphrases = 5
+	paraphrases = get_review_ensemble(batch)
+	reshaped_paraphrases = []
+	for j in range(len(paraphrases)//num_paraphrases):
+		reshaped_paraphrases.append([])
+		for k in range(num_paraphrases):
+			reshaped_paraphrases[-1].append(paraphrases[j*num_paraphrases+k])
+	#print(reshaped_paraphrases)
+	temp_csv += reshaped_paraphrases
+	if (i+1) % write_thresh == 0:
+		print("WRITING TO CSV")
+		cur_output_file = output_file+f"_{i}"
+		write_dataset_csv(temp_csv ,cur_output_file)
+		temp_csv = []
+write_dataset_csv(temp_csv, output_file)
+
diff --git a/carp/pytorch/data/__init__.py b/carp/pytorch/data/__init__.py
@@ -14,8 +14,9 @@
 # specifies a dictionary of architectures
 _DATAPIPELINE: Dict[str, any] = {}  # registry
 
+
 def register_datapipeline(name):
-    """Decorator used register a CARP architecture 
+    """Decorator used register a CARP architecture
 
         Args:
             name: Name of the architecture
@@ -29,7 +30,7 @@ def register_class(cls, name):
     if isinstance(name, str):
         name = name.lower()
         return lambda c: register_class(c, name)
-    
+
     cls = name
     name = cls.__name__
     register_class(cls, name.lower())
@@ -42,9 +43,9 @@ class BaseDataPipeline(Dataset):
     """Dataset wrapper class to ease working with the CARP dataset and Pytorch data utilities."""
 
     def __init__(
-        self,
-        dupe_protection: bool = True,
-        path: str = "dataset",
+            self,
+            dupe_protection: bool = True,
+            path: str = "dataset",
     ):
         dataset = load_from_disk(path)
         train = dataset["train"]
@@ -70,8 +71,7 @@ def __len__(self) -> int:
         return len(self.passages)
 
     @staticmethod
-    def create_tokenizer_factory(call_tokenizer : Callable, tokenizer_factory : Callable, context_len : int) -> Callable:
-
+    def create_tokenizer_factory(call_tokenizer: Callable, tokenizer_factory: Callable, context_len: int) -> Callable:
         """Function creates a callable tokenizer subroutine and uses it to curry the tokenizer factory
 
         Args:
@@ -85,7 +85,7 @@ def create_tokenizer_factory(call_tokenizer : Callable, tokenizer_factory : Call
         return partial(tokenizer_factory, tok_func)
 
     @staticmethod
-    def tokenizer_factory(_tok : Callable, encoder: BaseEncoder)  -> Callable:
+    def tokenizer_factory(_tok: Callable, encoder: BaseEncoder) -> Callable:
 
         """Function factory that creates a collate function for use with a torch.util.data.Dataloader
 
@@ -95,9 +95,10 @@ def tokenizer_factory(_tok : Callable, encoder: BaseEncoder)  -> Callable:
         Returns:
             Callable: A function that will take a batch of string tuples and tokenize them properly.
         """
+
         @typechecked
         def collate(
-            data: Iterable[Tuple[str, str]]
+                data: Iterable[Tuple[str, str]]
         ) -> Tuple[BatchElement, BatchElement]:
             passages, reviews = zip(*data)
             pass_tokens, rev_tokens = _tok(list(passages)), _tok(list(reviews))
@@ -111,12 +112,15 @@ def collate(
             )
 
         return collate
-    
+
 from carp.pytorch.data.mlm_pipeline import MLMDataPipeline
 from carp.pytorch.data.scarecrow_pipeline import ScarecrowDataPipeline
+from carp.pytorch.data.distill_pipeline import DistillDataPipeline
+
 
 def get_datapipeline(name):
     return _DATAPIPELINE[name.lower()]
 
+
 def get_datapipeline_names():
     return _DATAPIPELINE.keys()