From 1735cba249f99c743db96277f01846d6be6fab68 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sat, 8 Jul 2023 03:19:45 +0000
Subject: [PATCH 01/22] yolo refactor

---
 .github/workflows/deploy.yaml                 |  14 -
 .github/workflows/test.yaml                   |   7 -
 LICENSE                                       | 201 -------
 MANIFEST.in                                   |   5 -
 __init__.py                                   |   0
 examples/textbooks_A2YN/gpt_labeling.py       |  49 --
 gpt_labeling.py                               |  40 ++
 nbs/00_core.ipynb                             | 513 ------------------
 nbs/02_tutorial.ipynb                         | 104 ----
 nbs/_quarto.yml                               |  20 -
 nbs/index.ipynb                               |  96 ----
 nbs/nbdev.yml                                 |   9 -
 nbs/styles.css                                |  37 --
 requirements.txt                              |   8 +
 settings.ini                                  |  43 --
 .../train_labeler.py => train_labeler.py      |   0
 treasure_trove/__init__.py                    |   1 -
 treasure_trove/_modidx.py                     |  11 -
 treasure_trove/core.py                        | 146 ++++-
 19 files changed, 167 insertions(+), 1137 deletions(-)
 delete mode 100644 .github/workflows/deploy.yaml
 delete mode 100644 .github/workflows/test.yaml
 delete mode 100644 LICENSE
 delete mode 100644 MANIFEST.in
 create mode 100644 __init__.py
 delete mode 100644 examples/textbooks_A2YN/gpt_labeling.py
 create mode 100644 gpt_labeling.py
 delete mode 100644 nbs/00_core.ipynb
 delete mode 100644 nbs/02_tutorial.ipynb
 delete mode 100644 nbs/_quarto.yml
 delete mode 100644 nbs/index.ipynb
 delete mode 100644 nbs/nbdev.yml
 delete mode 100644 nbs/styles.css
 create mode 100644 requirements.txt
 delete mode 100644 settings.ini
 rename examples/textbooks_A2YN/train_labeler.py => train_labeler.py (100%)
 delete mode 100644 treasure_trove/_modidx.py

diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
deleted file mode 100644
index 29bfc57..0000000
--- a/.github/workflows/deploy.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-name: Deploy to GitHub Pages
-
-permissions:
-  contents: write
-  pages: write
-
-on:
-  push:
-    branches: [ "main", "master" ]
-  workflow_dispatch:
-jobs:
-  deploy:
-    runs-on: ubuntu-latest
-    steps: [uses: fastai/workflows/quarto-ghp@master]
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
deleted file mode 100644
index 5608592..0000000
--- a/.github/workflows/test.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-name: CI
-on:  [workflow_dispatch, pull_request, push]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps: [uses: fastai/workflows/nbdev-ci@master]
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 3b106e8..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2022, fastai
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 5c0e7ce..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,5 +0,0 @@
-include settings.ini
-include LICENSE
-include CONTRIBUTING.md
-include README.md
-recursive-exclude * __pycache__
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/textbooks_A2YN/gpt_labeling.py b/examples/textbooks_A2YN/gpt_labeling.py
deleted file mode 100644
index b19c72e..0000000
--- a/examples/textbooks_A2YN/gpt_labeling.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import os
-
-from datasets import concatenate_datasets, load_dataset
-from squeakily.helpers import LLMLabeler
-from treasure_trove.core import label_dataset
-
-instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability.
-High quality code has the following:
-* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure.
-* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects.
-* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles.
-* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain.
-Medium quality code has the following:
-* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names.
-* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions.
-* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose.
-* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices.
-Low quality code has the following:
-* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names.
-* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions.
-* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose.
-* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend.
-
-Output nothing other than one of the following labels:
-"""
-
-labels = ["high quality", "medium quality", "low quality"]
-api_key = os.environ["OPENAI_KEY"]
-labeler = LLMLabeler(instruction, labels, model_name="gpt-4", api_key=api_key) # gpt-3.5-turbo
-
-languages = ["python", "go", "java", "javascript", "c", "c++"]
-subsets = []
-for lang in languages:
-    ds = load_dataset("bigcode/the-stack-smol", data_dir=f"data/{lang}")["train"]
-    sample = 50 / len(ds)
-    subset = label_dataset(ds, "content", labeler, labels, sample=sample, num_workers=8)
-    new_column = [lang] * len(subset)
-    subset = subset.add_column("language", new_column)
-    subsets.append(subset)
-
-labeled_ds = concatenate_datasets(subsets)
-
-# upload to huggingface
-labeled_ds.push_to_hub("CarperAI/textbooks_A2YN_labeled_six_languages", private=True)
-
-# print number of each class
-print(f"Number of {labels[0]}: {len(labeled_ds.filter(lambda x: x['label'] == 0))}")
-print(f"Number of {labels[1]}: {len(labeled_ds.filter(lambda x: x['label'] == 1))}")
-print(f"Number of {labels[2]}: {len(labeled_ds.filter(lambda x: x['label'] == 2))}")
diff --git a/gpt_labeling.py b/gpt_labeling.py
new file mode 100644
index 0000000..1340a43
--- /dev/null
+++ b/gpt_labeling.py
@@ -0,0 +1,40 @@
+import os
+
+from pydantic import BaseModel, Field
+
+from datasets import concatenate_datasets, load_dataset
+from typing import List
+from langchain.output_parsers import PydanticOutputParser
+from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
+from langchain.prompts import PromptTemplate
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from dotenv import load_dotenv
+import time
+
+from treasure_trove.core import label_dataset
+
+load_dotenv(".env")
+labels = ["high quality", "medium quality", "low quality"]
+languages = ["python", "javascript"]
+subsets = []
+for lang in languages:
+    ds = load_dataset("bigcode/the-stack-smol", data_dir=f"data/{lang}")["train"]
+    sample = 50 / len(ds)
+    subset = label_dataset(ds, "content", labels, sample=sample, num_workers=1)
+    new_column = [lang] * len(subset)
+    subset = subset.add_column("language", new_column)
+    subsets.append(subset)
+
+labeled_ds = concatenate_datasets(subsets)
+
+# upload to huggingface
+labeled_ds.push_to_hub("CarperAI/textbooks_A2YN_labeled_six_languages", private=True)
+
+# print number of each class
+print(f"Number of {labels[0]}: {len(labeled_ds.filter(lambda x: x['label'] == 0))}")
+print(f"Number of {labels[1]}: {len(labeled_ds.filter(lambda x: x['label'] == 1))}")
+print(f"Number of {labels[2]}: {len(labeled_ds.filter(lambda x: x['label'] == 2))}")
diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb
deleted file mode 100644
index 7e7aea7..0000000
--- a/nbs/00_core.ipynb
+++ /dev/null
@@ -1,513 +0,0 @@
-{
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# core\n",
-    "\n",
-    "> Fill in a module description here"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | default_exp core"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | export\n",
-    "import evaluate\n",
-    "import time\n",
-    "\n",
-    "import numpy as np\n",
-    "\n",
-    "from transformers import (\n",
-    "    AutoModelForSequenceClassification,\n",
-    "    AutoTokenizer,\n",
-    "    DataCollatorWithPadding,\n",
-    "    Trainer,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | hide\n",
-    "from nbdev.showdoc import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | export\n",
-    "def classify(x, labels, llm_labeler, max_failures=5, default_label=0):\n",
-    "    failures = 0\n",
-    "    while failures < max_failures:\n",
-    "        try:\n",
-    "            label = labels.index(llm_labeler(x)[0])\n",
-    "            time.sleep(1)\n",
-    "            return label\n",
-    "        except Exception as e:\n",
-    "            failures += 1\n",
-    "            print(e)\n",
-    "            time.sleep(1)\n",
-    "            pass\n",
-    "    if failures == max_failures:\n",
-    "        return default_label"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | export\n",
-    "def label_dataset(\n",
-    "    dataset, text_column, labeler_model, labels, sample=0.1, num_workers=4, max_chars=4_096\n",
-    "):\n",
-    "    \"\"\"\n",
-    "    Filters a dataset using a labeler model.\n",
-    "\n",
-    "    Args:\n",
-    "        dataset (datasets.Dataset): Dataset to filter\n",
-    "        text_column (str): Name of the column containing the text to classify\n",
-    "        labeler_model (Any): Model to use for labeling\n",
-    "        labels (List[str]): List of labels\n",
-    "        sample (float): The fraction of the dataset to label and use for filtering\n",
-    "        batch_size (int): Batch size for labeling\n",
-    "        num_workers (int): Number of workers for labeling\n",
-    "        max_chars (int): Maximum number of characters to truncate the text to before labeling (reduces rate limiting errors)\n",
-    "    \"\"\"\n",
-    "\n",
-    "    # Get a subset of the dataset\n",
-    "    subset = dataset.shuffle(seed=115).select(range(int(len(dataset) * sample)))\n",
-    "\n",
-    "    # Label the subset\n",
-    "    subset = subset.map(\n",
-    "        lambda x: {\"label\": classify(x[text_column][:max_chars], labels, labeler_model)},\n",
-    "        batched=False,\n",
-    "        num_proc=num_workers,\n",
-    "    )\n",
-    "\n",
-    "    return subset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using custom data configuration bigcode--the-stack-smol-8f8055c3a4e4b4e3\n",
-      "Found cached dataset json (/home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-8f8055c3a4e4b4e3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cfb95116fc20477bb047848972658d69",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Loading cached shuffled indices for dataset at /home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-8f8055c3a4e4b4e3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-feaf44b92e145e5a.arrow\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9033fc6799034c8abffcb46335958b20",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1000 [00:00<?, ?ex/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from functools import partial\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "\n",
-    "def mock_labeler(x, labels):\n",
-    "    return [np.random.choice(labels, p=[0.25, 0.75])]\n",
-    "\n",
-    "\n",
-    "labels = [\"positive\", \"negative\"]\n",
-    "labeler = partial(mock_labeler, labels=labels)\n",
-    "ds = load_dataset(\"bigcode/the-stack-smol\", data_dir=\"data/python\")[\"train\"]\n",
-    "\n",
-    "subset = label_dataset(ds, \"content\", labeler, labels, sample=0.1)\n",
-    "\n",
-    "assert \"label\" in subset.column_names"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | export\n",
-    "def train_labeler(\n",
-    "    dataset,\n",
-    "    text_column,\n",
-    "    base_model_name,\n",
-    "    n_labels,\n",
-    "    training_args,\n",
-    "    num_workers=4,\n",
-    "    max_length=512,\n",
-    "    push_to_hub=True,\n",
-    "):\n",
-    "    \"\"\"\n",
-    "    Trains a labeler model on a labeled dataset.\n",
-    "\n",
-    "    Args:\n",
-    "        dataset (datasets.Dataset): Dataset to train on\n",
-    "        text_column (str): Name of the text column\n",
-    "        base_model_name (str): Name of the base model to use\n",
-    "        n_labels (int): Number of labels\n",
-    "        epochs (int): Number of epochs to train\n",
-    "        batch_size (int): Batch size for training\n",
-    "        num_workers (int): Number of workers for training\n",
-    "        max_length (int): Maximum length of the input\n",
-    "    \"\"\"\n",
-    "    # Load the tokenizer\n",
-    "    tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length=max_length)\n",
-    "    if tokenizer.pad_token is None:\n",
-    "        tokenizer.pad_token = tokenizer.eos_token\n",
-    "\n",
-    "    # Load the model\n",
-    "    model = AutoModelForSequenceClassification.from_pretrained(\n",
-    "        base_model_name, num_labels=n_labels, max_length=max_length\n",
-    "    )\n",
-    "    model.config.id2label = {i: i for i in range(n_labels)}\n",
-    "\n",
-    "    # Preprocess the dataset\n",
-    "    dataset = dataset.map(\n",
-    "        lambda x: tokenizer(\n",
-    "            x[text_column], padding=\"max_length\", truncation=True, max_length=max_length\n",
-    "        ),\n",
-    "        batched=True,\n",
-    "        num_proc=num_workers,\n",
-    "    )\n",
-    "\n",
-    "    # Split the dataset\n",
-    "    dataset = dataset.train_test_split(test_size=0.1, seed=42)\n",
-    "\n",
-    "    # Get the data collator\n",
-    "    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
-    "\n",
-    "    def compute_metrics(eval_preds):\n",
-    "        metric = evaluate.load(\"glue\", \"mrpc\")\n",
-    "        logits, labels = eval_preds\n",
-    "        if isinstance(logits, tuple): # Some models return tuples\n",
-    "            logits = logits[0]\n",
-    "        print(logits.shape, labels)\n",
-    "        predictions = np.argmax(logits, axis=-1)\n",
-    "        return metric.compute(predictions=predictions, references=labels)\n",
-    "\n",
-    "    # Get the trainer\n",
-    "    trainer = Trainer(\n",
-    "        model=model,\n",
-    "        args=training_args,\n",
-    "        train_dataset=dataset[\"train\"],\n",
-    "        eval_dataset=dataset[\"test\"],\n",
-    "        data_collator=data_collator,\n",
-    "        compute_metrics=compute_metrics,\n",
-    "    )\n",
-    "\n",
-    "    # Train the model\n",
-    "    trainer.train()\n",
-    "\n",
-    "    # Push the model to the hub\n",
-    "    if push_to_hub:\n",
-    "        trainer.push_to_hub()\n",
-    "\n",
-    "    # Return the model\n",
-    "    return model, tokenizer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']\n",
-      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
-      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
-      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "008f5b697a4a469d8e9e113140ff1938",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/nathan/miniconda3/envs/trove/lib/python3.8/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "18ca5beb1fbb4df9a9b871e9b929d5ff",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/225 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'loss': 0.7437, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.04}\n",
-      "{'loss': 0.711, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.09}\n",
-      "{'loss': 0.6896, 'learning_rate': 3e-06, 'epoch': 0.13}\n",
-      "{'loss': 0.6414, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.18}\n",
-      "{'loss': 0.6547, 'learning_rate': 5e-06, 'epoch': 0.22}\n",
-      "{'loss': 0.5845, 'learning_rate': 6e-06, 'epoch': 0.27}\n",
-      "{'loss': 0.5528, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.31}\n",
-      "{'loss': 0.6287, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.36}\n",
-      "{'loss': 0.6309, 'learning_rate': 9e-06, 'epoch': 0.4}\n",
-      "{'loss': 0.6, 'learning_rate': 1e-05, 'epoch': 0.44}\n",
-      "{'loss': 0.6651, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.49}\n",
-      "{'loss': 0.5361, 'learning_rate': 1.2e-05, 'epoch': 0.53}\n",
-      "{'loss': 0.674, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.58}\n",
-      "{'loss': 0.6853, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.62}\n",
-      "{'loss': 0.6342, 'learning_rate': 1.5e-05, 'epoch': 0.67}\n",
-      "{'loss': 0.6266, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.71}\n",
-      "{'loss': 0.4705, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.76}\n",
-      "{'loss': 0.5439, 'learning_rate': 1.8e-05, 'epoch': 0.8}\n",
-      "{'loss': 0.4427, 'learning_rate': 1.9e-05, 'epoch': 0.84}\n",
-      "{'loss': 0.5829, 'learning_rate': 2e-05, 'epoch': 0.89}\n",
-      "{'loss': 0.5624, 'learning_rate': 2.1e-05, 'epoch': 0.93}\n",
-      "{'loss': 0.6028, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.98}\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c72fabdbb3aa4dc8903d52013aa42f28",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/25 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'eval_loss': 0.5282490253448486, 'eval_accuracy': 0.8, 'eval_f1': 0.888888888888889, 'eval_runtime': 0.933, 'eval_samples_per_second': 107.178, 'eval_steps_per_second': 26.794, 'epoch': 1.0}\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\n",
-      "pip install xformers.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'train_runtime': 19.5343, 'train_samples_per_second': 46.073, 'train_steps_per_second': 11.518, 'train_loss': 0.6103349855211047, 'epoch': 1.0}\n"
-     ]
-    }
-   ],
-   "source": [
-    "from transformers import pipeline\n",
-    "\n",
-    "base_model_name = \"prajjwal1/bert-small\"\n",
-    "model, tokenizer = train_labeler(\n",
-    "    subset,\n",
-    "    \"content\",\n",
-    "    base_model_name,\n",
-    "    n_labels=len(labels),\n",
-    "    epochs=1,\n",
-    "    batch_size=4,\n",
-    "    num_workers=4,\n",
-    ")\n",
-    "assert type(model) == AutoModelForSequenceClassification"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | export\n",
-    "def filter_dataset(\n",
-    "    dataset, text_column, labeler_model, labels_to_keep, batch_size=32, num_workers=4\n",
-    "):\n",
-    "    \"\"\"\n",
-    "    Filters a dataset using a labeler model.\n",
-    "\n",
-    "    Args:\n",
-    "        dataset (datasets.Dataset): Dataset to filter\n",
-    "        text_column (str): Name of the text column\n",
-    "        labeler_model (transformers.pipelines.TextClassificationPipeline): Model to use for labeling\n",
-    "        labels_to_keep (list): List of labels to keep\n",
-    "        batch_size (int): Batch size for labeling\n",
-    "        num_workers (int): Number of workers for labeling\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def label(x):\n",
-    "        predicted = labeler_model(x, padding=True, truncation=True, max_length=512)\n",
-    "        return {\n",
-    "            \"label\": [l[\"label\"] for l in predicted],\n",
-    "            \"score\": [l[\"score\"] for l in predicted],\n",
-    "        }\n",
-    "\n",
-    "    # Label the dataset\n",
-    "    dataset = dataset.map(\n",
-    "        lambda x: label(x[text_column]),\n",
-    "        batched=True,\n",
-    "        batch_size=batch_size,\n",
-    "        num_proc=num_workers,\n",
-    "    )\n",
-    "\n",
-    "    # Filter the dataset\n",
-    "    dataset = dataset.filter(lambda x: x[\"label\"] in labels_to_keep)\n",
-    "\n",
-    "    return dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c14fd4c3288947358f8b9e01c4a50655",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/10 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c2897d09fb0a409793cd5bf9855e5999",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/10 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "pipe = pipeline(\n",
-    "    \"text-classification\", model=model, tokenizer=tokenizer, device=model.device\n",
-    ")\n",
-    "filtered_ds = filter_dataset(ds, \"content\", pipe, [0])\n",
-    "\n",
-    "assert len(filtered_ds) < len(ds)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | hide\n",
-    "import nbdev\n",
-    "\n",
-    "nbdev.nbdev_export()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "python3",
-   "language": "python",
-   "name": "python3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/nbs/02_tutorial.ipynb b/nbs/02_tutorial.ipynb
deleted file mode 100644
index 717e002..0000000
--- a/nbs/02_tutorial.ipynb
+++ /dev/null
@@ -1,104 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | hide\n",
-    "from treasure_trove.core import *\n",
-    "from squeakily.helpers import LLMLabeler"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# treasure_trove\n",
-    "\n",
-    "> Find the treasure in your trove of data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| eval: false\n",
-    "from datasets import load_dataset\n",
-    "from squeakily.helpers import LLMLabeler\n",
-    "from transformers import pipeline, TrainingArguments\n",
-    "from treasure_trove.core import filter_dataset, label_dataset, train_labeler\n",
-    "\n",
-    "instruction = \"\"\"Please label the following code as either educational or non-educational.\n",
-    "Educational code is code that is well written, follows best practices, has documentation such that it might be found in a textbook.\n",
-    "Non-educational code is code that is poorly written, lacks documentation, contain bugs, or is not idiomatic.\n",
-    "Labels:\n",
-    "\"\"\"\n",
-    "labels = [\"educational\", \"non-educational\"]\n",
-    "api_key = \"<api_key>\"\n",
-    "labeler = LLMLabeler(instruction, labels, model_name=\"gpt-4\", api_key=api_key)\n",
-    "\n",
-    "ds = load_dataset(\"bigcode/the-stack-smol\", data_dir=\"data/python\")[\"train\"]\n",
-    "\n",
-    "# Get the training arguments\n",
-    "batch_size=4,\n",
-    "training_args = TrainingArguments(\n",
-    "    output_dir=\"./code_edu\",\n",
-    "    num_train_epochs=1,\n",
-    "    per_device_train_batch_size=batch_size,\n",
-    "    per_device_eval_batch_size=batch_size,\n",
-    "    warmup_steps=500,\n",
-    "    weight_decay=0.01,\n",
-    "    logging_dir=\"./logs\",\n",
-    "    logging_steps=10,\n",
-    "    evaluation_strategy=\"epoch\",\n",
-    "    save_strategy=\"epoch\",\n",
-    "    load_best_model_at_end=True,\n",
-    "    metric_for_best_model=\"accuracy\",\n",
-    "    greater_is_better=True,\n",
-    "    seed=42,\n",
-    "    push_to_hub=True,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| eval: false\n",
-    "subset = label_dataset(ds, \"content\", labeler, labels, sample=0.001)\n",
-    "base_model_name = \"bigcode/starencoder\"\n",
-    "model, tokenizer = train_labeler(\n",
-    "    subset,\n",
-    "    \"content\",\n",
-    "    base_model_name,\n",
-    "    n_labels=len(labels),\n",
-    "    training_args=training_args,\n",
-    "    num_workers=4,\n",
-    "    max_length=512,\n",
-    "    push_to_hub=True,\n",
-    ")\n",
-    "pipe = pipeline(\n",
-    "    \"text-classification\", model=model, tokenizer=tokenizer, device=model.device\n",
-    ")\n",
-    "filtered_ds = filter_dataset(ds, \"content\", model, labels.index(\"educational\"))\n",
-    "filtered_ds.push_to_hub(\"ncoop57/code_edu\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "python3",
-   "language": "python",
-   "name": "python3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml
deleted file mode 100644
index 0a6dfcb..0000000
--- a/nbs/_quarto.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-project:
-  type: website
-
-format:
-  html:
-    theme: cosmo
-    css: styles.css
-    toc: true
-
-website:
-  twitter-card: true
-  open-graph: true
-  repo-actions: [issue]
-  navbar:
-    background: primary
-    search: true
-  sidebar:
-    style: floating
-
-metadata-files: [nbdev.yml, sidebar.yml]
\ No newline at end of file
diff --git a/nbs/index.ipynb b/nbs/index.ipynb
deleted file mode 100644
index 5e9fc26..0000000
--- a/nbs/index.ipynb
+++ /dev/null
@@ -1,96 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# | hide\n",
-    "from treasure_trove.core import *"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# treasure_trove\n",
-    "\n",
-    "> Find the treasure in your trove of data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This file will become your README and also the index of your documentation."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Install"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```sh\n",
-    "pip install treasure_trove\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## How to use"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Fill me in please! Don't forget code examples:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2"
-      ]
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "1 + 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "python3",
-   "language": "python",
-   "name": "python3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/nbs/nbdev.yml b/nbs/nbdev.yml
deleted file mode 100644
index 8264f3b..0000000
--- a/nbs/nbdev.yml
+++ /dev/null
@@ -1,9 +0,0 @@
-project:
-  output-dir: _docs
-
-website:
-  title: "treasure_trove"
-  site-url: "https://CarperAI.github.io/treasure_trove"
-  description: "Find the treasure in your trove of data"
-  repo-branch: main
-  repo-url: "https://github.com/CarperAI/treasure_trove"
diff --git a/nbs/styles.css b/nbs/styles.css
deleted file mode 100644
index 66ccc49..0000000
--- a/nbs/styles.css
+++ /dev/null
@@ -1,37 +0,0 @@
-.cell {
-  margin-bottom: 1rem;
-}
-
-.cell > .sourceCode {
-  margin-bottom: 0;
-}
-
-.cell-output > pre {
-  margin-bottom: 0;
-}
-
-.cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre {
-  margin-left: 0.8rem;
-  margin-top: 0;
-  background: none;
-  border-left: 2px solid lightsalmon;
-  border-top-left-radius: 0;
-  border-top-right-radius: 0;
-}
-
-.cell-output > .sourceCode {
-  border: none;
-}
-
-.cell-output > .sourceCode {
-  background: none;
-  margin-top: 0;
-}
-
-div.description {
-  padding-left: 2px;
-  padding-top: 5px;
-  font-style: italic;
-  font-size: 135%;
-  opacity: 70%;
-}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..d730a99
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+accelerate
+datasets
+evaluate
+fastcore
+langchain
+openai
+transformers
+python-dotenv
\ No newline at end of file
diff --git a/settings.ini b/settings.ini
deleted file mode 100644
index 3e8da59..0000000
--- a/settings.ini
+++ /dev/null
@@ -1,43 +0,0 @@
-[DEFAULT]
-# All sections below are required unless otherwise specified.
-# See https://github.com/fastai/nbdev/blob/master/settings.ini for examples.
-
-### Python library ###
-repo = treasure_trove
-lib_name = %(repo)s
-version = 0.0.1
-min_python = 3.7
-license = apache2
-black_formatting = False
-
-### nbdev ###
-doc_path = _docs
-lib_path = treasure_trove
-nbs_path = nbs
-recursive = True
-tst_flags = notest
-put_version_in_init = True
-
-### Docs ###
-branch = main
-custom_sidebar = False
-doc_host = https://%(user)s.github.io
-doc_baseurl = /%(repo)s
-git_url = https://github.com/%(user)s/%(repo)s
-title = %(lib_name)s
-
-### PyPI ###
-audience = Developers
-author = ncoop57
-author_email = nacooper01@email.wm.edu
-copyright = 2023 onwards, %(author)s
-description = Find the treasure in your trove of data
-keywords = nbdev jupyter notebook python
-language = English
-status = 3
-user = CarperAI
-
-### Optional ###
-requirements = accelerate datasets evaluate fastcore langchain openai squeakily transformers
-dev_requirements = black[jupyter] ipykernel
-# console_scripts =
\ No newline at end of file
diff --git a/examples/textbooks_A2YN/train_labeler.py b/train_labeler.py
similarity index 100%
rename from examples/textbooks_A2YN/train_labeler.py
rename to train_labeler.py
diff --git a/treasure_trove/__init__.py b/treasure_trove/__init__.py
index f102a9c..e69de29 100644
--- a/treasure_trove/__init__.py
+++ b/treasure_trove/__init__.py
@@ -1 +0,0 @@
-__version__ = "0.0.1"
diff --git a/treasure_trove/_modidx.py b/treasure_trove/_modidx.py
deleted file mode 100644
index 79d02e9..0000000
--- a/treasure_trove/_modidx.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# Autogenerated by nbdev
-
-d = { 'settings': { 'branch': 'main',
-                'doc_baseurl': '/treasure_trove',
-                'doc_host': 'https://CarperAI.github.io',
-                'git_url': 'https://github.com/CarperAI/treasure_trove',
-                'lib_path': 'treasure_trove'},
-  'syms': { 'treasure_trove.core': { 'treasure_trove.core.classify': ('core.html#classify', 'treasure_trove/core.py'),
-                                     'treasure_trove.core.filter_dataset': ('core.html#filter_dataset', 'treasure_trove/core.py'),
-                                     'treasure_trove.core.label_dataset': ('core.html#label_dataset', 'treasure_trove/core.py'),
-                                     'treasure_trove.core.train_labeler': ('core.html#train_labeler', 'treasure_trove/core.py')}}}
diff --git a/treasure_trove/core.py b/treasure_trove/core.py
index 0fc06ac..fe92666 100644
--- a/treasure_trove/core.py
+++ b/treasure_trove/core.py
@@ -1,11 +1,5 @@
-# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb.
-
-# %% auto 0
-__all__ = ['classify', 'label_dataset', 'train_labeler', 'filter_dataset']
-
-# %% ../nbs/00_core.ipynb 2
-import evaluate
 import time
+import os
 
 import numpy as np
 
@@ -15,15 +9,116 @@
     DataCollatorWithPadding,
     Trainer,
 )
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from dotenv import load_dotenv
+import time
+
+
+from pydantic import BaseModel, Field
+
+from datasets import concatenate_datasets, load_dataset
+from typing import List
+from langchain.output_parsers import PydanticOutputParser
+from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
+from langchain.prompts import PromptTemplate
+
+class LLMLabelerParser(BaseModel):
+    labels: List = Field(
+        ..., title="Labels", description="Labels that the LLM classifies the text as"
+    )
+
+
+class LLMLabeler:
+    def __init__(
+        self,
+        instruction: str,
+        labels: List,
+        model_name: str = "gpt-3.5-turbo",
+        api_key: str = None,
+        model_type: str = "openai",
+    ):
+        self.instruction = instruction
+        self.labels = labels
+        # Set up a parser + inject instructions into the prompt template.
+        self.parser = PydanticOutputParser(pydantic_object=LLMLabelerParser)
+        prompt = PromptTemplate(
+            template="{instruction}\n{labels}\n{format_instructions}\n",
+            input_variables=["instruction", "labels"],
+            partial_variables={
+                "format_instructions": self.parser.get_format_instructions()
+            },
+        )
+        system_message_prompt = SystemMessagePromptTemplate(prompt=prompt)
+        human_template = "{text}"
+        human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
+        self.chat_template = ChatPromptTemplate.from_messages(
+            [system_message_prompt, human_message_prompt]
+        )
+
+        if model_type == "azure":
+            raise NotImplementedError("Azure models are not supported yet")
+        elif model_type == "openai":
+            self.model = ChatOpenAI(
+                openai_api_key=api_key, model_name=model_name, temperature=0
+            )
+        else:
+            raise ValueError(f"Model type {model_type} is not supported")
+
+    def __call__(self, text: str):
+        messages = self.chat_template.format_prompt(
+            instruction=self.instruction, labels=self.labels, text=text
+        ).to_messages()
+        output = self.model(messages)
+        print('model output', output.content)
+        if output.content in self.labels:
+            return [output]
+        predicted_labels = self.parser.parse(output.content)
+        print('pred labels', predicted_labels)
+        # check if all the predicted tags are in the list of tags
+        assert all(
+            [label in self.labels for label in predicted_labels.labels]
+        ), f"Predicted labels {predicted_labels.labels} are not in the list of tags {self.labels}"
+        return predicted_labels.labels
+
 
-# %% ../nbs/00_core.ipynb 4
-def classify(x, labels, llm_labeler, max_failures=5, default_label=0):
+instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability.
+High quality code has the following:
+* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure.
+* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects.
+* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles.
+* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain.
+Medium quality code has the following:
+* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names.
+* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions.
+* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose.
+* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices.
+Low quality code has the following:
+* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names.
+* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions.
+* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose.
+* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend.
+
+Output nothing other than one of the following labels:
+"""
+
+
+def classify(x, labels, max_failures=5, default_label=0):
     failures = 0
+    api_key = os.environ["OPENAI_KEY"]
+    labeler = LLMLabeler(
+        instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key
+    )
+
     while failures < max_failures:
         try:
-            label = labels.index(llm_labeler(x)[0])
-            time.sleep(1)
-            return label
+            label = labeler(x)[0]
+            label_idx = labels.index(label)
+            print(label, label_idx)
+            return label_idx
         except Exception as e:
             failures += 1
             print(e)
@@ -32,9 +127,14 @@ def classify(x, labels, llm_labeler, max_failures=5, default_label=0):
     if failures == max_failures:
         return default_label
 
-# %% ../nbs/00_core.ipynb 5
+
 def label_dataset(
-    dataset, text_column, labeler_model, labels, sample=0.1, num_workers=4, max_chars=4_096
+    dataset,
+    text_column,
+    labels,
+    sample=0.1,
+    num_workers=4,
+    max_chars=4_096,
 ):
     """
     Filters a dataset using a labeler model.
@@ -55,14 +155,16 @@ def label_dataset(
 
     # Label the subset
     subset = subset.map(
-        lambda x: {"label": classify(x[text_column][:max_chars], labels, labeler_model)},
+        lambda x: {
+            "label": classify(x[text_column][:max_chars], labels)
+        },
         batched=False,
         num_proc=num_workers,
     )
 
     return subset
 
-# %% ../nbs/00_core.ipynb 7
+
 def train_labeler(
     dataset,
     text_column,
@@ -112,15 +214,6 @@ def train_labeler(
     # Get the data collator
     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 
-    def compute_metrics(eval_preds):
-        metric = evaluate.load("glue", "mrpc")
-        logits, labels = eval_preds
-        if isinstance(logits, tuple): # Some models return tuples
-            logits = logits[0]
-        print(logits.shape, labels)
-        predictions = np.argmax(logits, axis=-1)
-        return metric.compute(predictions=predictions, references=labels)
-
     # Get the trainer
     trainer = Trainer(
         model=model,
@@ -128,7 +221,6 @@ def compute_metrics(eval_preds):
         train_dataset=dataset["train"],
         eval_dataset=dataset["test"],
         data_collator=data_collator,
-        compute_metrics=compute_metrics,
     )
 
     # Train the model
@@ -141,7 +233,7 @@ def compute_metrics(eval_preds):
     # Return the model
     return model, tokenizer
 
-# %% ../nbs/00_core.ipynb 9
+
 def filter_dataset(
     dataset, text_column, labeler_model, labels_to_keep, batch_size=32, num_workers=4
 ):

From 59251f957e674656f6582708cb42f216a8286bee Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sat, 8 Jul 2023 16:21:40 +0000
Subject: [PATCH 02/22] streaming pile subset

---
 .gitignore             |  2 ++
 gpt_labeling.py        | 69 ++++++++++++++++++++++-------------------
 requirements.txt       |  3 +-
 treasure_trove/core.py | 70 +++++++++++++-----------------------------
 view_dataset.py        | 10 ++++++
 5 files changed, 73 insertions(+), 81 deletions(-)
 create mode 100644 view_dataset.py

diff --git a/.gitignore b/.gitignore
index 900add7..7b78dfe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,3 +153,5 @@ checklink/cookies.txt
 
 # Quarto
 .quarto
+
+checkpoints/
\ No newline at end of file
diff --git a/gpt_labeling.py b/gpt_labeling.py
index 1340a43..fc0e86c 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -1,40 +1,45 @@
 import os
-
-from pydantic import BaseModel, Field
+from pathlib import Path
 
 from datasets import concatenate_datasets, load_dataset
-from typing import List
-from langchain.output_parsers import PydanticOutputParser
-from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
-from langchain.prompts import PromptTemplate
-from langchain.prompts.chat import (
-    ChatPromptTemplate,
-    SystemMessagePromptTemplate,
-    HumanMessagePromptTemplate,
-)
 from dotenv import load_dotenv
-import time
 
-from treasure_trove.core import label_dataset
+from treasure_trove.core import classify
 
 load_dotenv(".env")
 labels = ["high quality", "medium quality", "low quality"]
-languages = ["python", "javascript"]
-subsets = []
-for lang in languages:
-    ds = load_dataset("bigcode/the-stack-smol", data_dir=f"data/{lang}")["train"]
-    sample = 50 / len(ds)
-    subset = label_dataset(ds, "content", labels, sample=sample, num_workers=1)
-    new_column = [lang] * len(subset)
-    subset = subset.add_column("language", new_column)
-    subsets.append(subset)
-
-labeled_ds = concatenate_datasets(subsets)
-
-# upload to huggingface
-labeled_ds.push_to_hub("CarperAI/textbooks_A2YN_labeled_six_languages", private=True)
-
-# print number of each class
-print(f"Number of {labels[0]}: {len(labeled_ds.filter(lambda x: x['label'] == 0))}")
-print(f"Number of {labels[1]}: {len(labeled_ds.filter(lambda x: x['label'] == 1))}")
-print(f"Number of {labels[2]}: {len(labeled_ds.filter(lambda x: x['label'] == 2))}")
+lang = "python"
+processed_subsets = []
+max_chars = 4_096
+num_workers = 8
+epochs = 2
+buffer_size = 1000
+dataset = load_dataset(
+    "bigcode/the-stack-dedup", data_dir=f"data/{lang}", streaming=True
+)["train"]
+subset = dataset.shuffle(seed=115, buffer_size=buffer_size)
+
+for epoch in range(epochs):
+    subset.set_epoch(epoch)
+
+    procesed = subset.map(
+        lambda x: {"label": classify(x["content"][:max_chars], labels)},
+        batched=False,
+    )
+
+    lang_column = [lang] * buffer_size
+    procesed = procesed.add_column("language", lang_column)
+    processed_subsets.append(procesed)
+
+    processed_ds = concatenate_datasets(processed_subsets)
+
+    # upload to huggingface
+    ckpt_dir = "./checkpoints"
+    Path(ckpt_dir).mkdir(exist_ok=True)
+    processed_ds.save_to_disk(ckpt_dir + "/latest")
+    processed_ds.push_to_hub("roborovski/phi-1", private=True)
+
+    # print number of each class
+    print(f"Number of {labels[0]}: {len(processed_ds.filter(lambda x: x['label'] == 0))}")
+    print(f"Number of {labels[1]}: {len(processed_ds.filter(lambda x: x['label'] == 1))}")
+    print(f"Number of {labels[2]}: {len(processed_ds.filter(lambda x: x['label'] == 2))}")
diff --git a/requirements.txt b/requirements.txt
index d730a99..73081f4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ fastcore
 langchain
 openai
 transformers
-python-dotenv
\ No newline at end of file
+python-dotenv
+pandas
\ No newline at end of file
diff --git a/treasure_trove/core.py b/treasure_trove/core.py
index fe92666..adb8058 100644
--- a/treasure_trove/core.py
+++ b/treasure_trove/core.py
@@ -1,5 +1,6 @@
 import time
 import os
+import re
 
 import numpy as np
 
@@ -26,6 +27,7 @@
 from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
 from langchain.prompts import PromptTemplate
 
+
 class LLMLabelerParser(BaseModel):
     labels: List = Field(
         ..., title="Labels", description="Labels that the LLM classifies the text as"
@@ -68,21 +70,29 @@ def __init__(
         else:
             raise ValueError(f"Model type {model_type} is not supported")
 
+    def parse(self, text: str):
+        for label in self.labels:
+            match = re.search(
+                r"\{.*\}", text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL
+            )
+            match = re.search(label, text)
+            if bool(match):
+                return label
+        return None
+
     def __call__(self, text: str):
         messages = self.chat_template.format_prompt(
             instruction=self.instruction, labels=self.labels, text=text
         ).to_messages()
         output = self.model(messages)
-        print('model output', output.content)
-        if output.content in self.labels:
-            return [output]
-        predicted_labels = self.parser.parse(output.content)
-        print('pred labels', predicted_labels)
-        # check if all the predicted tags are in the list of tags
-        assert all(
-            [label in self.labels for label in predicted_labels.labels]
-        ), f"Predicted labels {predicted_labels.labels} are not in the list of tags {self.labels}"
-        return predicted_labels.labels
+        print("model output", output.content)
+        print(output)
+        label = self.parse(output.content)
+        if not label:
+            print("label not found!")
+            raise Exception("Label not found")
+        print("get label", label)
+        return label
 
 
 instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability.
@@ -115,9 +125,10 @@ def classify(x, labels, max_failures=5, default_label=0):
 
     while failures < max_failures:
         try:
-            label = labeler(x)[0]
+            label = labeler(x)
             label_idx = labels.index(label)
             print(label, label_idx)
+            time.sleep(1)
             return label_idx
         except Exception as e:
             failures += 1
@@ -128,43 +139,6 @@ def classify(x, labels, max_failures=5, default_label=0):
         return default_label
 
 
-def label_dataset(
-    dataset,
-    text_column,
-    labels,
-    sample=0.1,
-    num_workers=4,
-    max_chars=4_096,
-):
-    """
-    Filters a dataset using a labeler model.
-
-    Args:
-        dataset (datasets.Dataset): Dataset to filter
-        text_column (str): Name of the column containing the text to classify
-        labeler_model (Any): Model to use for labeling
-        labels (List[str]): List of labels
-        sample (float): The fraction of the dataset to label and use for filtering
-        batch_size (int): Batch size for labeling
-        num_workers (int): Number of workers for labeling
-        max_chars (int): Maximum number of characters to truncate the text to before labeling (reduces rate limiting errors)
-    """
-
-    # Get a subset of the dataset
-    subset = dataset.shuffle(seed=115).select(range(int(len(dataset) * sample)))
-
-    # Label the subset
-    subset = subset.map(
-        lambda x: {
-            "label": classify(x[text_column][:max_chars], labels)
-        },
-        batched=False,
-        num_proc=num_workers,
-    )
-
-    return subset
-
-
 def train_labeler(
     dataset,
     text_column,
diff --git a/view_dataset.py b/view_dataset.py
new file mode 100644
index 0000000..864889f
--- /dev/null
+++ b/view_dataset.py
@@ -0,0 +1,10 @@
+import os
+from pathlib import Path
+from collections import Counter
+
+from datasets import load_dataset
+
+ds = load_dataset("roborovski/phi-1")["train"]
+print(ds)
+print(Counter(ds['label']))
+print(Counter(ds['language']))

From 745bba67d754128d4ab80872b86031539ef6a46a Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sat, 8 Jul 2023 19:32:32 +0000
Subject: [PATCH 03/22] wip

---
 gpt_labeling.py        | 84 +++++++++++++++++++++++++++++++-----------
 treasure_trove/core.py | 50 +++++++++----------------
 2 files changed, 81 insertions(+), 53 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index fc0e86c..f566a7c 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -1,45 +1,87 @@
 import os
 from pathlib import Path
 
-from datasets import concatenate_datasets, load_dataset
+from datasets import concatenate_datasets, load_dataset, IterableDataset, Dataset
 from dotenv import load_dotenv
 
-from treasure_trove.core import classify
+import time
+from treasure_trove.core import LLMLabeler, instruction
 
 load_dotenv(".env")
+api_key = os.environ["OPENAI_KEY"]
 labels = ["high quality", "medium quality", "low quality"]
 lang = "python"
 processed_subsets = []
 max_chars = 4_096
 num_workers = 8
-epochs = 2
-buffer_size = 1000
+
+buffer_size = 1_000
+chunk_size = 50
+
+print("Loading dataset..")
 dataset = load_dataset(
-    "bigcode/the-stack-dedup", data_dir=f"data/{lang}", streaming=True
+    "bigcode/the-stack-dedup",
+    data_dir=f"data/{lang}",
+    streaming=True,
 )["train"]
+print("Loaded dataset.")
+
 subset = dataset.shuffle(seed=115, buffer_size=buffer_size)
 
-for epoch in range(epochs):
-    subset.set_epoch(epoch)
+chunks_to_process = buffer_size // chunk_size
 
-    procesed = subset.map(
-        lambda x: {"label": classify(x["content"][:max_chars], labels)},
-        batched=False,
-    )
+total_cost = 0
+max_failures = 5
+failures = 0
+labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key)
+
+for chunk in range(chunks_to_process):
+    print(f"Chunk {chunk} / {chunks_to_process} starting...")
 
-    lang_column = [lang] * buffer_size
-    procesed = procesed.add_column("language", lang_column)
-    processed_subsets.append(procesed)
+    processed_rows = []
+    subset.set_epoch(chunk)
 
-    processed_ds = concatenate_datasets(processed_subsets)
+    for i, x in enumerate(subset):
+        failures = 0
+        label_idx, cost_info = 0, {}
+        while failures < max_failures:
+            try:
+                label, cost_info = labeler(x["content"][:max_chars])
+                label_idx = labels.index(label)
+                print(label, label_idx)
+                time.sleep(1)
+                break
+            except Exception as e:
+                failures += 1
+                print(e)
+                time.sleep(1)
+        if failures != max_failures:
+            total_cost += cost_info["total_cost"]
+            print(
+                f"classified {i}: {label} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}"
+            )
+            processed_rows.append({**x, "label": label, "language": lang})
+        else:
+            print(f"Max failures hit on idx {i}, continuing.")
 
-    # upload to huggingface
+    subset_ds = Dataset.from_list(processed_rows)
+    processed_subsets.append(subset_ds)
+
+    # Save all processed data
+    all_datasets: Dataset = concatenate_datasets(processed_subsets)
     ckpt_dir = "./checkpoints"
     Path(ckpt_dir).mkdir(exist_ok=True)
-    processed_ds.save_to_disk(ckpt_dir + "/latest")
-    processed_ds.push_to_hub("roborovski/phi-1", private=True)
+    all_datasets.save_to_disk(ckpt_dir + "/latest")
+    all_datasets.push_to_hub("roborovski/phi-1", private=True)
 
     # print number of each class
-    print(f"Number of {labels[0]}: {len(processed_ds.filter(lambda x: x['label'] == 0))}")
-    print(f"Number of {labels[1]}: {len(processed_ds.filter(lambda x: x['label'] == 1))}")
-    print(f"Number of {labels[2]}: {len(processed_ds.filter(lambda x: x['label'] == 2))}")
+    print(
+        f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}"
+    )
+    print(
+        f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}"
+    )
+    print(
+        f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}"
+    )
+    print(f"Cost so far: {total_cost}")
diff --git a/treasure_trove/core.py b/treasure_trove/core.py
index adb8058..6ecc02d 100644
--- a/treasure_trove/core.py
+++ b/treasure_trove/core.py
@@ -1,8 +1,6 @@
 import time
-import os
 import re
-
-import numpy as np
+from dotenv import load_dotenv
 
 from transformers import (
     AutoModelForSequenceClassification,
@@ -17,7 +15,7 @@
 )
 from dotenv import load_dotenv
 import time
-
+from langchain.callbacks import get_openai_callback, OpenAICallbackHandler
 
 from pydantic import BaseModel, Field
 
@@ -65,7 +63,10 @@ def __init__(
             raise NotImplementedError("Azure models are not supported yet")
         elif model_type == "openai":
             self.model = ChatOpenAI(
-                openai_api_key=api_key, model_name=model_name, temperature=0
+                openai_api_key=api_key,
+                model_name=model_name,
+                temperature=0,
+                max_tokens=50,
             )
         else:
             raise ValueError(f"Model type {model_type} is not supported")
@@ -80,19 +81,26 @@ def parse(self, text: str):
                 return label
         return None
 
+    def cost_info(self, cb: OpenAICallbackHandler):
+        return dict(
+            prompt_tokens=cb.prompt_tokens,
+            completion_tokens=cb.completion_tokens,
+            total_cost=cb.total_cost,
+        )
+
     def __call__(self, text: str):
         messages = self.chat_template.format_prompt(
             instruction=self.instruction, labels=self.labels, text=text
         ).to_messages()
-        output = self.model(messages)
-        print("model output", output.content)
-        print(output)
+        cost_info = None
+        with get_openai_callback() as cb:
+            output = self.model(messages)
+            cost_info = self.cost_info(cb)
         label = self.parse(output.content)
         if not label:
             print("label not found!")
             raise Exception("Label not found")
-        print("get label", label)
-        return label
+        return label, cost_info
 
 
 instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability.
@@ -116,28 +124,6 @@ def __call__(self, text: str):
 """
 
 
-def classify(x, labels, max_failures=5, default_label=0):
-    failures = 0
-    api_key = os.environ["OPENAI_KEY"]
-    labeler = LLMLabeler(
-        instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key
-    )
-
-    while failures < max_failures:
-        try:
-            label = labeler(x)
-            label_idx = labels.index(label)
-            print(label, label_idx)
-            time.sleep(1)
-            return label_idx
-        except Exception as e:
-            failures += 1
-            print(e)
-            time.sleep(1)
-            pass
-    if failures == max_failures:
-        return default_label
-
 
 def train_labeler(
     dataset,

From 358e91551d9d293723cccd8ce5f726b352c1e5fc Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sat, 8 Jul 2023 20:10:22 +0000
Subject: [PATCH 04/22] save on an interval per subset

---
 gpt_labeling.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index f566a7c..6b39732 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -30,11 +30,16 @@
 
 chunks_to_process = buffer_size // chunk_size
 
+subset_save_interval = 100
+
 total_cost = 0
 max_failures = 5
 failures = 0
 labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key)
 
+ckpt_dir = "./checkpoints"
+Path(ckpt_dir).mkdir(exist_ok=True)
+
 for chunk in range(chunks_to_process):
     print(f"Chunk {chunk} / {chunks_to_process} starting...")
 
@@ -63,15 +68,15 @@
             processed_rows.append({**x, "label": label, "language": lang})
         else:
             print(f"Max failures hit on idx {i}, continuing.")
+        if i % subset_save_interval == 0:
+            subset_ds = Dataset.from_list(processed_rows)
+            subset_ds.save_to_disk(os.path.join(ckpt_dir, f"chunk_{chunk}_subset_{i}"))
+            subset_ds.push_to_hub("roborovski/phi-1", private=True)
 
-    subset_ds = Dataset.from_list(processed_rows)
-    processed_subsets.append(subset_ds)
-
+    processed_subsets.append(processed_rows)
     # Save all processed data
     all_datasets: Dataset = concatenate_datasets(processed_subsets)
-    ckpt_dir = "./checkpoints"
-    Path(ckpt_dir).mkdir(exist_ok=True)
-    all_datasets.save_to_disk(ckpt_dir + "/latest")
+    all_datasets.save_to_disk(os.path.join(ckpt_dir, "latest"))
     all_datasets.push_to_hub("roborovski/phi-1", private=True)
 
     # print number of each class

From 5ae930b152f48ade9f1129ca7af31e4f7d5dd416 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sat, 8 Jul 2023 20:35:48 +0000
Subject: [PATCH 05/22] rm langchain

---
 gpt_labeling.py        |  12 ++--
 requirements.txt       |   1 -
 treasure_trove/core.py | 136 ++++++++++++++---------------------------
 3 files changed, 53 insertions(+), 96 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index 6b39732..39a2933 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -14,9 +14,12 @@
 processed_subsets = []
 max_chars = 4_096
 num_workers = 8
+labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key)
+res = labeler("def create()")
+print(res)
 
-buffer_size = 1_000
-chunk_size = 50
+buffer_size = 10_000
+chunks_to_process = 10
 
 print("Loading dataset..")
 dataset = load_dataset(
@@ -26,16 +29,13 @@
 )["train"]
 print("Loaded dataset.")
 
-subset = dataset.shuffle(seed=115, buffer_size=buffer_size)
-
-chunks_to_process = buffer_size // chunk_size
+subset = dataset.shuffle(seed=100, buffer_size=buffer_size)
 
 subset_save_interval = 100
 
 total_cost = 0
 max_failures = 5
 failures = 0
-labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key)
 
 ckpt_dir = "./checkpoints"
 Path(ckpt_dir).mkdir(exist_ok=True)
diff --git a/requirements.txt b/requirements.txt
index 73081f4..d659a60 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,6 @@ accelerate
 datasets
 evaluate
 fastcore
-langchain
 openai
 transformers
 python-dotenv
diff --git a/treasure_trove/core.py b/treasure_trove/core.py
index 6ecc02d..6e76aae 100644
--- a/treasure_trove/core.py
+++ b/treasure_trove/core.py
@@ -1,6 +1,5 @@
-import time
 import re
-from dotenv import load_dotenv
+import os
 
 from transformers import (
     AutoModelForSequenceClassification,
@@ -8,28 +7,34 @@
     DataCollatorWithPadding,
     Trainer,
 )
-from langchain.prompts.chat import (
-    ChatPromptTemplate,
-    SystemMessagePromptTemplate,
-    HumanMessagePromptTemplate,
-)
-from dotenv import load_dotenv
 import time
-from langchain.callbacks import get_openai_callback, OpenAICallbackHandler
+import openai
+
+openai.api_key = os.getenv("OPENAI_KEY")
 
-from pydantic import BaseModel, Field
 
-from datasets import concatenate_datasets, load_dataset
 from typing import List
-from langchain.output_parsers import PydanticOutputParser
-from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
-from langchain.prompts import PromptTemplate
 
+instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability.
+High quality code has the following:
+* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure.
+* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects.
+* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles.
+* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain.
+Medium quality code has the following:
+* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names.
+* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions.
+* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose.
+* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices.
+Low quality code has the following:
+* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names.
+* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions.
+* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose.
+* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend.
 
-class LLMLabelerParser(BaseModel):
-    labels: List = Field(
-        ..., title="Labels", description="Labels that the LLM classifies the text as"
-    )
+Output nothing other than one of the following labels:
+{0}
+"""
 
 
 class LLMLabeler:
@@ -37,94 +42,47 @@ def __init__(
         self,
         instruction: str,
         labels: List,
-        model_name: str = "gpt-3.5-turbo",
-        api_key: str = None,
-        model_type: str = "openai",
     ):
         self.instruction = instruction
         self.labels = labels
-        # Set up a parser + inject instructions into the prompt template.
-        self.parser = PydanticOutputParser(pydantic_object=LLMLabelerParser)
-        prompt = PromptTemplate(
-            template="{instruction}\n{labels}\n{format_instructions}\n",
-            input_variables=["instruction", "labels"],
-            partial_variables={
-                "format_instructions": self.parser.get_format_instructions()
-            },
-        )
-        system_message_prompt = SystemMessagePromptTemplate(prompt=prompt)
-        human_template = "{text}"
-        human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
-        self.chat_template = ChatPromptTemplate.from_messages(
-            [system_message_prompt, human_message_prompt]
-        )
 
-        if model_type == "azure":
-            raise NotImplementedError("Azure models are not supported yet")
-        elif model_type == "openai":
-            self.model = ChatOpenAI(
-                openai_api_key=api_key,
-                model_name=model_name,
-                temperature=0,
-                max_tokens=50,
-            )
-        else:
-            raise ValueError(f"Model type {model_type} is not supported")
-
-    def parse(self, text: str):
+    def parse_label(self, text: str):
         for label in self.labels:
-            match = re.search(
-                r"\{.*\}", text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL
-            )
-            match = re.search(label, text)
+            pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE)
+            match = re.search(pattern, text)
             if bool(match):
                 return label
         return None
 
-    def cost_info(self, cb: OpenAICallbackHandler):
+    def cost_info(self, oai_response):
+        prompt_tokens = oai_response["usage"]["prompt_tokens"]
+        completion_tokens = oai_response["usage"]["completion_tokens"]
+        total_cost=0.0015 * prompt_tokens + 0.0002 * completion_tokens
+
         return dict(
-            prompt_tokens=cb.prompt_tokens,
-            completion_tokens=cb.completion_tokens,
-            total_cost=cb.total_cost,
+            total_cost=total_cost,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
         )
 
     def __call__(self, text: str):
-        messages = self.chat_template.format_prompt(
-            instruction=self.instruction, labels=self.labels, text=text
-        ).to_messages()
-        cost_info = None
-        with get_openai_callback() as cb:
-            output = self.model(messages)
-            cost_info = self.cost_info(cb)
-        label = self.parse(output.content)
+        formatted_instruction = instruction.format(self.labels)
+        completion = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            temperature=0,
+            messages=[
+                {"role": "system", "content": formatted_instruction},
+                {"role": "user", "content": text},
+            ],
+        )
+        output_text = completion["choices"][0]["message"]["content"]
+        label = self.parse_label(output_text)
+        cost_info = self.cost_info(completion)
         if not label:
-            print("label not found!")
-            raise Exception("Label not found")
+            raise Exception(f"Label not found in text: {output_text}")
         return label, cost_info
 
 
-instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability.
-High quality code has the following:
-* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure.
-* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects.
-* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles.
-* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain.
-Medium quality code has the following:
-* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names.
-* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions.
-* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose.
-* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices.
-Low quality code has the following:
-* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names.
-* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions.
-* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose.
-* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend.
-
-Output nothing other than one of the following labels:
-"""
-
-
-
 def train_labeler(
     dataset,
     text_column,

From 5d381fe30f785d22900ceef77375cb1e5e6bab69 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sat, 8 Jul 2023 20:41:41 +0000
Subject: [PATCH 06/22] bugfixes

---
 gpt_labeling.py        |  2 +-
 treasure_trove/core.py | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index 39a2933..b22ec30 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -29,7 +29,7 @@
 )["train"]
 print("Loaded dataset.")
 
-subset = dataset.shuffle(seed=100, buffer_size=buffer_size)
+subset = dataset.shuffle(seed=110, buffer_size=buffer_size)
 
 subset_save_interval = 100
 
diff --git a/treasure_trove/core.py b/treasure_trove/core.py
index 6e76aae..5912163 100644
--- a/treasure_trove/core.py
+++ b/treasure_trove/core.py
@@ -41,13 +41,15 @@ class LLMLabeler:
     def __init__(
         self,
         instruction: str,
-        labels: List,
+        labels: List[str],
+        secondary_labels: List[str],
     ):
         self.instruction = instruction
         self.labels = labels
+        self.secondary_labels = secondary_labels
 
-    def parse_label(self, text: str):
-        for label in self.labels:
+    def find_label(self, text: str, labels: List[str]):
+        for label in labels:
             pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE)
             match = re.search(pattern, text)
             if bool(match):
@@ -57,7 +59,7 @@ def parse_label(self, text: str):
     def cost_info(self, oai_response):
         prompt_tokens = oai_response["usage"]["prompt_tokens"]
         completion_tokens = oai_response["usage"]["completion_tokens"]
-        total_cost=0.0015 * prompt_tokens + 0.0002 * completion_tokens
+        total_cost = 0.0015 * prompt_tokens + 0.0002 * completion_tokens
 
         return dict(
             total_cost=total_cost,
@@ -76,7 +78,9 @@ def __call__(self, text: str):
             ],
         )
         output_text = completion["choices"][0]["message"]["content"]
-        label = self.parse_label(output_text)
+        label = self.find_label(output_text, self.labels)
+        if not label:
+            label = self.find_label(output_text, self.secondary_labels)
         cost_info = self.cost_info(completion)
         if not label:
             raise Exception(f"Label not found in text: {output_text}")

From a94bdbd65dd5033fb7438abc64424c0168f4f6a2 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sat, 8 Jul 2023 20:54:09 +0000
Subject: [PATCH 07/22] secondary labeling

---
 gpt_labeling.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index b22ec30..82c8327 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -10,11 +10,16 @@
 load_dotenv(".env")
 api_key = os.environ["OPENAI_KEY"]
 labels = ["high quality", "medium quality", "low quality"]
+secondary_labels = ["high", "medium", "low"]
 lang = "python"
 processed_subsets = []
 max_chars = 4_096
 num_workers = 8
-labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key)
+labeler = LLMLabeler(
+    instruction,
+    labels,
+    secondary_labels=secondary_labels,
+)
 res = labeler("def create()")
 print(res)
 

From 5ccfd26a49f93f65ac410a649f55aa14f986f9df Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sat, 8 Jul 2023 22:37:53 +0000
Subject: [PATCH 08/22] workign

---
 gpt_labeling.py | 96 ++++++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 49 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index 82c8327..ee77b60 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -8,11 +8,9 @@
 from treasure_trove.core import LLMLabeler, instruction
 
 load_dotenv(".env")
-api_key = os.environ["OPENAI_KEY"]
 labels = ["high quality", "medium quality", "low quality"]
 secondary_labels = ["high", "medium", "low"]
 lang = "python"
-processed_subsets = []
 max_chars = 4_096
 num_workers = 8
 labeler = LLMLabeler(
@@ -22,67 +20,68 @@
 )
 res = labeler("def create()")
 print(res)
+dataset_chunks = []
 
-buffer_size = 10_000
-chunks_to_process = 10
+buffer_size = 500
+chunks_to_process = 20
 
 print("Loading dataset..")
-dataset = load_dataset(
-    "bigcode/the-stack-dedup",
-    data_dir=f"data/{lang}",
-    streaming=True,
-)["train"]
+dataset = load_dataset("parquet", data_files={"train": "data-00000-of-00144.parquet"})[
+    "train"
+]
 print("Loaded dataset.")
 
-subset = dataset.shuffle(seed=110, buffer_size=buffer_size)
+api_key = os.environ["OPENAI_KEY"]
 
 subset_save_interval = 100
 
-total_cost = 0
 max_failures = 5
 failures = 0
 
 ckpt_dir = "./checkpoints"
 Path(ckpt_dir).mkdir(exist_ok=True)
 
-for chunk in range(chunks_to_process):
-    print(f"Chunk {chunk} / {chunks_to_process} starting...")
-
-    processed_rows = []
-    subset.set_epoch(chunk)
-
-    for i, x in enumerate(subset):
-        failures = 0
-        label_idx, cost_info = 0, {}
-        while failures < max_failures:
-            try:
-                label, cost_info = labeler(x["content"][:max_chars])
-                label_idx = labels.index(label)
-                print(label, label_idx)
-                time.sleep(1)
-                break
-            except Exception as e:
-                failures += 1
-                print(e)
-                time.sleep(1)
-        if failures != max_failures:
-            total_cost += cost_info["total_cost"]
-            print(
-                f"classified {i}: {label} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}"
-            )
-            processed_rows.append({**x, "label": label, "language": lang})
-        else:
-            print(f"Max failures hit on idx {i}, continuing.")
-        if i % subset_save_interval == 0:
-            subset_ds = Dataset.from_list(processed_rows)
-            subset_ds.save_to_disk(os.path.join(ckpt_dir, f"chunk_{chunk}_subset_{i}"))
-            subset_ds.push_to_hub("roborovski/phi-1", private=True)
-
-    processed_subsets.append(processed_rows)
-    # Save all processed data
-    all_datasets: Dataset = concatenate_datasets(processed_subsets)
-    all_datasets.save_to_disk(os.path.join(ckpt_dir, "latest"))
+
+def process(x):
+    failures = 0
+    label_idx, cost_info = 0, {}
+    while failures < max_failures:
+        try:
+            label, cost_info = labeler(x["content"][:max_chars])
+            label_idx = labels.index(label)
+            print(label, label_idx)
+            time.sleep(1)
+            break
+        except Exception as e:
+            failures += 1
+            print(e)
+            time.sleep(1)
+    print(
+        f"classified {i}: {label} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}"
+    )
+    return {"label": label_idx, "cost": cost_info["total_cost"]}
+
+
+processed_chunk_datasets = []
+start_idx = 1
+
+for i in range(start_idx, start_idx + buffer_size, 1):
+    print(f"Chunk {i} / {chunks_to_process + start_idx} starting...")
+
+    subset = dataset[i : i + buffer_size]
+
+    # Label the subset
+    subset = dataset.map(process, batched=False, num_proc=8)
+
+    processed_chunk_datasets.append(subset)
+
+    all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets)
     all_datasets.push_to_hub("roborovski/phi-1", private=True)
+    all_datasets.to_parquet(
+        os.path.join(
+            ckpt_dir, f"processed_{start_idx}_to_{chunks_to_process+start_idx}"
+        )
+    )
 
     # print number of each class
     print(
@@ -94,4 +93,3 @@
     print(
         f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}"
     )
-    print(f"Cost so far: {total_cost}")

From 38b98e93f56cdd7d1d3bf54c4900b05bc3a8e7f6 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sun, 9 Jul 2023 00:05:05 +0000
Subject: [PATCH 09/22] chunking working

---
 gpt_labeling.py        | 25 ++++++++++++-------------
 treasure_trove/core.py |  5 ++++-
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index ee77b60..684c2ec 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -1,7 +1,7 @@
 import os
 from pathlib import Path
 
-from datasets import concatenate_datasets, load_dataset, IterableDataset, Dataset
+from datasets import concatenate_datasets, load_dataset, IterableDataset, Dataset, ReadInstruction
 from dotenv import load_dotenv
 
 import time
@@ -26,9 +26,6 @@
 chunks_to_process = 20
 
 print("Loading dataset..")
-dataset = load_dataset("parquet", data_files={"train": "data-00000-of-00144.parquet"})[
-    "train"
-]
 print("Loaded dataset.")
 
 api_key = os.environ["OPENAI_KEY"]
@@ -47,31 +44,33 @@ def process(x):
     label_idx, cost_info = 0, {}
     while failures < max_failures:
         try:
-            label, cost_info = labeler(x["content"][:max_chars])
-            label_idx = labels.index(label)
-            print(label, label_idx)
+            label_idx, cost_info = labeler(x["content"][:max_chars])
             time.sleep(1)
             break
         except Exception as e:
             failures += 1
             print(e)
             time.sleep(1)
-    print(
-        f"classified {i}: {label} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}"
-    )
+    if cost_info:
+        print(
+            f"{label_idx} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}"
+        )
+    else:
+        print("row not classified.")
     return {"label": label_idx, "cost": cost_info["total_cost"]}
 
 
 processed_chunk_datasets = []
-start_idx = 1
+start_idx = 0
 
 for i in range(start_idx, start_idx + buffer_size, 1):
     print(f"Chunk {i} / {chunks_to_process + start_idx} starting...")
 
-    subset = dataset[i : i + buffer_size]
+    split = ReadInstruction("train", from_=start_idx*buffer_size, to=start_idx*1+buffer_size, unit="abs")
+    subset = load_dataset("parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"})
 
     # Label the subset
-    subset = dataset.map(process, batched=False, num_proc=8)
+    subset = subset.map(process, batched=False, num_proc=4)
 
     processed_chunk_datasets.append(subset)
 
diff --git a/treasure_trove/core.py b/treasure_trove/core.py
index 5912163..2eeb2ee 100644
--- a/treasure_trove/core.py
+++ b/treasure_trove/core.py
@@ -77,6 +77,8 @@ def __call__(self, text: str):
                 {"role": "user", "content": text},
             ],
         )
+        if "error" in completion:
+            return 0, None
         output_text = completion["choices"][0]["message"]["content"]
         label = self.find_label(output_text, self.labels)
         if not label:
@@ -84,7 +86,8 @@ def __call__(self, text: str):
         cost_info = self.cost_info(completion)
         if not label:
             raise Exception(f"Label not found in text: {output_text}")
-        return label, cost_info
+        label_idx = self.labels.index(label)
+        return label_idx, cost_info
 
 
 def train_labeler(

From e28d55f78d6a8858edb651a5fbb9ce060b15cdd0 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sun, 9 Jul 2023 00:11:17 +0000
Subject: [PATCH 10/22] dumb bugs

---
 gpt_labeling.py        |  2 +-
 treasure_trove/core.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index 684c2ec..82d491b 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -53,7 +53,7 @@ def process(x):
             time.sleep(1)
     if cost_info:
         print(
-            f"{label_idx} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}"
+            f"{label_idx} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']} | {cost_info['total_cost']}"
         )
     else:
         print("row not classified.")
diff --git a/treasure_trove/core.py b/treasure_trove/core.py
index 2eeb2ee..88d79a2 100644
--- a/treasure_trove/core.py
+++ b/treasure_trove/core.py
@@ -49,11 +49,11 @@ def __init__(
         self.secondary_labels = secondary_labels
 
     def find_label(self, text: str, labels: List[str]):
-        for label in labels:
+        for i, label in enumerate(labels):
             pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE)
             match = re.search(pattern, text)
             if bool(match):
-                return label
+                return i
         return None
 
     def cost_info(self, oai_response):
@@ -72,6 +72,7 @@ def __call__(self, text: str):
         completion = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             temperature=0,
+            max_tokens=4,
             messages=[
                 {"role": "system", "content": formatted_instruction},
                 {"role": "user", "content": text},
@@ -80,13 +81,12 @@ def __call__(self, text: str):
         if "error" in completion:
             return 0, None
         output_text = completion["choices"][0]["message"]["content"]
-        label = self.find_label(output_text, self.labels)
-        if not label:
-            label = self.find_label(output_text, self.secondary_labels)
+        label_idx = self.find_label(output_text, self.labels)
+        if not label_idx:
+            label_idx = self.find_label(output_text, self.secondary_labels)
         cost_info = self.cost_info(completion)
-        if not label:
+        if not label_idx:
             raise Exception(f"Label not found in text: {output_text}")
-        label_idx = self.labels.index(label)
         return label_idx, cost_info
 
 

From cb4a59bddd0af4d0fd671d13a9026789a653f866 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sun, 9 Jul 2023 00:35:01 +0000
Subject: [PATCH 11/22] more dumb bugs

---
 gpt_labeling.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index 82d491b..6525b7e 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -1,7 +1,13 @@
 import os
 from pathlib import Path
 
-from datasets import concatenate_datasets, load_dataset, IterableDataset, Dataset, ReadInstruction
+from datasets import (
+    concatenate_datasets,
+    load_dataset,
+    IterableDataset,
+    Dataset,
+    ReadInstruction,
+)
 from dotenv import load_dotenv
 
 import time
@@ -23,15 +29,13 @@
 dataset_chunks = []
 
 buffer_size = 500
-chunks_to_process = 20
+num_chunks = 20
 
 print("Loading dataset..")
 print("Loaded dataset.")
 
 api_key = os.environ["OPENAI_KEY"]
 
-subset_save_interval = 100
-
 max_failures = 5
 failures = 0
 
@@ -41,6 +45,7 @@
 
 def process(x):
     failures = 0
+    total_cost = 0
     label_idx, cost_info = 0, {}
     while failures < max_failures:
         try:
@@ -52,22 +57,25 @@ def process(x):
             print(e)
             time.sleep(1)
     if cost_info:
+        total_cost = cost_info["total_cost"]
         print(
             f"{label_idx} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']} | {cost_info['total_cost']}"
         )
     else:
         print("row not classified.")
-    return {"label": label_idx, "cost": cost_info["total_cost"]}
+    return {"label": label_idx, "cost": total_cost}
 
 
 processed_chunk_datasets = []
-start_idx = 0
-
-for i in range(start_idx, start_idx + buffer_size, 1):
-    print(f"Chunk {i} / {chunks_to_process + start_idx} starting...")
 
-    split = ReadInstruction("train", from_=start_idx*buffer_size, to=start_idx*1+buffer_size, unit="abs")
-    subset = load_dataset("parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"})
+for i in range(num_chunks):
+    split = ReadInstruction(
+        "train", from_=i * buffer_size, to=(i + 1) * buffer_size, unit="abs"
+    )
+    print(f"processing chunk {i}: {split}")
+    subset = load_dataset(
+        "parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"}
+    )
 
     # Label the subset
     subset = subset.map(process, batched=False, num_proc=4)
@@ -77,9 +85,7 @@ def process(x):
     all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets)
     all_datasets.push_to_hub("roborovski/phi-1", private=True)
     all_datasets.to_parquet(
-        os.path.join(
-            ckpt_dir, f"processed_{start_idx}_to_{chunks_to_process+start_idx}"
-        )
+        os.path.join(ckpt_dir, f"processed_{start_idx}_to_{num_chunks+start_idx}")
     )
 
     # print number of each class

From bfcebb22dedd85b6c2d40afc2470945120a84a35 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sun, 9 Jul 2023 01:38:04 +0000
Subject: [PATCH 12/22] working

---
 gpt_labeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index 6525b7e..5e95107 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -85,7 +85,7 @@ def process(x):
     all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets)
     all_datasets.push_to_hub("roborovski/phi-1", private=True)
     all_datasets.to_parquet(
-        os.path.join(ckpt_dir, f"processed_{start_idx}_to_{num_chunks+start_idx}")
+        os.path.join(ckpt_dir, f"processed_{i}")
     )
 
     # print number of each class

From 505a56bbb77bd816d5e717f6253162f148bd6bce Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sun, 9 Jul 2023 03:32:05 +0000
Subject: [PATCH 13/22] skip exc

---
 gpt_labeling.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index 5e95107..717038d 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -83,10 +83,13 @@ def process(x):
     processed_chunk_datasets.append(subset)
 
     all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets)
-    all_datasets.push_to_hub("roborovski/phi-1", private=True)
-    all_datasets.to_parquet(
-        os.path.join(ckpt_dir, f"processed_{i}")
-    )
+    try:
+        all_datasets.push_to_hub("roborovski/phi-1", private=True)
+        all_datasets.to_parquet(
+            os.path.join(ckpt_dir, f"processed_{i}")
+        )
+    except Exception as e:
+        print(e)
 
     # print number of each class
     print(

From dfa71e84744e1dd4786e77cedaf472c137eec2d6 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Mon, 10 Jul 2023 19:04:17 +0000
Subject: [PATCH 14/22] bump chunks

---
 gpt_labeling.py | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/gpt_labeling.py b/gpt_labeling.py
index 717038d..f7d55af 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -29,7 +29,7 @@
 dataset_chunks = []
 
 buffer_size = 500
-num_chunks = 20
+num_chunks = 100
 
 print("Loading dataset..")
 print("Loaded dataset.")
@@ -68,10 +68,15 @@ def process(x):
 
 processed_chunk_datasets = []
 
+first_save_idx = 8000
+
 for i in range(num_chunks):
     split = ReadInstruction(
         "train", from_=i * buffer_size, to=(i + 1) * buffer_size, unit="abs"
     )
+    # if i < first_save_idx // buffer_size:
+    #     print(f"skipping chunk {i}: {split}")
+    #     continue
     print(f"processing chunk {i}: {split}")
     subset = load_dataset(
         "parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"}
@@ -82,22 +87,21 @@ def process(x):
 
     processed_chunk_datasets.append(subset)
 
-    all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets)
-    try:
-        all_datasets.push_to_hub("roborovski/phi-1", private=True)
-        all_datasets.to_parquet(
-            os.path.join(ckpt_dir, f"processed_{i}")
-        )
-    except Exception as e:
-        print(e)
+    if i > first_save_idx // buffer_size:
+        all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets)
+        try:
+            all_datasets.push_to_hub("roborovski/phi-1", private=True)
+            all_datasets.to_parquet(os.path.join(ckpt_dir, f"processed_{i}"))
+        except Exception as e:
+            print(e)
 
-    # print number of each class
-    print(
-        f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}"
-    )
-    print(
-        f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}"
-    )
-    print(
-        f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}"
-    )
+        # print number of each class
+        print(
+            f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}"
+        )
+        print(
+            f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}"
+        )
+        print(
+            f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}"
+        )

From bc5bd0c0661cdd55675a66a27f2e9f5e424f9a50 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Wed, 12 Jul 2023 00:21:34 +0000
Subject: [PATCH 15/22] labeler training

---
 .gitignore             |  4 ++-
 code_edu               |  1 +
 train_labeler.py       | 69 ++++++++++++++++++++++++++++++++---------
 treasure_trove/core.py | 70 ------------------------------------------
 4 files changed, 59 insertions(+), 85 deletions(-)
 create mode 160000 code_edu

diff --git a/.gitignore b/.gitignore
index 7b78dfe..b7ff8a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -154,4 +154,6 @@ checklink/cookies.txt
 # Quarto
 .quarto
 
-checkpoints/
\ No newline at end of file
+checkpoints/
+
+wandb/*
\ No newline at end of file
diff --git a/code_edu b/code_edu
new file mode 160000
index 0000000..e9a28b1
--- /dev/null
+++ b/code_edu
@@ -0,0 +1 @@
+Subproject commit e9a28b101f91ed62ee3d6c52db2fe1e2edacfbd9
diff --git a/train_labeler.py b/train_labeler.py
index 1249930..d97e3a6 100644
--- a/train_labeler.py
+++ b/train_labeler.py
@@ -1,10 +1,46 @@
 from datasets import load_dataset
 from transformers import pipeline, TrainingArguments
-from treasure_trove.core import filter_dataset, label_dataset, train_labeler
+
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    Trainer,
+)
 
 
-ds = load_dataset("CarperAI/textbooks_A2YN_labeled")["train"]
+dataset = load_dataset("roborovski/phi-1")["train"]
 batch_size = 32
+num_workers = 4
+max_length = 512
+push_to_hub = True
+n_labels = 3
+text_column = "content"
+
+id2label = {0: "HIGH_QUALITY", 1: "MEDIUM_QUALITY", 2: "LOW_QUALITY"}
+label2id = {"HIGH_QUALITY": 0, "MEDIUM_QUALITY": 1, "LOW_QUALITY": 2}
+
+base_model_name = "bigcode/starencoder"
+tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length=max_length)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+model = AutoModelForSequenceClassification.from_pretrained(
+    base_model_name, num_labels=n_labels, max_length=max_length, id2label=id2label, label2id=label2id
+)
+
+dataset = dataset.map(
+    lambda x: tokenizer(
+        x[text_column], padding="max_length", truncation=True, max_length=max_length
+    ),
+    batched=True,
+    num_proc=num_workers,
+)
+
+dataset = dataset.train_test_split(test_size=0.1, seed=42)
+
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
 training_args = TrainingArguments(
     output_dir="./code_edu",
     num_train_epochs=3,
@@ -21,17 +57,22 @@
     greater_is_better=True,
     seed=42,
     push_to_hub=True,
-    hub_model_id="CarperAI/code_edu_classifier_py",
+    hub_model_id="roborovski/phi-2-classifier",
     hub_private_repo=True,
 )
-base_model_name = "bigcode/starencoder"
-model, tokenizer = train_labeler(
-    ds,
-    "content",
-    base_model_name,
-    n_labels=2,
-    training_args=training_args,
-    num_workers=4,
-    max_length=512,
-    push_to_hub=True,
-)
\ No newline at end of file
+
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    data_collator=data_collator,
+)
+
+breakpoint()
+
+trainer.train()
+
+if push_to_hub:
+    trainer.push_to_hub()
diff --git a/treasure_trove/core.py b/treasure_trove/core.py
index 88d79a2..1fa950e 100644
--- a/treasure_trove/core.py
+++ b/treasure_trove/core.py
@@ -89,76 +89,6 @@ def __call__(self, text: str):
             raise Exception(f"Label not found in text: {output_text}")
         return label_idx, cost_info
 
-
-def train_labeler(
-    dataset,
-    text_column,
-    base_model_name,
-    n_labels,
-    training_args,
-    num_workers=4,
-    max_length=512,
-    push_to_hub=True,
-):
-    """
-    Trains a labeler model on a labeled dataset.
-
-    Args:
-        dataset (datasets.Dataset): Dataset to train on
-        text_column (str): Name of the text column
-        base_model_name (str): Name of the base model to use
-        n_labels (int): Number of labels
-        epochs (int): Number of epochs to train
-        batch_size (int): Batch size for training
-        num_workers (int): Number of workers for training
-        max_length (int): Maximum length of the input
-    """
-    # Load the tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length=max_length)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    # Load the model
-    model = AutoModelForSequenceClassification.from_pretrained(
-        base_model_name, num_labels=n_labels, max_length=max_length
-    )
-    model.config.id2label = {i: i for i in range(n_labels)}
-
-    # Preprocess the dataset
-    dataset = dataset.map(
-        lambda x: tokenizer(
-            x[text_column], padding="max_length", truncation=True, max_length=max_length
-        ),
-        batched=True,
-        num_proc=num_workers,
-    )
-
-    # Split the dataset
-    dataset = dataset.train_test_split(test_size=0.1, seed=42)
-
-    # Get the data collator
-    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-
-    # Get the trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=dataset["train"],
-        eval_dataset=dataset["test"],
-        data_collator=data_collator,
-    )
-
-    # Train the model
-    trainer.train()
-
-    # Push the model to the hub
-    if push_to_hub:
-        trainer.push_to_hub()
-
-    # Return the model
-    return model, tokenizer
-
-
 def filter_dataset(
     dataset, text_column, labeler_model, labels_to_keep, batch_size=32, num_workers=4
 ):

From 2e7f54cbd7aa7adf3074b195d5cb50955349b2fc Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Wed, 12 Jul 2023 00:22:40 +0000
Subject: [PATCH 16/22] rm

---
 code_edu | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 code_edu

diff --git a/code_edu b/code_edu
deleted file mode 160000
index e9a28b1..0000000
--- a/code_edu
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit e9a28b101f91ed62ee3d6c52db2fe1e2edacfbd9

From 05472af3eaab20a48db3e53fec75649cb2eedf94 Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Wed, 12 Jul 2023 12:04:34 +0000
Subject: [PATCH 17/22] wandb config and working metrics

---
 requirements.txt |  3 ++-
 train_labeler.py | 21 +++++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d659a60..4976ba5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ fastcore
 openai
 transformers
 python-dotenv
-pandas
\ No newline at end of file
+pandas
+wandb
\ No newline at end of file
diff --git a/train_labeler.py b/train_labeler.py
index d97e3a6..ac37f21 100644
--- a/train_labeler.py
+++ b/train_labeler.py
@@ -1,5 +1,8 @@
 from datasets import load_dataset
 from transformers import pipeline, TrainingArguments
+import evaluate
+import numpy as np
+import wandb
 
 from transformers import (
     AutoModelForSequenceClassification,
@@ -41,14 +44,25 @@
 
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 
+metric = evaluate.load("accuracy")
+
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels)
+
+wandb.login()
+
+wandb.init(project="phi-2-classifier")
+
 training_args = TrainingArguments(
-    output_dir="./code_edu",
+    output_dir="checkpoints",
     num_train_epochs=3,
     per_device_train_batch_size=batch_size,
     per_device_eval_batch_size=batch_size,
     warmup_steps=500,
     weight_decay=0.01,
-    logging_dir="./logs",
+    logging_dir="logs",
     logging_steps=10,
     evaluation_strategy="epoch",
     save_strategy="epoch",
@@ -68,10 +82,9 @@
     train_dataset=dataset["train"],
     eval_dataset=dataset["test"],
     data_collator=data_collator,
+    compute_metrics=compute_metrics,
 )
 
-breakpoint()
-
 trainer.train()
 
 if push_to_hub:

From 87ec8c26fa22380dee7d78f5e36e4b9ffb1dae3b Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Thu, 13 Jul 2023 00:58:03 +0000
Subject: [PATCH 18/22] eval batch size

---
 train_labeler.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/train_labeler.py b/train_labeler.py
index ac37f21..8c6fa3c 100644
--- a/train_labeler.py
+++ b/train_labeler.py
@@ -40,7 +40,9 @@
     num_proc=num_workers,
 )
 
-dataset = dataset.train_test_split(test_size=0.1, seed=42)
+dataset = dataset.train_test_split(test_size=0.05, seed=42)
+
+eval_dataset = dataset["test"].shuffle(seed=42).select(range(200))
 
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 
@@ -59,7 +61,7 @@ def compute_metrics(eval_pred):
     output_dir="checkpoints",
     num_train_epochs=3,
     per_device_train_batch_size=batch_size,
-    per_device_eval_batch_size=batch_size,
+    per_device_eval_batch_size=2,
     warmup_steps=500,
     weight_decay=0.01,
     logging_dir="logs",
@@ -73,6 +75,7 @@ def compute_metrics(eval_pred):
     push_to_hub=True,
     hub_model_id="roborovski/phi-2-classifier",
     hub_private_repo=True,
+    eval_accumulation_steps=2
 )
 
 
@@ -80,7 +83,7 @@ def compute_metrics(eval_pred):
     model=model,
     args=training_args,
     train_dataset=dataset["train"],
-    eval_dataset=dataset["test"],
+    eval_dataset=eval_dataset,
     data_collator=data_collator,
     compute_metrics=compute_metrics,
 )

From c0a30ccd41824b8c1fd349949ac0bba0e9fbc89d Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sun, 16 Jul 2023 01:04:14 +0000
Subject: [PATCH 19/22] generate embeddings

---
 generate_embeddings.py | 192 +++++++++++++++++++++++++++++++++++++
 gpt_labeling.py        |   2 -
 train_labeler.py       | 208 +++++++++++++++++++++++++----------------
 3 files changed, 320 insertions(+), 82 deletions(-)
 create mode 100644 generate_embeddings.py

diff --git a/generate_embeddings.py b/generate_embeddings.py
new file mode 100644
index 0000000..213cc73
--- /dev/null
+++ b/generate_embeddings.py
@@ -0,0 +1,192 @@
+from abc import ABC
+from datasets import (
+    load_dataset,
+)
+from dotenv import load_dotenv
+import torch
+from typing import Union, List, Dict
+
+from train_labeler import EncoderParams
+
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    AutoModel,
+)
+
+load_dotenv(".env")
+
+# https://huggingface.co/bigcode/starencoder/discussions/3
+# https://github.com/bigcode-project/bigcode-encoder/blob/master/embedding_sandbox.ipynb
+
+
+# https://github.com/bigcode-project/bigcode-encoder/blob/master/src/utils.py#L152
+def pooling(x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+    """Pools a batch of vector sequences into a batch of vector global representations.
+    It does so by taking the last vector in the sequence, as indicated by the mask.
+
+    Args:
+        x (torch.Tensor): Batch of vector sequences with shape [B, T, F].
+        mask (torch.Tensor): Batch of masks with shape [B, T].
+
+    Returns:
+        torch.Tensor: Pooled version of the input batch with shape [B, F].
+    """
+
+    eos_idx = mask.sum(1) - 1
+    batch_idx = torch.arange(len(eos_idx), device=x.device)
+
+    mu = x[batch_idx, eos_idx, :]
+
+    return mu
+
+
+# https://github.com/bigcode-project/bigcode-encoder/blob/master/src/utils.py#L121
+def pool_and_normalize(
+    features_sequence: torch.Tensor,
+    attention_masks: torch.Tensor,
+    return_norms: bool = False,
+) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Temporal pooling of sequences of vectors and projection onto the unit sphere.
+
+    Args:
+        features_sequence (torch.Tensor): Inpute features with shape [B, T, F].
+        attention_masks (torch.Tensor): Pooling masks with shape [B, T, F].
+        return_norms (bool, optional): Whether to additionally return the norms. Defaults to False.
+
+    Returns:
+        Union[torch.Tensor, List[torch.Tensor]]: Pooled and normalized vectors with shape [B, F].
+    """
+
+    pooled_embeddings = pooling(features_sequence, attention_masks)
+    embedding_norms = pooled_embeddings.norm(dim=1)
+
+    normalizing_factor = torch.where(  # Only normalize embeddings with norm > 1.0.
+        embedding_norms > 1.0, embedding_norms, torch.ones_like(embedding_norms)
+    )
+
+    pooled_normalized_embeddings = pooled_embeddings / normalizing_factor[:, None]
+
+    if return_norms:
+        return pooled_normalized_embeddings, embedding_norms
+    else:
+        return pooled_normalized_embeddings
+
+
+# https://github.com/bigcode-project/bigcode-encoder/blob/master/src/constants.py
+
+
+def set_device(inputs: Dict[str, torch.Tensor], device: str) -> Dict[str, torch.Tensor]:
+    output_data = {}
+    for k, v in inputs.items():
+        output_data[k] = v.to(device)
+
+    return output_data
+
+
+def prepare_tokenizer(tokenizer_path):
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    except OSError:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_auth_token=True)
+
+    tokenizer.add_special_tokens({"pad_token": EncoderParams.PAD_TOKEN})
+    tokenizer.add_special_tokens({"sep_token": EncoderParams.SEPARATOR_TOKEN})
+    tokenizer.add_special_tokens({"cls_token": EncoderParams.CLS_TOKEN})
+    tokenizer.add_special_tokens({"mask_token": EncoderParams.MASK_TOKEN})
+    return tokenizer
+
+
+def truncate_sentences(
+    sentence_list: List[str], maximum_length: Union[int, float]
+) -> List[str]:
+    truncated_sentences = []
+
+    for sentence in sentence_list:
+        truncated_sentences.append(sentence[:maximum_length])
+
+    return truncated_sentences
+
+
+class StarEncoder(torch.nn.Module):
+    def __init__(self, device):
+        super().__init__()
+
+        self.tokenizer = prepare_tokenizer(EncoderParams.base_model_name)
+        self.encoder = (
+            AutoModel.from_pretrained(
+                EncoderParams.base_model_name, use_auth_token=True
+            )
+            .to(device)
+            .eval()
+        )
+        self.device = device
+        self.max_input_len = EncoderParams.max_input_length
+        self.maximum_token_len = EncoderParams.max_token_length
+
+    def forward(self, input_sentences):
+        inputs = self.tokenizer(
+            [
+                f"{EncoderParams.CLS_TOKEN}{sentence}{EncoderParams.SEPARATOR_TOKEN}"
+                for sentence in input_sentences
+            ],
+            padding="longest",
+            max_length=self.maximum_token_len,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        outputs = self.encoder(**set_device(inputs, self.device))
+        embedding = pool_and_normalize(outputs.hidden_states[-1], inputs.attention_mask)
+
+        return embedding
+
+    def encode(self, input_sentences, batch_size=32, **kwargs):
+        truncated_input_sentences = truncate_sentences(
+            input_sentences, self.max_input_len
+        )
+
+        n_batches = len(truncated_input_sentences) // batch_size + int(
+            len(truncated_input_sentences) % batch_size > 0
+        )
+
+        embedding_batch_list = []
+
+        for i in range(n_batches):
+            start_idx = i * batch_size
+            end_idx = min((i + 1) * batch_size, len(truncated_input_sentences))
+
+            with torch.no_grad():
+                embedding_batch_list.append(
+                    self.forward(truncated_input_sentences[start_idx:end_idx])
+                    .detach()
+                    .cpu()
+                )
+
+        input_sentences_embedding = torch.cat(embedding_batch_list)
+
+        return input_sentences_embedding
+
+
+tokenizer = AutoTokenizer.from_pretrained(
+    EncoderParams.base_model_name, max_length=EncoderParams.max_token_length
+)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+dataset = load_dataset("roborovski/phi-1")
+
+device = torch.device("cuda")
+model = StarEncoder(device)
+
+
+def process(x):
+    content = x["content"]
+    embedding = model.encode(content)
+    return {"embedding": embedding}
+
+
+# process(dataset["train"][0])
+
+processed_dataset = dataset.map(process, batched=True)
+processed_dataset.push_to_hub("roborovski/phi-2-embeddings")
diff --git a/gpt_labeling.py b/gpt_labeling.py
index f7d55af..1599310 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -24,8 +24,6 @@
     labels,
     secondary_labels=secondary_labels,
 )
-res = labeler("def create()")
-print(res)
 dataset_chunks = []
 
 buffer_size = 500
diff --git a/train_labeler.py b/train_labeler.py
index 8c6fa3c..3e63c15 100644
--- a/train_labeler.py
+++ b/train_labeler.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from datasets import load_dataset
 from transformers import pipeline, TrainingArguments
 import evaluate
@@ -12,83 +13,130 @@
 )
 
 
-dataset = load_dataset("roborovski/phi-1")["train"]
-batch_size = 32
-num_workers = 4
-max_length = 512
-push_to_hub = True
-n_labels = 3
-text_column = "content"
-
-id2label = {0: "HIGH_QUALITY", 1: "MEDIUM_QUALITY", 2: "LOW_QUALITY"}
-label2id = {"HIGH_QUALITY": 0, "MEDIUM_QUALITY": 1, "LOW_QUALITY": 2}
-
-base_model_name = "bigcode/starencoder"
-tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length=max_length)
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-
-model = AutoModelForSequenceClassification.from_pretrained(
-    base_model_name, num_labels=n_labels, max_length=max_length, id2label=id2label, label2id=label2id
-)
-
-dataset = dataset.map(
-    lambda x: tokenizer(
-        x[text_column], padding="max_length", truncation=True, max_length=max_length
-    ),
-    batched=True,
-    num_proc=num_workers,
-)
-
-dataset = dataset.train_test_split(test_size=0.05, seed=42)
-
-eval_dataset = dataset["test"].shuffle(seed=42).select(range(200))
-
-data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-
-metric = evaluate.load("accuracy")
-
-def compute_metrics(eval_pred):
-    logits, labels = eval_pred
-    predictions = np.argmax(logits, axis=-1)
-    return metric.compute(predictions=predictions, references=labels)
-
-wandb.login()
-
-wandb.init(project="phi-2-classifier")
-
-training_args = TrainingArguments(
-    output_dir="checkpoints",
-    num_train_epochs=3,
-    per_device_train_batch_size=batch_size,
-    per_device_eval_batch_size=2,
-    warmup_steps=500,
-    weight_decay=0.01,
-    logging_dir="logs",
-    logging_steps=10,
-    evaluation_strategy="epoch",
-    save_strategy="epoch",
-    load_best_model_at_end=True,
-    metric_for_best_model="accuracy",
-    greater_is_better=True,
-    seed=42,
-    push_to_hub=True,
-    hub_model_id="roborovski/phi-2-classifier",
-    hub_private_repo=True,
-    eval_accumulation_steps=2
-)
-
-
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=dataset["train"],
-    eval_dataset=eval_dataset,
-    data_collator=data_collator,
-    compute_metrics=compute_metrics,
-)
-
-trainer.train()
-
-if push_to_hub:
-    trainer.push_to_hub()
+@dataclass
+class EncoderParams:
+    batch_size = 32
+    num_workers = 4
+    push_to_hub = True
+    n_labels = 3
+    text_column = "content"
+    labels = ["high quality", "medium quality", "low quality"]
+    base_model_name = "bigcode/starencoder"
+    id2label = {0: "HIGH_QUALITY", 1: "MEDIUM_QUALITY", 2: "LOW_QUALITY"}
+    label2id = {"HIGH_QUALITY": 0, "MEDIUM_QUALITY": 1, "LOW_QUALITY": 2}
+    MASK_TOKEN = "<mask>"
+    SEPARATOR_TOKEN = "<sep>"
+    PAD_TOKEN = "<pad>"
+    CLS_TOKEN = "<cls>"
+    max_input_length = 10000
+    max_token_length = 1024
+
+
+def train():
+
+    dataset = load_dataset("roborovski/phi-1")["train"]
+
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        EncoderParams.base_model_name, max_length=EncoderParams.max_token_length
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        EncoderParams.base_model_name,
+        num_labels=EncoderParams.n_labels,
+        max_length=EncoderParams.max_token_length,
+        id2label=EncoderParams.id2label,
+        label2id=EncoderParams.label2id,
+    )
+
+
+    def compute_metrics(eval_pred):
+        logits, labels = eval_pred
+        if isinstance(logits, tuple):  # Some models return tuples
+            logits = logits[0]
+        predictions = np.argmax(logits, axis=-1)
+        acc = acc_metric.compute(predictions=predictions, references=labels)
+        precision = precision_metric.compute(
+            predictions=predictions,
+            references=labels,
+            average="macro" if len(labels) > 2 else "binary",
+        )
+        recall = recall_metric.compute(
+            predictions=predictions,
+            references=labels,
+            average="macro" if len(labels) > 2 else "binary",
+        )
+        f1 = f1_metric.compute(
+            predictions=predictions,
+            references=labels,
+            average="macro" if len(labels) > 2 else "binary",
+        )
+
+        return {**acc, **precision, **recall, **f1}
+
+    dataset = dataset.map(
+        lambda x: tokenizer(
+            x[EncoderParams.text_column],
+            padding="max_length",
+            truncation=True,
+            max_length=EncoderParams.max_input_length,
+        ),
+        batched=True,
+        num_proc=EncoderParams.num_workers,
+    )
+
+    dataset = dataset.train_test_split(test_size=0.05, seed=42)
+
+    train_dataset = dataset["train"].shuffle(seed=42)
+    eval_dataset = dataset["test"].shuffle(seed=42).select(range(200))
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    acc_metric = evaluate.load("accuracy")
+    precision_metric = evaluate.load("precision")
+    recall_metric = evaluate.load("recall")
+    f1_metric = evaluate.load("f1")
+
+    wandb.login()
+
+    wandb.init(project="phi-2-classifier")
+
+    training_args = TrainingArguments(
+        output_dir="checkpoints",
+        num_train_epochs=100,
+        per_device_train_batch_size=EncoderParams.batch_size,
+        per_device_eval_batch_size=2,
+        warmup_steps=500,
+        weight_decay=0.01,
+        logging_dir="logs",
+        logging_steps=50,
+        eval_steps=5000,
+        evaluation_strategy="steps",
+        save_strategy="epoch",
+        save_steps=5,
+        seed=42,
+        push_to_hub=True,
+        hub_model_id="roborovski/phi-2-classifier",
+        hub_private_repo=True,
+        eval_accumulation_steps=1,
+    )
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+    )
+
+    trainer.train()
+
+    if EncoderParams.push_to_hub:
+        trainer.push_to_hub()
+
+
+if __name__ == "__main__":
+    train()

From a1637fe3451605903b30d6b0bb81c40381c1b76b Mon Sep 17 00:00:00 2001
From: Brian <cooldude242@gmail.com>
Date: Sun, 16 Jul 2023 02:05:57 +0000
Subject: [PATCH 20/22] set batch size

---
 generate_embeddings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_embeddings.py b/generate_embeddings.py
index 213cc73..0e814d5 100644
--- a/generate_embeddings.py
+++ b/generate_embeddings.py
@@ -188,5 +188,5 @@ def process(x):
 
 # process(dataset["train"][0])
 
-processed_dataset = dataset.map(process, batched=True)
+processed_dataset = dataset.map(process, batched=True, batch_size=128)
 processed_dataset.push_to_hub("roborovski/phi-2-embeddings")

From 3a6f668869483d19e64cbea0690365f69fb158df Mon Sep 17 00:00:00 2001
From: brian <brianfitzgerald242@gmail.com>
Date: Wed, 19 Jul 2023 02:40:59 +0000
Subject: [PATCH 21/22] llabeling with llama

---
 .gitignore             |   4 +-
 gpt_labeling.py        |   4 +-
 llama_inference.py     |  49 +++++++++++++
 llama_labeling.py      | 155 +++++++++++++++++++++++++++++++++++++++++
 treasure_trove/core.py |  27 ++++++-
 5 files changed, 235 insertions(+), 4 deletions(-)
 create mode 100644 llama_inference.py
 create mode 100644 llama_labeling.py

diff --git a/.gitignore b/.gitignore
index b7ff8a1..8d86c7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -156,4 +156,6 @@ checklink/cookies.txt
 
 checkpoints/
 
-wandb/*
\ No newline at end of file
+wandb/*
+
+*.parquet
\ No newline at end of file
diff --git a/gpt_labeling.py b/gpt_labeling.py
index 1599310..9f5eac1 100644
--- a/gpt_labeling.py
+++ b/gpt_labeling.py
@@ -11,7 +11,7 @@
 from dotenv import load_dotenv
 
 import time
-from treasure_trove.core import LLMLabeler, instruction
+from treasure_trove.core import ChatGPTLabeler, instruction
 
 load_dotenv(".env")
 labels = ["high quality", "medium quality", "low quality"]
@@ -19,7 +19,7 @@
 lang = "python"
 max_chars = 4_096
 num_workers = 8
-labeler = LLMLabeler(
+labeler = ChatGPTLabeler(
     instruction,
     labels,
     secondary_labels=secondary_labels,
diff --git a/llama_inference.py b/llama_inference.py
new file mode 100644
index 0000000..27daa08
--- /dev/null
+++ b/llama_inference.py
@@ -0,0 +1,49 @@
+from transformers import AutoTokenizer
+import transformers
+import torch
+
+model = "../llama-7bf-hf"
+
+instruction_simple = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability.
+High quality code has the following:
+* Readability: The code is written in a way that is easy to understand and follow.
+* Modularity: The code is organized into reusable and independent modules or functions.
+* Detailed explanations: The code is accompanied by explanations of the concepts used.
+* Good design principles: The code follows best practices for software design.
+Medium quality code has the following:
+* Readability: The code is reasonably well-structured and readable.
+* Partial modularity: The code contains some reusable components.
+* Some explanations: The code may have limited explanations or comments.
+* Adequate design principles: The code follows basic design principles.
+Low quality code has the following:
+* Poor readability: The code is poorly structured and difficult to follow.
+* No modularity: The code is written in a monolithic style.
+* Limited explanations: The code provides minimal or no explanations.
+* Neglects design principles: The code shows a lack of consideration for design principles.
+
+Output nothing other than one of the following labels:
+High quality
+Medium quality
+Low quality
+"""
+
+
+tokenizer = AutoTokenizer.from_pretrained(model)
+pipeline = transformers.pipeline(
+    "conversational",
+    model=model,
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+
+sequences = pipeline(
+    instruction_simple,
+    do_sample=True,
+    top_k=10,
+    num_return_sequences=1,
+    eos_token_id=tokenizer.eos_token_id,
+    max_length=200,
+)
+for seq in sequences:
+    print(f"Result: {seq['generated_text']}")
+
diff --git a/llama_labeling.py b/llama_labeling.py
new file mode 100644
index 0000000..0cffef9
--- /dev/null
+++ b/llama_labeling.py
@@ -0,0 +1,155 @@
+from typing import Optional, List
+
+import fire
+import re
+
+from llama import Llama
+
+
+instruction_simple = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability.
+High quality code has the following:
+* Readability: The code is written in a way that is easy to understand and follow.
+* Modularity: The code is organized into reusable and independent modules or functions.
+* Detailed explanations: The code is accompanied by explanations of the concepts used.
+* Good design principles: The code follows best practices for software design.
+Medium quality code has the following:
+* Readability: The code is reasonably well-structured and readable.
+* Partial modularity: The code contains some reusable components.
+* Some explanations: The code may have limited explanations or comments.
+* Adequate design principles: The code follows basic design principles.
+Low quality code has the following:
+* Poor readability: The code is poorly structured and difficult to follow.
+* No modularity: The code is written in a monolithic style.
+* Limited explanations: The code provides minimal or no explanations.
+* Neglects design principles: The code shows a lack of consideration for design principles.
+
+Output nothing other than one of the following labels:
+High quality
+Medium quality
+Low quality
+"""
+
+
+def find_label(text: str, labels: List[str]):
+    for i, label in enumerate(labels):
+        pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE)
+        match = re.search(pattern, text)
+        if bool(match):
+            return i
+    return None
+
+
+import os
+from pathlib import Path
+
+from datasets import (
+    concatenate_datasets,
+    load_dataset,
+    IterableDataset,
+    Dataset,
+    ReadInstruction,
+)
+from dotenv import load_dotenv
+
+import time
+
+load_dotenv(".env")
+labels = ["high quality", "medium quality", "low quality"]
+secondary_labels = ["high", "medium", "low"]
+lang = "python"
+max_chars = 4_096
+num_workers = 8
+dataset_chunks = []
+
+buffer_size = 500
+num_chunks = 100
+
+print("Loading dataset..")
+print("Loaded dataset.")
+
+max_failures = 5
+failures = 0
+
+max_gen_len = 512
+max_seq_len = 1024
+temperature = 0.1
+top_p = 0.2
+max_batch_size = 4
+
+
+ckpt_dir = "../llama/7Bf"
+tokenizer_path = "../llama/tokenizer.model"
+
+generator = Llama.build(
+    ckpt_dir=ckpt_dir,
+    tokenizer_path=tokenizer_path,
+    max_seq_len=max_seq_len,
+    max_batch_size=max_batch_size,
+)
+
+
+def process(x):
+    total_cost = 0
+    label_idx = 0
+    dialogs = []
+    for i in range(len(x["content"])):
+        code_sample = x["content"][i][:max_gen_len]
+        dialogs.append(
+            [
+                {"role": "system", "content": instruction_simple},
+                {"role": "user", "content": code_sample},
+            ]
+        )
+    results = generator.chat_completion(
+        dialogs,  # type: ignore
+        max_gen_len=max_gen_len,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    batch_labels = []
+    for i in range(len(dialogs)):
+        completion_text = results[i]["generation"]["content"]
+        label = find_label(completion_text, labels)
+        batch_labels.append(label)
+    return {"label": batch_labels}
+
+
+processed_chunk_datasets = []
+
+first_save_idx = 8000
+
+for i in range(num_chunks):
+    split = ReadInstruction(
+        "train", from_=i * buffer_size, to=(i + 1) * buffer_size, unit="abs"
+    )
+    # if i < first_save_idx // buffer_size:
+    #     print(f"skipping chunk {i}: {split}")
+    #     continue
+    print(f"processing chunk {i}: {split}")
+    subset = load_dataset(
+        "parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"}
+    )
+
+    # Label the subset
+    subset = subset.map(process, batched=True, batch_size=max_batch_size, num_proc=1)
+
+    processed_chunk_datasets.append(subset)
+
+    if i > first_save_idx // buffer_size:
+        all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets)
+        try:
+            all_datasets.push_to_hub("roborovski/phi-1", private=True)
+            all_datasets.to_parquet(os.path.join(ckpt_dir, f"processed_{i}"))
+        except Exception as e:
+            print(e)
+
+        # print number of each class
+        print(
+            f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}"
+        )
+        print(
+            f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}"
+        )
+        print(
+            f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}"
+        )
diff --git a/treasure_trove/core.py b/treasure_trove/core.py
index 1fa950e..372a8d9 100644
--- a/treasure_trove/core.py
+++ b/treasure_trove/core.py
@@ -36,8 +36,33 @@
 {0}
 """
 
+instruction_simple = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability.
+High quality code has the following:
+* Readability: The code is written in a way that is easy to understand and follow.
+* Modularity: The code is organized into reusable and independent modules or functions.
+* Detailed explanations: The code is accompanied by explanations of the concepts used.
+* Good design principles: The code follows best practices for software design.
+Medium quality code has the following:
+* Readability: The code is reasonably well-structured and readable.
+* Partial modularity: The code contains some reusable components.
+* Some explanations: The code may have limited explanations or comments.
+* Adequate design principles: The code follows basic design principles.
+Low quality code has the following:
+* Poor readability: The code is poorly structured and difficult to follow.
+* No modularity: The code is written in a monolithic style.
+* Limited explanations: The code provides minimal or no explanations.
+* Neglects design principles: The code shows a lack of consideration for design principles.
+
+Output nothing other than one of the following labels:
+High quality
+Medium quality
+Low quality
+"""
+
+
+
 
-class LLMLabeler:
+class ChatGPTLabeler:
     def __init__(
         self,
         instruction: str,

From 4ea38843cf3dd8b5cdeafc3674ad52bac03948e4 Mon Sep 17 00:00:00 2001
From: brian <brianfitzgerald242@gmail.com>
Date: Wed, 19 Jul 2023 12:25:58 +0000
Subject: [PATCH 22/22] log sample

---
 requirements.txt |  3 ++-
 train_labeler.py | 24 +++++++++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4976ba5..89399b9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,5 @@ openai
 transformers
 python-dotenv
 pandas
-wandb
\ No newline at end of file
+wandb
+huggingface_hub
\ No newline at end of file
diff --git a/train_labeler.py b/train_labeler.py
index 3e63c15..860c003 100644
--- a/train_labeler.py
+++ b/train_labeler.py
@@ -4,6 +4,9 @@
 import evaluate
 import numpy as np
 import wandb
+from dotenv import load_dotenv
+from huggingface_hub import login
+import os
 
 from transformers import (
     AutoModelForSequenceClassification,
@@ -12,11 +15,14 @@
     Trainer,
 )
 
+load_dotenv(".env")
+
+login(token=os.environ["HF_KEY"], add_to_git_credential=True)
 
 @dataclass
 class EncoderParams:
     batch_size = 32
-    num_workers = 4
+    num_workers = 16
     push_to_hub = True
     n_labels = 3
     text_column = "content"
@@ -28,14 +34,13 @@ class EncoderParams:
     SEPARATOR_TOKEN = "<sep>"
     PAD_TOKEN = "<pad>"
     CLS_TOKEN = "<cls>"
-    max_input_length = 10000
+    max_input_length = 1024
     max_token_length = 1024
 
 
 def train():
 
-    dataset = load_dataset("roborovski/phi-1")["train"]
-
+    dataset = load_dataset("roborovski/phi-2-labeled")["train"]
 
     tokenizer = AutoTokenizer.from_pretrained(
         EncoderParams.base_model_name, max_length=EncoderParams.max_token_length
@@ -51,10 +56,11 @@ def train():
         label2id=EncoderParams.label2id,
     )
 
+    sample_table_data = []
 
     def compute_metrics(eval_pred):
         logits, labels = eval_pred
-        if isinstance(logits, tuple):  # Some models return tuples
+        if isinstance(logits, tuple):
             logits = logits[0]
         predictions = np.argmax(logits, axis=-1)
         acc = acc_metric.compute(predictions=predictions, references=labels)
@@ -74,6 +80,14 @@ def compute_metrics(eval_pred):
             average="macro" if len(labels) > 2 else "binary",
         )
 
+        decoded_sample = tokenizer.decode(predictions)
+        sample_table_data.append([decoded_sample, labels[0]])
+        sample_table = wandb.Table(
+            columns=["sample", "label"],
+            data=sample_table_data,
+        )
+        wandb.log({"sample": sample_table})
+
         return {**acc, **precision, **recall, **f1}
 
     dataset = dataset.map(