From 1735cba249f99c743db96277f01846d6be6fab68 Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 8 Jul 2023 03:19:45 +0000 Subject: [PATCH 01/22] yolo refactor --- .github/workflows/deploy.yaml | 14 - .github/workflows/test.yaml | 7 - LICENSE | 201 ------- MANIFEST.in | 5 - __init__.py | 0 examples/textbooks_A2YN/gpt_labeling.py | 49 -- gpt_labeling.py | 40 ++ nbs/00_core.ipynb | 513 ------------------ nbs/02_tutorial.ipynb | 104 ---- nbs/_quarto.yml | 20 - nbs/index.ipynb | 96 ---- nbs/nbdev.yml | 9 - nbs/styles.css | 37 -- requirements.txt | 8 + settings.ini | 43 -- .../train_labeler.py => train_labeler.py | 0 treasure_trove/__init__.py | 1 - treasure_trove/_modidx.py | 11 - treasure_trove/core.py | 146 ++++- 19 files changed, 167 insertions(+), 1137 deletions(-) delete mode 100644 .github/workflows/deploy.yaml delete mode 100644 .github/workflows/test.yaml delete mode 100644 LICENSE delete mode 100644 MANIFEST.in create mode 100644 __init__.py delete mode 100644 examples/textbooks_A2YN/gpt_labeling.py create mode 100644 gpt_labeling.py delete mode 100644 nbs/00_core.ipynb delete mode 100644 nbs/02_tutorial.ipynb delete mode 100644 nbs/_quarto.yml delete mode 100644 nbs/index.ipynb delete mode 100644 nbs/nbdev.yml delete mode 100644 nbs/styles.css create mode 100644 requirements.txt delete mode 100644 settings.ini rename examples/textbooks_A2YN/train_labeler.py => train_labeler.py (100%) delete mode 100644 treasure_trove/_modidx.py diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml deleted file mode 100644 index 29bfc57..0000000 --- a/.github/workflows/deploy.yaml +++ /dev/null @@ -1,14 +0,0 @@ -name: Deploy to GitHub Pages - -permissions: - contents: write - pages: write - -on: - push: - branches: [ "main", "master" ] - workflow_dispatch: -jobs: - deploy: - runs-on: ubuntu-latest - steps: [uses: fastai/workflows/quarto-ghp@master] diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml deleted file mode 100644 index 5608592..0000000 --- a/.github/workflows/test.yaml +++ /dev/null @@ -1,7 +0,0 @@ -name: CI -on: [workflow_dispatch, pull_request, push] - -jobs: - test: - runs-on: ubuntu-latest - steps: [uses: fastai/workflows/nbdev-ci@master] diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 3b106e8..0000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2022, fastai - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 5c0e7ce..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,5 +0,0 @@ -include settings.ini -include LICENSE -include CONTRIBUTING.md -include README.md -recursive-exclude * __pycache__ diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/textbooks_A2YN/gpt_labeling.py b/examples/textbooks_A2YN/gpt_labeling.py deleted file mode 100644 index b19c72e..0000000 --- a/examples/textbooks_A2YN/gpt_labeling.py +++ /dev/null @@ -1,49 +0,0 @@ -import os - -from datasets import concatenate_datasets, load_dataset -from squeakily.helpers import LLMLabeler -from treasure_trove.core import label_dataset - -instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. -High quality code has the following: -* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure. -* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects. -* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles. -* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain. -Medium quality code has the following: -* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names. -* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions. -* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose. -* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices. -Low quality code has the following: -* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names. -* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions. -* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose. -* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend. - -Output nothing other than one of the following labels: -""" - -labels = ["high quality", "medium quality", "low quality"] -api_key = os.environ["OPENAI_KEY"] -labeler = LLMLabeler(instruction, labels, model_name="gpt-4", api_key=api_key) # gpt-3.5-turbo - -languages = ["python", "go", "java", "javascript", "c", "c++"] -subsets = [] -for lang in languages: - ds = load_dataset("bigcode/the-stack-smol", data_dir=f"data/{lang}")["train"] - sample = 50 / len(ds) - subset = label_dataset(ds, "content", labeler, labels, sample=sample, num_workers=8) - new_column = [lang] * len(subset) - subset = subset.add_column("language", new_column) - subsets.append(subset) - -labeled_ds = concatenate_datasets(subsets) - -# upload to huggingface -labeled_ds.push_to_hub("CarperAI/textbooks_A2YN_labeled_six_languages", private=True) - -# print number of each class -print(f"Number of {labels[0]}: {len(labeled_ds.filter(lambda x: x['label'] == 0))}") -print(f"Number of {labels[1]}: {len(labeled_ds.filter(lambda x: x['label'] == 1))}") -print(f"Number of {labels[2]}: {len(labeled_ds.filter(lambda x: x['label'] == 2))}") diff --git a/gpt_labeling.py b/gpt_labeling.py new file mode 100644 index 0000000..1340a43 --- /dev/null +++ b/gpt_labeling.py @@ -0,0 +1,40 @@ +import os + +from pydantic import BaseModel, Field + +from datasets import concatenate_datasets, load_dataset +from typing import List +from langchain.output_parsers import PydanticOutputParser +from langchain.chat_models import AzureChatOpenAI, ChatOpenAI +from langchain.prompts import PromptTemplate +from langchain.prompts.chat import ( + ChatPromptTemplate, + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, +) +from dotenv import load_dotenv +import time + +from treasure_trove.core import label_dataset + +load_dotenv(".env") +labels = ["high quality", "medium quality", "low quality"] +languages = ["python", "javascript"] +subsets = [] +for lang in languages: + ds = load_dataset("bigcode/the-stack-smol", data_dir=f"data/{lang}")["train"] + sample = 50 / len(ds) + subset = label_dataset(ds, "content", labels, sample=sample, num_workers=1) + new_column = [lang] * len(subset) + subset = subset.add_column("language", new_column) + subsets.append(subset) + +labeled_ds = concatenate_datasets(subsets) + +# upload to huggingface +labeled_ds.push_to_hub("CarperAI/textbooks_A2YN_labeled_six_languages", private=True) + +# print number of each class +print(f"Number of {labels[0]}: {len(labeled_ds.filter(lambda x: x['label'] == 0))}") +print(f"Number of {labels[1]}: {len(labeled_ds.filter(lambda x: x['label'] == 1))}") +print(f"Number of {labels[2]}: {len(labeled_ds.filter(lambda x: x['label'] == 2))}") diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb deleted file mode 100644 index 7e7aea7..0000000 --- a/nbs/00_core.ipynb +++ /dev/null @@ -1,513 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# core\n", - "\n", - "> Fill in a module description here" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | default_exp core" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | export\n", - "import evaluate\n", - "import time\n", - "\n", - "import numpy as np\n", - "\n", - "from transformers import (\n", - " AutoModelForSequenceClassification,\n", - " AutoTokenizer,\n", - " DataCollatorWithPadding,\n", - " Trainer,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | hide\n", - "from nbdev.showdoc import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | export\n", - "def classify(x, labels, llm_labeler, max_failures=5, default_label=0):\n", - " failures = 0\n", - " while failures < max_failures:\n", - " try:\n", - " label = labels.index(llm_labeler(x)[0])\n", - " time.sleep(1)\n", - " return label\n", - " except Exception as e:\n", - " failures += 1\n", - " print(e)\n", - " time.sleep(1)\n", - " pass\n", - " if failures == max_failures:\n", - " return default_label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | export\n", - "def label_dataset(\n", - " dataset, text_column, labeler_model, labels, sample=0.1, num_workers=4, max_chars=4_096\n", - "):\n", - " \"\"\"\n", - " Filters a dataset using a labeler model.\n", - "\n", - " Args:\n", - " dataset (datasets.Dataset): Dataset to filter\n", - " text_column (str): Name of the column containing the text to classify\n", - " labeler_model (Any): Model to use for labeling\n", - " labels (List[str]): List of labels\n", - " sample (float): The fraction of the dataset to label and use for filtering\n", - " batch_size (int): Batch size for labeling\n", - " num_workers (int): Number of workers for labeling\n", - " max_chars (int): Maximum number of characters to truncate the text to before labeling (reduces rate limiting errors)\n", - " \"\"\"\n", - "\n", - " # Get a subset of the dataset\n", - " subset = dataset.shuffle(seed=115).select(range(int(len(dataset) * sample)))\n", - "\n", - " # Label the subset\n", - " subset = subset.map(\n", - " lambda x: {\"label\": classify(x[text_column][:max_chars], labels, labeler_model)},\n", - " batched=False,\n", - " num_proc=num_workers,\n", - " )\n", - "\n", - " return subset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using custom data configuration bigcode--the-stack-smol-8f8055c3a4e4b4e3\n", - "Found cached dataset json (/home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-8f8055c3a4e4b4e3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cfb95116fc20477bb047848972658d69", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00 Find the treasure in your trove of data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "from datasets import load_dataset\n", - "from squeakily.helpers import LLMLabeler\n", - "from transformers import pipeline, TrainingArguments\n", - "from treasure_trove.core import filter_dataset, label_dataset, train_labeler\n", - "\n", - "instruction = \"\"\"Please label the following code as either educational or non-educational.\n", - "Educational code is code that is well written, follows best practices, has documentation such that it might be found in a textbook.\n", - "Non-educational code is code that is poorly written, lacks documentation, contain bugs, or is not idiomatic.\n", - "Labels:\n", - "\"\"\"\n", - "labels = [\"educational\", \"non-educational\"]\n", - "api_key = \"\"\n", - "labeler = LLMLabeler(instruction, labels, model_name=\"gpt-4\", api_key=api_key)\n", - "\n", - "ds = load_dataset(\"bigcode/the-stack-smol\", data_dir=\"data/python\")[\"train\"]\n", - "\n", - "# Get the training arguments\n", - "batch_size=4,\n", - "training_args = TrainingArguments(\n", - " output_dir=\"./code_edu\",\n", - " num_train_epochs=1,\n", - " per_device_train_batch_size=batch_size,\n", - " per_device_eval_batch_size=batch_size,\n", - " warmup_steps=500,\n", - " weight_decay=0.01,\n", - " logging_dir=\"./logs\",\n", - " logging_steps=10,\n", - " evaluation_strategy=\"epoch\",\n", - " save_strategy=\"epoch\",\n", - " load_best_model_at_end=True,\n", - " metric_for_best_model=\"accuracy\",\n", - " greater_is_better=True,\n", - " seed=42,\n", - " push_to_hub=True,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| eval: false\n", - "subset = label_dataset(ds, \"content\", labeler, labels, sample=0.001)\n", - "base_model_name = \"bigcode/starencoder\"\n", - "model, tokenizer = train_labeler(\n", - " subset,\n", - " \"content\",\n", - " base_model_name,\n", - " n_labels=len(labels),\n", - " training_args=training_args,\n", - " num_workers=4,\n", - " max_length=512,\n", - " push_to_hub=True,\n", - ")\n", - "pipe = pipeline(\n", - " \"text-classification\", model=model, tokenizer=tokenizer, device=model.device\n", - ")\n", - "filtered_ds = filter_dataset(ds, \"content\", model, labels.index(\"educational\"))\n", - "filtered_ds.push_to_hub(\"ncoop57/code_edu\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "python3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml deleted file mode 100644 index 0a6dfcb..0000000 --- a/nbs/_quarto.yml +++ /dev/null @@ -1,20 +0,0 @@ -project: - type: website - -format: - html: - theme: cosmo - css: styles.css - toc: true - -website: - twitter-card: true - open-graph: true - repo-actions: [issue] - navbar: - background: primary - search: true - sidebar: - style: floating - -metadata-files: [nbdev.yml, sidebar.yml] \ No newline at end of file diff --git a/nbs/index.ipynb b/nbs/index.ipynb deleted file mode 100644 index 5e9fc26..0000000 --- a/nbs/index.ipynb +++ /dev/null @@ -1,96 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# | hide\n", - "from treasure_trove.core import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# treasure_trove\n", - "\n", - "> Find the treasure in your trove of data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This file will become your README and also the index of your documentation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Install" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```sh\n", - "pip install treasure_trove\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## How to use" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fill me in please! Don't forget code examples:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "1 + 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "python3", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/nbs/nbdev.yml b/nbs/nbdev.yml deleted file mode 100644 index 8264f3b..0000000 --- a/nbs/nbdev.yml +++ /dev/null @@ -1,9 +0,0 @@ -project: - output-dir: _docs - -website: - title: "treasure_trove" - site-url: "https://CarperAI.github.io/treasure_trove" - description: "Find the treasure in your trove of data" - repo-branch: main - repo-url: "https://github.com/CarperAI/treasure_trove" diff --git a/nbs/styles.css b/nbs/styles.css deleted file mode 100644 index 66ccc49..0000000 --- a/nbs/styles.css +++ /dev/null @@ -1,37 +0,0 @@ -.cell { - margin-bottom: 1rem; -} - -.cell > .sourceCode { - margin-bottom: 0; -} - -.cell-output > pre { - margin-bottom: 0; -} - -.cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { - margin-left: 0.8rem; - margin-top: 0; - background: none; - border-left: 2px solid lightsalmon; - border-top-left-radius: 0; - border-top-right-radius: 0; -} - -.cell-output > .sourceCode { - border: none; -} - -.cell-output > .sourceCode { - background: none; - margin-top: 0; -} - -div.description { - padding-left: 2px; - padding-top: 5px; - font-style: italic; - font-size: 135%; - opacity: 70%; -} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d730a99 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +accelerate +datasets +evaluate +fastcore +langchain +openai +transformers +python-dotenv \ No newline at end of file diff --git a/settings.ini b/settings.ini deleted file mode 100644 index 3e8da59..0000000 --- a/settings.ini +++ /dev/null @@ -1,43 +0,0 @@ -[DEFAULT] -# All sections below are required unless otherwise specified. -# See https://github.com/fastai/nbdev/blob/master/settings.ini for examples. - -### Python library ### -repo = treasure_trove -lib_name = %(repo)s -version = 0.0.1 -min_python = 3.7 -license = apache2 -black_formatting = False - -### nbdev ### -doc_path = _docs -lib_path = treasure_trove -nbs_path = nbs -recursive = True -tst_flags = notest -put_version_in_init = True - -### Docs ### -branch = main -custom_sidebar = False -doc_host = https://%(user)s.github.io -doc_baseurl = /%(repo)s -git_url = https://github.com/%(user)s/%(repo)s -title = %(lib_name)s - -### PyPI ### -audience = Developers -author = ncoop57 -author_email = nacooper01@email.wm.edu -copyright = 2023 onwards, %(author)s -description = Find the treasure in your trove of data -keywords = nbdev jupyter notebook python -language = English -status = 3 -user = CarperAI - -### Optional ### -requirements = accelerate datasets evaluate fastcore langchain openai squeakily transformers -dev_requirements = black[jupyter] ipykernel -# console_scripts = \ No newline at end of file diff --git a/examples/textbooks_A2YN/train_labeler.py b/train_labeler.py similarity index 100% rename from examples/textbooks_A2YN/train_labeler.py rename to train_labeler.py diff --git a/treasure_trove/__init__.py b/treasure_trove/__init__.py index f102a9c..e69de29 100644 --- a/treasure_trove/__init__.py +++ b/treasure_trove/__init__.py @@ -1 +0,0 @@ -__version__ = "0.0.1" diff --git a/treasure_trove/_modidx.py b/treasure_trove/_modidx.py deleted file mode 100644 index 79d02e9..0000000 --- a/treasure_trove/_modidx.py +++ /dev/null @@ -1,11 +0,0 @@ -# Autogenerated by nbdev - -d = { 'settings': { 'branch': 'main', - 'doc_baseurl': '/treasure_trove', - 'doc_host': 'https://CarperAI.github.io', - 'git_url': 'https://github.com/CarperAI/treasure_trove', - 'lib_path': 'treasure_trove'}, - 'syms': { 'treasure_trove.core': { 'treasure_trove.core.classify': ('core.html#classify', 'treasure_trove/core.py'), - 'treasure_trove.core.filter_dataset': ('core.html#filter_dataset', 'treasure_trove/core.py'), - 'treasure_trove.core.label_dataset': ('core.html#label_dataset', 'treasure_trove/core.py'), - 'treasure_trove.core.train_labeler': ('core.html#train_labeler', 'treasure_trove/core.py')}}} diff --git a/treasure_trove/core.py b/treasure_trove/core.py index 0fc06ac..fe92666 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -1,11 +1,5 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb. - -# %% auto 0 -__all__ = ['classify', 'label_dataset', 'train_labeler', 'filter_dataset'] - -# %% ../nbs/00_core.ipynb 2 -import evaluate import time +import os import numpy as np @@ -15,15 +9,116 @@ DataCollatorWithPadding, Trainer, ) +from langchain.prompts.chat import ( + ChatPromptTemplate, + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, +) +from dotenv import load_dotenv +import time + + +from pydantic import BaseModel, Field + +from datasets import concatenate_datasets, load_dataset +from typing import List +from langchain.output_parsers import PydanticOutputParser +from langchain.chat_models import AzureChatOpenAI, ChatOpenAI +from langchain.prompts import PromptTemplate + +class LLMLabelerParser(BaseModel): + labels: List = Field( + ..., title="Labels", description="Labels that the LLM classifies the text as" + ) + + +class LLMLabeler: + def __init__( + self, + instruction: str, + labels: List, + model_name: str = "gpt-3.5-turbo", + api_key: str = None, + model_type: str = "openai", + ): + self.instruction = instruction + self.labels = labels + # Set up a parser + inject instructions into the prompt template. + self.parser = PydanticOutputParser(pydantic_object=LLMLabelerParser) + prompt = PromptTemplate( + template="{instruction}\n{labels}\n{format_instructions}\n", + input_variables=["instruction", "labels"], + partial_variables={ + "format_instructions": self.parser.get_format_instructions() + }, + ) + system_message_prompt = SystemMessagePromptTemplate(prompt=prompt) + human_template = "{text}" + human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) + self.chat_template = ChatPromptTemplate.from_messages( + [system_message_prompt, human_message_prompt] + ) + + if model_type == "azure": + raise NotImplementedError("Azure models are not supported yet") + elif model_type == "openai": + self.model = ChatOpenAI( + openai_api_key=api_key, model_name=model_name, temperature=0 + ) + else: + raise ValueError(f"Model type {model_type} is not supported") + + def __call__(self, text: str): + messages = self.chat_template.format_prompt( + instruction=self.instruction, labels=self.labels, text=text + ).to_messages() + output = self.model(messages) + print('model output', output.content) + if output.content in self.labels: + return [output] + predicted_labels = self.parser.parse(output.content) + print('pred labels', predicted_labels) + # check if all the predicted tags are in the list of tags + assert all( + [label in self.labels for label in predicted_labels.labels] + ), f"Predicted labels {predicted_labels.labels} are not in the list of tags {self.labels}" + return predicted_labels.labels + -# %% ../nbs/00_core.ipynb 4 -def classify(x, labels, llm_labeler, max_failures=5, default_label=0): +instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. +High quality code has the following: +* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure. +* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects. +* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles. +* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain. +Medium quality code has the following: +* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names. +* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions. +* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose. +* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices. +Low quality code has the following: +* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names. +* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions. +* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose. +* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend. + +Output nothing other than one of the following labels: +""" + + +def classify(x, labels, max_failures=5, default_label=0): failures = 0 + api_key = os.environ["OPENAI_KEY"] + labeler = LLMLabeler( + instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key + ) + while failures < max_failures: try: - label = labels.index(llm_labeler(x)[0]) - time.sleep(1) - return label + label = labeler(x)[0] + label_idx = labels.index(label) + print(label, label_idx) + return label_idx except Exception as e: failures += 1 print(e) @@ -32,9 +127,14 @@ def classify(x, labels, llm_labeler, max_failures=5, default_label=0): if failures == max_failures: return default_label -# %% ../nbs/00_core.ipynb 5 + def label_dataset( - dataset, text_column, labeler_model, labels, sample=0.1, num_workers=4, max_chars=4_096 + dataset, + text_column, + labels, + sample=0.1, + num_workers=4, + max_chars=4_096, ): """ Filters a dataset using a labeler model. @@ -55,14 +155,16 @@ def label_dataset( # Label the subset subset = subset.map( - lambda x: {"label": classify(x[text_column][:max_chars], labels, labeler_model)}, + lambda x: { + "label": classify(x[text_column][:max_chars], labels) + }, batched=False, num_proc=num_workers, ) return subset -# %% ../nbs/00_core.ipynb 7 + def train_labeler( dataset, text_column, @@ -112,15 +214,6 @@ def train_labeler( # Get the data collator data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - def compute_metrics(eval_preds): - metric = evaluate.load("glue", "mrpc") - logits, labels = eval_preds - if isinstance(logits, tuple): # Some models return tuples - logits = logits[0] - print(logits.shape, labels) - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - # Get the trainer trainer = Trainer( model=model, @@ -128,7 +221,6 @@ def compute_metrics(eval_preds): train_dataset=dataset["train"], eval_dataset=dataset["test"], data_collator=data_collator, - compute_metrics=compute_metrics, ) # Train the model @@ -141,7 +233,7 @@ def compute_metrics(eval_preds): # Return the model return model, tokenizer -# %% ../nbs/00_core.ipynb 9 + def filter_dataset( dataset, text_column, labeler_model, labels_to_keep, batch_size=32, num_workers=4 ): From 59251f957e674656f6582708cb42f216a8286bee Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 8 Jul 2023 16:21:40 +0000 Subject: [PATCH 02/22] streaming pile subset --- .gitignore | 2 ++ gpt_labeling.py | 69 ++++++++++++++++++++++------------------- requirements.txt | 3 +- treasure_trove/core.py | 70 +++++++++++++----------------------------- view_dataset.py | 10 ++++++ 5 files changed, 73 insertions(+), 81 deletions(-) create mode 100644 view_dataset.py diff --git a/.gitignore b/.gitignore index 900add7..7b78dfe 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,5 @@ checklink/cookies.txt # Quarto .quarto + +checkpoints/ \ No newline at end of file diff --git a/gpt_labeling.py b/gpt_labeling.py index 1340a43..fc0e86c 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -1,40 +1,45 @@ import os - -from pydantic import BaseModel, Field +from pathlib import Path from datasets import concatenate_datasets, load_dataset -from typing import List -from langchain.output_parsers import PydanticOutputParser -from langchain.chat_models import AzureChatOpenAI, ChatOpenAI -from langchain.prompts import PromptTemplate -from langchain.prompts.chat import ( - ChatPromptTemplate, - SystemMessagePromptTemplate, - HumanMessagePromptTemplate, -) from dotenv import load_dotenv -import time -from treasure_trove.core import label_dataset +from treasure_trove.core import classify load_dotenv(".env") labels = ["high quality", "medium quality", "low quality"] -languages = ["python", "javascript"] -subsets = [] -for lang in languages: - ds = load_dataset("bigcode/the-stack-smol", data_dir=f"data/{lang}")["train"] - sample = 50 / len(ds) - subset = label_dataset(ds, "content", labels, sample=sample, num_workers=1) - new_column = [lang] * len(subset) - subset = subset.add_column("language", new_column) - subsets.append(subset) - -labeled_ds = concatenate_datasets(subsets) - -# upload to huggingface -labeled_ds.push_to_hub("CarperAI/textbooks_A2YN_labeled_six_languages", private=True) - -# print number of each class -print(f"Number of {labels[0]}: {len(labeled_ds.filter(lambda x: x['label'] == 0))}") -print(f"Number of {labels[1]}: {len(labeled_ds.filter(lambda x: x['label'] == 1))}") -print(f"Number of {labels[2]}: {len(labeled_ds.filter(lambda x: x['label'] == 2))}") +lang = "python" +processed_subsets = [] +max_chars = 4_096 +num_workers = 8 +epochs = 2 +buffer_size = 1000 +dataset = load_dataset( + "bigcode/the-stack-dedup", data_dir=f"data/{lang}", streaming=True +)["train"] +subset = dataset.shuffle(seed=115, buffer_size=buffer_size) + +for epoch in range(epochs): + subset.set_epoch(epoch) + + procesed = subset.map( + lambda x: {"label": classify(x["content"][:max_chars], labels)}, + batched=False, + ) + + lang_column = [lang] * buffer_size + procesed = procesed.add_column("language", lang_column) + processed_subsets.append(procesed) + + processed_ds = concatenate_datasets(processed_subsets) + + # upload to huggingface + ckpt_dir = "./checkpoints" + Path(ckpt_dir).mkdir(exist_ok=True) + processed_ds.save_to_disk(ckpt_dir + "/latest") + processed_ds.push_to_hub("roborovski/phi-1", private=True) + + # print number of each class + print(f"Number of {labels[0]}: {len(processed_ds.filter(lambda x: x['label'] == 0))}") + print(f"Number of {labels[1]}: {len(processed_ds.filter(lambda x: x['label'] == 1))}") + print(f"Number of {labels[2]}: {len(processed_ds.filter(lambda x: x['label'] == 2))}") diff --git a/requirements.txt b/requirements.txt index d730a99..73081f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ fastcore langchain openai transformers -python-dotenv \ No newline at end of file +python-dotenv +pandas \ No newline at end of file diff --git a/treasure_trove/core.py b/treasure_trove/core.py index fe92666..adb8058 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -1,5 +1,6 @@ import time import os +import re import numpy as np @@ -26,6 +27,7 @@ from langchain.chat_models import AzureChatOpenAI, ChatOpenAI from langchain.prompts import PromptTemplate + class LLMLabelerParser(BaseModel): labels: List = Field( ..., title="Labels", description="Labels that the LLM classifies the text as" @@ -68,21 +70,29 @@ def __init__( else: raise ValueError(f"Model type {model_type} is not supported") + def parse(self, text: str): + for label in self.labels: + match = re.search( + r"\{.*\}", text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL + ) + match = re.search(label, text) + if bool(match): + return label + return None + def __call__(self, text: str): messages = self.chat_template.format_prompt( instruction=self.instruction, labels=self.labels, text=text ).to_messages() output = self.model(messages) - print('model output', output.content) - if output.content in self.labels: - return [output] - predicted_labels = self.parser.parse(output.content) - print('pred labels', predicted_labels) - # check if all the predicted tags are in the list of tags - assert all( - [label in self.labels for label in predicted_labels.labels] - ), f"Predicted labels {predicted_labels.labels} are not in the list of tags {self.labels}" - return predicted_labels.labels + print("model output", output.content) + print(output) + label = self.parse(output.content) + if not label: + print("label not found!") + raise Exception("Label not found") + print("get label", label) + return label instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. @@ -115,9 +125,10 @@ def classify(x, labels, max_failures=5, default_label=0): while failures < max_failures: try: - label = labeler(x)[0] + label = labeler(x) label_idx = labels.index(label) print(label, label_idx) + time.sleep(1) return label_idx except Exception as e: failures += 1 @@ -128,43 +139,6 @@ def classify(x, labels, max_failures=5, default_label=0): return default_label -def label_dataset( - dataset, - text_column, - labels, - sample=0.1, - num_workers=4, - max_chars=4_096, -): - """ - Filters a dataset using a labeler model. - - Args: - dataset (datasets.Dataset): Dataset to filter - text_column (str): Name of the column containing the text to classify - labeler_model (Any): Model to use for labeling - labels (List[str]): List of labels - sample (float): The fraction of the dataset to label and use for filtering - batch_size (int): Batch size for labeling - num_workers (int): Number of workers for labeling - max_chars (int): Maximum number of characters to truncate the text to before labeling (reduces rate limiting errors) - """ - - # Get a subset of the dataset - subset = dataset.shuffle(seed=115).select(range(int(len(dataset) * sample))) - - # Label the subset - subset = subset.map( - lambda x: { - "label": classify(x[text_column][:max_chars], labels) - }, - batched=False, - num_proc=num_workers, - ) - - return subset - - def train_labeler( dataset, text_column, diff --git a/view_dataset.py b/view_dataset.py new file mode 100644 index 0000000..864889f --- /dev/null +++ b/view_dataset.py @@ -0,0 +1,10 @@ +import os +from pathlib import Path +from collections import Counter + +from datasets import load_dataset + +ds = load_dataset("roborovski/phi-1")["train"] +print(ds) +print(Counter(ds['label'])) +print(Counter(ds['language'])) From 745bba67d754128d4ab80872b86031539ef6a46a Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 8 Jul 2023 19:32:32 +0000 Subject: [PATCH 03/22] wip --- gpt_labeling.py | 84 +++++++++++++++++++++++++++++++----------- treasure_trove/core.py | 50 +++++++++---------------- 2 files changed, 81 insertions(+), 53 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index fc0e86c..f566a7c 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -1,45 +1,87 @@ import os from pathlib import Path -from datasets import concatenate_datasets, load_dataset +from datasets import concatenate_datasets, load_dataset, IterableDataset, Dataset from dotenv import load_dotenv -from treasure_trove.core import classify +import time +from treasure_trove.core import LLMLabeler, instruction load_dotenv(".env") +api_key = os.environ["OPENAI_KEY"] labels = ["high quality", "medium quality", "low quality"] lang = "python" processed_subsets = [] max_chars = 4_096 num_workers = 8 -epochs = 2 -buffer_size = 1000 + +buffer_size = 1_000 +chunk_size = 50 + +print("Loading dataset..") dataset = load_dataset( - "bigcode/the-stack-dedup", data_dir=f"data/{lang}", streaming=True + "bigcode/the-stack-dedup", + data_dir=f"data/{lang}", + streaming=True, )["train"] +print("Loaded dataset.") + subset = dataset.shuffle(seed=115, buffer_size=buffer_size) -for epoch in range(epochs): - subset.set_epoch(epoch) +chunks_to_process = buffer_size // chunk_size - procesed = subset.map( - lambda x: {"label": classify(x["content"][:max_chars], labels)}, - batched=False, - ) +total_cost = 0 +max_failures = 5 +failures = 0 +labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key) + +for chunk in range(chunks_to_process): + print(f"Chunk {chunk} / {chunks_to_process} starting...") - lang_column = [lang] * buffer_size - procesed = procesed.add_column("language", lang_column) - processed_subsets.append(procesed) + processed_rows = [] + subset.set_epoch(chunk) - processed_ds = concatenate_datasets(processed_subsets) + for i, x in enumerate(subset): + failures = 0 + label_idx, cost_info = 0, {} + while failures < max_failures: + try: + label, cost_info = labeler(x["content"][:max_chars]) + label_idx = labels.index(label) + print(label, label_idx) + time.sleep(1) + break + except Exception as e: + failures += 1 + print(e) + time.sleep(1) + if failures != max_failures: + total_cost += cost_info["total_cost"] + print( + f"classified {i}: {label} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}" + ) + processed_rows.append({**x, "label": label, "language": lang}) + else: + print(f"Max failures hit on idx {i}, continuing.") - # upload to huggingface + subset_ds = Dataset.from_list(processed_rows) + processed_subsets.append(subset_ds) + + # Save all processed data + all_datasets: Dataset = concatenate_datasets(processed_subsets) ckpt_dir = "./checkpoints" Path(ckpt_dir).mkdir(exist_ok=True) - processed_ds.save_to_disk(ckpt_dir + "/latest") - processed_ds.push_to_hub("roborovski/phi-1", private=True) + all_datasets.save_to_disk(ckpt_dir + "/latest") + all_datasets.push_to_hub("roborovski/phi-1", private=True) # print number of each class - print(f"Number of {labels[0]}: {len(processed_ds.filter(lambda x: x['label'] == 0))}") - print(f"Number of {labels[1]}: {len(processed_ds.filter(lambda x: x['label'] == 1))}") - print(f"Number of {labels[2]}: {len(processed_ds.filter(lambda x: x['label'] == 2))}") + print( + f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}" + ) + print( + f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}" + ) + print( + f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}" + ) + print(f"Cost so far: {total_cost}") diff --git a/treasure_trove/core.py b/treasure_trove/core.py index adb8058..6ecc02d 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -1,8 +1,6 @@ import time -import os import re - -import numpy as np +from dotenv import load_dotenv from transformers import ( AutoModelForSequenceClassification, @@ -17,7 +15,7 @@ ) from dotenv import load_dotenv import time - +from langchain.callbacks import get_openai_callback, OpenAICallbackHandler from pydantic import BaseModel, Field @@ -65,7 +63,10 @@ def __init__( raise NotImplementedError("Azure models are not supported yet") elif model_type == "openai": self.model = ChatOpenAI( - openai_api_key=api_key, model_name=model_name, temperature=0 + openai_api_key=api_key, + model_name=model_name, + temperature=0, + max_tokens=50, ) else: raise ValueError(f"Model type {model_type} is not supported") @@ -80,19 +81,26 @@ def parse(self, text: str): return label return None + def cost_info(self, cb: OpenAICallbackHandler): + return dict( + prompt_tokens=cb.prompt_tokens, + completion_tokens=cb.completion_tokens, + total_cost=cb.total_cost, + ) + def __call__(self, text: str): messages = self.chat_template.format_prompt( instruction=self.instruction, labels=self.labels, text=text ).to_messages() - output = self.model(messages) - print("model output", output.content) - print(output) + cost_info = None + with get_openai_callback() as cb: + output = self.model(messages) + cost_info = self.cost_info(cb) label = self.parse(output.content) if not label: print("label not found!") raise Exception("Label not found") - print("get label", label) - return label + return label, cost_info instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. @@ -116,28 +124,6 @@ def __call__(self, text: str): """ -def classify(x, labels, max_failures=5, default_label=0): - failures = 0 - api_key = os.environ["OPENAI_KEY"] - labeler = LLMLabeler( - instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key - ) - - while failures < max_failures: - try: - label = labeler(x) - label_idx = labels.index(label) - print(label, label_idx) - time.sleep(1) - return label_idx - except Exception as e: - failures += 1 - print(e) - time.sleep(1) - pass - if failures == max_failures: - return default_label - def train_labeler( dataset, From 358e91551d9d293723cccd8ce5f726b352c1e5fc Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 8 Jul 2023 20:10:22 +0000 Subject: [PATCH 04/22] save on an interval per subset --- gpt_labeling.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index f566a7c..6b39732 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -30,11 +30,16 @@ chunks_to_process = buffer_size // chunk_size +subset_save_interval = 100 + total_cost = 0 max_failures = 5 failures = 0 labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key) +ckpt_dir = "./checkpoints" +Path(ckpt_dir).mkdir(exist_ok=True) + for chunk in range(chunks_to_process): print(f"Chunk {chunk} / {chunks_to_process} starting...") @@ -63,15 +68,15 @@ processed_rows.append({**x, "label": label, "language": lang}) else: print(f"Max failures hit on idx {i}, continuing.") + if i % subset_save_interval == 0: + subset_ds = Dataset.from_list(processed_rows) + subset_ds.save_to_disk(os.path.join(ckpt_dir, f"chunk_{chunk}_subset_{i}")) + subset_ds.push_to_hub("roborovski/phi-1", private=True) - subset_ds = Dataset.from_list(processed_rows) - processed_subsets.append(subset_ds) - + processed_subsets.append(processed_rows) # Save all processed data all_datasets: Dataset = concatenate_datasets(processed_subsets) - ckpt_dir = "./checkpoints" - Path(ckpt_dir).mkdir(exist_ok=True) - all_datasets.save_to_disk(ckpt_dir + "/latest") + all_datasets.save_to_disk(os.path.join(ckpt_dir, "latest")) all_datasets.push_to_hub("roborovski/phi-1", private=True) # print number of each class From 5ae930b152f48ade9f1129ca7af31e4f7d5dd416 Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 8 Jul 2023 20:35:48 +0000 Subject: [PATCH 05/22] rm langchain --- gpt_labeling.py | 12 ++-- requirements.txt | 1 - treasure_trove/core.py | 136 ++++++++++++++--------------------------- 3 files changed, 53 insertions(+), 96 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index 6b39732..39a2933 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -14,9 +14,12 @@ processed_subsets = [] max_chars = 4_096 num_workers = 8 +labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key) +res = labeler("def create()") +print(res) -buffer_size = 1_000 -chunk_size = 50 +buffer_size = 10_000 +chunks_to_process = 10 print("Loading dataset..") dataset = load_dataset( @@ -26,16 +29,13 @@ )["train"] print("Loaded dataset.") -subset = dataset.shuffle(seed=115, buffer_size=buffer_size) - -chunks_to_process = buffer_size // chunk_size +subset = dataset.shuffle(seed=100, buffer_size=buffer_size) subset_save_interval = 100 total_cost = 0 max_failures = 5 failures = 0 -labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key) ckpt_dir = "./checkpoints" Path(ckpt_dir).mkdir(exist_ok=True) diff --git a/requirements.txt b/requirements.txt index 73081f4..d659a60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ accelerate datasets evaluate fastcore -langchain openai transformers python-dotenv diff --git a/treasure_trove/core.py b/treasure_trove/core.py index 6ecc02d..6e76aae 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -1,6 +1,5 @@ -import time import re -from dotenv import load_dotenv +import os from transformers import ( AutoModelForSequenceClassification, @@ -8,28 +7,34 @@ DataCollatorWithPadding, Trainer, ) -from langchain.prompts.chat import ( - ChatPromptTemplate, - SystemMessagePromptTemplate, - HumanMessagePromptTemplate, -) -from dotenv import load_dotenv import time -from langchain.callbacks import get_openai_callback, OpenAICallbackHandler +import openai + +openai.api_key = os.getenv("OPENAI_KEY") -from pydantic import BaseModel, Field -from datasets import concatenate_datasets, load_dataset from typing import List -from langchain.output_parsers import PydanticOutputParser -from langchain.chat_models import AzureChatOpenAI, ChatOpenAI -from langchain.prompts import PromptTemplate +instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. +High quality code has the following: +* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure. +* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects. +* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles. +* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain. +Medium quality code has the following: +* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names. +* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions. +* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose. +* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices. +Low quality code has the following: +* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names. +* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions. +* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose. +* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend. -class LLMLabelerParser(BaseModel): - labels: List = Field( - ..., title="Labels", description="Labels that the LLM classifies the text as" - ) +Output nothing other than one of the following labels: +{0} +""" class LLMLabeler: @@ -37,94 +42,47 @@ def __init__( self, instruction: str, labels: List, - model_name: str = "gpt-3.5-turbo", - api_key: str = None, - model_type: str = "openai", ): self.instruction = instruction self.labels = labels - # Set up a parser + inject instructions into the prompt template. - self.parser = PydanticOutputParser(pydantic_object=LLMLabelerParser) - prompt = PromptTemplate( - template="{instruction}\n{labels}\n{format_instructions}\n", - input_variables=["instruction", "labels"], - partial_variables={ - "format_instructions": self.parser.get_format_instructions() - }, - ) - system_message_prompt = SystemMessagePromptTemplate(prompt=prompt) - human_template = "{text}" - human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) - self.chat_template = ChatPromptTemplate.from_messages( - [system_message_prompt, human_message_prompt] - ) - if model_type == "azure": - raise NotImplementedError("Azure models are not supported yet") - elif model_type == "openai": - self.model = ChatOpenAI( - openai_api_key=api_key, - model_name=model_name, - temperature=0, - max_tokens=50, - ) - else: - raise ValueError(f"Model type {model_type} is not supported") - - def parse(self, text: str): + def parse_label(self, text: str): for label in self.labels: - match = re.search( - r"\{.*\}", text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL - ) - match = re.search(label, text) + pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE) + match = re.search(pattern, text) if bool(match): return label return None - def cost_info(self, cb: OpenAICallbackHandler): + def cost_info(self, oai_response): + prompt_tokens = oai_response["usage"]["prompt_tokens"] + completion_tokens = oai_response["usage"]["completion_tokens"] + total_cost=0.0015 * prompt_tokens + 0.0002 * completion_tokens + return dict( - prompt_tokens=cb.prompt_tokens, - completion_tokens=cb.completion_tokens, - total_cost=cb.total_cost, + total_cost=total_cost, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, ) def __call__(self, text: str): - messages = self.chat_template.format_prompt( - instruction=self.instruction, labels=self.labels, text=text - ).to_messages() - cost_info = None - with get_openai_callback() as cb: - output = self.model(messages) - cost_info = self.cost_info(cb) - label = self.parse(output.content) + formatted_instruction = instruction.format(self.labels) + completion = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + temperature=0, + messages=[ + {"role": "system", "content": formatted_instruction}, + {"role": "user", "content": text}, + ], + ) + output_text = completion["choices"][0]["message"]["content"] + label = self.parse_label(output_text) + cost_info = self.cost_info(completion) if not label: - print("label not found!") - raise Exception("Label not found") + raise Exception(f"Label not found in text: {output_text}") return label, cost_info -instruction = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. -High quality code has the following: -* Readability: The code is written in a way that is easy to understand and follow, with consistent detailed comments, formatting, meaningful variable names, and appropriate code structure. -* Modularity: The code is organized into reusable and independent modules or functions, making it easier to comprehend and reuse in other projects. -* Detailed explanations: The code is accompanied by thorough explanations of the concepts and techniques used, providing learners with a deeper understanding of the underlying principles. -* Good design principles: The code follows best practices for software design, such as encapsulation, separation of concerns, and adhering to design patterns, making it easier to understand and maintain. -Medium quality code has the following: -* Readability: The code is reasonably well-structured and readable, but there may be occasional inconsistencies, some comments, or less descriptive variable names. -* Partial modularity: The code contains some reusable components, but not all parts of the code are organized into separate modules or functions. -* Some explanations: The code may have limited explanations or comments that provide a general understanding of the code's logic and purpose. -* Adequate design principles: The code follows basic design principles, such as separation of concerns, but may not fully adhere to advanced design patterns or best practices. -Low quality code has the following: -* Poor readability: The code is poorly structured and difficult to follow, with little to no comments, inconsistent formatting and unclear variable names. -* No modularity: The code is written in a monolithic style, lacking any organization into reusable or independent modules or functions. -* Limited explanations: The code provides minimal or no explanations, leaving learners with little guidance on its logic or purpose. -* Neglects design principles: The code shows a lack of consideration for design principles, making it harder to comprehend, maintain, and extend. - -Output nothing other than one of the following labels: -""" - - - def train_labeler( dataset, text_column, From 5d381fe30f785d22900ceef77375cb1e5e6bab69 Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 8 Jul 2023 20:41:41 +0000 Subject: [PATCH 06/22] bugfixes --- gpt_labeling.py | 2 +- treasure_trove/core.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index 39a2933..b22ec30 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -29,7 +29,7 @@ )["train"] print("Loaded dataset.") -subset = dataset.shuffle(seed=100, buffer_size=buffer_size) +subset = dataset.shuffle(seed=110, buffer_size=buffer_size) subset_save_interval = 100 diff --git a/treasure_trove/core.py b/treasure_trove/core.py index 6e76aae..5912163 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -41,13 +41,15 @@ class LLMLabeler: def __init__( self, instruction: str, - labels: List, + labels: List[str], + secondary_labels: List[str], ): self.instruction = instruction self.labels = labels + self.secondary_labels = secondary_labels - def parse_label(self, text: str): - for label in self.labels: + def find_label(self, text: str, labels: List[str]): + for label in labels: pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE) match = re.search(pattern, text) if bool(match): @@ -57,7 +59,7 @@ def parse_label(self, text: str): def cost_info(self, oai_response): prompt_tokens = oai_response["usage"]["prompt_tokens"] completion_tokens = oai_response["usage"]["completion_tokens"] - total_cost=0.0015 * prompt_tokens + 0.0002 * completion_tokens + total_cost = 0.0015 * prompt_tokens + 0.0002 * completion_tokens return dict( total_cost=total_cost, @@ -76,7 +78,9 @@ def __call__(self, text: str): ], ) output_text = completion["choices"][0]["message"]["content"] - label = self.parse_label(output_text) + label = self.find_label(output_text, self.labels) + if not label: + label = self.find_label(output_text, self.secondary_labels) cost_info = self.cost_info(completion) if not label: raise Exception(f"Label not found in text: {output_text}") From a94bdbd65dd5033fb7438abc64424c0168f4f6a2 Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 8 Jul 2023 20:54:09 +0000 Subject: [PATCH 07/22] secondary labeling --- gpt_labeling.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index b22ec30..82c8327 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -10,11 +10,16 @@ load_dotenv(".env") api_key = os.environ["OPENAI_KEY"] labels = ["high quality", "medium quality", "low quality"] +secondary_labels = ["high", "medium", "low"] lang = "python" processed_subsets = [] max_chars = 4_096 num_workers = 8 -labeler = LLMLabeler(instruction, labels, model_name="gpt-3.5-turbo", api_key=api_key) +labeler = LLMLabeler( + instruction, + labels, + secondary_labels=secondary_labels, +) res = labeler("def create()") print(res) From 5ccfd26a49f93f65ac410a649f55aa14f986f9df Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 8 Jul 2023 22:37:53 +0000 Subject: [PATCH 08/22] workign --- gpt_labeling.py | 96 ++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 49 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index 82c8327..ee77b60 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -8,11 +8,9 @@ from treasure_trove.core import LLMLabeler, instruction load_dotenv(".env") -api_key = os.environ["OPENAI_KEY"] labels = ["high quality", "medium quality", "low quality"] secondary_labels = ["high", "medium", "low"] lang = "python" -processed_subsets = [] max_chars = 4_096 num_workers = 8 labeler = LLMLabeler( @@ -22,67 +20,68 @@ ) res = labeler("def create()") print(res) +dataset_chunks = [] -buffer_size = 10_000 -chunks_to_process = 10 +buffer_size = 500 +chunks_to_process = 20 print("Loading dataset..") -dataset = load_dataset( - "bigcode/the-stack-dedup", - data_dir=f"data/{lang}", - streaming=True, -)["train"] +dataset = load_dataset("parquet", data_files={"train": "data-00000-of-00144.parquet"})[ + "train" +] print("Loaded dataset.") -subset = dataset.shuffle(seed=110, buffer_size=buffer_size) +api_key = os.environ["OPENAI_KEY"] subset_save_interval = 100 -total_cost = 0 max_failures = 5 failures = 0 ckpt_dir = "./checkpoints" Path(ckpt_dir).mkdir(exist_ok=True) -for chunk in range(chunks_to_process): - print(f"Chunk {chunk} / {chunks_to_process} starting...") - - processed_rows = [] - subset.set_epoch(chunk) - - for i, x in enumerate(subset): - failures = 0 - label_idx, cost_info = 0, {} - while failures < max_failures: - try: - label, cost_info = labeler(x["content"][:max_chars]) - label_idx = labels.index(label) - print(label, label_idx) - time.sleep(1) - break - except Exception as e: - failures += 1 - print(e) - time.sleep(1) - if failures != max_failures: - total_cost += cost_info["total_cost"] - print( - f"classified {i}: {label} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}" - ) - processed_rows.append({**x, "label": label, "language": lang}) - else: - print(f"Max failures hit on idx {i}, continuing.") - if i % subset_save_interval == 0: - subset_ds = Dataset.from_list(processed_rows) - subset_ds.save_to_disk(os.path.join(ckpt_dir, f"chunk_{chunk}_subset_{i}")) - subset_ds.push_to_hub("roborovski/phi-1", private=True) - - processed_subsets.append(processed_rows) - # Save all processed data - all_datasets: Dataset = concatenate_datasets(processed_subsets) - all_datasets.save_to_disk(os.path.join(ckpt_dir, "latest")) + +def process(x): + failures = 0 + label_idx, cost_info = 0, {} + while failures < max_failures: + try: + label, cost_info = labeler(x["content"][:max_chars]) + label_idx = labels.index(label) + print(label, label_idx) + time.sleep(1) + break + except Exception as e: + failures += 1 + print(e) + time.sleep(1) + print( + f"classified {i}: {label} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}" + ) + return {"label": label_idx, "cost": cost_info["total_cost"]} + + +processed_chunk_datasets = [] +start_idx = 1 + +for i in range(start_idx, start_idx + buffer_size, 1): + print(f"Chunk {i} / {chunks_to_process + start_idx} starting...") + + subset = dataset[i : i + buffer_size] + + # Label the subset + subset = dataset.map(process, batched=False, num_proc=8) + + processed_chunk_datasets.append(subset) + + all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets) all_datasets.push_to_hub("roborovski/phi-1", private=True) + all_datasets.to_parquet( + os.path.join( + ckpt_dir, f"processed_{start_idx}_to_{chunks_to_process+start_idx}" + ) + ) # print number of each class print( @@ -94,4 +93,3 @@ print( f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}" ) - print(f"Cost so far: {total_cost}") From 38b98e93f56cdd7d1d3bf54c4900b05bc3a8e7f6 Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 9 Jul 2023 00:05:05 +0000 Subject: [PATCH 09/22] chunking working --- gpt_labeling.py | 25 ++++++++++++------------- treasure_trove/core.py | 5 ++++- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index ee77b60..684c2ec 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -1,7 +1,7 @@ import os from pathlib import Path -from datasets import concatenate_datasets, load_dataset, IterableDataset, Dataset +from datasets import concatenate_datasets, load_dataset, IterableDataset, Dataset, ReadInstruction from dotenv import load_dotenv import time @@ -26,9 +26,6 @@ chunks_to_process = 20 print("Loading dataset..") -dataset = load_dataset("parquet", data_files={"train": "data-00000-of-00144.parquet"})[ - "train" -] print("Loaded dataset.") api_key = os.environ["OPENAI_KEY"] @@ -47,31 +44,33 @@ def process(x): label_idx, cost_info = 0, {} while failures < max_failures: try: - label, cost_info = labeler(x["content"][:max_chars]) - label_idx = labels.index(label) - print(label, label_idx) + label_idx, cost_info = labeler(x["content"][:max_chars]) time.sleep(1) break except Exception as e: failures += 1 print(e) time.sleep(1) - print( - f"classified {i}: {label} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}" - ) + if cost_info: + print( + f"{label_idx} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}" + ) + else: + print("row not classified.") return {"label": label_idx, "cost": cost_info["total_cost"]} processed_chunk_datasets = [] -start_idx = 1 +start_idx = 0 for i in range(start_idx, start_idx + buffer_size, 1): print(f"Chunk {i} / {chunks_to_process + start_idx} starting...") - subset = dataset[i : i + buffer_size] + split = ReadInstruction("train", from_=start_idx*buffer_size, to=start_idx*1+buffer_size, unit="abs") + subset = load_dataset("parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"}) # Label the subset - subset = dataset.map(process, batched=False, num_proc=8) + subset = subset.map(process, batched=False, num_proc=4) processed_chunk_datasets.append(subset) diff --git a/treasure_trove/core.py b/treasure_trove/core.py index 5912163..2eeb2ee 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -77,6 +77,8 @@ def __call__(self, text: str): {"role": "user", "content": text}, ], ) + if "error" in completion: + return 0, None output_text = completion["choices"][0]["message"]["content"] label = self.find_label(output_text, self.labels) if not label: @@ -84,7 +86,8 @@ def __call__(self, text: str): cost_info = self.cost_info(completion) if not label: raise Exception(f"Label not found in text: {output_text}") - return label, cost_info + label_idx = self.labels.index(label) + return label_idx, cost_info def train_labeler( From e28d55f78d6a8858edb651a5fbb9ce060b15cdd0 Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 9 Jul 2023 00:11:17 +0000 Subject: [PATCH 10/22] dumb bugs --- gpt_labeling.py | 2 +- treasure_trove/core.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index 684c2ec..82d491b 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -53,7 +53,7 @@ def process(x): time.sleep(1) if cost_info: print( - f"{label_idx} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']}" + f"{label_idx} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']} | {cost_info['total_cost']}" ) else: print("row not classified.") diff --git a/treasure_trove/core.py b/treasure_trove/core.py index 2eeb2ee..88d79a2 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -49,11 +49,11 @@ def __init__( self.secondary_labels = secondary_labels def find_label(self, text: str, labels: List[str]): - for label in labels: + for i, label in enumerate(labels): pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE) match = re.search(pattern, text) if bool(match): - return label + return i return None def cost_info(self, oai_response): @@ -72,6 +72,7 @@ def __call__(self, text: str): completion = openai.ChatCompletion.create( model="gpt-3.5-turbo", temperature=0, + max_tokens=4, messages=[ {"role": "system", "content": formatted_instruction}, {"role": "user", "content": text}, @@ -80,13 +81,12 @@ def __call__(self, text: str): if "error" in completion: return 0, None output_text = completion["choices"][0]["message"]["content"] - label = self.find_label(output_text, self.labels) - if not label: - label = self.find_label(output_text, self.secondary_labels) + label_idx = self.find_label(output_text, self.labels) + if not label_idx: + label_idx = self.find_label(output_text, self.secondary_labels) cost_info = self.cost_info(completion) - if not label: + if not label_idx: raise Exception(f"Label not found in text: {output_text}") - label_idx = self.labels.index(label) return label_idx, cost_info From cb4a59bddd0af4d0fd671d13a9026789a653f866 Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 9 Jul 2023 00:35:01 +0000 Subject: [PATCH 11/22] more dumb bugs --- gpt_labeling.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index 82d491b..6525b7e 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -1,7 +1,13 @@ import os from pathlib import Path -from datasets import concatenate_datasets, load_dataset, IterableDataset, Dataset, ReadInstruction +from datasets import ( + concatenate_datasets, + load_dataset, + IterableDataset, + Dataset, + ReadInstruction, +) from dotenv import load_dotenv import time @@ -23,15 +29,13 @@ dataset_chunks = [] buffer_size = 500 -chunks_to_process = 20 +num_chunks = 20 print("Loading dataset..") print("Loaded dataset.") api_key = os.environ["OPENAI_KEY"] -subset_save_interval = 100 - max_failures = 5 failures = 0 @@ -41,6 +45,7 @@ def process(x): failures = 0 + total_cost = 0 label_idx, cost_info = 0, {} while failures < max_failures: try: @@ -52,22 +57,25 @@ def process(x): print(e) time.sleep(1) if cost_info: + total_cost = cost_info["total_cost"] print( f"{label_idx} - tokens used: {cost_info['prompt_tokens']} | {cost_info['completion_tokens']} | {cost_info['total_cost']}" ) else: print("row not classified.") - return {"label": label_idx, "cost": cost_info["total_cost"]} + return {"label": label_idx, "cost": total_cost} processed_chunk_datasets = [] -start_idx = 0 - -for i in range(start_idx, start_idx + buffer_size, 1): - print(f"Chunk {i} / {chunks_to_process + start_idx} starting...") - split = ReadInstruction("train", from_=start_idx*buffer_size, to=start_idx*1+buffer_size, unit="abs") - subset = load_dataset("parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"}) +for i in range(num_chunks): + split = ReadInstruction( + "train", from_=i * buffer_size, to=(i + 1) * buffer_size, unit="abs" + ) + print(f"processing chunk {i}: {split}") + subset = load_dataset( + "parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"} + ) # Label the subset subset = subset.map(process, batched=False, num_proc=4) @@ -77,9 +85,7 @@ def process(x): all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets) all_datasets.push_to_hub("roborovski/phi-1", private=True) all_datasets.to_parquet( - os.path.join( - ckpt_dir, f"processed_{start_idx}_to_{chunks_to_process+start_idx}" - ) + os.path.join(ckpt_dir, f"processed_{start_idx}_to_{num_chunks+start_idx}") ) # print number of each class From bfcebb22dedd85b6c2d40afc2470945120a84a35 Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 9 Jul 2023 01:38:04 +0000 Subject: [PATCH 12/22] working --- gpt_labeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index 6525b7e..5e95107 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -85,7 +85,7 @@ def process(x): all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets) all_datasets.push_to_hub("roborovski/phi-1", private=True) all_datasets.to_parquet( - os.path.join(ckpt_dir, f"processed_{start_idx}_to_{num_chunks+start_idx}") + os.path.join(ckpt_dir, f"processed_{i}") ) # print number of each class From 505a56bbb77bd816d5e717f6253162f148bd6bce Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 9 Jul 2023 03:32:05 +0000 Subject: [PATCH 13/22] skip exc --- gpt_labeling.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index 5e95107..717038d 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -83,10 +83,13 @@ def process(x): processed_chunk_datasets.append(subset) all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets) - all_datasets.push_to_hub("roborovski/phi-1", private=True) - all_datasets.to_parquet( - os.path.join(ckpt_dir, f"processed_{i}") - ) + try: + all_datasets.push_to_hub("roborovski/phi-1", private=True) + all_datasets.to_parquet( + os.path.join(ckpt_dir, f"processed_{i}") + ) + except Exception as e: + print(e) # print number of each class print( From dfa71e84744e1dd4786e77cedaf472c137eec2d6 Mon Sep 17 00:00:00 2001 From: Brian Date: Mon, 10 Jul 2023 19:04:17 +0000 Subject: [PATCH 14/22] bump chunks --- gpt_labeling.py | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/gpt_labeling.py b/gpt_labeling.py index 717038d..f7d55af 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -29,7 +29,7 @@ dataset_chunks = [] buffer_size = 500 -num_chunks = 20 +num_chunks = 100 print("Loading dataset..") print("Loaded dataset.") @@ -68,10 +68,15 @@ def process(x): processed_chunk_datasets = [] +first_save_idx = 8000 + for i in range(num_chunks): split = ReadInstruction( "train", from_=i * buffer_size, to=(i + 1) * buffer_size, unit="abs" ) + # if i < first_save_idx // buffer_size: + # print(f"skipping chunk {i}: {split}") + # continue print(f"processing chunk {i}: {split}") subset = load_dataset( "parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"} @@ -82,22 +87,21 @@ def process(x): processed_chunk_datasets.append(subset) - all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets) - try: - all_datasets.push_to_hub("roborovski/phi-1", private=True) - all_datasets.to_parquet( - os.path.join(ckpt_dir, f"processed_{i}") - ) - except Exception as e: - print(e) + if i > first_save_idx // buffer_size: + all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets) + try: + all_datasets.push_to_hub("roborovski/phi-1", private=True) + all_datasets.to_parquet(os.path.join(ckpt_dir, f"processed_{i}")) + except Exception as e: + print(e) - # print number of each class - print( - f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}" - ) - print( - f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}" - ) - print( - f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}" - ) + # print number of each class + print( + f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}" + ) + print( + f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}" + ) + print( + f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}" + ) From bc5bd0c0661cdd55675a66a27f2e9f5e424f9a50 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 12 Jul 2023 00:21:34 +0000 Subject: [PATCH 15/22] labeler training --- .gitignore | 4 ++- code_edu | 1 + train_labeler.py | 69 ++++++++++++++++++++++++++++++++--------- treasure_trove/core.py | 70 ------------------------------------------ 4 files changed, 59 insertions(+), 85 deletions(-) create mode 160000 code_edu diff --git a/.gitignore b/.gitignore index 7b78dfe..b7ff8a1 100644 --- a/.gitignore +++ b/.gitignore @@ -154,4 +154,6 @@ checklink/cookies.txt # Quarto .quarto -checkpoints/ \ No newline at end of file +checkpoints/ + +wandb/* \ No newline at end of file diff --git a/code_edu b/code_edu new file mode 160000 index 0000000..e9a28b1 --- /dev/null +++ b/code_edu @@ -0,0 +1 @@ +Subproject commit e9a28b101f91ed62ee3d6c52db2fe1e2edacfbd9 diff --git a/train_labeler.py b/train_labeler.py index 1249930..d97e3a6 100644 --- a/train_labeler.py +++ b/train_labeler.py @@ -1,10 +1,46 @@ from datasets import load_dataset from transformers import pipeline, TrainingArguments -from treasure_trove.core import filter_dataset, label_dataset, train_labeler + +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + Trainer, +) -ds = load_dataset("CarperAI/textbooks_A2YN_labeled")["train"] +dataset = load_dataset("roborovski/phi-1")["train"] batch_size = 32 +num_workers = 4 +max_length = 512 +push_to_hub = True +n_labels = 3 +text_column = "content" + +id2label = {0: "HIGH_QUALITY", 1: "MEDIUM_QUALITY", 2: "LOW_QUALITY"} +label2id = {"HIGH_QUALITY": 0, "MEDIUM_QUALITY": 1, "LOW_QUALITY": 2} + +base_model_name = "bigcode/starencoder" +tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length=max_length) +if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + +model = AutoModelForSequenceClassification.from_pretrained( + base_model_name, num_labels=n_labels, max_length=max_length, id2label=id2label, label2id=label2id +) + +dataset = dataset.map( + lambda x: tokenizer( + x[text_column], padding="max_length", truncation=True, max_length=max_length + ), + batched=True, + num_proc=num_workers, +) + +dataset = dataset.train_test_split(test_size=0.1, seed=42) + +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + training_args = TrainingArguments( output_dir="./code_edu", num_train_epochs=3, @@ -21,17 +57,22 @@ greater_is_better=True, seed=42, push_to_hub=True, - hub_model_id="CarperAI/code_edu_classifier_py", + hub_model_id="roborovski/phi-2-classifier", hub_private_repo=True, ) -base_model_name = "bigcode/starencoder" -model, tokenizer = train_labeler( - ds, - "content", - base_model_name, - n_labels=2, - training_args=training_args, - num_workers=4, - max_length=512, - push_to_hub=True, -) \ No newline at end of file + + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], + data_collator=data_collator, +) + +breakpoint() + +trainer.train() + +if push_to_hub: + trainer.push_to_hub() diff --git a/treasure_trove/core.py b/treasure_trove/core.py index 88d79a2..1fa950e 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -89,76 +89,6 @@ def __call__(self, text: str): raise Exception(f"Label not found in text: {output_text}") return label_idx, cost_info - -def train_labeler( - dataset, - text_column, - base_model_name, - n_labels, - training_args, - num_workers=4, - max_length=512, - push_to_hub=True, -): - """ - Trains a labeler model on a labeled dataset. - - Args: - dataset (datasets.Dataset): Dataset to train on - text_column (str): Name of the text column - base_model_name (str): Name of the base model to use - n_labels (int): Number of labels - epochs (int): Number of epochs to train - batch_size (int): Batch size for training - num_workers (int): Number of workers for training - max_length (int): Maximum length of the input - """ - # Load the tokenizer - tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length=max_length) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - # Load the model - model = AutoModelForSequenceClassification.from_pretrained( - base_model_name, num_labels=n_labels, max_length=max_length - ) - model.config.id2label = {i: i for i in range(n_labels)} - - # Preprocess the dataset - dataset = dataset.map( - lambda x: tokenizer( - x[text_column], padding="max_length", truncation=True, max_length=max_length - ), - batched=True, - num_proc=num_workers, - ) - - # Split the dataset - dataset = dataset.train_test_split(test_size=0.1, seed=42) - - # Get the data collator - data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - - # Get the trainer - trainer = Trainer( - model=model, - args=training_args, - train_dataset=dataset["train"], - eval_dataset=dataset["test"], - data_collator=data_collator, - ) - - # Train the model - trainer.train() - - # Push the model to the hub - if push_to_hub: - trainer.push_to_hub() - - # Return the model - return model, tokenizer - - def filter_dataset( dataset, text_column, labeler_model, labels_to_keep, batch_size=32, num_workers=4 ): From 2e7f54cbd7aa7adf3074b195d5cb50955349b2fc Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 12 Jul 2023 00:22:40 +0000 Subject: [PATCH 16/22] rm --- code_edu | 1 - 1 file changed, 1 deletion(-) delete mode 160000 code_edu diff --git a/code_edu b/code_edu deleted file mode 160000 index e9a28b1..0000000 --- a/code_edu +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e9a28b101f91ed62ee3d6c52db2fe1e2edacfbd9 From 05472af3eaab20a48db3e53fec75649cb2eedf94 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 12 Jul 2023 12:04:34 +0000 Subject: [PATCH 17/22] wandb config and working metrics --- requirements.txt | 3 ++- train_labeler.py | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index d659a60..4976ba5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ fastcore openai transformers python-dotenv -pandas \ No newline at end of file +pandas +wandb \ No newline at end of file diff --git a/train_labeler.py b/train_labeler.py index d97e3a6..ac37f21 100644 --- a/train_labeler.py +++ b/train_labeler.py @@ -1,5 +1,8 @@ from datasets import load_dataset from transformers import pipeline, TrainingArguments +import evaluate +import numpy as np +import wandb from transformers import ( AutoModelForSequenceClassification, @@ -41,14 +44,25 @@ data_collator = DataCollatorWithPadding(tokenizer=tokenizer) +metric = evaluate.load("accuracy") + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +wandb.login() + +wandb.init(project="phi-2-classifier") + training_args = TrainingArguments( - output_dir="./code_edu", + output_dir="checkpoints", num_train_epochs=3, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, warmup_steps=500, weight_decay=0.01, - logging_dir="./logs", + logging_dir="logs", logging_steps=10, evaluation_strategy="epoch", save_strategy="epoch", @@ -68,10 +82,9 @@ train_dataset=dataset["train"], eval_dataset=dataset["test"], data_collator=data_collator, + compute_metrics=compute_metrics, ) -breakpoint() - trainer.train() if push_to_hub: From 87ec8c26fa22380dee7d78f5e36e4b9ffb1dae3b Mon Sep 17 00:00:00 2001 From: Brian Date: Thu, 13 Jul 2023 00:58:03 +0000 Subject: [PATCH 18/22] eval batch size --- train_labeler.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/train_labeler.py b/train_labeler.py index ac37f21..8c6fa3c 100644 --- a/train_labeler.py +++ b/train_labeler.py @@ -40,7 +40,9 @@ num_proc=num_workers, ) -dataset = dataset.train_test_split(test_size=0.1, seed=42) +dataset = dataset.train_test_split(test_size=0.05, seed=42) + +eval_dataset = dataset["test"].shuffle(seed=42).select(range(200)) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) @@ -59,7 +61,7 @@ def compute_metrics(eval_pred): output_dir="checkpoints", num_train_epochs=3, per_device_train_batch_size=batch_size, - per_device_eval_batch_size=batch_size, + per_device_eval_batch_size=2, warmup_steps=500, weight_decay=0.01, logging_dir="logs", @@ -73,6 +75,7 @@ def compute_metrics(eval_pred): push_to_hub=True, hub_model_id="roborovski/phi-2-classifier", hub_private_repo=True, + eval_accumulation_steps=2 ) @@ -80,7 +83,7 @@ def compute_metrics(eval_pred): model=model, args=training_args, train_dataset=dataset["train"], - eval_dataset=dataset["test"], + eval_dataset=eval_dataset, data_collator=data_collator, compute_metrics=compute_metrics, ) From c0a30ccd41824b8c1fd349949ac0bba0e9fbc89d Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 16 Jul 2023 01:04:14 +0000 Subject: [PATCH 19/22] generate embeddings --- generate_embeddings.py | 192 +++++++++++++++++++++++++++++++++++++ gpt_labeling.py | 2 - train_labeler.py | 208 +++++++++++++++++++++++++---------------- 3 files changed, 320 insertions(+), 82 deletions(-) create mode 100644 generate_embeddings.py diff --git a/generate_embeddings.py b/generate_embeddings.py new file mode 100644 index 0000000..213cc73 --- /dev/null +++ b/generate_embeddings.py @@ -0,0 +1,192 @@ +from abc import ABC +from datasets import ( + load_dataset, +) +from dotenv import load_dotenv +import torch +from typing import Union, List, Dict + +from train_labeler import EncoderParams + +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + AutoModel, +) + +load_dotenv(".env") + +# https://huggingface.co/bigcode/starencoder/discussions/3 +# https://github.com/bigcode-project/bigcode-encoder/blob/master/embedding_sandbox.ipynb + + +# https://github.com/bigcode-project/bigcode-encoder/blob/master/src/utils.py#L152 +def pooling(x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """Pools a batch of vector sequences into a batch of vector global representations. + It does so by taking the last vector in the sequence, as indicated by the mask. + + Args: + x (torch.Tensor): Batch of vector sequences with shape [B, T, F]. + mask (torch.Tensor): Batch of masks with shape [B, T]. + + Returns: + torch.Tensor: Pooled version of the input batch with shape [B, F]. + """ + + eos_idx = mask.sum(1) - 1 + batch_idx = torch.arange(len(eos_idx), device=x.device) + + mu = x[batch_idx, eos_idx, :] + + return mu + + +# https://github.com/bigcode-project/bigcode-encoder/blob/master/src/utils.py#L121 +def pool_and_normalize( + features_sequence: torch.Tensor, + attention_masks: torch.Tensor, + return_norms: bool = False, +) -> Union[torch.Tensor, List[torch.Tensor]]: + """Temporal pooling of sequences of vectors and projection onto the unit sphere. + + Args: + features_sequence (torch.Tensor): Inpute features with shape [B, T, F]. + attention_masks (torch.Tensor): Pooling masks with shape [B, T, F]. + return_norms (bool, optional): Whether to additionally return the norms. Defaults to False. + + Returns: + Union[torch.Tensor, List[torch.Tensor]]: Pooled and normalized vectors with shape [B, F]. + """ + + pooled_embeddings = pooling(features_sequence, attention_masks) + embedding_norms = pooled_embeddings.norm(dim=1) + + normalizing_factor = torch.where( # Only normalize embeddings with norm > 1.0. + embedding_norms > 1.0, embedding_norms, torch.ones_like(embedding_norms) + ) + + pooled_normalized_embeddings = pooled_embeddings / normalizing_factor[:, None] + + if return_norms: + return pooled_normalized_embeddings, embedding_norms + else: + return pooled_normalized_embeddings + + +# https://github.com/bigcode-project/bigcode-encoder/blob/master/src/constants.py + + +def set_device(inputs: Dict[str, torch.Tensor], device: str) -> Dict[str, torch.Tensor]: + output_data = {} + for k, v in inputs.items(): + output_data[k] = v.to(device) + + return output_data + + +def prepare_tokenizer(tokenizer_path): + try: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + except OSError: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_auth_token=True) + + tokenizer.add_special_tokens({"pad_token": EncoderParams.PAD_TOKEN}) + tokenizer.add_special_tokens({"sep_token": EncoderParams.SEPARATOR_TOKEN}) + tokenizer.add_special_tokens({"cls_token": EncoderParams.CLS_TOKEN}) + tokenizer.add_special_tokens({"mask_token": EncoderParams.MASK_TOKEN}) + return tokenizer + + +def truncate_sentences( + sentence_list: List[str], maximum_length: Union[int, float] +) -> List[str]: + truncated_sentences = [] + + for sentence in sentence_list: + truncated_sentences.append(sentence[:maximum_length]) + + return truncated_sentences + + +class StarEncoder(torch.nn.Module): + def __init__(self, device): + super().__init__() + + self.tokenizer = prepare_tokenizer(EncoderParams.base_model_name) + self.encoder = ( + AutoModel.from_pretrained( + EncoderParams.base_model_name, use_auth_token=True + ) + .to(device) + .eval() + ) + self.device = device + self.max_input_len = EncoderParams.max_input_length + self.maximum_token_len = EncoderParams.max_token_length + + def forward(self, input_sentences): + inputs = self.tokenizer( + [ + f"{EncoderParams.CLS_TOKEN}{sentence}{EncoderParams.SEPARATOR_TOKEN}" + for sentence in input_sentences + ], + padding="longest", + max_length=self.maximum_token_len, + truncation=True, + return_tensors="pt", + ) + + outputs = self.encoder(**set_device(inputs, self.device)) + embedding = pool_and_normalize(outputs.hidden_states[-1], inputs.attention_mask) + + return embedding + + def encode(self, input_sentences, batch_size=32, **kwargs): + truncated_input_sentences = truncate_sentences( + input_sentences, self.max_input_len + ) + + n_batches = len(truncated_input_sentences) // batch_size + int( + len(truncated_input_sentences) % batch_size > 0 + ) + + embedding_batch_list = [] + + for i in range(n_batches): + start_idx = i * batch_size + end_idx = min((i + 1) * batch_size, len(truncated_input_sentences)) + + with torch.no_grad(): + embedding_batch_list.append( + self.forward(truncated_input_sentences[start_idx:end_idx]) + .detach() + .cpu() + ) + + input_sentences_embedding = torch.cat(embedding_batch_list) + + return input_sentences_embedding + + +tokenizer = AutoTokenizer.from_pretrained( + EncoderParams.base_model_name, max_length=EncoderParams.max_token_length +) +if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + +dataset = load_dataset("roborovski/phi-1") + +device = torch.device("cuda") +model = StarEncoder(device) + + +def process(x): + content = x["content"] + embedding = model.encode(content) + return {"embedding": embedding} + + +# process(dataset["train"][0]) + +processed_dataset = dataset.map(process, batched=True) +processed_dataset.push_to_hub("roborovski/phi-2-embeddings") diff --git a/gpt_labeling.py b/gpt_labeling.py index f7d55af..1599310 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -24,8 +24,6 @@ labels, secondary_labels=secondary_labels, ) -res = labeler("def create()") -print(res) dataset_chunks = [] buffer_size = 500 diff --git a/train_labeler.py b/train_labeler.py index 8c6fa3c..3e63c15 100644 --- a/train_labeler.py +++ b/train_labeler.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from datasets import load_dataset from transformers import pipeline, TrainingArguments import evaluate @@ -12,83 +13,130 @@ ) -dataset = load_dataset("roborovski/phi-1")["train"] -batch_size = 32 -num_workers = 4 -max_length = 512 -push_to_hub = True -n_labels = 3 -text_column = "content" - -id2label = {0: "HIGH_QUALITY", 1: "MEDIUM_QUALITY", 2: "LOW_QUALITY"} -label2id = {"HIGH_QUALITY": 0, "MEDIUM_QUALITY": 1, "LOW_QUALITY": 2} - -base_model_name = "bigcode/starencoder" -tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length=max_length) -if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - -model = AutoModelForSequenceClassification.from_pretrained( - base_model_name, num_labels=n_labels, max_length=max_length, id2label=id2label, label2id=label2id -) - -dataset = dataset.map( - lambda x: tokenizer( - x[text_column], padding="max_length", truncation=True, max_length=max_length - ), - batched=True, - num_proc=num_workers, -) - -dataset = dataset.train_test_split(test_size=0.05, seed=42) - -eval_dataset = dataset["test"].shuffle(seed=42).select(range(200)) - -data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - -metric = evaluate.load("accuracy") - -def compute_metrics(eval_pred): - logits, labels = eval_pred - predictions = np.argmax(logits, axis=-1) - return metric.compute(predictions=predictions, references=labels) - -wandb.login() - -wandb.init(project="phi-2-classifier") - -training_args = TrainingArguments( - output_dir="checkpoints", - num_train_epochs=3, - per_device_train_batch_size=batch_size, - per_device_eval_batch_size=2, - warmup_steps=500, - weight_decay=0.01, - logging_dir="logs", - logging_steps=10, - evaluation_strategy="epoch", - save_strategy="epoch", - load_best_model_at_end=True, - metric_for_best_model="accuracy", - greater_is_better=True, - seed=42, - push_to_hub=True, - hub_model_id="roborovski/phi-2-classifier", - hub_private_repo=True, - eval_accumulation_steps=2 -) - - -trainer = Trainer( - model=model, - args=training_args, - train_dataset=dataset["train"], - eval_dataset=eval_dataset, - data_collator=data_collator, - compute_metrics=compute_metrics, -) - -trainer.train() - -if push_to_hub: - trainer.push_to_hub() +@dataclass +class EncoderParams: + batch_size = 32 + num_workers = 4 + push_to_hub = True + n_labels = 3 + text_column = "content" + labels = ["high quality", "medium quality", "low quality"] + base_model_name = "bigcode/starencoder" + id2label = {0: "HIGH_QUALITY", 1: "MEDIUM_QUALITY", 2: "LOW_QUALITY"} + label2id = {"HIGH_QUALITY": 0, "MEDIUM_QUALITY": 1, "LOW_QUALITY": 2} + MASK_TOKEN = "" + SEPARATOR_TOKEN = "" + PAD_TOKEN = "" + CLS_TOKEN = "" + max_input_length = 10000 + max_token_length = 1024 + + +def train(): + + dataset = load_dataset("roborovski/phi-1")["train"] + + + tokenizer = AutoTokenizer.from_pretrained( + EncoderParams.base_model_name, max_length=EncoderParams.max_token_length + ) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForSequenceClassification.from_pretrained( + EncoderParams.base_model_name, + num_labels=EncoderParams.n_labels, + max_length=EncoderParams.max_token_length, + id2label=EncoderParams.id2label, + label2id=EncoderParams.label2id, + ) + + + def compute_metrics(eval_pred): + logits, labels = eval_pred + if isinstance(logits, tuple): # Some models return tuples + logits = logits[0] + predictions = np.argmax(logits, axis=-1) + acc = acc_metric.compute(predictions=predictions, references=labels) + precision = precision_metric.compute( + predictions=predictions, + references=labels, + average="macro" if len(labels) > 2 else "binary", + ) + recall = recall_metric.compute( + predictions=predictions, + references=labels, + average="macro" if len(labels) > 2 else "binary", + ) + f1 = f1_metric.compute( + predictions=predictions, + references=labels, + average="macro" if len(labels) > 2 else "binary", + ) + + return {**acc, **precision, **recall, **f1} + + dataset = dataset.map( + lambda x: tokenizer( + x[EncoderParams.text_column], + padding="max_length", + truncation=True, + max_length=EncoderParams.max_input_length, + ), + batched=True, + num_proc=EncoderParams.num_workers, + ) + + dataset = dataset.train_test_split(test_size=0.05, seed=42) + + train_dataset = dataset["train"].shuffle(seed=42) + eval_dataset = dataset["test"].shuffle(seed=42).select(range(200)) + + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + acc_metric = evaluate.load("accuracy") + precision_metric = evaluate.load("precision") + recall_metric = evaluate.load("recall") + f1_metric = evaluate.load("f1") + + wandb.login() + + wandb.init(project="phi-2-classifier") + + training_args = TrainingArguments( + output_dir="checkpoints", + num_train_epochs=100, + per_device_train_batch_size=EncoderParams.batch_size, + per_device_eval_batch_size=2, + warmup_steps=500, + weight_decay=0.01, + logging_dir="logs", + logging_steps=50, + eval_steps=5000, + evaluation_strategy="steps", + save_strategy="epoch", + save_steps=5, + seed=42, + push_to_hub=True, + hub_model_id="roborovski/phi-2-classifier", + hub_private_repo=True, + eval_accumulation_steps=1, + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) + + trainer.train() + + if EncoderParams.push_to_hub: + trainer.push_to_hub() + + +if __name__ == "__main__": + train() From a1637fe3451605903b30d6b0bb81c40381c1b76b Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 16 Jul 2023 02:05:57 +0000 Subject: [PATCH 20/22] set batch size --- generate_embeddings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate_embeddings.py b/generate_embeddings.py index 213cc73..0e814d5 100644 --- a/generate_embeddings.py +++ b/generate_embeddings.py @@ -188,5 +188,5 @@ def process(x): # process(dataset["train"][0]) -processed_dataset = dataset.map(process, batched=True) +processed_dataset = dataset.map(process, batched=True, batch_size=128) processed_dataset.push_to_hub("roborovski/phi-2-embeddings") From 3a6f668869483d19e64cbea0690365f69fb158df Mon Sep 17 00:00:00 2001 From: brian Date: Wed, 19 Jul 2023 02:40:59 +0000 Subject: [PATCH 21/22] llabeling with llama --- .gitignore | 4 +- gpt_labeling.py | 4 +- llama_inference.py | 49 +++++++++++++ llama_labeling.py | 155 +++++++++++++++++++++++++++++++++++++++++ treasure_trove/core.py | 27 ++++++- 5 files changed, 235 insertions(+), 4 deletions(-) create mode 100644 llama_inference.py create mode 100644 llama_labeling.py diff --git a/.gitignore b/.gitignore index b7ff8a1..8d86c7b 100644 --- a/.gitignore +++ b/.gitignore @@ -156,4 +156,6 @@ checklink/cookies.txt checkpoints/ -wandb/* \ No newline at end of file +wandb/* + +*.parquet \ No newline at end of file diff --git a/gpt_labeling.py b/gpt_labeling.py index 1599310..9f5eac1 100644 --- a/gpt_labeling.py +++ b/gpt_labeling.py @@ -11,7 +11,7 @@ from dotenv import load_dotenv import time -from treasure_trove.core import LLMLabeler, instruction +from treasure_trove.core import ChatGPTLabeler, instruction load_dotenv(".env") labels = ["high quality", "medium quality", "low quality"] @@ -19,7 +19,7 @@ lang = "python" max_chars = 4_096 num_workers = 8 -labeler = LLMLabeler( +labeler = ChatGPTLabeler( instruction, labels, secondary_labels=secondary_labels, diff --git a/llama_inference.py b/llama_inference.py new file mode 100644 index 0000000..27daa08 --- /dev/null +++ b/llama_inference.py @@ -0,0 +1,49 @@ +from transformers import AutoTokenizer +import transformers +import torch + +model = "../llama-7bf-hf" + +instruction_simple = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. +High quality code has the following: +* Readability: The code is written in a way that is easy to understand and follow. +* Modularity: The code is organized into reusable and independent modules or functions. +* Detailed explanations: The code is accompanied by explanations of the concepts used. +* Good design principles: The code follows best practices for software design. +Medium quality code has the following: +* Readability: The code is reasonably well-structured and readable. +* Partial modularity: The code contains some reusable components. +* Some explanations: The code may have limited explanations or comments. +* Adequate design principles: The code follows basic design principles. +Low quality code has the following: +* Poor readability: The code is poorly structured and difficult to follow. +* No modularity: The code is written in a monolithic style. +* Limited explanations: The code provides minimal or no explanations. +* Neglects design principles: The code shows a lack of consideration for design principles. + +Output nothing other than one of the following labels: +High quality +Medium quality +Low quality +""" + + +tokenizer = AutoTokenizer.from_pretrained(model) +pipeline = transformers.pipeline( + "conversational", + model=model, + torch_dtype=torch.float16, + device_map="auto", +) + +sequences = pipeline( + instruction_simple, + do_sample=True, + top_k=10, + num_return_sequences=1, + eos_token_id=tokenizer.eos_token_id, + max_length=200, +) +for seq in sequences: + print(f"Result: {seq['generated_text']}") + diff --git a/llama_labeling.py b/llama_labeling.py new file mode 100644 index 0000000..0cffef9 --- /dev/null +++ b/llama_labeling.py @@ -0,0 +1,155 @@ +from typing import Optional, List + +import fire +import re + +from llama import Llama + + +instruction_simple = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. +High quality code has the following: +* Readability: The code is written in a way that is easy to understand and follow. +* Modularity: The code is organized into reusable and independent modules or functions. +* Detailed explanations: The code is accompanied by explanations of the concepts used. +* Good design principles: The code follows best practices for software design. +Medium quality code has the following: +* Readability: The code is reasonably well-structured and readable. +* Partial modularity: The code contains some reusable components. +* Some explanations: The code may have limited explanations or comments. +* Adequate design principles: The code follows basic design principles. +Low quality code has the following: +* Poor readability: The code is poorly structured and difficult to follow. +* No modularity: The code is written in a monolithic style. +* Limited explanations: The code provides minimal or no explanations. +* Neglects design principles: The code shows a lack of consideration for design principles. + +Output nothing other than one of the following labels: +High quality +Medium quality +Low quality +""" + + +def find_label(text: str, labels: List[str]): + for i, label in enumerate(labels): + pattern = re.compile(re.escape(label), re.IGNORECASE | re.MULTILINE) + match = re.search(pattern, text) + if bool(match): + return i + return None + + +import os +from pathlib import Path + +from datasets import ( + concatenate_datasets, + load_dataset, + IterableDataset, + Dataset, + ReadInstruction, +) +from dotenv import load_dotenv + +import time + +load_dotenv(".env") +labels = ["high quality", "medium quality", "low quality"] +secondary_labels = ["high", "medium", "low"] +lang = "python" +max_chars = 4_096 +num_workers = 8 +dataset_chunks = [] + +buffer_size = 500 +num_chunks = 100 + +print("Loading dataset..") +print("Loaded dataset.") + +max_failures = 5 +failures = 0 + +max_gen_len = 512 +max_seq_len = 1024 +temperature = 0.1 +top_p = 0.2 +max_batch_size = 4 + + +ckpt_dir = "../llama/7Bf" +tokenizer_path = "../llama/tokenizer.model" + +generator = Llama.build( + ckpt_dir=ckpt_dir, + tokenizer_path=tokenizer_path, + max_seq_len=max_seq_len, + max_batch_size=max_batch_size, +) + + +def process(x): + total_cost = 0 + label_idx = 0 + dialogs = [] + for i in range(len(x["content"])): + code_sample = x["content"][i][:max_gen_len] + dialogs.append( + [ + {"role": "system", "content": instruction_simple}, + {"role": "user", "content": code_sample}, + ] + ) + results = generator.chat_completion( + dialogs, # type: ignore + max_gen_len=max_gen_len, + temperature=temperature, + top_p=top_p, + ) + batch_labels = [] + for i in range(len(dialogs)): + completion_text = results[i]["generation"]["content"] + label = find_label(completion_text, labels) + batch_labels.append(label) + return {"label": batch_labels} + + +processed_chunk_datasets = [] + +first_save_idx = 8000 + +for i in range(num_chunks): + split = ReadInstruction( + "train", from_=i * buffer_size, to=(i + 1) * buffer_size, unit="abs" + ) + # if i < first_save_idx // buffer_size: + # print(f"skipping chunk {i}: {split}") + # continue + print(f"processing chunk {i}: {split}") + subset = load_dataset( + "parquet", split=split, data_files={"train": "data-00000-of-00144.parquet"} + ) + + # Label the subset + subset = subset.map(process, batched=True, batch_size=max_batch_size, num_proc=1) + + processed_chunk_datasets.append(subset) + + if i > first_save_idx // buffer_size: + all_datasets: Dataset = concatenate_datasets(processed_chunk_datasets) + try: + all_datasets.push_to_hub("roborovski/phi-1", private=True) + all_datasets.to_parquet(os.path.join(ckpt_dir, f"processed_{i}")) + except Exception as e: + print(e) + + # print number of each class + print( + f"Number of {labels[0]}: {len(all_datasets.filter(lambda x: x['label'] == 0))}" + ) + print( + f"Number of {labels[1]}: {len(all_datasets.filter(lambda x: x['label'] == 1))}" + ) + print( + f"Number of {labels[2]}: {len(all_datasets.filter(lambda x: x['label'] == 2))}" + ) diff --git a/treasure_trove/core.py b/treasure_trove/core.py index 1fa950e..372a8d9 100644 --- a/treasure_trove/core.py +++ b/treasure_trove/core.py @@ -36,8 +36,33 @@ {0} """ +instruction_simple = f"""Determine the following code's quality value for a software engineer whose goal is to improve their programming ability. +High quality code has the following: +* Readability: The code is written in a way that is easy to understand and follow. +* Modularity: The code is organized into reusable and independent modules or functions. +* Detailed explanations: The code is accompanied by explanations of the concepts used. +* Good design principles: The code follows best practices for software design. +Medium quality code has the following: +* Readability: The code is reasonably well-structured and readable. +* Partial modularity: The code contains some reusable components. +* Some explanations: The code may have limited explanations or comments. +* Adequate design principles: The code follows basic design principles. +Low quality code has the following: +* Poor readability: The code is poorly structured and difficult to follow. +* No modularity: The code is written in a monolithic style. +* Limited explanations: The code provides minimal or no explanations. +* Neglects design principles: The code shows a lack of consideration for design principles. + +Output nothing other than one of the following labels: +High quality +Medium quality +Low quality +""" + + + -class LLMLabeler: +class ChatGPTLabeler: def __init__( self, instruction: str, From 4ea38843cf3dd8b5cdeafc3674ad52bac03948e4 Mon Sep 17 00:00:00 2001 From: brian Date: Wed, 19 Jul 2023 12:25:58 +0000 Subject: [PATCH 22/22] log sample --- requirements.txt | 3 ++- train_labeler.py | 24 +++++++++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4976ba5..89399b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ openai transformers python-dotenv pandas -wandb \ No newline at end of file +wandb +huggingface_hub \ No newline at end of file diff --git a/train_labeler.py b/train_labeler.py index 3e63c15..860c003 100644 --- a/train_labeler.py +++ b/train_labeler.py @@ -4,6 +4,9 @@ import evaluate import numpy as np import wandb +from dotenv import load_dotenv +from huggingface_hub import login +import os from transformers import ( AutoModelForSequenceClassification, @@ -12,11 +15,14 @@ Trainer, ) +load_dotenv(".env") + +login(token=os.environ["HF_KEY"], add_to_git_credential=True) @dataclass class EncoderParams: batch_size = 32 - num_workers = 4 + num_workers = 16 push_to_hub = True n_labels = 3 text_column = "content" @@ -28,14 +34,13 @@ class EncoderParams: SEPARATOR_TOKEN = "" PAD_TOKEN = "" CLS_TOKEN = "" - max_input_length = 10000 + max_input_length = 1024 max_token_length = 1024 def train(): - dataset = load_dataset("roborovski/phi-1")["train"] - + dataset = load_dataset("roborovski/phi-2-labeled")["train"] tokenizer = AutoTokenizer.from_pretrained( EncoderParams.base_model_name, max_length=EncoderParams.max_token_length @@ -51,10 +56,11 @@ def train(): label2id=EncoderParams.label2id, ) + sample_table_data = [] def compute_metrics(eval_pred): logits, labels = eval_pred - if isinstance(logits, tuple): # Some models return tuples + if isinstance(logits, tuple): logits = logits[0] predictions = np.argmax(logits, axis=-1) acc = acc_metric.compute(predictions=predictions, references=labels) @@ -74,6 +80,14 @@ def compute_metrics(eval_pred): average="macro" if len(labels) > 2 else "binary", ) + decoded_sample = tokenizer.decode(predictions) + sample_table_data.append([decoded_sample, labels[0]]) + sample_table = wandb.Table( + columns=["sample", "label"], + data=sample_table_data, + ) + wandb.log({"sample": sample_table}) + return {**acc, **precision, **recall, **f1} dataset = dataset.map(