From 5e5fc2c8ffb0098be0d7b4bb69117440b65e9aad Mon Sep 17 00:00:00 2001 From: sanchez-alex <141684261+sanchez-alex@users.noreply.github.com> Date: Mon, 30 Sep 2024 13:19:52 -0700 Subject: [PATCH] Distillation MS Learn Doc (#3375) * Initial draft of distillation mslearn readme * Small change * Fix link * Update paths * Update link to qa_math notebook * Fix paths * Fix paths again * Dummy rename * Lowercase folders * Remove tmp file --- ...tion-nli-distillation_chat_completion.yml} | 170 +++++++++--------- .../system/distillation/README.md | 72 ++++++++ .../distillation_conversational_task.ipynb | 0 .../math}/distillation_math.ipynb | 2 +- .../nli}/distillation_chat_completion.ipynb | 0 .../nlu_qa}/distillation_nlu_qa_task.ipynb | 0 .../nlu_qa}/distillation_qa_math.ipynb | 0 .../nlu_qa}/download_dataset.py | 98 +++++----- 8 files changed, 207 insertions(+), 135 deletions(-) rename .github/workflows/{sdk-foundation-models-system-finetune-Llama-notebooks-distillation-distillation_chat_completion.yml => sdk-foundation-models-system-distillation-nli-distillation_chat_completion.yml} (79%) create mode 100644 sdk/python/foundation-models/system/distillation/README.md rename sdk/python/foundation-models/system/{finetune/Llama-notebooks/distillation => distillation/conversation}/distillation_conversational_task.ipynb (100%) rename sdk/python/foundation-models/system/{finetune/Llama-notebooks/distillation => distillation/math}/distillation_math.ipynb (99%) rename sdk/python/foundation-models/system/{finetune/Llama-notebooks/distillation => distillation/nli}/distillation_chat_completion.ipynb (100%) rename sdk/python/foundation-models/system/{finetune/Llama-notebooks/distillation => distillation/nlu_qa}/distillation_nlu_qa_task.ipynb (100%) rename sdk/python/foundation-models/system/{finetune/Llama-notebooks/distillation => distillation/nlu_qa}/distillation_qa_math.ipynb (100%) rename sdk/python/foundation-models/system/{finetune/Llama-notebooks/distillation => distillation/nlu_qa}/download_dataset.py (96%) diff --git a/.github/workflows/sdk-foundation-models-system-finetune-Llama-notebooks-distillation-distillation_chat_completion.yml b/.github/workflows/sdk-foundation-models-system-distillation-nli-distillation_chat_completion.yml similarity index 79% rename from .github/workflows/sdk-foundation-models-system-finetune-Llama-notebooks-distillation-distillation_chat_completion.yml rename to .github/workflows/sdk-foundation-models-system-distillation-nli-distillation_chat_completion.yml index a5952bc5617..90a7492bd4a 100644 --- a/.github/workflows/sdk-foundation-models-system-finetune-Llama-notebooks-distillation-distillation_chat_completion.yml +++ b/.github/workflows/sdk-foundation-models-system-distillation-nli-distillation_chat_completion.yml @@ -1,85 +1,85 @@ -# This code is autogenerated. -# Code is generated by running custom script: python3 readme.py -# Any manual changes to this file may cause incorrect behavior. -# Any manual changes will be overwritten if the code is regenerated. - -name: sdk-foundation-models-system-finetune-Llama-notebooks-distillation-distillation_chat_completion -# This file is created by sdk/python/readme.py. -# Please do not edit directly. -on: - workflow_dispatch: - schedule: - - cron: "42 4/12 * * *" - pull_request: - branches: - - main - paths: - - sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/** - - .github/workflows/sdk-foundation-models-system-finetune-Llama-notebooks-distillation-distillation_chat_completion.yml - - sdk/python/dev-requirements.txt - - infra/bootstrapping/** - - sdk/python/setup.sh - -permissions: - id-token: write -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: check out repo - uses: actions/checkout@v2 - - name: setup python - uses: actions/setup-python@v2 - with: - python-version: "3.10" - - name: pip install notebook reqs - run: pip install --no-cache-dir -r sdk/python/dev-requirements.txt - - name: azure login - uses: azure/login@v1 - with: - client-id: ${{ secrets.OIDC_AZURE_CLIENT_ID }} - tenant-id: ${{ secrets.OIDC_AZURE_TENANT_ID }} - subscription-id: ${{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }} - - name: bootstrap resources - run: | - echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; - bash bootstrap.sh - working-directory: infra/bootstrapping - continue-on-error: false - - name: setup SDK - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: sdk/python - continue-on-error: true - # - name: validate readme - # run: | - # python check-readme.py "${{ github.workspace }}" "${{ github.workspace }}/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation" - # working-directory: infra/bootstrapping - # continue-on-error: false - - name: setup-cli - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: cli - continue-on-error: true - - name: run foundation-models/system/finetune/Llama-notebooks/distillation/distillation_chat_completion.ipynb - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; - bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "distillation_chat_completion.ipynb"; - [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; - papermill -k python distillation_chat_completion.ipynb distillation_chat_completion.output.ipynb - working-directory: sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation - - name: upload notebook's working folder as an artifact - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: distillation_chat_completion - path: sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: sdk-foundation-models-system-distillation-nli-distillation_chat_completion +# This file is created by sdk/python/readme.py. +# Please do not edit directly. +on: + workflow_dispatch: + schedule: + - cron: "42 4/12 * * *" + pull_request: + branches: + - main + paths: + - sdk/python/foundation-models/system/distillation/nli/ + - .github/workflows/sdk-foundation-models-system-distillation-nli-distillation_chat_completion.yml + - sdk/python/dev-requirements.txt + - infra/bootstrapping/** + - sdk/python/setup.sh + +permissions: + id-token: write +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.10" + - name: pip install notebook reqs + run: pip install --no-cache-dir -r sdk/python/dev-requirements.txt + - name: azure login + uses: azure/login@v1 + with: + client-id: ${{ secrets.OIDC_AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.OIDC_AZURE_TENANT_ID }} + subscription-id: ${{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra/bootstrapping + continue-on-error: false + - name: setup SDK + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: sdk/python + continue-on-error: true + # - name: validate readme + # run: | + # python check-readme.py "${{ github.workspace }}" "${{ github.workspace }}/sdk/python/foundation-models/system/distillation" + # working-directory: infra/bootstrapping + # continue-on-error: false + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: run foundation-models/system/distillation/nli/distillation_chat_completion.ipynb + run: | + source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; + bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "distillation_chat_completion.ipynb"; + [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; + papermill -k python distillation_chat_completion.ipynb distillation_chat_completion.output.ipynb + working-directory: sdk/python/foundation-models/system/distillation/nli + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: distillation_chat_completion + path: sdk/python/foundation-models/system/distillation/nli \ No newline at end of file diff --git a/sdk/python/foundation-models/system/distillation/README.md b/sdk/python/foundation-models/system/distillation/README.md new file mode 100644 index 00000000000..974556cf551 --- /dev/null +++ b/sdk/python/foundation-models/system/distillation/README.md @@ -0,0 +1,72 @@ +--- +page_type: sample +languages: +- python +products: +- azure-machine-learning +description: An explanation on model distillaton and step-by-step guide on creating a distilled model. +--- + +# AzureML Model Distillation + +## Table of Contents +- [AzureML Model Distillation](#azureml-model-distillation) +- [Welcome](#welcome) +- [Getting Started](#getting-started) +- [Model Distillation](#model-distillation) +- [Examples](#examples) + + +## Welcome +This document is for anyone interested in exploring and performing model distillation in AzureML. We assume basic knowledge of python. + +## Getting Started +Please follow this [tutorial](https://learn.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources?view=azureml-api-2) in order to create all the necessary resources to get started. You'll then have everything you need to get started with Azure Machine Learning. The AzureML Workspace is the top-level resource for your machine learning activities, providing a centralized place to view and manage the artifacts you create when you use Azure Machine Learning. The compute resources provide a pre-configured cloud-based environment you can use to train, deploy, automate, manage, and track machine learning models. + +## Model Distillation +Large Language Models (LLMs) are gradually increasing in size. As their size increases, so does the compute resources needed to host these models. Consuming larger language models has the benefit of more accurate responses than smaller language models. With model distillation, we can improve the accuracy of smaller language models by leveraging larger language models. + +During distillation, a smaller LLM "student" learns from a larger LLM "teacher". The teacher teaches the student on specific tasks in order to transfer its knowledge onto the student. Through this distillation process, the student's accuracy improves and less computational resources are needed to host this smaller LLM. + + + +The process of model distillation is a two stage process as seen below. + +```mermaid +sequenceDiagram + participant TM as Teacher Model + participant D as Dataset (Training + Validation) + participant SM as Student Model + + + D->>TM: Generate Training Responses + activate TM + TM-->>D: Responses + deactivate TM + + activate D + D->>TM: Generate Validation Responses + deactivate D + activate TM + TM-->>D: Responses + deactivate TM + + note over D: Datasets Complete + + D->>+SM: Finetune + +``` + +1. The first stage is the synthetic data generation step. In this step, using a training dataset, the teacher model is asked to generate responses for the training data. If there is a validation dataset, the teacher model also generates responses for that dataset as well. +2. The second stage is finetuning. Once the synthetic data is collected, the student model is then finetuned off of the training and validation data created from the teacher model. This transfers the knowledge from the teacher model to the student model. + + + +## Examples +We currently support numerous task types for model distillation. To view examples on how to distill and consume a model, click on the following task type of interest +- [NLI (Natural Language Interpretation)](./nli/distillation_chat_completion.ipynb) +- [Conversation](./conversation/distillation_conversational_task.ipynb) +- [NLU QA (Natural Language Understanding Question and Answer)](./nlu_qa/distillation_nlu_qa_task.ipynb) +- [Math](./math/distillation_math.ipynb) +- Summarization + diff --git a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_conversational_task.ipynb b/sdk/python/foundation-models/system/distillation/conversation/distillation_conversational_task.ipynb similarity index 100% rename from sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_conversational_task.ipynb rename to sdk/python/foundation-models/system/distillation/conversation/distillation_conversational_task.ipynb diff --git a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_math.ipynb b/sdk/python/foundation-models/system/distillation/math/distillation_math.ipynb similarity index 99% rename from sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_math.ipynb rename to sdk/python/foundation-models/system/distillation/math/distillation_math.ipynb index 7fc9c9c7fb8..66c824f1774 100644 --- a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_math.ipynb +++ b/sdk/python/foundation-models/system/distillation/math/distillation_math.ipynb @@ -461,7 +461,7 @@ "\n", "For math datasets, such as MultiArith (current dataset), where the answer is numeric, select `MATH` as the `data_generation_task_type`. \n", "\n", - "There exists math datasets where the answer is expected to be a letter. For these datasets, use the [Math Q&A Notebook](distillation_qa_math.ipynb) instead." + "There exists math datasets where the answer is expected to be a letter. For these datasets, use the [Math Q&A Notebook](../nlu_qa/distillation_qa_math.ipynb) instead." ] }, { diff --git a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_chat_completion.ipynb b/sdk/python/foundation-models/system/distillation/nli/distillation_chat_completion.ipynb similarity index 100% rename from sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_chat_completion.ipynb rename to sdk/python/foundation-models/system/distillation/nli/distillation_chat_completion.ipynb diff --git a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_nlu_qa_task.ipynb b/sdk/python/foundation-models/system/distillation/nlu_qa/distillation_nlu_qa_task.ipynb similarity index 100% rename from sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_nlu_qa_task.ipynb rename to sdk/python/foundation-models/system/distillation/nlu_qa/distillation_nlu_qa_task.ipynb diff --git a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_qa_math.ipynb b/sdk/python/foundation-models/system/distillation/nlu_qa/distillation_qa_math.ipynb similarity index 100% rename from sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_qa_math.ipynb rename to sdk/python/foundation-models/system/distillation/nlu_qa/distillation_qa_math.ipynb diff --git a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/download_dataset.py b/sdk/python/foundation-models/system/distillation/nlu_qa/download_dataset.py similarity index 96% rename from sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/download_dataset.py rename to sdk/python/foundation-models/system/distillation/nlu_qa/download_dataset.py index 95f1d48a983..bb17a662747 100644 --- a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/download_dataset.py +++ b/sdk/python/foundation-models/system/distillation/nlu_qa/download_dataset.py @@ -1,49 +1,49 @@ -from datasets import load_dataset -from abc import ABC - - -class InputDataset(ABC): - def __init__(self): - super().__init__() - ( - self.train_data_file_name, - self.test_data_file_name, - self.eval_data_file_name, - ) = (None, None, None) - - -class CQnAHuggingFaceInputDataset(InputDataset): - """ - Loads the HuggingFace dataset - """ - - def __init__(self): - super().__init__() - - def load_hf_dataset( - self, - dataset_name, - train_sample_size=10, - val_sample_size=10, - test_sample_size=10, - train_split_name="train", - val_split_name="validation", - test_split_name="test", - ): - full_dataset = load_dataset(dataset_name) - - if val_split_name is not None: - train_data = full_dataset[train_split_name].select(range(train_sample_size)) - val_data = full_dataset[val_split_name].select(range(val_sample_size)) - test_data = full_dataset[test_split_name].select(range(test_sample_size)) - else: - train_val_data = full_dataset[train_split_name].select( - range(train_sample_size + val_sample_size) - ) - train_data = train_val_data.select(range(train_sample_size)) - val_data = train_val_data.select( - range(train_sample_size, train_sample_size + val_sample_size) - ) - test_data = full_dataset[test_split_name].select(range(test_sample_size)) - - return train_data, val_data, test_data +from datasets import load_dataset +from abc import ABC + + +class InputDataset(ABC): + def __init__(self): + super().__init__() + ( + self.train_data_file_name, + self.test_data_file_name, + self.eval_data_file_name, + ) = (None, None, None) + + +class CQnAHuggingFaceInputDataset(InputDataset): + """ + Loads the HuggingFace dataset + """ + + def __init__(self): + super().__init__() + + def load_hf_dataset( + self, + dataset_name, + train_sample_size=10, + val_sample_size=10, + test_sample_size=10, + train_split_name="train", + val_split_name="validation", + test_split_name="test", + ): + full_dataset = load_dataset(dataset_name) + + if val_split_name is not None: + train_data = full_dataset[train_split_name].select(range(train_sample_size)) + val_data = full_dataset[val_split_name].select(range(val_sample_size)) + test_data = full_dataset[test_split_name].select(range(test_sample_size)) + else: + train_val_data = full_dataset[train_split_name].select( + range(train_sample_size + val_sample_size) + ) + train_data = train_val_data.select(range(train_sample_size)) + val_data = train_val_data.select( + range(train_sample_size, train_sample_size + val_sample_size) + ) + test_data = full_dataset[test_split_name].select(range(test_sample_size)) + + return train_data, val_data, test_data