Merge branch 'main' into fix/damian/sparseautomodel

neuralmagic · Jul 2, 2024 · 6c0161b · 6c0161b
2 parents 03a3939 + f0a3692
commit 6c0161b
Show file tree

Hide file tree

Showing 18 changed files with 344 additions and 170 deletions.
diff --git a/.github/workflows/build-release-wheel.yaml b/.github/workflows/build-release-wheel.yaml
@@ -0,0 +1,57 @@
+name: Build release wheel
+
+on:
+  push:
+    branches:
+      - 'release/*'
+
+  workflow_dispatch:
+    inputs:
+      gitref:
+        description: "git tag, commit or branch name for the release"
+        type: string
+        required: true
+        default: 'release/1.8'
+
+jobs:
+  build-release-wheel:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.gitref }}
+
+      - name: Build PyPi Wheel
+        id: build
+        uses: neuralmagic/nm-actions/actions/pypi_build@main
+        with:
+          dev: false
+          release: true
+
+      - name: Set Env
+        run: |
+          pip3 install --upgrade pip && pip3 install --upgrade setuptools
+          pip3 install virtualenv
+          virtualenv venv
+          source venv/bin/activate
+
+      - name: upload whl
+        uses: actions/upload-artifact@v4
+        if: success() || failure()
+        with:
+          name: "wheel-sparseml"
+          path: ${{ steps.build.outputs.whlname }}
+          retention-days: 7
+
+      - name: upload tar.gz
+        uses: actions/upload-artifact@v4
+        if: success() || failure()
+        with:
+          name: "tar-sparseml"
+          path: ${{ steps.build.outputs.tarname }}
+          retention-days: 7
diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
diff --git a/examples/llama7b_one_shot_quantization.md b/examples/llama7b_one_shot_quantization.md
@@ -0,0 +1,51 @@
+# Creating a Quantized Llama Model in One Shot
+
+Quantizing a model to a lower precision can save on both memory and speed at inference time.
+This example demonstrates how to use the SparseML API to quantize a Llama model from 16 bits
+to 4 bits and save it to a compressed-tensors format for inference with vLLM.
+
+## Step 1: Select a model and dataset
+For this example, we will use a TinyLlama model and the open platypus dataset, however
+these can be swapped out for any huggingface compatible models and datasets
+
+```python
+model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+dataset = "open_platypus"
+```
+
+## Step 2: Configure a `GPTQModifier`
+Modifiers in sparseml are used to apply optimizations to models. In this example we use a
+`GPTQModifier` to apply the GPTQ algorithm to our model.  We target all `Linear` layers
+for 4-bit weight quantization.  These options may be swapped out for any valid `QuantizationScheme`.
+
+```python
+from sparseml.modifiers.quantization.gptq import GPTQModifier
+
+gptq = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=["lm_head"],
+)
+```
+
+
+### Step3: One-Shot Compression
+
+The `oneshot` api applies the created modifier to the target model and dataset.
+Setting `save_compressed` to True runs the model through `compressed_tensors` compression
+after the quantization is completed.
+
+```python
+from sparseml.transformers import oneshot
+
+oneshot(
+    model=model,
+    dataset=dataset,
+    recipe=gptq,
+    save_compressed=True,
+    output_dir="llama-compressed-example",
+    overwrite_output_dir=True,
+    max_seq_length=256,
+    num_calibration_samples=256,
+)
+```
diff --git a/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml b/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml
@@ -23,6 +23,7 @@ quantization_stage:
   run_type: oneshot
   quantization_modifiers:
     GPTQModifier:
+      ignore: [ "lm_head" ]
       sequential_update: false
       config_groups:
         group_0:

diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md
@@ -1,52 +1,88 @@
 # Creating a Sparse Quantized Llama7b Model
 
-The example in this folder runs in multiple stages to create a Llama 7b model with 
-a 2:4 sparsity pattern and W4A16 post training quantization (PTW). The model is 
-calibrated and trained with the ultachat200k dataset. At least 75GB of GPU memory is 
-required to run this example.
+This example uses SparseML and Compressed-Tensors to create a 2:4 sparse and quantized Llama2-7b model.
+The model is calibrated and trained with the ultachat200k dataset.
+At least 85GB of GPU memory is required to run this example.
 
-## Recipe Summary
+Follow the steps below one by one in a code notebook, or run the full example script 
+as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py`
 
-The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml). It contains 3 stages that are outlined below.
+## Step 1: Select a model, dataset, and recipe
+In this step, we select which model to use as a baseline for sparsification, a dataset to
+use for calibration and finetuning, and a recipe.
 
+Models can reference a local directory, model in the huggingface hub, or in the sparsezoo.
 
-### Stage 1: Sparsification
+Datasets can be from a local compatible directory or the huggingface hub.
 
-Runs the SparseGPT one-shot algorithm to prune the model to 50% sparsity with a 2:4 
-sparsity pattern. This means that 2 weights out of every group of 4 weights are masked to 0.
+Recipes are YAML files that describe how a model should be optimized during or after training.
+The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml).
+It contains instructions to prune the model to 2:4 sparsity, run one epoch of recovery finetuning,
+and quantize to 4 bits in one show using GPTQ.
 
-### Stage 2: Finetuning Recovery
-
-This stage runs a single epoch of training on the ultrachat200k dataset while maintaining 
-the sparsity mask from stage 1. The purpose of this stage is to recover any accuracy lost 
-during the sparsification process.
+```python
+import torch
+from sparseml.transformers import SparseAutoModelForCausalLM
 
-### Stage 3: Quantization
+model_stub = "zoo:llama2-7b-ultrachat200k_llama2_pretrain-base"
+model = SparseAutoModelForCausalLM.from_pretrained(
+    model_stub, torch_dtype=torch.bfloat16, device_map="auto"
+)
 
-Finally, we run the GPTQ one-shot algorithm to quantize all linear weights to 4 bit 
-channelwise.
+dataset = "ultrachat-200k"
+splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
 
-## How to Run
+recipe = "2:4_w4a16_recipe.yaml"
+```
 
-We can run the entire staged recipe with one call to SparseML's `apply` pathway. This 
-will save a checkpoint of the model after each stage.
+## Step 2: Run sparsification using `apply`
+The `apply` function applies the given recipe to our model and dataset.
+The hardcoded kwargs may be altered based on each model's needs. This code snippet should 
+be run in the same Python instance as step 1.
+After running, the sparsified model will be saved to `output_llama7b_2:4_w4a16_channel`.
+
+```python
+from sparseml.transformers import apply
+
+output_dir = "output_llama7b_2:4_w4a16_channel"
+
+apply(
+    model=model,
+    dataset=dataset,
+    recipe=recipe,
+    bf16=False,  # use full precision for training
+    output_dir=output_dir,
+    splits=splits,
+    max_seq_length=512,
+    num_calibration_samples=512,
+    num_train_epochs=0.5,
+    logging_steps=500,
+    save_steps=5000,
+    gradient_checkpointing=True,
+    learning_rate=0.0001,
+    lr_scheduler_type="cosine",
+    warmup_ratio=0.1,
+)
+```
 
-```python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py```
 
-### Compression
+### Step 3: Compression
 
 The resulting model will be uncompressed. To save a final compressed copy of the model 
-run the following:
+run the following in the same Python instance as the previous steps.
 
-```
+```python
 import torch
+import os
 from sparseml.transformers import SparseAutoModelForCausalLM
 
-model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)
+compressed_output_dir = "output_llama7b_2:4_w4a16_channel_compressed"
+uncompressed_path = os.path.join(output_dir, "stage_quantization")
+model = SparseAutoModelForCausalLM.from_pretrained(uncompressed_path, torch_dtype=torch.bfloat16)
 model.save_pretrained(compressed_output_dir, save_compressed=True)
 ```
 
 ### Custom Quantization
 The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are `tensor`, `group` and `channel`. 
 The above recipe (`2:4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. 
-To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. Group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`
+To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. A group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`.
diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py
@@ -16,12 +16,12 @@
                         num_bits: 8
                         type: "int"
                         symmetric: true
-                        strategy: "channel"
+                        strategy: "tensor"
                     input_activations:
                         num_bits: 8
                         type: "int"
                         symmetric: true
-                        dynamic: True
+                        dynamic: true
                         strategy: "token"
                     targets: ["Linear"]
 """
@@ -37,7 +37,7 @@
 dataset = "ultrachat-200k"
 
 # save location of quantized model out
-output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed"
+output_dir = "./output_llama7b_w8a8_dynamic_compressed"
 
 # set dataset config parameters
 splits = {"calibration": "train_gen[:5%]"}

diff --git a/setup.py b/setup.py
@@ -16,18 +16,25 @@
 from typing import Dict, List, Tuple
 
 from setuptools import find_packages, setup
+from utils.artifacts import get_release_and_version
 
 
-# default variables to be overwritten by the version.py file
-is_release = None
-is_dev = None
-version = "unknown"
-version_major_minor = version
+package_path = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "src", "sparseml"
+)
+(
+    is_release,
+    is_dev,
+    version,
+    version_major,
+    version_minor,
+    version_bug,
+) = get_release_and_version(package_path)
 
 # load and overwrite version and release info from sparseml package
 exec(open(os.path.join("src", "sparseml", "version.py")).read())
 print(f"loaded version {version} from src/sparseml/version.py")
-version_nm_deps = f"{version_major_minor}.0"
+version_nm_deps = f"{version_major}.{version_minor}.0"
 
 if is_release:
     _PACKAGE_NAME = "sparseml"
@@ -38,7 +45,7 @@
 
 _deps = [
     "pyyaml>=5.0.0",
-    "numpy>=1.17.0",
+    "numpy>=1.17.0,<2.0",
     "matplotlib>=3.0.0",
     "merge-args>=0.1.0",
     "onnx>=1.5.0,<1.15.0",
@@ -56,11 +63,9 @@
     "protobuf>=3.12.2,<=3.20.3",
     "click>=7.1.2,!=8.0.0",  # latest version < 8.0 + blocked version with reported bug
 ]
-_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}~={version_nm_deps}"]
-_deepsparse_deps = [
-    f"{'deepsparse' if is_release else 'deepsparse-nightly'}~={version_nm_deps}"
-]
-_deepsparse_ent_deps = [f"deepsparse-ent~={version_nm_deps}"]
+_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>=1.7.0"]
+_deepsparse_deps = [f"{'deepsparse' if is_release else 'deepsparse-nightly'}>=1.7.0"]
+_deepsparse_ent_deps = ["deepsparse-ent>=1.7.0"]
 
 _onnxruntime_deps = ["onnxruntime>=1.0.0"]
 _clip_deps = ["open_clip_torch==2.20.0"]

diff --git a/src/sparseml/exporters/transforms/kv_cache/configs.py b/src/sparseml/exporters/transforms/kv_cache/configs.py
@@ -84,7 +84,7 @@ class KeyValueCacheConfig(BaseModel):
         "the kv cache. If this is not provided, no transpose will "
         "be applied.",
     )
-    model_config = ConfigDict(arbitrary_types_allowed=True)
+    model_config = ConfigDict(arbitrary_types_allowed=True, protected_namespaces=())
 
 
 OPT_CONFIG = KeyValueCacheConfig(