diff --git a/.github/workflows/build-release-wheel.yaml b/.github/workflows/build-release-wheel.yaml new file mode 100644 index 00000000000..20457401a6a --- /dev/null +++ b/.github/workflows/build-release-wheel.yaml @@ -0,0 +1,57 @@ +name: Build release wheel + +on: + push: + branches: + - 'release/*' + + workflow_dispatch: + inputs: + gitref: + description: "git tag, commit or branch name for the release" + type: string + required: true + default: 'release/1.8' + +jobs: + build-release-wheel: + runs-on: ubuntu-20.04 + steps: + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.gitref }} + + - name: Build PyPi Wheel + id: build + uses: neuralmagic/nm-actions/actions/pypi_build@main + with: + dev: false + release: true + + - name: Set Env + run: | + pip3 install --upgrade pip && pip3 install --upgrade setuptools + pip3 install virtualenv + virtualenv venv + source venv/bin/activate + + - name: upload whl + uses: actions/upload-artifact@v4 + if: success() || failure() + with: + name: "wheel-sparseml" + path: ${{ steps.build.outputs.whlname }} + retention-days: 7 + + - name: upload tar.gz + uses: actions/upload-artifact@v4 + if: success() || failure() + with: + name: "tar-sparseml" + path: ${{ steps.build.outputs.tarname }} + retention-days: 7 diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml deleted file mode 100644 index 26d9eb31946..00000000000 --- a/.github/workflows/build-release.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: build-release -run-name: ${{ github.workflow }} is to create release wheel file for pypi -on: - push: - branches: - - 'release/[0-9]+.[0-9]+' - workflow_dispatch: - -jobs: - - BUILD-SPARSEML-RELEASE: - - uses: ./.github/workflows/util.yml - with: - runs_on: ubuntu-22.04 - run_id: ${{ github.run_id }} - build_type: release - testmo_project_id: 9 - secrets: inherit diff --git a/examples/llama7b_one_shot_quantization.md b/examples/llama7b_one_shot_quantization.md new file mode 100644 index 00000000000..af644897065 --- /dev/null +++ b/examples/llama7b_one_shot_quantization.md @@ -0,0 +1,51 @@ +# Creating a Quantized Llama Model in One Shot + +Quantizing a model to a lower precision can save on both memory and speed at inference time. +This example demonstrates how to use the SparseML API to quantize a Llama model from 16 bits +to 4 bits and save it to a compressed-tensors format for inference with vLLM. + +## Step 1: Select a model and dataset +For this example, we will use a TinyLlama model and the open platypus dataset, however +these can be swapped out for any huggingface compatible models and datasets + +```python +model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" +dataset = "open_platypus" +``` + +## Step 2: Configure a `GPTQModifier` +Modifiers in sparseml are used to apply optimizations to models. In this example we use a +`GPTQModifier` to apply the GPTQ algorithm to our model. We target all `Linear` layers +for 4-bit weight quantization. These options may be swapped out for any valid `QuantizationScheme`. + +```python +from sparseml.modifiers.quantization.gptq import GPTQModifier + +gptq = GPTQModifier( + targets="Linear", + scheme="W4A16", + ignore=["lm_head"], +) +``` + + +### Step3: One-Shot Compression + +The `oneshot` api applies the created modifier to the target model and dataset. +Setting `save_compressed` to True runs the model through `compressed_tensors` compression +after the quantization is completed. + +```python +from sparseml.transformers import oneshot + +oneshot( + model=model, + dataset=dataset, + recipe=gptq, + save_compressed=True, + output_dir="llama-compressed-example", + overwrite_output_dir=True, + max_seq_length=256, + num_calibration_samples=256, +) +``` diff --git a/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml b/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml index aeddebb8cb3..6f35f511396 100644 --- a/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml +++ b/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml @@ -23,6 +23,7 @@ quantization_stage: run_type: oneshot quantization_modifiers: GPTQModifier: + ignore: [ "lm_head" ] sequential_update: false config_groups: group_0: diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md index 779696ba599..35183345d9c 100644 --- a/examples/llama7b_sparse_quantized/README.md +++ b/examples/llama7b_sparse_quantized/README.md @@ -1,52 +1,88 @@ # Creating a Sparse Quantized Llama7b Model -The example in this folder runs in multiple stages to create a Llama 7b model with -a 2:4 sparsity pattern and W4A16 post training quantization (PTW). The model is -calibrated and trained with the ultachat200k dataset. At least 75GB of GPU memory is -required to run this example. +This example uses SparseML and Compressed-Tensors to create a 2:4 sparse and quantized Llama2-7b model. +The model is calibrated and trained with the ultachat200k dataset. +At least 85GB of GPU memory is required to run this example. -## Recipe Summary +Follow the steps below one by one in a code notebook, or run the full example script +as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py` -The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml). It contains 3 stages that are outlined below. +## Step 1: Select a model, dataset, and recipe +In this step, we select which model to use as a baseline for sparsification, a dataset to +use for calibration and finetuning, and a recipe. +Models can reference a local directory, model in the huggingface hub, or in the sparsezoo. -### Stage 1: Sparsification +Datasets can be from a local compatible directory or the huggingface hub. -Runs the SparseGPT one-shot algorithm to prune the model to 50% sparsity with a 2:4 -sparsity pattern. This means that 2 weights out of every group of 4 weights are masked to 0. +Recipes are YAML files that describe how a model should be optimized during or after training. +The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml). +It contains instructions to prune the model to 2:4 sparsity, run one epoch of recovery finetuning, +and quantize to 4 bits in one show using GPTQ. -### Stage 2: Finetuning Recovery - -This stage runs a single epoch of training on the ultrachat200k dataset while maintaining -the sparsity mask from stage 1. The purpose of this stage is to recover any accuracy lost -during the sparsification process. +```python +import torch +from sparseml.transformers import SparseAutoModelForCausalLM -### Stage 3: Quantization +model_stub = "zoo:llama2-7b-ultrachat200k_llama2_pretrain-base" +model = SparseAutoModelForCausalLM.from_pretrained( + model_stub, torch_dtype=torch.bfloat16, device_map="auto" +) -Finally, we run the GPTQ one-shot algorithm to quantize all linear weights to 4 bit -channelwise. +dataset = "ultrachat-200k" +splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} -## How to Run +recipe = "2:4_w4a16_recipe.yaml" +``` -We can run the entire staged recipe with one call to SparseML's `apply` pathway. This -will save a checkpoint of the model after each stage. +## Step 2: Run sparsification using `apply` +The `apply` function applies the given recipe to our model and dataset. +The hardcoded kwargs may be altered based on each model's needs. This code snippet should +be run in the same Python instance as step 1. +After running, the sparsified model will be saved to `output_llama7b_2:4_w4a16_channel`. + +```python +from sparseml.transformers import apply + +output_dir = "output_llama7b_2:4_w4a16_channel" + +apply( + model=model, + dataset=dataset, + recipe=recipe, + bf16=False, # use full precision for training + output_dir=output_dir, + splits=splits, + max_seq_length=512, + num_calibration_samples=512, + num_train_epochs=0.5, + logging_steps=500, + save_steps=5000, + gradient_checkpointing=True, + learning_rate=0.0001, + lr_scheduler_type="cosine", + warmup_ratio=0.1, +) +``` -```python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py``` -### Compression +### Step 3: Compression The resulting model will be uncompressed. To save a final compressed copy of the model -run the following: +run the following in the same Python instance as the previous steps. -``` +```python import torch +import os from sparseml.transformers import SparseAutoModelForCausalLM -model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16) +compressed_output_dir = "output_llama7b_2:4_w4a16_channel_compressed" +uncompressed_path = os.path.join(output_dir, "stage_quantization") +model = SparseAutoModelForCausalLM.from_pretrained(uncompressed_path, torch_dtype=torch.bfloat16) model.save_pretrained(compressed_output_dir, save_compressed=True) ``` ### Custom Quantization The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are `tensor`, `group` and `channel`. The above recipe (`2:4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. -To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. Group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml` +To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. A group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`. diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py index c894613ffbb..702218f7db7 100644 --- a/examples/llama7b_w8a8_quantization.py +++ b/examples/llama7b_w8a8_quantization.py @@ -16,12 +16,12 @@ num_bits: 8 type: "int" symmetric: true - strategy: "channel" + strategy: "tensor" input_activations: num_bits: 8 type: "int" symmetric: true - dynamic: True + dynamic: true strategy: "token" targets: ["Linear"] """ @@ -37,7 +37,7 @@ dataset = "ultrachat-200k" # save location of quantized model out -output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed" +output_dir = "./output_llama7b_w8a8_dynamic_compressed" # set dataset config parameters splits = {"calibration": "train_gen[:5%]"} diff --git a/setup.py b/setup.py index 5e55ebec249..413e533aa4c 100644 --- a/setup.py +++ b/setup.py @@ -16,18 +16,25 @@ from typing import Dict, List, Tuple from setuptools import find_packages, setup +from utils.artifacts import get_release_and_version -# default variables to be overwritten by the version.py file -is_release = None -is_dev = None -version = "unknown" -version_major_minor = version +package_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "src", "sparseml" +) +( + is_release, + is_dev, + version, + version_major, + version_minor, + version_bug, +) = get_release_and_version(package_path) # load and overwrite version and release info from sparseml package exec(open(os.path.join("src", "sparseml", "version.py")).read()) print(f"loaded version {version} from src/sparseml/version.py") -version_nm_deps = f"{version_major_minor}.0" +version_nm_deps = f"{version_major}.{version_minor}.0" if is_release: _PACKAGE_NAME = "sparseml" @@ -38,7 +45,7 @@ _deps = [ "pyyaml>=5.0.0", - "numpy>=1.17.0", + "numpy>=1.17.0,<2.0", "matplotlib>=3.0.0", "merge-args>=0.1.0", "onnx>=1.5.0,<1.15.0", @@ -56,11 +63,9 @@ "protobuf>=3.12.2,<=3.20.3", "click>=7.1.2,!=8.0.0", # latest version < 8.0 + blocked version with reported bug ] -_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}~={version_nm_deps}"] -_deepsparse_deps = [ - f"{'deepsparse' if is_release else 'deepsparse-nightly'}~={version_nm_deps}" -] -_deepsparse_ent_deps = [f"deepsparse-ent~={version_nm_deps}"] +_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>=1.7.0"] +_deepsparse_deps = [f"{'deepsparse' if is_release else 'deepsparse-nightly'}>=1.7.0"] +_deepsparse_ent_deps = ["deepsparse-ent>=1.7.0"] _onnxruntime_deps = ["onnxruntime>=1.0.0"] _clip_deps = ["open_clip_torch==2.20.0"] diff --git a/src/sparseml/exporters/transforms/kv_cache/configs.py b/src/sparseml/exporters/transforms/kv_cache/configs.py index 686adf5c7d5..d0135a2c133 100644 --- a/src/sparseml/exporters/transforms/kv_cache/configs.py +++ b/src/sparseml/exporters/transforms/kv_cache/configs.py @@ -84,7 +84,7 @@ class KeyValueCacheConfig(BaseModel): "the kv cache. If this is not provided, no transpose will " "be applied.", ) - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, protected_namespaces=()) OPT_CONFIG = KeyValueCacheConfig( diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py index 004fce2ee7a..43bc596d849 100644 --- a/src/sparseml/modifiers/quantization/gptq/base.py +++ b/src/sparseml/modifiers/quantization/gptq/base.py @@ -18,9 +18,9 @@ from pydantic import Field from compressed_tensors.quantization import ( - QuantizationConfig, QuantizationScheme, is_preset_scheme, + preset_name_to_scheme, ) from sparseml.core import Modifier from sparseml.core.factory import ModifierFactory @@ -77,6 +77,7 @@ class GPTQModifier(Modifier): QuantizationScheme except targets, which will be set to the targets parameter set at the modifier level. Can also be set to a dictionary of the format `preset_scheme_name: targets` for example: `W8A8: ['Linear']` for weight 8 bit + or a string of a preset scheme if targets is provided and activation 8 bit quantization on the Linear layers. """ @@ -89,7 +90,7 @@ class GPTQModifier(Modifier): ignore: List[str] = Field(default_factory=list) disable_quantization_observer_epoch: Optional[float] = None num_calibration_steps: Optional[int] = None - scheme: Optional[Dict[str, Any]] = None + scheme: Optional[Union[str, Dict[str, Any]]] = None compressible_layers_: Optional[List] = None quantization_modifier_: Any = None @@ -167,32 +168,33 @@ def _build_quant_modifier(self, framework): if getattr(self, key, False) } + if isinstance(self.targets, str): + self.targets = [self.targets] + if self.scheme is not None: # takes precedence over config_groups - if any(is_preset_scheme(key) for key in self.scheme.keys()): - config_groups = QuantizationConfig( - config_groups=self.scheme - ).config_groups - quant_args["config_groups"] = config_groups - else: - targets = self.targets or ["Linear"] - config_group = QuantizationScheme.model_validate( - {"targets": targets, **self.scheme} - ) - quant_args["config_groups"] = {"config_group_0": config_group} + if isinstance(self.scheme, str) and is_preset_scheme(self.scheme): + # attach targets to scheme + self.scheme = {self.scheme: self.targets} - targets = self.targets or ["Linear"] - config_group = QuantizationScheme.model_validate( - {"targets": targets, **self.scheme} - ) - quant_args["config_groups"] = {"config_group_0": config_group} + quant_args["config_groups"] = {} + for idx, key in enumerate(self.scheme.keys()): + if is_preset_scheme(key): + scheme = preset_name_to_scheme(key, self.scheme[key]) + else: + scheme = QuantizationScheme.model_validate( + {"targets": self.scheme[key], **self.scheme} + ) + + group_name = f"group_{idx}" + quant_args["config_groups"][group_name] = scheme - if "config_groups" not in quant_args: + if "config_groups" not in quant_args or len("config_groups") == 0: default_quant_scheme = QuantizationScheme.default_scheme( targets=self.targets ) - quant_args["config_groups"] = {"config_group_0": default_quant_scheme} + quant_args["config_groups"] = {"group_0": default_quant_scheme} _LOGGER.info(f"Building quantization modifier with args: {quant_args}") vllm_quant_config = {"QuantizationModifier": quant_args} self._build_quant_modifier_from_dict(vllm_quant_config, framework) diff --git a/src/sparseml/modifiers/quantization_legacy/base.py b/src/sparseml/modifiers/quantization_legacy/base.py index 9b9f1569f09..27856774c00 100644 --- a/src/sparseml/modifiers/quantization_legacy/base.py +++ b/src/sparseml/modifiers/quantization_legacy/base.py @@ -14,6 +14,8 @@ from typing import Any, Dict, List, Optional +from pydantic import ConfigDict + from sparseml.core import Event, Modifier @@ -81,6 +83,8 @@ class LegacyQuantizationModifier(Modifier): post_oneshot_calibration: Optional[bool] = False strict: bool = True + model_config = ConfigDict(protected_namespaces=()) + def __init__(self, **kwargs): super().__init__(**kwargs) if self.model_fuse_fn_kwargs is None: diff --git a/src/sparseml/version.py b/src/sparseml/version.py index ffe77da583a..1279a498a4b 100644 --- a/src/sparseml/version.py +++ b/src/sparseml/version.py @@ -16,16 +16,20 @@ Functionality for storing and setting the version info for SparseML """ -from datetime import date - - version_base = "1.8.0" is_release = False # change to True to set the generated version as a release version is_dev = False dev_number = None -def _generate_version(): +def _generate_version( + is_release: bool, + is_dev: bool, + version_base: str, + dev_number: str, +): + from datetime import date + if is_release: return version_base elif is_dev: @@ -45,7 +49,7 @@ def _generate_version(): "version_build", "version_major_minor", ] -__version__ = _generate_version() +__version__ = _generate_version(is_release, is_dev, version_base, dev_number) version = __version__ version_major, version_minor, version_bug, version_build = version.split(".") + ( diff --git a/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py b/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py index 0fcb66eee9c..1b9f365bebf 100644 --- a/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py +++ b/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py @@ -95,7 +95,7 @@ def test_create_default_quant_modifier(self): modifier.on_initialize_structure(testing_harness.get_state()) assert modifier.quantize assert isinstance(modifier.quantization_modifier_, QuantizationModifier) - default_config_group_name = "config_group_0" + default_config_group_name = "group_0" should_be_default_quant_scheme = modifier.quantization_modifier_.config_groups[ default_config_group_name ] diff --git a/tests/sparseml/transformers/gptq/test_oneshot.py b/tests/sparseml/transformers/gptq/test_oneshot.py index c7c14275df1..1d2e28cc303 100644 --- a/tests/sparseml/transformers/gptq/test_oneshot.py +++ b/tests/sparseml/transformers/gptq/test_oneshot.py @@ -16,11 +16,57 @@ import shutil import unittest +from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme +from parameterized import parameterized_class +from sparseml.modifiers.quantization.gptq import GPTQModifier from sparseml.transformers.sparsification.sparse_model import SparseAutoModelForCausalLM from tests.testing_utils import requires_torch +recipe_str = """ +quant_stage: + quant_modifiers: + GPTQModifier: + sequential_update: false + ignore: ["lm_head"] + config_groups: + group_0: + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: "channel" + targets: ["Linear"] +""" + +recipe_modifier_full = GPTQModifier( + ignore=["lm_head"], + sequential_update=False, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], weights=QuantizationArgs(num_bits=4, strategy="channel") + ) + }, +) + +recipe_modifier_shorthand_a = GPTQModifier( + ignore=["lm_head"], sequential_update=False, targets="Linear", scheme="W4A16" +) + +recipe_modifier_shorthand_b = GPTQModifier( + ignore=["lm_head"], sequential_update=False, scheme={"W4A16": ["Linear"]} +) + + @requires_torch +@parameterized_class( + [ + {"recipe": recipe_str}, + {"recipe": recipe_modifier_full}, + {"recipe": recipe_modifier_shorthand_a}, + {"recipe": recipe_modifier_shorthand_b}, + ] +) class TestGPTQOneShotWithFullScheme(unittest.TestCase): def setUp(self): import torch @@ -30,26 +76,6 @@ def setUp(self): self.dataset = "open_platypus" self.device = "cuda:0" if torch.cuda.is_available() else "cpu" - self.recipe = """ - first_stage: - quant_modifiers: - GPTQModifier: - ignore: ["lm_head"] - sequential_update: True - dampening_frac: 0.001 - block_size: 128 - targets: ["Linear"] - scheme: - input_activations: null - output_activations: null - weights: - num_bits: 8 - type: "int" - symmetric: true - strategy: "tensor" - group_size: 128 - """ - def test_oneshot_application(self): from sparseml.transformers import oneshot @@ -68,9 +94,23 @@ def test_oneshot_application(self): # Check that the model is quantized assert model_loaded.quantization_config is not None + # check config is set properly + assert model_loaded.quantization_config.ignore == ["lm_head"] + assert len(model_loaded.quantization_config.config_groups) == 1 + quant_scheme = model_loaded.quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + assert quant_scheme.targets == ["Linear"] + weight_args = model_loaded.quantization_config.config_groups["group_0"].weights + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == 4 + # Check a specific layer is quantized targetted_linear_layer = model_loaded.transformer.h[0].attn.attention.k_proj assert hasattr(targetted_linear_layer, "quantization_scheme") + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") + def tearDown(self): shutil.rmtree(self.output) diff --git a/tests/sparseml/transformers/obcq/recipes/quant.yaml b/tests/sparseml/transformers/obcq/recipes/quant.yaml index f5436b3873f..9c5a6ac6209 100644 --- a/tests/sparseml/transformers/obcq/recipes/quant.yaml +++ b/tests/sparseml/transformers/obcq/recipes/quant.yaml @@ -6,32 +6,7 @@ test_stage: [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"], [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"] ] - LegacyQuantizationModifier: - ignore: - - LlamaRotaryEmbedding - - LlamaRMSNorm - - SiLU - - model.layers.0.mlp.down_proj - - model.layers.1.mlp.down_proj - - model.layers.2.mlp.down_proj - - model.layers.3.mlp.down_proj - - model.layers.4.mlp.down_proj - - model.layers.5.mlp.down_proj - scheme_overrides: - Embedding: - input_activations: null - weights: - num_bits: 8 - symmetric: False GPTQModifier: block_size: 128 sequential_update: False - percdamp: 0.01 - targets: [ - "model.layers.0", - "model.layers.1", - "model.layers.2", - "model.layers.3", - "model.layers.4", - "model.layers.5" - ] \ No newline at end of file + percdamp: 0.01 \ No newline at end of file diff --git a/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml b/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml index 198b32f0e3c..643ba175597 100644 --- a/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml +++ b/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml @@ -1,5 +1,11 @@ test_stage: obcq_modifiers: + SparseGPTModifier: + sparsity: 0.5 + block_size: 128 + sequential_update: False + percdamp: 0.01 + mask_structure: "0:0" SmoothQuantModifier: smoothing_strength: 0.5 mappings: [ @@ -11,13 +17,6 @@ test_stage: - LlamaRotaryEmbedding - LlamaRMSNorm - SiLU - - model.layers.0.mlp.down_proj - - model.layers.1.mlp.down_proj - - model.layers.2.mlp.down_proj - - model.layers.3.mlp.down_proj - - model.layers.4.mlp.down_proj - - model.layers.5.mlp.down_proj - post_oneshot_calibration: True scheme_overrides: Embedding: input_activations: null @@ -27,26 +26,4 @@ test_stage: GPTQModifier: block_size: 128 sequential_update: False - percdamp: 0.01 - targets: [ - "model.layers.0", - "model.layers.1", - "model.layers.2", - "model.layers.3", - "model.layers.4", - "model.layers.5" - ] - SparseGPTModifier: - sparsity: 0.5 - block_size: 128 - sequential_update: False - percdamp: 0.01 - mask_structure: "0:0" - targets: [ - "model.layers.0", - "model.layers.1", - "model.layers.2", - "model.layers.3", - "model.layers.4", - "model.layers.5" - ] \ No newline at end of file + percdamp: 0.01 \ No newline at end of file diff --git a/tests/sparseml/transformers/obcq/recipes/sparse.yaml b/tests/sparseml/transformers/obcq/recipes/sparse.yaml index 70ffc7bf784..4309a066377 100644 --- a/tests/sparseml/transformers/obcq/recipes/sparse.yaml +++ b/tests/sparseml/transformers/obcq/recipes/sparse.yaml @@ -5,9 +5,8 @@ test_stage: block_size: 128 sequential_update: False percdamp: 0.01 - mask_structure: "0:0" targets: [ "model.layers.0", "model.layers.1", - "lm_head" - ] \ No newline at end of file + ] + mask_structure: "0:0" \ No newline at end of file diff --git a/tests/sparseml/transformers/obcq/test_obcq_sparsity.py b/tests/sparseml/transformers/obcq/test_obcq_sparsity.py index c6c80b301aa..d8e25271c9b 100644 --- a/tests/sparseml/transformers/obcq/test_obcq_sparsity.py +++ b/tests/sparseml/transformers/obcq/test_obcq_sparsity.py @@ -60,8 +60,6 @@ def test_sparsities(self): model = get_session_model() - lm_head_sparsity = tensor_sparsity(model.lm_head.weight) - assert math.isclose(lm_head_sparsity.item(), self.sparsity, rel_tol=1e-4) layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight) assert math.isclose(layer_1_sparse.item(), self.sparsity, rel_tol=1e-4) layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight) @@ -118,8 +116,6 @@ def test_sparsities_gpu(self): model = get_session_model() - lm_head_sparsity = tensor_sparsity(model.lm_head.weight) - assert math.isclose(lm_head_sparsity.item(), self.sparsity, rel_tol=1e-4) layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight) assert math.isclose(layer_1_sparse.item(), self.sparsity, rel_tol=1e-4) layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight) diff --git a/utils/artifacts.py b/utils/artifacts.py new file mode 100644 index 00000000000..a93bda61122 --- /dev/null +++ b/utils/artifacts.py @@ -0,0 +1,46 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from typing import Tuple + + +def get_release_and_version(package_path: str) -> Tuple[bool, bool, str, str, str, str]: + """ + Load version and release info from deepsparse package + """ + # deepsparse/src/deepsparse/version.py always exists, default source of truth + version_path = os.path.join(package_path, "version.py") + + # exec() cannot set local variables so need to manually + locals_dict = {} + exec(open(version_path).read(), globals(), locals_dict) + is_release = locals_dict.get("is_release", False) + is_dev = locals_dict.get("is_dev", False) + version = locals_dict.get("version", "unknown") + version_major = locals_dict.get("version_major", "unknown") + version_minor = locals_dict.get("version_minor", "unknown") + version_bug = locals_dict.get("version_bug", "unknown") + + print(f"Loaded version {version} from {version_path}") + + return ( + is_release, + is_dev, + version, + version_major, + version_minor, + version_bug, + )