Merge branch 'main' into update-nightly-build

neuralmagic · Jun 12, 2024 · 31e808b · 31e808b
2 parents c9d85b5 + e255b17
commit 31e808b
Show file tree

Hide file tree

Showing 10 changed files with 185 additions and 95 deletions.
diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md
@@ -40,7 +40,7 @@ run the following:
 
 ```
 import torch
-from sparseml import SparseAutoModelForCausalLM
+from sparseml.transformers import SparseAutoModelForCausalLM
 
 model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)
 model.save_pretrained(compressed_output_dir, save_compressed=True)
@@ -49,4 +49,4 @@ model.save_pretrained(compressed_output_dir, save_compressed=True)
 ### Custom Quantization
 The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are `tensor`, `group` and `channel`. 
 The above recipe (`2:4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. 
-To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. Group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`
+To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. Group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`
diff --git a/examples/llama7b_w4a16_quantization.ipynb b/examples/llama7b_w4a16_quantization.ipynb
@@ -153,7 +153,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.save_pretrained(\"/network/sadkins/llama1.1b_W4A16_channel_packed\", save_compressed=True)"
+    "model.save_pretrained(\"llama1.1b_W4A16_channel_packed\", save_compressed=True)"
    ]
   }
  ],

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -103,6 +103,14 @@ def fasterprune(
             W = W.t()
         W = W.float()
 
+        sparsity = tensor_sparsity(W)
+        preserve_zeros = sparsity >= SPARSITY_THRESHOLD
+        W_nz_mask = (
+            (~torch.isclose(W, torch.zeros(1, device=W.device).float())).float()
+            if preserve_zeros
+            else None
+        )
+
         tick = time.time()
 
         dead = torch.diag(self.H) == 0
@@ -119,17 +127,6 @@ def fasterprune(
         self.H = torch.linalg.cholesky(self.H, upper=True)
         Hinv = self.H
 
-        sparsity = tensor_sparsity(W)
-        mask = (
-            torch.where(
-                W == 0,
-                torch.tensor(1, dtype=torch.bool),
-                torch.tensor(0, dtype=torch.bool),
-            )
-            if sparsity >= SPARSITY_THRESHOLD
-            else None
-        )
-
         # See section 3.4 of https://arxiv.org/abs/2203.07259
         for i1 in range(0, self.columns, blocksize):
             i2 = min(i1 + blocksize, self.columns)
@@ -141,21 +138,13 @@ def fasterprune(
             Losses1 = torch.zeros_like(W1)
             Hinv1 = Hinv[i1:i2, i1:i2]
 
-            if sparsity >= SPARSITY_THRESHOLD:
-                tmp = (
-                    (~mask[:, i1:i2])
-                    * W1**2
-                    / (torch.diag(Hinv1).reshape((1, -1))) ** 2
-                )
-                thresh = torch.sort(tmp.flatten())[0][int(tmp.numel() * sparsity)]
-                mask1 = tmp <= thresh
+            if preserve_zeros:
+                W1_nz_mask = W_nz_mask[:, i1:i2]
 
             for i in range(count):
                 w = W1[:, i]
                 d = Hinv1[i, i]
                 q = w.clone()
-                if sparsity >= SPARSITY_THRESHOLD:
-                    q[mask1[:, i]] = 0
 
                 if hasattr(self.layer, "weight_fake_quant"):
                     scale = self.layer.weight_fake_quant.scale
@@ -216,13 +205,21 @@ def fasterprune(
                 Losses1[:, i] = (w - q) ** 2 / d**2
 
                 err1 = (w - q) / d
-                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                w1_err = err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                if preserve_zeros:
+                    W1[:, i:] -= w1_err * W1_nz_mask[:, i:]
+                else:
+                    W1[:, i:] -= w1_err
                 Err1[:, i] = err1
 
             W[:, i1:i2] = Q1
             Losses += torch.sum(Losses1, 1) / 2
 
-            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+            w_err = Err1.matmul(Hinv[i1:i2, i2:])
+            if preserve_zeros:
+                W[:, i2:] -= w_err * W_nz_mask[:, i2:]
+            else:
+                W[:, i2:] -= w_err
 
         _LOGGER.info("time %.2f" % (time.time() - tick))
         _LOGGER.info("error %.2f" % torch.sum(Losses).item())

diff --git a/src/sparseml/pytorch/utils/helpers.py b/src/sparseml/pytorch/utils/helpers.py
@@ -20,7 +20,6 @@
 import os
 import random
 import re
-import warnings
 from collections import OrderedDict, namedtuple
 from contextlib import contextmanager
 from copy import deepcopy
@@ -30,7 +29,7 @@
 import torch
 from packaging import version
 from torch import Tensor
-from torch.nn import Linear, Module, Parameter
+from torch.nn import Embedding, Linear, Module, Parameter
 from torch.nn.modules.conv import Conv2d, Conv3d, _ConvNd
 from torch.optim.optimizer import Optimizer
 from torch.utils.data import DataLoader
@@ -780,6 +779,7 @@ def get_prunable_layers(module: Module) -> List[Tuple[str, Module]]:
         for (name, mod) in module.named_modules()
         if (
             isinstance(mod, Linear)
+            or isinstance(mod, Embedding)
             or isinstance(mod, _ConvNd)
             or (QATLinear and isinstance(mod, QATLinear))
             or (QATConv2d and isinstance(mod, QATConv2d))
@@ -793,7 +793,7 @@ def get_quantizable_layers(module: Module) -> List[Tuple[str, Module]]:
     """
     :param module: the module to get the quantizable layers from
     :return: a list containing the names and modules of the quantizable layers
-        (Linear, Conv2d, Conv3d)
+        (Embedding, Linear, Conv2d, Conv3d)
     """
     if QATLinear is None:
         raise ImportError(
@@ -806,6 +806,7 @@ def get_quantizable_layers(module: Module) -> List[Tuple[str, Module]]:
         for (name, mod) in module.named_modules()
         if (
             isinstance(mod, Linear)
+            or isinstance(mod, Embedding)
             or isinstance(mod, Conv2d)
             or (QATConv3d and isinstance(mod, Conv3d))
         )
@@ -816,29 +817,15 @@ def get_quantized_layers(module: Module) -> List[Tuple[str, Module]]:
     """
     :param module: the module to get the quantized layers from
     :return: a list containing the names and modules of the quantized layers
-        (Linear, Conv2d, Conv3d)
+        (Embedding, Linear, Conv2d, Conv3d)
     """
-    if QATLinear is None:
-        raise ImportError(
-            "PyTorch version is not setup for Quantization. "
-            "Please install a QAT compatible version of PyTorch"
-        )
 
     quantized_layers = []
     for (name, mod) in module.named_modules():
-        if (
-            (QATLinear and isinstance(mod, QATLinear))
-            or (QATConv2d and isinstance(mod, QATConv2d))
-            or (QATConv3d and isinstance(mod, QATConv3d))
-        ):
-            quantized_layers.append((name, mod))
-
-        elif isinstance(mod, Conv3d) and not QATConv3d:
-            warnings.warn(
-                "Pytorch version is not setup for Conv3D Quantization. "
-                "Quantization of Conv3D layers will be skipped",
-                UserWarning,
-            )
+        if hasattr(mod, "quantization_scheme"):
+            weight_scheme = getattr(mod.quantization_scheme, "weights", None)
+            if weight_scheme is not None and hasattr(mod, "weight"):
+                quantized_layers.append((name, mod))
 
     return quantized_layers
 

diff --git a/src/sparseml/transformers/compression/helpers.py b/src/sparseml/transformers/compression/helpers.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import torch
+from tqdm import tqdm
+
+from sparseml.pytorch.utils import get_linear_layers
+
+
+__ALL__ = [
+    "tensor_follows_mask_structure",
+    "infer_sparsity_structure_from_stage_modifiers",
+    "infer_sparsity_structure_from_model",
+]
+
+
+def tensor_follows_mask_structure(tensor, mask: str = "2:4") -> bool:
+    """
+    :param tensor: tensor to check
+    :param mask: mask structure to check for, in the format "n:m"
+    :return: True if the tensor follows the mask structure, False otherwise.
+        Note, some weights can incidentally be zero, so we check for
+        atleast n zeros in each chunk of size m
+    """
+
+    n, m = tuple(map(int, mask.split(":")))
+    # Reshape the tensor into chunks of size m
+    tensor = tensor.view(-1, m)
+
+    # Count the number of zeros in each chunk
+    zero_counts = (tensor == 0).sum(dim=1)
+
+    # Check if the number of zeros in each chunk atleast n
+    # Greater than sign is needed as some weights can incidentally
+    # be zero
+    return torch.all(zero_counts >= n).item()
+
+
+def infer_sparsity_structure_from_stage_modifiers(
+    stage_modifiers: List["StageModifier"],  # noqa E501
+) -> Optional[str]:
+    """
+    Determines the sparsity structure, if any exists, given the
+    list of stage modifiers
+
+    :param stage_modifiers: non-empty list of stage modifiers
+    :return: sparsity structure as a string or None
+    """
+    for stage in stage_modifiers:
+        if stage.applied:
+            for modifier in stage.modifiers:
+                if hasattr(modifier, "mask_structure"):
+                    sparsity_structure = modifier.mask_structure
+                    return sparsity_structure
+    return None
+
+
+def infer_sparsity_structure_from_model(model: torch.nn.Module) -> Optional[str]:
+    """
+    Determines the sparsity structure, if any exists, given the model
+
+    :param model: model to check for sparsity structure
+    :return: sparsity structure as a string or None
+    """
+
+    # check for the common sparsity structures
+    structures = {"2:4"}
+    for sparsity_structure in structures:
+        linear_modules = get_linear_layers(model)
+        linear_modules_with_sparsity_structure = [
+            tensor_follows_mask_structure(layer.weight)
+            for layer in tqdm(
+                linear_modules.values(),
+                desc="Checking whether model follows "
+                f"{sparsity_structure} sparsity structure",
+            )
+        ]
+        # if the majority of the linear modules follow the sparsity structure
+        # we can assume that the model follows the sparsity structure
+        # (taking into consideration the fact that some Linear layers like the
+        # embedding layer might not be sparse)
+        if (
+            sum(linear_modules_with_sparsity_structure)
+            > len(linear_modules_with_sparsity_structure) * 0.8
+        ):
+            return sparsity_structure
+
+    return None
diff --git a/src/sparseml/transformers/compression/sparsity_config.py b/src/sparseml/transformers/compression/sparsity_config.py
@@ -21,6 +21,10 @@
 from compressed_tensors import CompressionFormat, SparsityCompressionConfig
 from compressed_tensors.quantization.utils import is_model_quantized
 from sparseml.pytorch.utils import ModuleSparsificationInfo
+from sparseml.transformers.compression.helpers import (
+    infer_sparsity_structure_from_model,
+    infer_sparsity_structure_from_stage_modifiers,
+)
 
 
 class SparsityConfigMetadata:
@@ -47,26 +51,34 @@ def infer_global_sparsity(
         return global_sparsity
 
     @staticmethod
-    def infer_sparsity_structure() -> str:
+    def infer_sparsity_structure(model: Optional[Module] = None) -> str:
         """
-        Determines what sparsity structure, if any, was applied in the currently active
-        sparse session
+        Determines what sparsity structure, if any, was applied.
+
+        First, there is an attempt to dedue the sparsity structure
+        from the currently active sparse session.
+
+        If that fails, the sparsity structure is inferred from the
+        model (if provided)
+
+        Finally, if both fail, the sparsity structure is set to
+        "unstructured"
 
         :return: sparsity structure as a string
         """
+        sparsity_structure = None
+
         current_session = sparseml.active_session()
         stage_modifiers = current_session.lifecycle.modifiers
-        sparsity_structure = "unstructured"
+        if stage_modifiers:
+            sparsity_structure = infer_sparsity_structure_from_stage_modifiers(
+                stage_modifiers
+            )
 
-        # check for applied pruning modifiers
-        for stage in stage_modifiers:
-            if stage.applied:
-                for modifier in stage.modifiers:
-                    if hasattr(modifier, "mask_structure"):
-                        sparsity_structure = modifier.mask_structure
-                        break
+        if model and sparsity_structure is None:
+            sparsity_structure = infer_sparsity_structure_from_model(model)
 
-        return sparsity_structure
+        return sparsity_structure or "unstructured"
 
     @staticmethod
     def from_pretrained(
@@ -91,7 +103,9 @@ def from_pretrained(
         if global_sparsity < 0.05:
             return None
 
-        sparsity_structure = SparsityConfigMetadata.infer_sparsity_structure()
+        sparsity_structure = SparsityConfigMetadata.infer_sparsity_structure(
+            model=model
+        )
         if is_model_quantized(model):
             # compressing a sparse quantized model is not supported yet
             format = CompressionFormat.dense.value

diff --git a/src/sparseml/transformers/finetune/session_mixin.py b/src/sparseml/transformers/finetune/session_mixin.py
@@ -500,15 +500,22 @@ def log_model_sparsification(self):
             f"Sparsification info for {type(self.model).__name__}: "
             f"{sparsification_info.params_total} total params. "
         )
+        sparsity_percent_formatted = "{:.2f}".format(
+            sparsification_info.params_prunable_sparse_percent
+        )
         _LOGGER.info(
             f"There are {sparsification_info.params_prunable_total} prunable "
-            f"params which have {sparsification_info.params_prunable_sparse_percent} "
+            f"params which have {sparsity_percent_formatted}% "
             "avg sparsity."
         )
+
+        quant_percent_formatted = "{:.2f}".format(
+            sparsification_info.params_quantized_percent
+        )
         _LOGGER.info(
             f"There are {sparsification_info.params_quantizable} quantizable "
             f"params, with a quantization percentage of "
-            f"{sparsification_info.params_quantized_percent}."
+            f"{quant_percent_formatted}%."
         )
 
     def _prepare_model_for_fsdp(self):

diff --git a/src/sparseml/transformers/sparsification/sparse_model.py b/src/sparseml/transformers/sparsification/sparse_model.py
@@ -111,6 +111,14 @@ def skip(*args, **kwargs):
         model = super(AutoModelForCausalLM, cls).from_pretrained(
             pretrained_model_name_or_path, *model_args, **kwargs
         )
+        if model.dtype != model.config.torch_dtype:
+            _LOGGER.warning(
+                f"The dtype of the loaded model: {model.dtype} is different "
+                "from from the dtype specified in the model config: "
+                f"{model.config.torch_dtype}."
+                "To load the model in the format that it was previously saved in, "
+                "set torch_dtype=`auto` in the SparseAutoModel creation call."
+            )
         logger.setLevel(level=restore_log_level)
         # override the PreTrainedModel instance with compression save function
         modify_save_pretrained(model)