[OneShot][Testing] Expand Integration tests to run for llama-7b; add …

…gpu/auto cases (#2237)
neuralmagic · May 7, 2024 · 80ef58e · 80ef58e
1 parent f7cb678
commit 80ef58e
Show file tree

Hide file tree

Showing 31 changed files with 957 additions and 443 deletions.
diff --git a/tests/sparseml/transformers/finetune/test_finetune.py b/tests/sparseml/transformers/finetune/test_finetune.py
@@ -98,7 +98,7 @@ def test_oneshot_and_finetune_with_tokenizer(tmp_path: Path):
 
 
 def test_oneshot_then_finetune(tmp_path: Path):
-    recipe_str = "tests/sparseml/transformers/obcq/test_tiny2.yaml"
+    recipe_str = "tests/sparseml/transformers/obcq/recipes/test_tiny2.yaml"
     model = "Xenova/llama2.c-stories15M"
     device = "cuda:0"
     if not torch.cuda.is_available():

diff --git a/tests/sparseml/transformers/finetune/test_finetune_helpers.py b/tests/sparseml/transformers/finetune/test_finetune_helpers.py
@@ -26,7 +26,7 @@ def test_apply_recipe_structure():
     model = AutoModelForCausalLM.from_pretrained(model_path)
     assert not qat_active(model)
 
-    recipe_with_quant = "tests/sparseml/transformers/obcq/quant_and_sparse.yaml"
+    recipe_with_quant = "tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml"
     apply_recipe_structure_to_model(model, recipe_with_quant, model_path)
 
     assert qat_active(model)
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant.yaml b/tests/sparseml/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant.yaml
@@ -0,0 +1,8 @@
+cadence: "nightly"
+test_type: "regression"
+model: "zoo:llama2-7b-llama2_pretrain-base"
+dataset: open_platypus
+recipe: "tests/sparseml/transformers/obcq/recipes/quant.yaml"
+device: "cuda:1"
+num_samples: 512
+perplexity: 20
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant_and_sparse.yaml b/tests/sparseml/transformers/obcq/obcq_configs/completion/gpu/llama_7b_quant_and_sparse.yaml
@@ -0,0 +1,8 @@
+cadence: "nightly"
+test_type: "regression"
+model: "zoo:llama2-7b-llama2_pretrain-base"
+dataset: open_platypus
+recipe: "tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml"
+device: "cuda:0"
+num_samples: 512
+perplexity: 20
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/completion/tiny_llama_quant.yaml b/tests/sparseml/transformers/obcq/obcq_configs/completion/tiny_llama_quant.yaml
@@ -0,0 +1,7 @@
+cadence: "commit"
+test_type: "sanity"
+model: "Xenova/llama2.c-stories15M"
+dataset: open_platypus
+recipe: "tests/sparseml/transformers/obcq/recipes/quant.yaml"
+num_samples: 32
+perplexity: 5000
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/completion/tiny_llama_quant_and_sparse.yaml b/tests/sparseml/transformers/obcq/obcq_configs/completion/tiny_llama_quant_and_sparse.yaml
@@ -0,0 +1,7 @@
+cadence: "commit"
+test_type: "sanity"
+model: "Xenova/llama2.c-stories15M"
+dataset: open_platypus
+recipe: "tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml"
+num_samples: 32
+perplexity: 5000
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/consec_runs/gpu/llama_consec_runs.yaml b/tests/sparseml/transformers/obcq/obcq_configs/consec_runs/gpu/llama_consec_runs.yaml
@@ -0,0 +1,7 @@
+cadence: "nightly"
+test_type: "regression"
+model: "zoo:llama2-7b-llama2_pretrain-base"
+dataset: open_platypus
+first_recipe: "tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml"
+second_recipe: "tests/sparseml/transformers/obcq/recipes/additional_sparsity.yaml"
+device: "auto"
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/consec_runs/tiny_llama_consec_runs.yaml b/tests/sparseml/transformers/obcq/obcq_configs/consec_runs/tiny_llama_consec_runs.yaml
@@ -0,0 +1,6 @@
+cadence: "commit"
+test_type: "sanity"
+model: "Xenova/llama2.c-stories15M"
+dataset: open_platypus
+first_recipe: "tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml"
+second_recipe: "tests/sparseml/transformers/obcq/recipes/additional_sparsity.yaml"
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/repeat_quants/tiny_llama_repeat_quant.yaml b/tests/sparseml/transformers/obcq/obcq_configs/repeat_quants/tiny_llama_repeat_quant.yaml
@@ -0,0 +1,25 @@
+cadence: "commit"
+test_type: "sanity"
+model: "Xenova/llama2.c-stories15M"
+dataset: open_platypus
+first_recipe: |
+  first_stage:
+      quant_modifiers:
+          QuantizationModifier:
+              ignore:
+                  - LlamaRotaryEmbedding
+                  - LlamaRMSNorm
+                  - SiLU
+              scheme_overrides:
+                  Embedding:
+                      input_activations: null
+
+second_recipe: |
+  second_stage:
+      quant_modifiers:
+          QuantizationModifier:
+              ignore:
+                  - LlamaRotaryEmbedding
+                  - LlamaRMSNorm
+                  - SiLU
+                  - Embedding
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/separate_quants/tiny_llama_separate_quant.yaml b/tests/sparseml/transformers/obcq/obcq_configs/separate_quants/tiny_llama_separate_quant.yaml
@@ -0,0 +1,32 @@
+cadence: "commit"
+test_type: "sanity"
+model: "Xenova/llama2.c-stories15M"
+dataset: open_platypus
+first_recipe: |
+    first_stage:
+        quant_modifiers:
+            QuantizationModifier:
+                ignore:
+                    - LlamaRotaryEmbedding
+                    - LlamaRMSNorm
+                    - SiLU
+                    - Linear
+                scheme_overrides:
+                    Embedding:
+                        input_activations: null
+second_recipe: |
+    second_stage:
+        quant_modifiers:
+            QuantizationModifier:
+                ignore:
+                    - LlamaRotaryEmbedding
+                    - LlamaRMSNorm
+                    - SiLU
+                    - Embedding
+                    - MatMulLeftInput_QK
+                    - MatMulRightInput_QK
+                    - MatMulOutput_QK
+                    - MatMulLeftInput_PV
+                    - MatMulRightInput_PV
+                    - MatMulOutput_PV
+                    - QuantizableMatMul
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse.yaml b/tests/sparseml/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse.yaml
@@ -0,0 +1,7 @@
+cadence: "nightly"
+test_type: "regression"
+model: "zoo:llama2-7b-llama2_pretrain-base"
+dataset: open_platypus
+recipe: "tests/sparseml/transformers/obcq/recipes/sparse.yaml"
+sparsity: 0.3
+device: "cuda:0"
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_auto.yaml b/tests/sparseml/transformers/obcq/obcq_configs/sparse/gpu/llama_7b_sparse_auto.yaml
@@ -0,0 +1,7 @@
+cadence: "nightly"
+test_type: "regression"
+model: "zoo:llama2-7b-llama2_pretrain-base"
+dataset: open_platypus
+recipe: "tests/sparseml/transformers/obcq/recipes/sparse.yaml"
+sparsity: 0.3
+device: "auto"
diff --git a/tests/sparseml/transformers/obcq/obcq_configs/sparse/tiny_llama_sparse.yaml b/tests/sparseml/transformers/obcq/obcq_configs/sparse/tiny_llama_sparse.yaml
@@ -0,0 +1,6 @@
+cadence: "commit"
+test_type: "sanity"
+model: "Xenova/llama2.c-stories15M"
+dataset: open_platypus
+recipe: "tests/sparseml/transformers/obcq/recipes/sparse.yaml"
+sparsity: 0.3
diff --git a/...ransformers/obcq/additional_sparsity.yaml → ...ers/obcq/recipes/additional_sparsity.yaml b/...ransformers/obcq/additional_sparsity.yaml → ...ers/obcq/recipes/additional_sparsity.yaml
diff --git a/tests/sparseml/transformers/obcq/quant.yaml → ...seml/transformers/obcq/recipes/quant.yaml b/tests/sparseml/transformers/obcq/quant.yaml → ...seml/transformers/obcq/recipes/quant.yaml
diff --git a/...l/transformers/obcq/quant_and_sparse.yaml → ...ormers/obcq/recipes/quant_and_sparse.yaml b/...l/transformers/obcq/quant_and_sparse.yaml → ...ormers/obcq/recipes/quant_and_sparse.yaml
diff --git a/tests/sparseml/transformers/obcq/sparse.yaml → ...eml/transformers/obcq/recipes/sparse.yaml b/tests/sparseml/transformers/obcq/sparse.yaml → ...eml/transformers/obcq/recipes/sparse.yaml
diff --git a/...parseml/transformers/obcq/test_tiny2.yaml → ...transformers/obcq/recipes/test_tiny2.yaml b/...parseml/transformers/obcq/test_tiny2.yaml → ...transformers/obcq/recipes/test_tiny2.yaml
diff --git a/tests/sparseml/transformers/obcq/test_consecutive_runs.py b/tests/sparseml/transformers/obcq/test_consecutive_runs.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import unittest
+from pathlib import Path
+
+import pytest
+import yaml
+
+from parameterized import parameterized_class
+from tests.testing_utils import parse_params, requires_gpu, requires_torch
+
+
+CONFIGS_DIRECTORY = "tests/sparseml/transformers/obcq/obcq_configs/consec_runs"
+GPU_CONFIGS_DIRECTORY = "tests/sparseml/transformers/obcq/obcq_configs/consec_runs/gpu"
+
+
+class TestConsecutiveRuns(unittest.TestCase):
+    def _test_consecutive_runs(
+        self, tolerance: float, num_calibration_samples: int = 16
+    ):
+        import math
+
+        import sparseml.core.session as session_manager
+        from sparseml.pytorch.model_load.helpers import get_session_model
+        from sparseml.pytorch.utils.helpers import tensor_sparsity
+        from sparseml.transformers import oneshot
+        from sparseml.utils.pytorch import qat_active
+
+        # test recipe with 50% sparsity, quantization and smoothquant
+        oneshot(
+            model=self.model,
+            dataset=self.dataset,
+            num_calibration_samples=num_calibration_samples,
+            recipe=self.first_recipe,
+            output_dir=self.output_first,
+            oneshot_device=self.device,
+            clear_sparse_session=False,
+        )
+        first_tiny_model = get_session_model()
+        layer_0_sparse = tensor_sparsity(
+            first_tiny_model.model.layers[0].self_attn.k_proj.module.weight
+        )
+        assert math.isclose(layer_0_sparse.item(), 0.5, rel_tol=tolerance)
+        assert qat_active(first_tiny_model)
+
+        session = session_manager.active_session()
+        session_recipe = session.lifecycle.recipe_container.compiled_recipe
+        stages = [stage.group for stage in session_recipe.stages]
+        self.assertEqual(len(stages), 1)
+        session.reset()
+
+        # reload saved model and up sparsity to 0.7
+        oneshot(
+            model=self.output_first,
+            dataset=self.dataset,
+            num_calibration_samples=num_calibration_samples,
+            recipe=self.second_recipe,
+            output_dir=self.output_second,
+            oneshot_device=self.device,
+            clear_sparse_session=False,
+        )
+
+        second_tiny_model = get_session_model()
+        layer_0_sparse = tensor_sparsity(
+            second_tiny_model.model.layers[0].self_attn.k_proj.module.weight
+        )
+        assert math.isclose(layer_0_sparse.item(), 0.7, rel_tol=tolerance)
+        assert qat_active(second_tiny_model)
+
+        session = session_manager.active_session()
+        session_recipe = session.lifecycle.recipe_container.compiled_recipe
+        stages = [stage.group for stage in session_recipe.stages]
+        self.assertEqual(len(stages), 2)
+
+        recipe_path = self.output_second / "recipe.yaml"
+        recipe_data = yaml.safe_load(recipe_path.read_text())
+        stage_keys = recipe_data.keys()
+        self.assertEqual(len(stage_keys), 2)
+        self.assertIn("test_stage_0", stage_keys)
+        self.assertIn("test_stage_1", stage_keys)
+
+    def tearDown(self):
+        shutil.rmtree(self.output)
+
+
+@requires_torch
+@pytest.mark.integration
+@parameterized_class(parse_params(CONFIGS_DIRECTORY))
+class TestConsecutiveRunsSmall(TestConsecutiveRuns):
+    model = None
+    first_recipe = None
+    second_recipe = None
+    dataset = None
+
+    def setUp(self):
+        import torch
+
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.output = "./oneshot_output"
+        self.output_first = Path(self.output) / "test_1"
+        self.output_second = Path(self.output) / "test_2"
+
+    def test_consecutive_runs_small(self):
+        self._test_consecutive_runs(tolerance=1e-3)
+
+
+@requires_gpu
+@requires_torch
+@pytest.mark.integration
+@parameterized_class(parse_params(GPU_CONFIGS_DIRECTORY))
+class TestConsecutiveRunsGPU(TestConsecutiveRuns):
+    # Will be populated using the config files
+    model = None
+    first_recipe = None
+    second_recipe = None
+    dataset = None
+    device = None
+
+    def setUp(self):
+        from sparseml.transformers import SparseAutoModelForCausalLM
+
+        if "zoo:" in self.model:
+            self.model = SparseAutoModelForCausalLM.from_pretrained(
+                self.model, device_map=self.device
+            )
+
+        self.output = "./oneshot_output"
+        self.output_first = Path(self.output) / "test_1"
+        self.output_second = Path(self.output) / "test_2"
+
+    def test_consecutive_runs_gpu(self):
+        self._test_consecutive_runs(tolerance=1e-0, num_calibration_samples=16)