Fix sparsity arg in Engine/ModelArgs (#179)

neuralmagic · Apr 11, 2024 · dcd4973 · dcd4973 · github-actions · Apr 11, 2024
1 parent 5919679
commit dcd4973
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 9 deletions.
diff --git a/neuralmagic/tests/skip-for-remote-push.txt b/neuralmagic/tests/skip-for-remote-push.txt
@@ -12,7 +12,6 @@ tests/distributed/test_comm_ops.py
 tests/prefix_caching/test_prefix_caching.py
 tests/models/test_models_logprobs.py
 tests/models/test_models.py
-tests/models/test_compressed_memory.py
 tests/spec_decode/test_utils.py
 tests/spec_decode/test_spec_decode_worker.py
 tests/spec_decode/test_metrics.py

diff --git a/vllm/config.py b/vllm/config.py
@@ -88,9 +88,9 @@ def __init__(
         tokenizer_revision: Optional[str] = None,
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
+        quantization_param_path: Optional[str] = None,
         # UPSTREAM SYNC: keep sparsity
         sparsity: Optional[str] = None,
-        quantization_param_path: Optional[str] = None,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
         max_logprobs: int = 5,
@@ -106,9 +106,9 @@ def __init__(
         self.code_revision = code_revision
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
+        self.quantization_param_path = quantization_param_path
         # UPSTREAM SYNC: keep sparsity
         self.sparsity = sparsity
-        self.quantization_param_path = quantization_param_path
         self.enforce_eager = enforce_eager
         self.max_context_len_to_capture = max_context_len_to_capture
         self.max_logprobs = max_logprobs

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -432,12 +432,25 @@ def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
     def create_engine_config(self, ) -> EngineConfig:
         device_config = DeviceConfig(self.device)
         model_config = ModelConfig(
-            self.model, self.tokenizer, self.tokenizer_mode,
-            self.trust_remote_code, self.download_dir, self.load_format,
-            self.dtype, self.seed, self.revision, self.code_revision,
-            self.tokenizer_revision, self.max_model_len, self.quantization,
-            self.quantization_param_path, self.enforce_eager,
-            self.max_context_len_to_capture, self.max_logprobs)
+            self.model,
+            self.tokenizer,
+            self.tokenizer_mode,
+            self.trust_remote_code,
+            self.download_dir,
+            self.load_format,
+            self.dtype,
+            self.seed,
+            self.revision,
+            self.code_revision,
+            self.tokenizer_revision,
+            self.max_model_len,
+            self.quantization,
+            self.quantization_param_path,
+            # UPSTREAM SYNC: keep sparsity argument
+            self.sparsity,
+            self.enforce_eager,
+            self.max_context_len_to_capture,
+            self.max_logprobs)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,