stray issues

neuralmagic · May 7, 2024 · 50c1029 · 50c1029 · github-actions · May 7, 2024
1 parent 8423620
commit 50c1029
Show file tree

Hide file tree

Showing 10 changed files with 38 additions and 53 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -102,6 +102,8 @@ WORKDIR /usr/src/flash-attention-v2
 RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
     --no-build-isolation --no-deps --no-cache-dir
 
+#################### FLASH_ATTENTION Build IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
@@ -124,6 +126,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
     --mount=type=cache,target=/root/.cache/pip \
     pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
+#################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################
 # image to run unit testing suite
@@ -159,7 +162,5 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
-ENV VLLM_USAGE_SOURCE production-docker-image
-
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
@@ -111,7 +111,6 @@ void copy_blocks(std::vector<torch::Tensor> &key_caches,
 void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
                        torch::Tensor &key_cache, torch::Tensor &value_cache,
                        torch::Tensor &slot_mapping,
-                       const std::string &kv_cache_dtype) {
                        const std::string &kv_cache_dtype, float kv_scale) {
   TORCH_CHECK(kv_scale == 1.0f);
 

diff --git a/docs/source/assets/kernel/v_vec.png b/docs/source/assets/kernel/v_vec.png
diff --git a/docs/source/assets/kernel/value.png b/docs/source/assets/kernel/value.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,7 @@ exclude = [
     # External file, leaving license intact
     "examples/fp8/quantizer/quantize.py"
 ]
+
 [tool.ruff.lint]
 select = [
     # pycodestyle

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
@@ -14,7 +14,6 @@
 ]
 
 
-@pytest.mark.skip(reason="drift")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -60,8 +59,7 @@ def test_models(
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
         vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert (
-            hf_output_str == vllm_output_str
-        ), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
-        assert (hf_output_ids == vllm_output_ids
-                ), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
@@ -43,8 +43,7 @@ def format_prompt_tuples(prompt):
         prompts,
         sampling_params,
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
+        if lora_id else None)
     # Print the outputs.
     generated_texts = []
     for output in outputs:
@@ -64,21 +63,19 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
     # if torch.cuda.device_count() < tp_size:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    llm = vllm.LLM(
-        model=model.model_path,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        max_model_len=400,
-        tensor_parallel_size=tp_size,
-        quantization=model.quantization,
-        trust_remote_code=True,
-    )
+    llm = vllm.LLM(model=model.model_path,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   max_model_len=400,
+                   tensor_parallel_size=tp_size,
+                   quantization=model.quantization,
+                   trust_remote_code=True)
 
     if model.quantization is None:
         expected_no_lora_output = [
             "Here are some examples of orange-brown colors",
-            "I'm sorry, I don't have",
+            "I'm sorry, I don't have"
         ]
         expected_lora_output = [
             "#ff8050",
@@ -111,7 +108,7 @@ def expect_match(output, expected_output):
             assert output != expected_no_lora_output
             for i, o in enumerate(output):
                 assert o.startswith(
-                    "#"), f"Expected example {i} to start with # but got {o}"
+                    '#'), f"Expected example {i} to start with # but got {o}"
             return
         assert output == expected_output
 
@@ -158,28 +155,24 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model):
     # if torch.cuda.device_count() < 2:
     #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
 
-    llm_tp1 = vllm.LLM(
-        model=model.model_path,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        tensor_parallel_size=1,
-        quantization=model.quantization,
-        trust_remote_code=True,
-    )
+    llm_tp1 = vllm.LLM(model=model.model_path,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=1,
+                       quantization=model.quantization,
+                       trust_remote_code=True)
     output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
 
     del llm_tp1
     cleanup()
 
-    llm_tp2 = vllm.LLM(
-        model=model.model_path,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        tensor_parallel_size=2,
-        quantization=model.quantization,
-    )
+    llm_tp2 = vllm.LLM(model=model.model_path,
+                       enable_lora=True,
+                       max_num_seqs=16,
+                       max_loras=4,
+                       tensor_parallel_size=2,
+                       quantization=model.quantization)
     output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
 
     del llm_tp2

diff --git a/vllm/config.py b/vllm/config.py
@@ -580,15 +580,7 @@ def __init__(
         placement_group: Optional["PlacementGroup"] = None,
     ) -> None:
         self.pipeline_parallel_size = pipeline_parallel_size
-        if is_neuron():
-            # For Neuron device support, here we assign TP=1 to avoid sharding
-            # within vLLM directly. Transformer-neuronx would take
-            # neuron_tp_degree attribute, and distribute the workload
-            # to multiple NeuronCores.
-            self.tensor_parallel_size = 1
-            self.neuron_tp_degree = tensor_parallel_size
-        else:
-            self.tensor_parallel_size = tensor_parallel_size
+        self.tensor_parallel_size = tensor_parallel_size
         self.worker_use_ray = worker_use_ray
         self.max_parallel_loading_workers = max_parallel_loading_workers
         self.disable_custom_all_reduce = disable_custom_all_reduce

diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
@@ -363,4 +363,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                weight_loader(param, loaded_weight)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -1033,11 +1033,9 @@ def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
 
-class CUDAGraphRunner(nn.Module):
+class CUDAGraphRunner():
 
     def __init__(self, model: nn.Module):
-        super().__init__()
-
         self.model = model
         self.input_buffers: Dict[str, torch.Tensor] = {}
         self.output_buffers: Dict[str, torch.Tensor] = {}
@@ -1125,6 +1123,9 @@ def forward(
         # Return the output tensor.
         return self.output_buffers["hidden_states"]
 
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
 
 @contextlib.contextmanager
 def _maybe_pynccl():
Benchmark suite	Current: `50c1029`	Previous: `df1f1a0`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`3.834528548854599` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"}`	`1472.458962760166` tokens/s