Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
stray issues
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-neuralmagic committed May 7, 2024
1 parent 8423620 commit 50c1029
Show file tree
Hide file tree
Showing 10 changed files with 38 additions and 53 deletions.
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ WORKDIR /usr/src/flash-attention-v2
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
--no-build-isolation --no-deps --no-cache-dir

#################### FLASH_ATTENTION Build IMAGE ####################

#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
Expand All @@ -124,6 +126,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
--mount=type=cache,target=/root/.cache/pip \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### vLLM installation IMAGE ####################

#################### TEST IMAGE ####################
# image to run unit testing suite
Expand Down Expand Up @@ -159,7 +162,5 @@ RUN --mount=type=cache,target=/root/.cache/pip \

ENV VLLM_USAGE_SOURCE production-docker-image

ENV VLLM_USAGE_SOURCE production-docker-image

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
1 change: 0 additions & 1 deletion csrc/cpu/cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ void copy_blocks(std::vector<torch::Tensor> &key_caches,
void reshape_and_cache(torch::Tensor &key, torch::Tensor &value,
torch::Tensor &key_cache, torch::Tensor &value_cache,
torch::Tensor &slot_mapping,
const std::string &kv_cache_dtype) {
const std::string &kv_cache_dtype, float kv_scale) {
TORCH_CHECK(kv_scale == 1.0f);

Expand Down
Binary file removed docs/source/assets/kernel/v_vec.png
Binary file not shown.
Binary file removed docs/source/assets/kernel/value.png
Binary file not shown.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ exclude = [
# External file, leaving license intact
"examples/fp8/quantizer/quantize.py"
]

[tool.ruff.lint]
select = [
# pycodestyle
Expand Down
10 changes: 4 additions & 6 deletions tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
]


@pytest.mark.skip(reason="drift")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
Expand Down Expand Up @@ -60,8 +59,7 @@ def test_models(
for i in range(len(example_prompts)):
hf_output_ids, hf_output_str = hf_outputs[i]
vllm_output_ids, vllm_output_str = vllm_outputs[i]
assert (
hf_output_str == vllm_output_str
), f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
assert (hf_output_ids == vllm_output_ids
), f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}"
assert hf_output_str == vllm_output_str, (
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
55 changes: 24 additions & 31 deletions tests/lora/test_quant_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ def format_prompt_tuples(prompt):
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None,
)
if lora_id else None)
# Print the outputs.
generated_texts = []
for output in outputs:
Expand All @@ -64,21 +63,19 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
# if torch.cuda.device_count() < tp_size:
# pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

llm = vllm.LLM(
model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_model_len=400,
tensor_parallel_size=tp_size,
quantization=model.quantization,
trust_remote_code=True,
)
llm = vllm.LLM(model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_model_len=400,
tensor_parallel_size=tp_size,
quantization=model.quantization,
trust_remote_code=True)

if model.quantization is None:
expected_no_lora_output = [
"Here are some examples of orange-brown colors",
"I'm sorry, I don't have",
"I'm sorry, I don't have"
]
expected_lora_output = [
"#ff8050",
Expand Down Expand Up @@ -111,7 +108,7 @@ def expect_match(output, expected_output):
assert output != expected_no_lora_output
for i, o in enumerate(output):
assert o.startswith(
"#"), f"Expected example {i} to start with # but got {o}"
'#'), f"Expected example {i} to start with # but got {o}"
return
assert output == expected_output

Expand Down Expand Up @@ -158,28 +155,24 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model):
# if torch.cuda.device_count() < 2:
# pytest.skip(f"Not enough GPUs for tensor parallelism {2}")

llm_tp1 = vllm.LLM(
model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=1,
quantization=model.quantization,
trust_remote_code=True,
)
llm_tp1 = vllm.LLM(model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=1,
quantization=model.quantization,
trust_remote_code=True)
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)

del llm_tp1
cleanup()

llm_tp2 = vllm.LLM(
model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=2,
quantization=model.quantization,
)
llm_tp2 = vllm.LLM(model=model.model_path,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=2,
quantization=model.quantization)
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)

del llm_tp2
Expand Down
10 changes: 1 addition & 9 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,15 +580,7 @@ def __init__(
placement_group: Optional["PlacementGroup"] = None,
) -> None:
self.pipeline_parallel_size = pipeline_parallel_size
if is_neuron():
# For Neuron device support, here we assign TP=1 to avoid sharding
# within vLLM directly. Transformer-neuronx would take
# neuron_tp_degree attribute, and distribute the workload
# to multiple NeuronCores.
self.tensor_parallel_size = 1
self.neuron_tp_degree = tensor_parallel_size
else:
self.tensor_parallel_size = tensor_parallel_size
self.tensor_parallel_size = tensor_parallel_size
self.worker_use_ray = worker_use_ray
self.max_parallel_loading_workers = max_parallel_loading_workers
self.disable_custom_all_reduce = disable_custom_all_reduce
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/xverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,4 +363,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
weight_loader(param, loaded_weight)
7 changes: 4 additions & 3 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,11 +1033,9 @@ def vocab_size(self) -> int:
return self.model_config.get_vocab_size()


class CUDAGraphRunner(nn.Module):
class CUDAGraphRunner():

def __init__(self, model: nn.Module):
super().__init__()

self.model = model
self.input_buffers: Dict[str, torch.Tensor] = {}
self.output_buffers: Dict[str, torch.Tensor] = {}
Expand Down Expand Up @@ -1125,6 +1123,9 @@ def forward(
# Return the output tensor.
return self.output_buffers["hidden_states"]

def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)


@contextlib.contextmanager
def _maybe_pynccl():
Expand Down

1 comment on commit 50c1029

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bigger_is_better

Benchmark suite Current: 50c1029 Previous: df1f1a0 Ratio
{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"} 3.834528548854599 prompts/s
{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.3.0+cu121"} 1472.458962760166 tokens/s

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.