Upstream sync v0.4.0.post1 (merged with upstream-v0.4.0.post1) (#157)

neuralmagic · Apr 4, 2024 · 3d151aa · 3d151aa · github-actions · Apr 4, 2024
1 parent 5d256f5
commit 3d151aa
Show file tree

Hide file tree

Showing 297 changed files with 16,469 additions and 2,466 deletions.
diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+
+# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
+mkdir -p images
+cd images
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
+
+cd -
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -23,8 +23,9 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 python3 benchmarks/benchmark_serving.py \
-    --backend openai \
-    --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
+    --backend vllm \
+    --dataset-name sharegpt \
+    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
     --model meta-llama/Llama-2-7b-chat-hf \
     --num-prompts 20 \
     --endpoint /v1/completions \
@@ -48,7 +49,9 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
 echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
+echo '```' >> benchmark_results.md
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+echo '```' >> benchmark_results.md
 
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -0,0 +1,14 @@
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.cpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f cpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -12,35 +12,53 @@ steps:
   command: pytest -v -s async_engine
 
 - label: Basic Correctness Test
-  command: pytest -v -s --forked basic_correctness
+  command: pytest -v -s basic_correctness
 
 - label: Core Test
   command: pytest -v -s core
 
 - label: Distributed Comm Ops Test
-  command: pytest -v -s --forked test_comm_ops.py
+  command: pytest -v -s test_comm_ops.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
-- label: Distributed Correctness Test
-  command: pytest -v -s --forked test_basic_distributed_correctness.py
+- label: Distributed Tests
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
+  commands:
+  - pytest -v -s test_pynccl.py
+  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
 
 - label: Engine Test
   command: pytest -v -s engine tokenization test_sequence.py test_config.py
 
 - label: Entrypoints Test
   command: pytest -v -s entrypoints
 
+- label: Examples Test
+  working_dir: "/vllm-workspace/examples"
+  commands:
+    # install aws cli for llava_example.py
+    - pip install awscli
+    - python3 offline_inference.py
+    - python3 offline_inference_with_prefix.py
+    - python3 llm_engine_example.py
+    - python3 llava_example.py
+
 - label: Kernels Test %N
   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
 - label: Models Test
   commands:
-    - pytest -v -s models --forked
-  soft_fail: true
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
+
+- label: Llava Test
+  commands:
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models/test_llava.py
 
 - label: Prefix Caching Test
   commands:

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -8,6 +8,9 @@ steps:
       queue: amd
     command: bash .buildkite/run-amd-test.sh
 
+  - label: "CPU Test"
+    command: bash .buildkite/run-cpu-test.sh
+
   - label: ":docker: build image"
     commands:
       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
@@ -53,6 +56,8 @@ steps:
                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                 {% endif %}
                 env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
                   - name: HF_TOKEN
                     valueFrom:
                       secretKeyRef:

diff --git a/.github/scripts/lm_eval_compare_hf_vs_vllm.py b/.github/scripts/lm_eval_compare_hf_vs_vllm.py
@@ -2,11 +2,10 @@
 import os
 from typing import Dict, List, Tuple
 
-import numpy as np
-import scipy.stats
-
 import lm_eval
 import lm_eval.models.utils
+import numpy as np
+import scipy.stats
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 

diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
@@ -98,6 +98,9 @@ do
 done
 echo "..."
 
+# download required artifacts for testing
+(cd ${TEST_DIR} && bash ../.buildkite/download-images.sh)
+
 # run selected tests
 SUCCESS=0
 CC_PYTEST_FLAGS="--cov=${SRC_DIR} --cov=${TEST_DIR} --cov-report=html:cc-vllm-html --cov-append"
@@ -108,12 +111,12 @@ do
 
     # this is a bit messy and brittle, but certain tests
     # need to be run with specific options
-    if [[ "${TEST}" == *"kernels/test_pos_encoding"* ]]; then
-        CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --forked --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
-    elif [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
+    if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
         CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
-    elif [[ "${TEST}" == *"distributed"* || "${TEST}" == *"models_logprobs"* || "${TEST}" == *"basic_correctness"* ]]; then
-        pytest ${CC_PYTEST_FLAGS} --forked --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    elif [[ "${TEST}" == *"test_basic_distributed_correctness"* ]]; then
+        CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=facebook/opt-125m pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
+        pytest --forked ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     else
         pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     fi

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -21,7 +21,7 @@ jobs:
         uses: ./.github/workflows/build-test.yml
         with:
             build_label: aws-avx2-192G-4-a10g-96G
-            timeout: 240
+            timeout: 480
             gitref: ${{ github.ref }}
             Gi_per_thread: 4
             nvcc_threads: 8
@@ -33,7 +33,7 @@ jobs:
         uses: ./.github/workflows/build-test.yml
         with:
             build_label: aws-avx2-32G-a10g-24G
-            timeout: 300
+            timeout: 480
             gitref: ${{ github.ref }}
             Gi_per_thread: 12
             nvcc_threads: 1

diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
@@ -19,7 +19,7 @@ jobs:
         uses: ./.github/workflows/build-test.yml
         with:
             build_label: aws-avx512-192G-4-T4-64G
-            timeout: 240
+            timeout: 360
             gitref: '${{ github.ref }}'
             Gi_per_thread: 4
             nvcc_threads: 8

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -25,10 +25,13 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
+        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
     - name: Analysing the code with ruff
       run: |
         ruff .
     - name: Spelling check with codespell
       run: |
-        codespell --toml pyproject.toml
+        codespell --toml pyproject.toml
+    - name: Run isort
+      run: |
+        isort . --check-only
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
@@ -15,6 +15,7 @@ $python_executable -m pip install -r requirements.txt
 export MAX_JOBS=1
 # Make sure punica is built for the release (for LoRA)
 export VLLM_INSTALL_PUNICA_KERNELS=1
-
+# Make sure release wheels are built for the following architectures
+export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,10 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
+option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 
@@ -70,10 +73,24 @@ find_package(Torch REQUIRED)
 # config is used for standalone C++ binaries that link against torch).
 # The `libtorch_python.so` library defines some of the glue code between
 # torch/python via pybind and is required by VLLM extensions for this
-# reason. So, add it by manually using `append_torchlib_if_found` from
-# torch's cmake setup.
+# reason. So, add it by manually with `find_library` using torch's
+# installed library path.
+#
+find_library(torch_python_LIBRARY torch_python PATHS
+  "${TORCH_INSTALL_PREFIX}/lib")
+
 #
-append_torchlib_if_found(torch_python)
+# Forward the non-CUDA device extensions to external CMake scripts.
+#
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
+    NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
+    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+    else()
+        message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
+    endif()
+    return()
+endif()
 
 #
 # Set up GPU language and check the torch version and warn if it isn't

diff --git a/Dockerfile b/Dockerfile
@@ -35,6 +35,9 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-build.txt
 
+# install compiler cache to speed up compilation leveraging local or remote caching
+RUN apt-get update -y && apt-get install -y ccache
+
 # copy input files
 COPY csrc csrc
 COPY setup.py setup.py
@@ -56,7 +59,9 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
-RUN python3 setup.py build_ext --inplace
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    python3 setup.py build_ext --inplace
 #################### EXTENSION Build IMAGE ####################
 
 #################### FLASH_ATTENTION Build IMAGE ####################
@@ -97,7 +102,7 @@ RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip instal
 
 #################### RUNTIME BASE IMAGE ####################
 # We used base cuda image because pytorch installs its own cuda libraries.
-# However cupy depends on cuda libraries so we had to switch to the runtime image
+# However pynccl depends on cuda libraries so we had to switch to the runtime image
 # In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
 FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
 
@@ -131,5 +136,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY vllm vllm
 
+ENV VLLM_USAGE_SOURCE production-docker-image
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -0,0 +1,20 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+
+FROM ubuntu:22.04
+
+RUN apt-get update  -y \
+    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+RUN pip install --upgrade pip \
+    && pip install wheel packaging ninja setuptools>=49.4.0 numpy
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+CMD ["/bin/bash"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -78,23 +78,6 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
 RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
     rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
-# build cupy
-RUN if [ "$BUILD_CUPY" = "1" ]; then \
-    mkdir -p libs \
-    && cd libs \
-    && git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \
-    && cd cupy \
-    && pip install mpi4py-mpich \
-    && pip install scipy==1.9.3 \
-    && pip install cython==0.29.* \
-    && env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
-    && export CUPY_INSTALL_USE_HIP=1 \
-    && export ROCM_HOME=/opt/rocm \
-    && export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \
-    && pip install . \
-    && cd ..; \
-    fi
-
 COPY ./ /app/vllm
 
 RUN python3 -m pip install --upgrade pip
@@ -110,6 +93,6 @@ RUN cd /app \
     && cd ..
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
 
 CMD ["/bin/bash"]
Benchmark suite	Current: `3d151aa`	Previous: `5d256f5`	Ratio
`{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`14154.261130999657` ms	`12107.788427500054` ms	`1.17`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`124.25997606894775` ms	`107.88996677468617` ms	`1.15`
`{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`112.44488433213928` ms	`96.48709741669009` ms	`1.17`
`{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`12421.510741999555` ms	`10733.578026000032` ms	`1.16`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`95.61047598259354` ms	`81.96528384593707` ms	`1.17`
`{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`98.40706089880393` ms	`83.47636292423746` ms	`1.18`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`19808.064829272684` ms	`17532.284571051987` ms	`1.13`
`{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}`	`14854.807530500693` ms	`12446.751371499886` ms	`1.19`