Fix OpenVINO vLLM build scripts and update unit test case (#446)

Signed-off-by: Zahidul Haque <[email protected]>
opea-project · Aug 12, 2024 · 91d825c · 91d825c
1 parent c45f8f0
commit 91d825c
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 12 deletions.
diff --git a/comps/llms/text-generation/vllm-openvino/README.md b/comps/llms/text-generation/vllm-openvino/README.md
@@ -1,5 +1,10 @@
 # Use vLLM with OpenVINO
 
+vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.rst) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (`--enable-prefix-caching`)
+- Chunked prefill (`--enable-chunked-prefill`)
+
 ## Build Docker Image
 
 To build the docker image, run the command
@@ -59,15 +64,19 @@ export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8000"
 export LLM_MODEL=<model_name> # example: export LLM_MODEL="meta-llama/Llama-2-7b-hf"
 ```
 
-## Use Int-8 Weights Compression
+## Performance tips
+
+vLLM OpenVINO backend uses the following environment variables to control behavior:
+
+- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+
+- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
 
-Weights int-8 compression is disabled by default. For better performance and lower memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`.
-To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above.
+- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off.
 
-The variable enables weights compression logic described in [optimum-intel 8-bit weights quantization](https://huggingface.co/docs/optimum/intel/optimization_ov#8-bit).
-Hence, even if the variable is enabled, the compression is applied only for models starting with a certain size and avoids compression of too small models due to a significant accuracy drop.
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
 
-## Use UInt-8 KV cache Compression
+OpenVINO best known configuration is:
 
-KV cache uint-8 compression is disabled by default. For better performance and lower memory consumption, the KV cache compression can be enabled by setting the environment variable `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`.
-To pass the variable in docker, use `-e VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` as an additional argument to `docker run` command in the examples above.
+    $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
diff --git a/comps/llms/text-generation/vllm-openvino/build_vllm_openvino.sh b/comps/llms/text-generation/vllm-openvino/build_vllm_openvino.sh
@@ -3,7 +3,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-
-git clone --branch openvino-model-executor https://github.com/ilya-lavrenov/vllm.git
+BASEDIR="$( cd "$( dirname "$0" )" && pwd )"
+git clone https://github.com/vllm-project/vllm.git vllm
 cd ./vllm/
 docker build -t vllm:openvino -f Dockerfile.openvino . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+cd $BASEDIR && rm -rf vllm
diff --git a/comps/llms/text-generation/vllm-openvino/launch_model_server.sh b/comps/llms/text-generation/vllm-openvino/launch_model_server.sh
@@ -42,5 +42,20 @@ port_number=${port:-$default_port}
 # Set the Huggingface cache directory variable
 HF_CACHE_DIR=$HOME/.cache/huggingface
 
-# Start the model server using Openvino as the backend inference engine. Provide the container name that is unique and meaningful, typically one that includes the model name.
-docker run --rm --name="vllm-openvino-server" -p $port_number:$port_number -v $HF_CACHE_DIR:/root/.cache/huggingface vllm:openvino --model $model_name --port $port_number --disable-log-requests --swap-space $swap_space
+# Start the model server using Openvino as the backend inference engine.
+# Provide the container name that is unique and meaningful, typically one that includes the model name.
+
+docker run -d --rm --name="vllm-openvino-server" \
+  -p $port_number:80 \
+  --ipc=host \
+  -e HTTPS_PROXY=$https_proxy \
+  -e HTTP_PROXY=$https_proxy \
+  -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
+  -v $HOME/.cache/huggingface:/root/.cache/huggingface \
+  vllm:openvino /bin/bash -c "\
+    cd / && \
+    export VLLM_CPU_KVCACHE_SPACE=50 && \
+    python3 -m vllm.entrypoints.openai.api_server \
+      --model \"$model_name\" \
+      --host 0.0.0.0 \
+      --port 80"
diff --git a/tests/test_llms_text-generation_vllm-openvino.sh b/tests/test_llms_text-generation_vllm-openvino.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+
+WORKPATH="$( cd "$( dirname "$0" )" && pwd )"
+
+# Define variables
+port=8123
+HF_CACHE_DIR=$HOME/.cache/huggingface
+DOCKER_IMAGE="vllm:openvino"
+CONTAINER_NAME="vllm-openvino-container"
+
+function build_container() {
+    cd $WORKPATH
+    git clone https://github.com/vllm-project/vllm.git vllm-openvino
+    cd ./vllm-openvino/
+    docker build -t $DOCKER_IMAGE \
+      -f Dockerfile.openvino \
+      . \
+      --build-arg https_proxy=$https_proxy \
+      --build-arg http_proxy=$http_proxy
+    cd $WORKPATH
+    rm -rf vllm-openvino
+}
+
+# Function to start Docker container
+start_container() {
+
+    docker run -d --rm --name=$CONTAINER_NAME \
+      -p $port:$port \
+      --ipc=host \
+      -e HTTPS_PROXY=$https_proxy \
+      -e HTTP_PROXY=$https_proxy \
+      -v $HF_CACHE_DIR:/root/.cache/huggingface \
+      vllm:openvino /bin/bash -c "\
+        cd / && \
+        export VLLM_CPU_KVCACHE_SPACE=50 && \
+        python3 -m vllm.entrypoints.openai.api_server \
+          --model \"Intel/neural-chat-7b-v3-3\" \
+          --host 0.0.0.0 \
+          --port $port"
+
+    # check whether service is fully ready
+    n=0
+    until [[ "$n" -ge 300 ]]; do
+        docker logs $CONTAINER_NAME > /tmp/$CONTAINER_NAME.log 2>&1
+        n=$((n+1))
+        if grep -q "Uvicorn running on" /tmp/$CONTAINER_NAME.log; then
+            break
+        fi
+        sleep 3s
+    done
+
+}
+
+# Cleanup Function
+cleanup() {
+    # Stop and remove Docker container and images
+    cid=$(docker ps -aq --filter "name=$CONTAINER_NAME")
+        if [[ ! -z "$cid" ]]; then docker stop $cid || docker rm $cid && sleep 1s; fi
+    docker rmi -f $DOCKER_IMAGE
+    rm /tmp/$CONTAINER_NAME.log
+}
+
+# Function to test API endpoint
+function test_api_endpoint {
+    local endpoint="$1"
+    local expected_status="$2"
+
+    # Make the HTTP request
+    if test "$1" = "v1/completions"
+    then
+        local response=$(curl "http://localhost:$port/$endpoint" \
+          -H "Content-Type: application/json" \
+          -d '{
+                "model": "Intel/neural-chat-7b-v3-3",
+                "prompt": "What is the key advantage of Openvino framework",
+                "max_tokens": 300,
+                "temperature": 0.7
+              }' \
+          --write-out '%{http_code}' \
+          --silent \
+          --output /dev/null)
+    else
+        local response=$(curl "http://localhost:$port/$endpoint" \
+          --write-out '%{http_code}' \
+          --silent \
+          --output /dev/null)
+    fi
+
+    # Assert the response status code
+    if [[ "$response" -eq "$expected_status" ]]; then
+        echo "PASS: $endpoint returned expected status code: $expected_status"
+    else
+        echo "FAIL: $endpoint returned unexpected status code: $response (expected: $expected_status)"
+    fi
+}
+# Main function
+main() {
+
+    build_container
+    start_container
+
+    # Sleep to allow the container to start up fully
+    sleep 10
+    # Test the /v1/models API
+    test_api_endpoint "v1/models" 200
+
+    # Test the /v1/completions API
+    test_api_endpoint "v1/completions" 200
+
+    cleanup
+}
+
+# Call main function
+main