diff --git a/.github/workflows/_helm-e2e.yaml b/.github/workflows/_helm-e2e.yaml index 0062127b0..9deff4af6 100644 --- a/.github/workflows/_helm-e2e.yaml +++ b/.github/workflows/_helm-e2e.yaml @@ -65,7 +65,7 @@ jobs: echo "CHART_NAME=$CHART_NAME" >> $GITHUB_ENV echo "RELEASE_NAME=${CHART_NAME}$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV echo "NAMESPACE=${CHART_NAME}-$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV - echo "ROLLOUT_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV + echo "ROLLOUT_TIMEOUT_SECONDS=1200s" >> $GITHUB_ENV echo "TEST_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV echo "KUBECTL_TIMEOUT_SECONDS=60s" >> $GITHUB_ENV echo "should_cleanup=false" >> $GITHUB_ENV diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml index 7d22d2811..e9a0d00fa 100644 --- a/helm-charts/chatqna/Chart.yaml +++ b/helm-charts/chatqna/Chart.yaml @@ -18,6 +18,19 @@ dependencies: - name: tgi version: 1.0.0 repository: "file://../common/tgi" + condition: tgi.enabled + - name: vllm + version: 1.0.0 + repository: "file://../common/vllm" + condition: vllm.enabled + - name: llm-uservice + version: 1.0.0 + repository: "file://../common/llm-uservice" + condition: tgi.enabled + - name: llm-ctrl-uservice + version: 1.0.0 + repository: "file://../common/llm-ctrl-uservice" + condition: vllm.enabled - name: tei version: 1.0.0 repository: "file://../common/tei" diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index a591ff1e6..e329c105f 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -9,37 +9,91 @@ Helm chart for deploying ChatQnA service. ChatQnA depends on the following servi - [redis-vector-db](../common/redis-vector-db/README.md) - [reranking-usvc](../common/reranking-usvc/README.md) - [teirerank](../common/teirerank/README.md) -- [llm-uservice](../common/llm-uservice/README.md) -- [tgi](../common/tgi/README.md) + +For LLM inference, two more microservices will be required. We can either use [TGI](https://github.com/huggingface/text-generation-inference) or [vLLM](https://github.com/vllm-project/vllm) as our LLM backend. Depending on that, we will have following microservices as part of dependencies for ChatQnA application. + +1. For using **TGI** as an inference service, following 2 microservices will be required: + + - [llm-uservice](../common/llm-uservice/README.md) + - [tgi](../common/tgi/README.md) + +2. For using **vLLM** as an inference service, following 2 microservices would be required: + + - [llm-ctrl-uservice](../common/llm-ctrl-uservice/README.md) + - [vllm](../common/vllm/README.md) + +> **_NOTE :_** We shouldn't have both inference engine deployed. It is required to only setup either of them. To achieve this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. ## Installing the Chart -To install the chart, run the following: +Please follow the following steps to install the ChatQnA Chart: + +1. Clone the GenAIInfra repository: + +```bash +git clone https://github.com/opea-project/GenAIInfra.git +``` + +2. Setup the dependencies and required environment variables: -```console +```bash cd GenAIInfra/helm-charts/ ./update_dependency.sh helm dependency update chatqna export HFTOKEN="insert-your-huggingface-token-here" export MODELDIR="/mnt/opea-models" export MODELNAME="Intel/neural-chat-7b-v3-3" +``` + +3. Depending on the device which we are targeting for running ChatQnA, please use one the following installation commands: + +```bash +# Install the chart on a Xeon machine + # If you would like to use the traditional UI, please change the image as well as the containerport within the values # append these at the end of the command "--set chatqna-ui.image.repository=opea/chatqna-ui,chatqna-ui.image.tag=latest,chatqna-ui.containerPort=5173" + helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} +``` + +```bash # To use Gaudi device -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml +``` + +```bash # To use Nvidia GPU -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml +``` + +```bash # To include guardrail component in chatqna on Xeon -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml +``` + +```bash # To include guardrail component in chatqna on Gaudi -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml +``` + +> **_NOTE :_** Default installation will use [TGI (Text Generation Inference)](https://github.com/huggingface/text-generation-inference) as inference engine. To use vLLM as inference engine, please see below. + +```bash +# To use vLLM inference engine on XEON device + +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tgi.enabled=false --set vllm.enabled=true + +# To use OpenVINO optimized vLLM inference engine on XEON device + +helm install -f ./chatqna/vllm-openvino-values.yaml chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tgi.enabled=false --set vllm.enabled=true ``` ### IMPORTANT NOTE 1. Make sure your `MODELDIR` exists on the node where your workload is scheduled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model. +2. Please set `http_proxy`, `https_proxy` and `no_proxy` values while installing chart, if you are behind a proxy. + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are running. @@ -52,8 +106,9 @@ Run the command `kubectl port-forward svc/chatqna 8888:8888` to expose the servi Open another terminal and run the following command to verify the service if working: -```console +```bash curl http://localhost:8888/v1/chatqna \ + -X POST \ -H "Content-Type: application/json" \ -d '{"messages": "What is the revenue of Nike in 2023?"}' ``` @@ -71,12 +126,13 @@ Open a browser to access `http://:${port}` to play with the ## Values -| Key | Type | Default | Description | -| ----------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------------- | -| image.repository | string | `"opea/chatqna"` | | -| service.port | string | `"8888"` | | -| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.monitoring | bop; | false | Enable usage metrics for the service components. See ../monitoring.md before enabling! | +| Key | Type | Default | Description | +| -------------------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------------- | +| image.repository | string | `"opea/chatqna"` | | +| service.port | string | `"8888"` | | +| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| vllm-openvino.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.monitoring | bop; | false | Enable usage metrics for the service components. See ../monitoring.md before enabling! | ## Troubleshooting diff --git a/helm-charts/chatqna/ci-vllm-openvino-values.yaml b/helm-charts/chatqna/ci-vllm-openvino-values.yaml new file mode 100644 index 000000000..653953d3d --- /dev/null +++ b/helm-charts/chatqna/ci-vllm-openvino-values.yaml @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: false + +vllm: + enabled: true + openvino_enabled: true + image: + repository: opea/vllm-openvino + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + + extraCmdArgs: [] + + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + + CUDA_GRAPHS: "0" + VLLM_CPU_KVCACHE_SPACE: 50 + VLLM_OPENVINO_KVCACHE_SPACE: 32 + OMPI_MCA_btl_vader_single_copy_mechanism: none + + ov_command: ["/bin/bash"] diff --git a/helm-charts/chatqna/ci-vllm-values.yaml b/helm-charts/chatqna/ci-vllm-values.yaml new file mode 100644 index 000000000..d16040d28 --- /dev/null +++ b/helm-charts/chatqna/ci-vllm-values.yaml @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: false + +vllm: + enabled: true diff --git a/helm-charts/chatqna/templates/deployment.yaml b/helm-charts/chatqna/templates/deployment.yaml index 812d38486..a1e5e11d3 100644 --- a/helm-charts/chatqna/templates/deployment.yaml +++ b/helm-charts/chatqna/templates/deployment.yaml @@ -33,12 +33,25 @@ spec: containers: - name: {{ .Release.Name }} env: + {{- if .Values.vllm.enabled }} + - name: LLM_SERVICE_HOST_IP + value: {{ .Release.Name }}-llm-ctrl-uservice + - name: LLM_SERVER_HOST_IP + value: {{ .Release.Name }}-vllm + - name: LLM_MODEL + value: {{ .Values.vllm.LLM_MODEL_ID | quote }} + {{- else }} + - name: LLM_SERVICE_HOST_IP + value: {{ .Release.Name }}-llm-uservice - name: LLM_SERVER_HOST_IP value: {{ .Release.Name }}-tgi - - name: LLM_SERVER_PORT - value: "80" - name: LLM_MODEL value: {{ .Values.tgi.LLM_MODEL_ID | quote }} + {{- end }} + - name: RERANK_SERVICE_HOST_IP + value: {{ .Release.Name }}-reranking-usvc + - name: LLM_SERVER_PORT + value: "80" - name: RERANK_SERVER_HOST_IP value: {{ .Release.Name }}-teirerank - name: RERANK_SERVER_PORT diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index d0f64f3e0..182cfc656 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -22,6 +22,14 @@ nginx: service: type: NodePort +imagePullSecrets: [] + +podAnnotations: {} + +podSecurityContext: {} + +resources: {} + securityContext: readOnlyRootFilesystem: true allowPrivilegeEscalation: false @@ -47,6 +55,14 @@ horizontalPodAutoscaler: # Override values in specific subcharts tgi: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + enabled: true + +vllm: + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + enabled: false + +llm-ctrl-uservice: + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 # disable guardrails-usvc by default # See guardrails-values.yaml for guardrail related options @@ -66,9 +82,9 @@ global: https_proxy: "" no_proxy: "" HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + # set modelUseHostPath or modelUsePVC to use model cache. modelUseHostPath: "" - # modelUseHostPath: /mnt/opea-models # modelUsePVC: model-volume # Install Prometheus serviceMonitors for service components diff --git a/helm-charts/chatqna/vllm-openvino-values.yaml b/helm-charts/chatqna/vllm-openvino-values.yaml new file mode 100644 index 000000000..4097b0ee4 --- /dev/null +++ b/helm-charts/chatqna/vllm-openvino-values.yaml @@ -0,0 +1,21 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +vllm: + openvino_enabled: true + image: + repository: opea/vllm-openvino + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + + extraCmdArgs: [] + + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + + CUDA_GRAPHS: "0" + VLLM_CPU_KVCACHE_SPACE: 50 + VLLM_OPENVINO_KVCACHE_SPACE: 32 + OMPI_MCA_btl_vader_single_copy_mechanism: none + + ov_command: ["/bin/bash"] diff --git a/helm-charts/common/llm-ctrl-uservice/.helmignore b/helm-charts/common/llm-ctrl-uservice/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts/common/llm-ctrl-uservice/Chart.yaml b/helm-charts/common/llm-ctrl-uservice/Chart.yaml new file mode 100644 index 000000000..bb10012ea --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/Chart.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v2 +name: llm-ctrl-uservice +description: A Helm chart for LLM controller microservice which connects with vLLM microservice to provide inferences. +type: application +version: 1.0.0 +appVersion: "v1.0" +dependencies: + - name: vllm + version: 1.0.0 + repository: file://../vllm + condition: vllm.enabled diff --git a/helm-charts/common/llm-ctrl-uservice/README.md b/helm-charts/common/llm-ctrl-uservice/README.md new file mode 100644 index 000000000..b74190ca3 --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/README.md @@ -0,0 +1,91 @@ +# llm-ctrl Microservice + +Helm chart for deploying LLM controller microservice which facilitates connections and handles responses from OpenVINO vLLM microservice. + +`llm-ctrl-uservice` depends on vLLM microservice. You should properly set `vLLM_ENDPOINT` as the HOST URI of vLLM microservice. If not set, it will consider the default value : `http://-vllm:80` + +As this service depends on vLLM microservice, we can proceed in either of 2 ways: + +- Install both microservices individually. +- Install the vLLM microservice as dependency for `llm-ctrl-uservice` microservice. + +## (Option 1): Installing the charts individually: + +First, you need to install the `vllm` chart, please refer to the [vllm](../vllm) chart for more information. + +After you've deployed the `vllm` chart successfully, please run `kubectl get svc` to get the vLLM service name with port. We need to provide this to `llm-ctrl-uservice` as a value for vLLM_ENDPOINT for letting it discover and connect to the vLLM microservice. + +> **_NOTE:_** While installing charts separately, if you don't provide any vLLM endpoint explicitly, it will take the default endpoint as `http://-vllm:80`. So, if you are not providing the vLLM endpoint explicitly, please make sure to provide same helm release name to both the charts while installing. + +Get the service name for vLLM deployment by running: `kubectl get svc`. In the current case, service name would be `myvllm`. + +> **_NOTE:_** Please add the service name for vLLM to the value of no_proxy env var, if you are behind a proxy. + +To install the chart, run the following: + +```bash +cd GenAIInfra/helm-charts/common/llm-ctrl-uservice +export HFTOKEN="insert-your-huggingface-token-here" +export vLLM_ENDPOINT="http://myvllm" +export MODELNAME="Intel/neural-chat-7b-v3-3" + +# If proxy is required, please export the appropriate proxy values. +export http_proxy= +export https_proxy= + +helm dependency update +helm install llm-ctrl-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set vLLM_ENDPOINT=${vLLM_ENDPOINT} --set LLM_MODEL_ID=${MODELNAME} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +``` + +## (Option 2): Installing the chart with automatic installation of dependency: + +```bash +cd GenAIInfra/helm-charts/common/llm-ctrl-uservice +export HFTOKEN="insert-your-huggingface-token-here" +export MODELDIR="/mnt/opea-models" +export MODELNAME="Intel/neural-chat-7b-v3-3" + +# If proxy is required, please export the appropriate proxy values. +export http_proxy= +export https_proxy= + +helm dependency update +helm install llm-ctrl-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set vllm.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +``` + +`--wait` flag in above installation command will make sure that all the dependencies are resolved and all services are deployed. + +## Verify + +To verify the installation, run the following command to make sure all pods are running. + +```bash +kubectl get pod +``` + +Once you see `llm-ctrl-uservice` pod and `llm-ctrl-uservice-vllm` pod in ready and running state, run the following command: + +```bash +kubectl port-forward svc/llm-ctrl-uservice 9000:9000 +``` + +This exposes the port 9000, on which `llm-ctrl-uservice` is running inside the pod, at port 9000 on the host. + +Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `llm-ctrl-uservice` is working: + +```bash +curl http://localhost:9000/v1/chat/completions \ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -H 'Content-Type: application/json' +``` + +## Values + +| Key | Type | Default | Description | +| ------------------------------- | ------ | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| global.HUGGINGFACEHUB_API_TOKEN | string | `""` | Your own Hugging Face API token | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, vLLM will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory | +| image.repository | string | `"opea/llm-vllm"` | | +| service.port | string | `"9000"` | | +| vLLM_ENDPOINT | string | `""` | OpenVINO vLLM service endpoint | diff --git a/helm-charts/common/llm-ctrl-uservice/ci-values.yaml b/helm-charts/common/llm-ctrl-uservice/ci-values.yaml new file mode 100644 index 000000000..763f5c3f2 --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/ci-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +vllm: + enabled: true diff --git a/helm-charts/common/llm-ctrl-uservice/templates/_helpers.tpl b/helm-charts/common/llm-ctrl-uservice/templates/_helpers.tpl new file mode 100644 index 000000000..3cf82f83a --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "llm-ctrl-uservice.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "llm-ctrl-uservice.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "llm-ctrl-uservice.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "llm-ctrl-uservice.labels" -}} +helm.sh/chart: {{ include "llm-ctrl-uservice.chart" . }} +{{ include "llm-ctrl-uservice.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "llm-ctrl-uservice.selectorLabels" -}} +app.kubernetes.io/name: {{ include "llm-ctrl-uservice.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "llm-ctrl-uservice.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "llm-ctrl-uservice.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts/common/llm-ctrl-uservice/templates/configmap.yaml b/helm-charts/common/llm-ctrl-uservice/templates/configmap.yaml new file mode 100644 index 000000000..4bc0fcea4 --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/templates/configmap.yaml @@ -0,0 +1,33 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "llm-ctrl-uservice.fullname" . }}-config + labels: + {{- include "llm-ctrl-uservice.labels" . | nindent 4 }} +data: + {{- if .Values.vLLM_ENDPOINT }} + vLLM_ENDPOINT: {{ .Values.vLLM_ENDPOINT | quote}} + {{- else }} + vLLM_ENDPOINT: "http://{{ .Release.Name }}-vllm" + {{- end }} + LLM_MODEL: {{ .Values.LLM_MODEL_ID | quote }} + HUGGINGFACEHUB_API_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} + HF_HOME: "/tmp/.cache/huggingface" + {{- if .Values.global.HF_ENDPOINT }} + HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} + {{- end }} + http_proxy: {{ .Values.global.http_proxy | quote }} + https_proxy: {{ .Values.global.https_proxy | quote }} + {{- if and (not .Values.vLLM_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy) }} + no_proxy: "{{ .Release.Name }}-vllm,{{ .Values.global.no_proxy }}" + {{- else }} + no_proxy: "{{ .Values.global.no_proxy }},myvllm,vllm" + {{- end }} + LANGCHAIN_TRACING_V2: {{ .Values.global.LANGCHAIN_TRACING_V2 | quote }} + LANGCHAIN_API_KEY: {{ .Values.global.LANGCHAIN_API_KEY }} + LANGCHAIN_PROJECT: "opea-llm-uservice" + HF_HUB_DISABLE_PROGRESS_BARS: {{ .Values.HF_HUB_DISABLE_PROGRESS_BARS | quote }} + HF_HUB_ENABLE_HF_TRANSFER: {{ .Values.HF_HUB_ENABLE_HF_TRANSFER | quote }} diff --git a/helm-charts/common/llm-ctrl-uservice/templates/deployment.yaml b/helm-charts/common/llm-ctrl-uservice/templates/deployment.yaml new file mode 100644 index 000000000..9020a59de --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/templates/deployment.yaml @@ -0,0 +1,82 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-ctrl-uservice.fullname" . }} + labels: + {{- include "llm-ctrl-uservice.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "llm-ctrl-uservice.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "llm-ctrl-uservice.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + envFrom: + - configMapRef: + name: {{ include "llm-ctrl-uservice.fullname" . }}-config + {{- if .Values.global.extraEnvConfig }} + - configMapRef: + name: {{ .Values.global.extraEnvConfig }} + optional: true + {{- end }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: llm-vllm + containerPort: {{ .Values.service.targetPort }} + protocol: TCP + {{- if .Values.livenessProbe }} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.readinessProbe }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + {{- end }} + {{- if .Values.startupProbe }} + startupProbe: + {{- toYaml .Values.startupProbe | nindent 12 }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - mountPath: /tmp + name: tmp + volumes: + - name: tmp + emptyDir: {} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm-charts/common/llm-ctrl-uservice/templates/service.yaml b/helm-charts/common/llm-ctrl-uservice/templates/service.yaml new file mode 100644 index 000000000..ef5ae38fb --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/templates/service.yaml @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "llm-ctrl-uservice.fullname" . }} + labels: + {{- include "llm-ctrl-uservice.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + protocol: TCP + name: llm-vllm + selector: + {{- include "llm-ctrl-uservice.selectorLabels" . | nindent 4 }} diff --git a/helm-charts/common/llm-ctrl-uservice/templates/tests/test-pod.yaml b/helm-charts/common/llm-ctrl-uservice/templates/tests/test-pod.yaml new file mode 100644 index 000000000..a0d86f3da --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/templates/tests/test-pod.yaml @@ -0,0 +1,29 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "llm-ctrl-uservice.fullname" . }}-testpod" + labels: + {{- include "llm-ctrl-uservice.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: curl + image: python:3.10.14 + command: ['bash', '-c'] + args: + - | + max_retry=20; + for ((i=1; i<=max_retry; i++)); do + curl http://{{ include "llm-ctrl-uservice.fullname" . }}:{{ .Values.service.port }}/v1/chat/completions -sS --fail-with-body \ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -H 'Content-Type: application/json' && break; + curlcode=$? + if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; + done; + if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi + restartPolicy: Never diff --git a/helm-charts/common/llm-ctrl-uservice/values.yaml b/helm-charts/common/llm-ctrl-uservice/values.yaml new file mode 100644 index 000000000..b4f2521e8 --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/values.yaml @@ -0,0 +1,104 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for llm-ctrl-uservice. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 +vLLM_ENDPOINT: "" +HF_HUB_DISABLE_PROGRESS_BARS: 1 +HF_HUB_ENABLE_HF_TRANSFER: 0 + +image: + repository: opea/llm-vllm + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: + readOnlyRootFilesystem: false + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + + +service: + type: ClusterIP + port: 9000 + targetPort: 9000 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + httpGet: + path: v1/health_check + port: llm-vllm + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 24 +readinessProbe: + httpGet: + path: v1/health_check + port: llm-vllm + initialDelaySeconds: 5 + periodSeconds: 5 +startupProbe: + httpGet: + path: v1/health_check + port: llm-vllm + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 + + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +# Model ID to be used by llm-vllm microservice +LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" + +# Overriding the Model ID being used by vllm-openvino service.(As llm-vllm microservice depends on vllm-openvino, these 2 values should be same.) +vllm: + enabled: false + LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" + +global: + http_proxy: "" + https_proxy: "" + no_proxy: "" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + LANGCHAIN_TRACING_V2: false + LANGCHAIN_API_KEY: "insert-your-langchain-key-here" + # set modelUseHostPath to host directory if you want to use hostPath volume for model storage + # comment out modeluseHostPath if you want to download the model from huggingface + # modelUseHostPath: "" + modelUseHostPath: "" diff --git a/helm-charts/common/vllm/README.md b/helm-charts/common/vllm/README.md index 0235a7443..d366667be 100644 --- a/helm-charts/common/vllm/README.md +++ b/helm-charts/common/vllm/README.md @@ -10,23 +10,43 @@ To install the chart, run the following: Note that you cannot use vllm as the service release name due to [environment variables conflict](https://docs.vllm.ai/en/stable/serving/env_vars.html#environment-variables). -```console +```bash cd GenAIInfra/helm-charts/common export MODELDIR=/mnt/opea-models export MODELNAME="Intel/neural-chat-7b-v3-3" export HFTOKEN="insert-your-huggingface-token-here" + +# If you are behind a proxy, please export the appropriate proxy values. +export http_proxy= +export https_proxy= + +``` + +- Deploy on XEON device: + +```bash helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -# To deploy on Gaudi enabled kubernetes cluster -# helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values gaudi-values.yaml ``` -By default, the vllm service will downloading the "Intel/neural-chat-7b-v3-3". +- To deploy on Gaudi enabled Kubernetes cluster: -If you already cached the model locally, you can pass it to container like this example: +```bash +helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values vllm/gaudi-values.yaml +``` + +- To deploy OpenVINO optimized vLLM on XEON device: + +```bash +helm -f vllm/openvino-values.yaml install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} +``` + +By default, the vLLM service will download "Intel/neural-chat-7b-v3-3" model. If you already cached the model locally, you can pass it to container like this example: +```bash MODELDIR=/mnt/opea-models MODELNAME="facebook/opt-125m" +``` ## Verify diff --git a/helm-charts/common/vllm/ci-openvino-values.yaml b/helm-charts/common/vllm/ci-openvino-values.yaml new file mode 120000 index 000000000..81b2b0484 --- /dev/null +++ b/helm-charts/common/vllm/ci-openvino-values.yaml @@ -0,0 +1 @@ +openvino-values.yaml \ No newline at end of file diff --git a/helm-charts/common/vllm/openvino-values.yaml b/helm-charts/common/vllm/openvino-values.yaml new file mode 100644 index 000000000..5e72d5b00 --- /dev/null +++ b/helm-charts/common/vllm/openvino-values.yaml @@ -0,0 +1,23 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Values for OpenVINO optimized vLLM. + +openvino_enabled: true + +image: + repository: opea/vllm-openvino + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + +extraCmdArgs: [] + +LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + +CUDA_GRAPHS: "0" +VLLM_CPU_KVCACHE_SPACE: 50 +VLLM_OPENVINO_KVCACHE_SPACE: 32 +OMPI_MCA_btl_vader_single_copy_mechanism: none + +ov_command: ["/bin/bash"] diff --git a/helm-charts/common/vllm/templates/configmap.yaml b/helm-charts/common/vllm/templates/configmap.yaml index 80b9a97da..c794b6cb0 100644 --- a/helm-charts/common/vllm/templates/configmap.yaml +++ b/helm-charts/common/vllm/templates/configmap.yaml @@ -8,10 +8,17 @@ metadata: labels: {{- include "vllm.labels" . | nindent 4 }} data: + {{- if .Values.openvino_enabled }} + MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }} + PORT: {{ .Values.port | quote }} + {{- end }} HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} {{- if .Values.global.HF_ENDPOINT }} HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} {{- end }} + {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }} + OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote }} + {{- end}} http_proxy: {{ .Values.global.http_proxy | quote }} https_proxy: {{ .Values.global.https_proxy | quote }} no_proxy: {{ .Values.global.no_proxy | quote }} @@ -23,3 +30,9 @@ data: {{- if .Values.VLLM_CPU_KVCACHE_SPACE }} VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}} {{- end }} + {{- if .Values.VLLM_OPENVINO_KVCACHE_SPACE }} + VLLM_OPENVINO_KVCACHE_SPACE: {{ .Values.VLLM_OPENVINO_KVCACHE_SPACE | quote }} + {{- end }} + {{- if .Values.CUDA_GRAPHS }} + CUDA_GRAPHS: {{ .Values.CUDA_GRAPHS | quote }} + {{- end }} diff --git a/helm-charts/common/vllm/templates/deployment.yaml b/helm-charts/common/vllm/templates/deployment.yaml index 14c65f76b..799e32c83 100644 --- a/helm-charts/common/vllm/templates/deployment.yaml +++ b/helm-charts/common/vllm/templates/deployment.yaml @@ -45,7 +45,20 @@ spec: {{- end }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- if .Values.openvino_enabled }} + command: {{ .Values.ov_command }} + {{- end }} args: + {{- if .Values.openvino_enabled }} + - "-c" + - | + cd / && \ + python3 -m vllm.entrypoints.openai.api_server \ + --model {{ .Values.LLM_MODEL_ID | quote }} \ + --host 0.0.0.0 \ + --port {{ .Values.port | quote }} \ + --download-dir /data + {{- else }} {{- if .Values.extraCmdArgs }} {{- range .Values.extraCmdArgs }} - {{ . | quote }} @@ -59,6 +72,7 @@ spec: - {{ .Values.port | quote }} - "--download-dir" - "/data" + {{- end }} volumeMounts: - mountPath: /data name: model-volume