From df8c195a4429a8fbc405f717fb3e33b3d76d3a2d Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Thu, 5 Sep 2024 19:46:27 +0530 Subject: [PATCH 01/37] =?UTF-8?q?=E2=9C=A8=20Added=20chart=20for=20vllm-op?= =?UTF-8?q?envino?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/common/vllm-openvino/.helmignore | 23 ++++ helm-charts/common/vllm-openvino/Chart.yaml | 9 ++ .../vllm-openvino/templates/_helpers.tpl | 62 +++++++++ .../vllm-openvino/templates/configmap.yaml | 35 +++++ .../vllm-openvino/templates/deployment.yaml | 113 ++++++++++++++++ .../common/vllm-openvino/templates/hpa.yaml | 49 +++++++ .../vllm-openvino/templates/service.yaml | 18 +++ .../templates/servicemonitor.yaml | 18 +++ helm-charts/common/vllm-openvino/values.yaml | 128 ++++++++++++++++++ 9 files changed, 455 insertions(+) create mode 100644 helm-charts/common/vllm-openvino/.helmignore create mode 100644 helm-charts/common/vllm-openvino/Chart.yaml create mode 100644 helm-charts/common/vllm-openvino/templates/_helpers.tpl create mode 100644 helm-charts/common/vllm-openvino/templates/configmap.yaml create mode 100644 helm-charts/common/vllm-openvino/templates/deployment.yaml create mode 100644 helm-charts/common/vllm-openvino/templates/hpa.yaml create mode 100644 helm-charts/common/vllm-openvino/templates/service.yaml create mode 100644 helm-charts/common/vllm-openvino/templates/servicemonitor.yaml create mode 100644 helm-charts/common/vllm-openvino/values.yaml diff --git a/helm-charts/common/vllm-openvino/.helmignore b/helm-charts/common/vllm-openvino/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/helm-charts/common/vllm-openvino/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts/common/vllm-openvino/Chart.yaml b/helm-charts/common/vllm-openvino/Chart.yaml new file mode 100644 index 000000000..1c6ce08df --- /dev/null +++ b/helm-charts/common/vllm-openvino/Chart.yaml @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v2 +name: vllm-openvino +description: A Helm chart for OpenVINO optimized vLLM Service +type: application +version: 0.9.0 +appVersion: "v0.9" diff --git a/helm-charts/common/vllm-openvino/templates/_helpers.tpl b/helm-charts/common/vllm-openvino/templates/_helpers.tpl new file mode 100644 index 000000000..7c16c1206 --- /dev/null +++ b/helm-charts/common/vllm-openvino/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "vllm-openvino.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "vllm-openvino.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "vllm-openvino.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "vllm-openvino.labels" -}} +helm.sh/chart: {{ include "vllm-openvino.chart" . }} +{{ include "vllm-openvino.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "vllm-openvino.selectorLabels" -}} +app.kubernetes.io/name: {{ include "vllm-openvino.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "vllm-openvino.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "vllm-openvino.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts/common/vllm-openvino/templates/configmap.yaml b/helm-charts/common/vllm-openvino/templates/configmap.yaml new file mode 100644 index 000000000..ec9a6b268 --- /dev/null +++ b/helm-charts/common/vllm-openvino/templates/configmap.yaml @@ -0,0 +1,35 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "vllm-openvino.fullname" . }}-config + labels: + {{- include "vllm-openvino.labels" . | nindent 4 }} +data: + MODEL_ID: {{ .Values.global.LLM_MODEL_ID | quote }} + PORT: {{ .Values.service.port | quote }} + HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} + VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote }} + HABANA_VISIBLE_DEVICES : {{ .Values.HABANA_VISIBLE_DEVICES | quote }} + OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote }} + {{- if .Values.global.HF_ENDPOINT }} + HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} + {{- end }} + http_proxy: {{ .Values.global.http_proxy | quote }} + https_proxy: {{ .Values.global.https_proxy | quote }} + no_proxy: {{ .Values.global.no_proxy | quote }} + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + TRANSFORMERS_CACHE: "/tmp/transformers_cache" + HF_HOME: "/tmp/.cache/huggingface" + {{- if .Values.MAX_INPUT_LENGTH }} + MAX_INPUT_LENGTH: {{ .Values.MAX_INPUT_LENGTH | quote }} + {{- end }} + {{- if .Values.MAX_TOTAL_TOKENS }} + MAX_TOTAL_TOKENS: {{ .Values.MAX_TOTAL_TOKENS | quote }} + {{- end }} + {{- if .Values.CUDA_GRAPHS }} + CUDA_GRAPHS: {{ .Values.CUDA_GRAPHS | quote }} + {{- end }} diff --git a/helm-charts/common/vllm-openvino/templates/deployment.yaml b/helm-charts/common/vllm-openvino/templates/deployment.yaml new file mode 100644 index 000000000..d8bde1a4c --- /dev/null +++ b/helm-charts/common/vllm-openvino/templates/deployment.yaml @@ -0,0 +1,113 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "vllm-openvino.fullname" . }} + labels: + {{- include "vllm-openvino.labels" . | nindent 4 }} +spec: + {{- if not .Values.global.horizontalPodAutoscaler.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "vllm-openvino.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "vllm-openvino.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + envFrom: + - configMapRef: + name: {{ include "vllm-openvino.fullname" . }}-config + {{- if .Values.global.extraEnvConfig }} + - configMapRef: + name: {{ .Values.global.extraEnvConfig }} + optional: true + {{- end }} + securityContext: + {{- if .Values.global.modelUseHostPath }} + {} + {{- else }} + {{- toYaml .Values.securityContext | nindent 12 }} + {{- end }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.service.targetPort }} + protocol: TCP + {{- if .Values.livenessProbe}} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + {{ end }} + {{- if .Values.readinessProbe}} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + {{ end }} + {{- if .Values.startupProbe }} + startupProbe: + {{- toYaml .Values.startupProbe | nindent 12 }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + command: ["/bin/bash"] + args: + - "-c" + - | + cd / && \ + python3 -m vllm.entrypoints.openai.api_server \ + --model "{{ .Values.global.LLM_MODEL_ID }}" \ + --host 0.0.0.0 \ + --port 80 + volumes: + - name: model-volume + {{- if .Values.global.modelUsePVC }} + persistentVolumeClaim: + claimName: {{ .Values.global.modelUsePVC }} + {{- else if .Values.global.modelUseHostPath }} + hostPath: + path: {{ .Values.global.modelUseHostPath }} + type: Directory + {{- else }} + emptyDir: {} + {{- end }} + - name: tmp + emptyDir: {} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if .Values.global.horizontalPodAutoscaler.enabled }} + # extra time to finish processing buffered requests before HPA forcibly terminates pod + terminationGracePeriodSeconds: 120 + {{- end }} diff --git a/helm-charts/common/vllm-openvino/templates/hpa.yaml b/helm-charts/common/vllm-openvino/templates/hpa.yaml new file mode 100644 index 000000000..62a2de8bf --- /dev/null +++ b/helm-charts/common/vllm-openvino/templates/hpa.yaml @@ -0,0 +1,49 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "vllm-openvino.fullname" . }} + labels: + {{- include "vllm-openvino.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "vllm-openvino.fullname" . }} + minReplicas: {{ .Values.horizontalPodAutoscaler.minReplicas }} + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # VLLM time metrics are in seconds + name: vllm_ov_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: {{ include "vllm-openvino.fullname" . }} + target: + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 15 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 50 + periodSeconds: 15 + - type: Pods + value: 2 + periodSeconds: 15 +{{- end }} diff --git a/helm-charts/common/vllm-openvino/templates/service.yaml b/helm-charts/common/vllm-openvino/templates/service.yaml new file mode 100644 index 000000000..89fe92e01 --- /dev/null +++ b/helm-charts/common/vllm-openvino/templates/service.yaml @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "vllm-openvino.fullname" . }} + labels: + {{- include "vllm-openvino.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + protocol: TCP + name: http + selector: + {{- include "vllm-openvino.selectorLabels" . | nindent 4 }} diff --git a/helm-charts/common/vllm-openvino/templates/servicemonitor.yaml b/helm-charts/common/vllm-openvino/templates/servicemonitor.yaml new file mode 100644 index 000000000..fefa864b7 --- /dev/null +++ b/helm-charts/common/vllm-openvino/templates/servicemonitor.yaml @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +{{- if .Values.global.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "vllm-openvino.fullname" . }} +spec: + selector: + matchLabels: + {{- include "vllm-openvino.selectorLabels" . | nindent 6 }} + endpoints: + - interval: 4s + port: tgi + scheme: http +{{- end }} diff --git a/helm-charts/common/vllm-openvino/values.yaml b/helm-charts/common/vllm-openvino/values.yaml new file mode 100644 index 000000000..ec81ff14f --- /dev/null +++ b/helm-charts/common/vllm-openvino/values.yaml @@ -0,0 +1,128 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for vllm-openvino. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: vllm + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "openvino" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + + +securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + +service: + type: ClusterIP + port: 80 + targetPort: 80 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 24 +readinessProbe: + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 5 +startupProbe: + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 + +horizontalPodAutoscaler: + minReplicas: 1 + maxReplicas: 6 + # targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /tmp + name: tmp + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +MAX_INPUT_LENGTH: "" +MAX_TOTAL_TOKENS: "" +CUDA_GRAPHS: "0" +VLLM_CPU_KVCACHE_SPACE: 50 +HABANA_VISIBLE_DEVICES: all +OMPI_MCA_btl_vader_single_copy_mechanism: none + +global: + http_proxy: "" + https_proxy: "" + no_proxy: "" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + + # Choose where to save your downloaded models + # Set modelUseHostPath for local directory, this is good for one node test. Example: + # modelUseHostPath: /mnt/opea-models + # Set modelUsePVC for PersistentVolumeClaim(PVC), which is suitable for multinode deployment. Example: + # modelUsePVC: model-volume + # You can only set one of the following var, the behavior is not defined is both are set. + # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. + modelUseHostPath: "" + modelUsePVC: "" + # Enabling HPA will: + # - Ignore above replica count, as it will be controlled by HPA + # - Add example HPA scaling rules with thresholds suitable for Xeon deployments + # - Require custom metrics ConfigMap available in the main application chart + horizontalPodAutoscaler: + enabled: false From d339c74bdf66090a46fa87899131267cf5cf206b Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Thu, 5 Sep 2024 19:53:01 +0530 Subject: [PATCH 02/37] =?UTF-8?q?=E2=9C=A8=20Added=20charts=20for=20llm-vl?= =?UTF-8?q?lm=20microservice?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- .../common/llm-vllm-uservice/.helmignore | 23 ++++ .../common/llm-vllm-uservice/Chart.yaml | 11 ++ .../llm-vllm-uservice/templates/_helpers.tpl | 62 +++++++++++ .../templates/configmap.yaml | 33 ++++++ .../templates/deployment.yaml | 79 ++++++++++++++ .../llm-vllm-uservice/templates/service.yaml | 15 +++ .../common/llm-vllm-uservice/values.yaml | 101 ++++++++++++++++++ 7 files changed, 324 insertions(+) create mode 100644 helm-charts/common/llm-vllm-uservice/.helmignore create mode 100644 helm-charts/common/llm-vllm-uservice/Chart.yaml create mode 100644 helm-charts/common/llm-vllm-uservice/templates/_helpers.tpl create mode 100644 helm-charts/common/llm-vllm-uservice/templates/configmap.yaml create mode 100644 helm-charts/common/llm-vllm-uservice/templates/deployment.yaml create mode 100644 helm-charts/common/llm-vllm-uservice/templates/service.yaml create mode 100644 helm-charts/common/llm-vllm-uservice/values.yaml diff --git a/helm-charts/common/llm-vllm-uservice/.helmignore b/helm-charts/common/llm-vllm-uservice/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/helm-charts/common/llm-vllm-uservice/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts/common/llm-vllm-uservice/Chart.yaml b/helm-charts/common/llm-vllm-uservice/Chart.yaml new file mode 100644 index 000000000..67da3e01a --- /dev/null +++ b/helm-charts/common/llm-vllm-uservice/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +name: llm-vllm-uservice +description: A Helm chart for LLM microservice for which connects with vLLM microservice to recieve inferences. +type: application +version: 0.9.0 +appVersion: "v0.9" +dependencies: + - name: vllm-openvino + version: 0.9.0 + repository: file://../vllm-openvino + condition: autodependency.enabled \ No newline at end of file diff --git a/helm-charts/common/llm-vllm-uservice/templates/_helpers.tpl b/helm-charts/common/llm-vllm-uservice/templates/_helpers.tpl new file mode 100644 index 000000000..211968599 --- /dev/null +++ b/helm-charts/common/llm-vllm-uservice/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "llm-vllm-uservice.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "llm-vllm-uservice.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "llm-vllm-uservice.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "llm-vllm-uservice.labels" -}} +helm.sh/chart: {{ include "llm-vllm-uservice.chart" . }} +{{ include "llm-vllm-uservice.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "llm-vllm-uservice.selectorLabels" -}} +app.kubernetes.io/name: {{ include "llm-vllm-uservice.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "llm-vllm-uservice.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "llm-vllm-uservice.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml b/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml new file mode 100644 index 000000000..98b8615b0 --- /dev/null +++ b/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml @@ -0,0 +1,33 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "llm-vllm-uservice.fullname" . }}-config + labels: + {{- include "llm-vllm-uservice.labels" . | nindent 4 }} +data: + {{- if .Values.vLLM_ENDPOINT }} + vLLM_ENDPOINT: {{ .Values.vLLM_ENDPOINT | quote}} + {{- else }} + vLLM_ENDPOINT: "http://{{ .Release.Name }}-vllm-openvino" + {{- end }} + LLM_MODEL: {{ .Values.global.LLM_MODEL_ID | quote }} + HUGGINGFACEHUB_API_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} + HF_HOME: "/tmp/.cache/huggingface" + {{- if .Values.global.HF_ENDPOINT }} + HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} + {{- end }} + http_proxy: {{ .Values.global.http_proxy | quote }} + https_proxy: {{ .Values.global.https_proxy | quote }} + {{- if and (not .Values.vLLM_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy) }} + no_proxy: "{{ .Release.Name }}-vllm-openvino,{{ .Values.global.no_proxy }}" + {{- else }} + no_proxy: "{{ .Values.global.no_proxy }}, vllm-openvino" + {{- end }} + LANGCHAIN_TRACING_V2: {{ .Values.global.LANGCHAIN_TRACING_V2 | quote }} + LANGCHAIN_API_KEY: {{ .Values.global.LANGCHAIN_API_KEY }} + LANGCHAIN_PROJECT: "opea-llm-uservice" + HF_HUB_DISABLE_PROGRESS_BARS: {{ .Values.HF_HUB_DISABLE_PROGRESS_BARS | quote }} + HF_HUB_ENABLE_HF_TRANSFER: {{ .Values.HF_HUB_ENABLE_HF_TRANSFER | quote }} diff --git a/helm-charts/common/llm-vllm-uservice/templates/deployment.yaml b/helm-charts/common/llm-vllm-uservice/templates/deployment.yaml new file mode 100644 index 000000000..d4b823c60 --- /dev/null +++ b/helm-charts/common/llm-vllm-uservice/templates/deployment.yaml @@ -0,0 +1,79 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-vllm-uservice.fullname" . }} + labels: + {{- include "llm-vllm-uservice.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "llm-vllm-uservice.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "llm-vllm-uservice.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + envFrom: + - configMapRef: + name: {{ include "llm-vllm-uservice.fullname" . }}-config + {{- if .Values.global.extraEnvConfig }} + - configMapRef: + name: {{ .Values.global.extraEnvConfig }} + optional: true + {{- end }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: llm-vllm + containerPort: {{ .Values.service.targetPort }} + protocol: TCP + {{- if .Values.livenessProbe }} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.readinessProbe }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + {{- end }} + {{- if .Values.startupProbe }} + startupProbe: + {{- toYaml .Values.startupProbe | nindent 12 }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - mountPath: /tmp + name: tmp + volumes: + - name: tmp + emptyDir: {} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm-charts/common/llm-vllm-uservice/templates/service.yaml b/helm-charts/common/llm-vllm-uservice/templates/service.yaml new file mode 100644 index 000000000..ec4758e59 --- /dev/null +++ b/helm-charts/common/llm-vllm-uservice/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "llm-vllm-uservice.fullname" . }} + labels: + {{- include "llm-vllm-uservice.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + protocol: TCP + name: llm-vllm + selector: + {{- include "llm-vllm-uservice.selectorLabels" . | nindent 4 }} diff --git a/helm-charts/common/llm-vllm-uservice/values.yaml b/helm-charts/common/llm-vllm-uservice/values.yaml new file mode 100644 index 000000000..529c0d689 --- /dev/null +++ b/helm-charts/common/llm-vllm-uservice/values.yaml @@ -0,0 +1,101 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for llm-vllm-uservice. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +autodependency: + enabled: false + +replicaCount: 1 +vLLM_ENDPOINT: "" +HF_HUB_DISABLE_PROGRESS_BARS: 1 +HF_HUB_ENABLE_HF_TRANSFER: 0 + + +image: + repository: opea/llm-vllm + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: + readOnlyRootFilesystem: false + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + + +service: + type: ClusterIP + port: 9000 + targetPort: 9000 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + httpGet: + path: v1/health_check + port: llm-vllm + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 24 +readinessProbe: + httpGet: + path: v1/health_check + port: llm-vllm + initialDelaySeconds: 5 + periodSeconds: 5 +startupProbe: + httpGet: + path: v1/health_check + port: llm-vllm + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 + + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +global: + http_proxy: "" + https_proxy: "" + no_proxy: "" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + LANGCHAIN_TRACING_V2: false + LANGCHAIN_API_KEY: "insert-your-langchain-key-here" + # set modelUseHostPath to host directory if you want to use hostPath volume for model storage + # comment out modeluseHostPath if you want to download the model from huggingface + # modelUseHostPath: "" + modelUseHostPath: "" + LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" From c8a420cda68f9897330e3817ccdf2c64cfbca858 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Fri, 6 Sep 2024 18:16:01 +0530 Subject: [PATCH 03/37] =?UTF-8?q?=E2=9E=95=20Updated=20chatqna=20to=20have?= =?UTF-8?q?=20conditional=20dependency=20on=20tgi=20and=20vllm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/Chart.yaml | 18 ++++++++++++++++ helm-charts/chatqna/templates/deployment.yaml | 21 +++++-------------- helm-charts/chatqna/values.yaml | 21 +++++++++++++++++-- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml index aba726a5c..2bc875094 100644 --- a/helm-charts/chatqna/Chart.yaml +++ b/helm-charts/chatqna/Chart.yaml @@ -9,9 +9,27 @@ dependencies: - name: tgi version: 0.9.0 repository: "file://../common/tgi" + condition: tgi.enabled + tags: + - tgi + - name: vllm-openvino + version: 0.9.0 + repository: "file://../common/vllm-openvino" + condition: vllm-openvino.enabled + tags: + - vllm - name: llm-uservice version: 0.9.0 repository: "file://../common/llm-uservice" + condition: tgi.enabled + tags: + - tgi + - name: llm-vllm-uservice + version: 0.9.0 + repository: "file://../common/llm-vllm-uservice" + condition: vllm-openvino.enabled + tags: + - vllm - name: tei version: 0.9.0 repository: "file://../common/tei" diff --git a/helm-charts/chatqna/templates/deployment.yaml b/helm-charts/chatqna/templates/deployment.yaml index 4667666dc..619d12110 100644 --- a/helm-charts/chatqna/templates/deployment.yaml +++ b/helm-charts/chatqna/templates/deployment.yaml @@ -30,8 +30,13 @@ spec: containers: - name: {{ .Release.Name }} env: + {{- if (index .Values "vllm-openvino" "enabled") }} + - name: LLM_SERVICE_HOST_IP + value: {{ .Release.Name }}-llm-vllm-uservice + {{- else }} - name: LLM_SERVICE_HOST_IP value: {{ .Release.Name }}-llm-uservice + {{- end }} - name: RERANK_SERVICE_HOST_IP value: {{ .Release.Name }}-reranking-usvc - name: RETRIEVER_SERVICE_HOST_IP @@ -49,22 +54,6 @@ spec: - name: chatqna containerPort: {{ .Values.port }} protocol: TCP - # startupProbe: - # httpGet: - # host: {{ .Release.Name }}-llm-uservice - # port: {{ index .Values "llm-uservice" "service" "port" }} - # path: / - # initialDelaySeconds: 5 - # periodSeconds: 5 - # failureThreshold: 120 - # livenessProbe: - # httpGet: - # path: / - # port: {{ .Values.port }} - # readinessProbe: - # httpGet: - # path: / - # port: {{ .Values.port }} resources: {{- toYaml .Values.resources | nindent 12 }} volumes: diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index a7a115f9b..092f57bd2 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -18,6 +18,14 @@ service: type: ClusterIP port: 8888 +imagePullSecrets: [] + +podAnnotations: {} + +podSecurityContext: {} + +resources: {} + securityContext: readOnlyRootFilesystem: true allowPrivilegeEscalation: false @@ -35,18 +43,27 @@ tolerations: [] affinity: {} -# To override values in subchart tgi +# To override values in subchart tgi and vllm-ov tgi: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 +vllm-openvino: + enabled: false + +tags: + tgi: true + vllm: false + + global: http_proxy: "" https_proxy: "" no_proxy: "" HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 # set modelUseHostPath or modelUsePVC to use model cache. modelUseHostPath: "" - # modelUseHostPath: /mnt/opea-models # modelUsePVC: model-volume # Enabling HorizontalPodAutoscaler (HPA) will: From 21be6c99254e945002a9f2b6f858c37511568f77 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Fri, 6 Sep 2024 18:16:29 +0530 Subject: [PATCH 04/37] =?UTF-8?q?=F0=9F=A7=AA=20Added=20tests=20for=20veri?= =?UTF-8?q?fying=20pod=20sanity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- .../templates/tests/test-pod.yaml | 29 +++++++++++++++++++ .../templates/tests/test-pod.yaml | 29 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 helm-charts/common/llm-vllm-uservice/templates/tests/test-pod.yaml create mode 100644 helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml diff --git a/helm-charts/common/llm-vllm-uservice/templates/tests/test-pod.yaml b/helm-charts/common/llm-vllm-uservice/templates/tests/test-pod.yaml new file mode 100644 index 000000000..a2803fc95 --- /dev/null +++ b/helm-charts/common/llm-vllm-uservice/templates/tests/test-pod.yaml @@ -0,0 +1,29 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "llm-vllm-uservice.fullname" . }}-testpod" + labels: + {{- include "llm-vllm-uservice.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: curl + image: python:3.10.14 + command: ['bash', '-c'] + args: + - | + max_retry=20; + for ((i=1; i<=max_retry; i++)); do + curl http://{{ include "llm-vllm-uservice.fullname" . }}:{{ .Values.service.port }}/v1/chat/completions -sS --fail-with-body \ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -H 'Content-Type: application/json' && break; + curlcode=$? + if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; + done; + if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi + restartPolicy: Never diff --git a/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml b/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml new file mode 100644 index 000000000..138379b73 --- /dev/null +++ b/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml @@ -0,0 +1,29 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "vllm-openvino.fullname" . }}-testpod" + labels: + {{- include "vllm-openvino.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: curl + image: python:3.10.14 + command: ['bash', '-c'] + args: + - | + max_retry=20; + for ((i=1; i<=max_retry; i++)); do \ + curl http://{{ include "vllm-openvino.fullname" . }}/v1/completions -sS --fail-with-body \ + -X POST \ + -d '{"prompt":"What is Deep Learning?", "model": {{ .Values.global.LLM_MODEL_ID | quote }}, "max_tokens":17, "temperature": 0.5}' \ + -H 'Content-Type: application/json' && break; + curlcode=$? + if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; + done; + if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi + restartPolicy: Never From 25528c9eb66c01ea8009f1c02612428932a35b92 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Fri, 6 Sep 2024 18:17:02 +0530 Subject: [PATCH 05/37] =?UTF-8?q?=F0=9F=93=9D=20Added=20docs=20for=20instr?= =?UTF-8?q?uction=20to=20setup=20chatqna=20with=20vllm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/README.md | 39 +++++++-- .../common/llm-vllm-uservice/README.md | 87 +++++++++++++++++++ helm-charts/common/vllm-openvino/README.md | 68 +++++++++++++++ 3 files changed, 185 insertions(+), 9 deletions(-) create mode 100644 helm-charts/common/llm-vllm-uservice/README.md create mode 100644 helm-charts/common/vllm-openvino/README.md diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 64f001eb8..8b13bd36c 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -9,14 +9,24 @@ Helm chart for deploying ChatQnA service. ChatQnA depends on the following servi - [redis-vector-db](../common/redis-vector-db) - [reranking-usvc](../common/reranking-usvc) - [teirerank](../common/teirerank) -- [llm-uservice](../common/llm-uservice) -- [tgi](../common/tgi) + +Apart from above mentioned services, there are following conditional dependencies (out of which, one are required): + +1. If we want to use TGI as our inference service, following 2 services will be required: + + - [llm-uservice](../common/llm-uservice) + - [tgi](../common/tgi) + +2. If we want to use OpenVINO vLLM inference service, following 2 services would be required: + - [llm-vllm-uservice](../common/llm-vllm-uservice) + - [vllm-openvino](../common/vllm-openvino) + ## Installing the Chart To install the chart, run the following: -```console +```bash cd GenAIInfra/helm-charts/ ./update_dependency.sh helm dependency update chatqna @@ -24,15 +34,25 @@ export HFTOKEN="insert-your-huggingface-token-here" export MODELDIR="/mnt/opea-models" export MODELNAME="Intel/neural-chat-7b-v3-3" helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} + # To use Gaudi device -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml + # To use Nvidia GPU -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml + + +# To use OpenVINO vLLM inference engine on Xeon device + +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set global.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm-openvino.enabled=true ``` + ### IMPORTANT NOTE -1. Make sure your `MODELDIR` exists on the node where your workload is schedueled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model. +1. Make sure your `MODELDIR` exists on the node where your workload is scheduled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model. + +2. Please set `http_proxy`, `https_proxy` and `no_proxy` values while installing chart, if you are behind a proxy. ## Verify @@ -46,8 +66,9 @@ Run the command `kubectl port-forward svc/chatqna 8888:8888` to expose the servi Open another terminal and run the following command to verify the service if working: -```console +```bash curl http://localhost:8888/v1/chatqna \ + -X POST \ -H "Content-Type: application/json" \ -d '{"messages": "What is the revenue of Nike in 2023?"}' ``` @@ -71,7 +92,6 @@ docker save -o ui.tar opea/chatqna-conversation-ui:latest sudo ctr -n k8s.io image import ui.tar # install UI using helm chart. Replace image tag if required -cd cd GenAIInfra/helm-charts/ helm install ui common/chatqna-ui --set BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/chatqna",DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep",image.tag="latest" @@ -88,4 +108,5 @@ Access `http://localhost:5174` to play with the ChatQnA workload through UI. | image.repository | string | `"opea/chatqna"` | | | service.port | string | `"8888"` | | | tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.horizontalPodAutoscaler.enabled | bop; | false | HPA autoscaling for the TGI and TEI service deployments based on metrics they provide. See HPA section in ../README.md before enabling! | +| vllm-openvino.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.horizontalPodAutoscaler.enabled | bool | false | HPA autoscaling for the TGI and TEI service deployments based on metrics they provide. See HPA section in ../README.md before enabling! | diff --git a/helm-charts/common/llm-vllm-uservice/README.md b/helm-charts/common/llm-vllm-uservice/README.md new file mode 100644 index 000000000..60fd04ad9 --- /dev/null +++ b/helm-charts/common/llm-vllm-uservice/README.md @@ -0,0 +1,87 @@ +# llm-vllm Microservice + +Helm chart for deploying a microservice which facilitates connections and handles responses from OpenVINO vLLM microservice. + +`llm-vllm-uservice` depends on OpenVINO vLLM. You should properly set `vLLM_ENDPOINT` as the HOST URI of vLLM microservice. If not set, it will consider the default value : `http://-vllm-openvino:80` + +As this service depends on vLLM microservice, we can proceed in either of 2 ways: + +- Install both microservices separately one after another. +- Install the vLLM microservice as dependency for the our main `llm-vllm-uservice` microservice. + +## (Option 1): Installing the chart separately: + +First, you need to install the `vllm-openvino` chart, please refer to the [vllm-openvino](../vllm-openvino) chart for more information. + +After you've deployed the `vllm-openvino` chart successfully, please run `kubectl get svc` to get the vLLM service name with port. We need to provide this to `llm-vllm-uservice` as a value for vLLM_ENDPOINT for letting it discover and connect to the vLLM microservice. + + +> **_NOTE:_** While installing charts separately, if you don't provide any vLLM endpoint explicitly, it will take the default endpoint as `http://-vllm-openvino:80`. So, if you are not providing the vLLM endpoint explicitly, please make sure to provide same helm release name to both the charts while installing. + +To install the chart, run the following: + +```bash +cd GenAIInfra/helm-charts/common/llm-vllm-uservice +export HFTOKEN="insert-your-huggingface-token-here" +export vLLM_ENDPOINT="http://vllm-openvino" +export MODELNAME="bigscience/bloom-560m" + +# If proxy is required, please export the appropriate proxy values. +export http_proxy= +export https_proxy= + +helm dependency update +helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set vLLM_ENDPOINT=${vLLM_ENDPOINT} --set global.LLM_MODEL_ID=${MODELNAME} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +``` + +## (Option 2): Installing the chart with automatic installation of dependency: + +```bash +cd GenAIInfra/helm-charts/common/llm-vllm-uservice +export HFTOKEN="insert-your-huggingface-token-here" +export MODELNAME="bigscience/bloom-560m" + +# If proxy is required, please export the appropriate proxy values. +export http_proxy= +export https_proxy= + +helm dependency update +helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.LLM_MODEL_ID=${MODELNAME} --set autodependency.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +``` + +`--wait` flag in above installation command will make sure that all the dependencies are resolved and all services are deployed. + +## Verify + +To verify the installation, run the following command to make sure all pods are running. + +```bash +kubectl get pod +``` + +Once you see `llm-vllm-uservice` pod and `llm-vllm-uservice-vllm-openvino` pod in ready and running state, run the following command: + +```bash +kubectl port-forward svc/llm-vllm-uservice 9000:9000 +``` + +This exposes the port 9000, on which `llm-vllm-uservice` is running inside the pod, at port 9000 on the host. + +Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `llm-vllm-uservice` is working: + +```bash +curl http://localhost:9000/v1/chat/completions \ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -H 'Content-Type: application/json' +``` + +## Values + +| Key | Type | Default | Description | +| ------------------------------- | ------ | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| global.HUGGINGFACEHUB_API_TOKEN | string | `""` | Your own Hugging Face API token | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, vLLM will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory | +| image.repository | string | `"opea/llm-vllm"` | | +| service.port | string | `"9000"` | | +| vLLM_ENDPOINT | string | `""` | OpenVINO vLLM service endpoint | diff --git a/helm-charts/common/vllm-openvino/README.md b/helm-charts/common/vllm-openvino/README.md new file mode 100644 index 000000000..b88e5c719 --- /dev/null +++ b/helm-charts/common/vllm-openvino/README.md @@ -0,0 +1,68 @@ +# OpenVINO vLLM + +Helm chart for deploying OpenVINO optimized vLLM Inference service. + +## Installing the Chart + +To install the chart, run the following: + +```bash +cd GenAIInfra/helm-charts/common +export MODELDIR=/mnt/opea-models +export MODELNAME="bigscience/bloom-560m" +export HFTOKEN="insert-your-huggingface-token-here" + +# If proxy is required, please export the appropriate proxy values. +export http_proxy= +export https_proxy= + +helm install vllm-openvino vllm-openvino --set global.modelUseHostPath=${MODELDIR} --set global.LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +``` + +`--wait` flag in the above helm installation command lets the shell wait till `vllm-openvino` is completely up and ready. + +>**_NOTE:_** Make sure your `MODELDIR` exists on the node where your workload is scheduled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model. + +If you already cached the model locally, you can pass it to container like this example: + +MODELDIR=/mnt/opea-models + +MODELNAME="/data/models--bigscience--bloom-560m" + +>**_NOTE:_** By default, the vLLM service will be downloading **Intel/neural-chat-7b-v3-3** model from Huggingface, which is around 4GB in size. To use a smaller model, please set the LLM_MODEL_ID value to your desired model, as shown above, while installing the chart. + +## Verify + +To verify the installation, run the following command to make sure all pods are running. Please note that it may take a while to come the vLLM pod in ready state. + +```bash +kubectl get pod +``` + +Once you see `vllm-openvino` pod in ready and running state, run the following command: + +```bash + kubectl port-forward svc/vllm-openvino 2080:80 +``` + +This exposes the port 80, on which `vllm-openvino` service is running inside the pod, at port 2080 on the host. + +Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `vllm-openvino` service is working: + +```bash +curl http://localhost:2080/v1/completions -sS --fail-with-body \ + -X POST \ + -d '{"prompt":"What is Deep Learning?", "model": "bigscience/bloom-560m", "max_tokens":17, "temperature": 0.5}' \ + -H 'Content-Type: application/json' +``` + +## Values + +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| LLM_MODEL_ID | string | `"bigscience/bloom-560m"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.HUGGINGFACEHUB_API_TOKEN | string | `insert-your-huggingface-token-here` | Hugging Face API token | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, vLLM will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Setting this to null/empty will force it to download model. | +| image.repository | string | `"vllm"` | | +| image.tag | string | `"openvino"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployment based on metrics it provides. See HPA section in ../../README.md before enabling! | From 140d1b54c638c441d6f27775dac2221c7d1ebdaf Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Fri, 6 Sep 2024 18:34:19 +0530 Subject: [PATCH 06/37] =?UTF-8?q?=F0=9F=94=A5=20removed=20unsupported=20en?= =?UTF-8?q?v=20vars?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/common/vllm-openvino/templates/configmap.yaml | 6 ------ helm-charts/common/vllm-openvino/values.yaml | 2 -- 2 files changed, 8 deletions(-) diff --git a/helm-charts/common/vllm-openvino/templates/configmap.yaml b/helm-charts/common/vllm-openvino/templates/configmap.yaml index ec9a6b268..59a59c38c 100644 --- a/helm-charts/common/vllm-openvino/templates/configmap.yaml +++ b/helm-charts/common/vllm-openvino/templates/configmap.yaml @@ -24,12 +24,6 @@ data: NUMBA_CACHE_DIR: "/tmp" TRANSFORMERS_CACHE: "/tmp/transformers_cache" HF_HOME: "/tmp/.cache/huggingface" - {{- if .Values.MAX_INPUT_LENGTH }} - MAX_INPUT_LENGTH: {{ .Values.MAX_INPUT_LENGTH | quote }} - {{- end }} - {{- if .Values.MAX_TOTAL_TOKENS }} - MAX_TOTAL_TOKENS: {{ .Values.MAX_TOTAL_TOKENS | quote }} - {{- end }} {{- if .Values.CUDA_GRAPHS }} CUDA_GRAPHS: {{ .Values.CUDA_GRAPHS | quote }} {{- end }} diff --git a/helm-charts/common/vllm-openvino/values.yaml b/helm-charts/common/vllm-openvino/values.yaml index ec81ff14f..3e74be940 100644 --- a/helm-charts/common/vllm-openvino/values.yaml +++ b/helm-charts/common/vllm-openvino/values.yaml @@ -96,8 +96,6 @@ tolerations: [] affinity: {} -MAX_INPUT_LENGTH: "" -MAX_TOTAL_TOKENS: "" CUDA_GRAPHS: "0" VLLM_CPU_KVCACHE_SPACE: 50 HABANA_VISIBLE_DEVICES: all From 815c51bc0ab810b1eb81b50cd837f8e56f9938f0 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Sat, 7 Sep 2024 00:53:53 +0530 Subject: [PATCH 07/37] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Removed=20global=20M?= =?UTF-8?q?odel=20ID=20var=20|=20resolved=20readme=20conflicts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/README.md | 37 +++++++++++++++++-- helm-charts/chatqna/values.yaml | 7 +++- .../common/llm-vllm-uservice/README.md | 5 ++- .../templates/configmap.yaml | 2 +- .../common/llm-vllm-uservice/values.yaml | 8 +++- helm-charts/common/vllm-openvino/README.md | 2 +- .../vllm-openvino/templates/configmap.yaml | 3 +- .../vllm-openvino/templates/deployment.yaml | 2 +- .../templates/tests/test-pod.yaml | 2 +- helm-charts/common/vllm-openvino/values.yaml | 4 +- 10 files changed, 55 insertions(+), 17 deletions(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 8b13bd36c..759ac5c96 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -24,7 +24,15 @@ Apart from above mentioned services, there are following conditional dependencie ## Installing the Chart -To install the chart, run the following: +Please follow the following steps to install the ChatQnA Chart: + +1. Clone the GenAIInfra repository: + +```bash +git clone https://github.com/opea-project/GenAIInfra.git +``` + +2. Setup the dependencies and required environment variables: ```bash cd GenAIInfra/helm-charts/ @@ -33,20 +41,41 @@ helm dependency update chatqna export HFTOKEN="insert-your-huggingface-token-here" export MODELDIR="/mnt/opea-models" export MODELNAME="Intel/neural-chat-7b-v3-3" +``` + +3. Depending on the device which we are targeting for running ChatQnA, please use one the following installation commands: + +```bash +# Install the chart on a Xeon machine helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} +``` +```bash # To use Gaudi device helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml +``` +```bash # To use Nvidia GPU helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml +``` +```bash +# To include guardrail component in chatqna on Xeon +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml +``` -# To use OpenVINO vLLM inference engine on Xeon device - -helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set global.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm-openvino.enabled=true +```bash +# To include guardrail component in chatqna on Gaudi +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml ``` +>**_NOTE:_** Default installation will use [TGI (Text Generation Inference)](https://github.com/huggingface/text-generation-inference) as inference engine. To use vLLM as inference engine, please see below. + +```bash +# To use OpenVINO vLLM inference engine on Xeon device +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-vllm-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm-openvino.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm-openvino.enabled=true +``` ### IMPORTANT NOTE diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index 092f57bd2..179cc1711 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -43,13 +43,17 @@ tolerations: [] affinity: {} -# To override values in subchart tgi and vllm-ov +# To override values in subchart tgi, vllm-openvino and llm-vllm-uservice tgi: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 vllm-openvino: + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 enabled: false +llm-vllm-uservice: + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + tags: tgi: true vllm: false @@ -61,7 +65,6 @@ global: no_proxy: "" HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 # set modelUseHostPath or modelUsePVC to use model cache. modelUseHostPath: "" # modelUsePVC: model-volume diff --git a/helm-charts/common/llm-vllm-uservice/README.md b/helm-charts/common/llm-vllm-uservice/README.md index 60fd04ad9..12d11688f 100644 --- a/helm-charts/common/llm-vllm-uservice/README.md +++ b/helm-charts/common/llm-vllm-uservice/README.md @@ -31,7 +31,7 @@ export http_proxy= export https_proxy= helm dependency update -helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set vLLM_ENDPOINT=${vLLM_ENDPOINT} --set global.LLM_MODEL_ID=${MODELNAME} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set vLLM_ENDPOINT=${vLLM_ENDPOINT} --set LLM_MODEL_ID=${MODELNAME} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait ``` ## (Option 2): Installing the chart with automatic installation of dependency: @@ -39,6 +39,7 @@ helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN ```bash cd GenAIInfra/helm-charts/common/llm-vllm-uservice export HFTOKEN="insert-your-huggingface-token-here" +export MODELDIR="/mnt/opea-models" export MODELNAME="bigscience/bloom-560m" # If proxy is required, please export the appropriate proxy values. @@ -46,7 +47,7 @@ export http_proxy= export https_proxy= helm dependency update -helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.LLM_MODEL_ID=${MODELNAME} --set autodependency.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set vllm-openvino.LLM_MODEL_ID=${MODELNAME} --set autodependency.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait ``` `--wait` flag in above installation command will make sure that all the dependencies are resolved and all services are deployed. diff --git a/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml b/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml index 98b8615b0..247cd614e 100644 --- a/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml +++ b/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml @@ -13,7 +13,7 @@ data: {{- else }} vLLM_ENDPOINT: "http://{{ .Release.Name }}-vllm-openvino" {{- end }} - LLM_MODEL: {{ .Values.global.LLM_MODEL_ID | quote }} + LLM_MODEL: {{ .Values.LLM_MODEL_ID | quote }} HUGGINGFACEHUB_API_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} HF_HOME: "/tmp/.cache/huggingface" {{- if .Values.global.HF_ENDPOINT }} diff --git a/helm-charts/common/llm-vllm-uservice/values.yaml b/helm-charts/common/llm-vllm-uservice/values.yaml index 529c0d689..0748addc6 100644 --- a/helm-charts/common/llm-vllm-uservice/values.yaml +++ b/helm-charts/common/llm-vllm-uservice/values.yaml @@ -87,6 +87,13 @@ tolerations: [] affinity: {} +# Model ID to be used by llm-vllm microservice +LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" + +# Overriding the Model ID being used by vllm-openvino service.(As llm-vllm microservice depends on vllm-openvino, these 2 values should be same.) +vllm-openvino: + LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" + global: http_proxy: "" https_proxy: "" @@ -98,4 +105,3 @@ global: # comment out modeluseHostPath if you want to download the model from huggingface # modelUseHostPath: "" modelUseHostPath: "" - LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" diff --git a/helm-charts/common/vllm-openvino/README.md b/helm-charts/common/vllm-openvino/README.md index b88e5c719..b7eba4361 100644 --- a/helm-charts/common/vllm-openvino/README.md +++ b/helm-charts/common/vllm-openvino/README.md @@ -16,7 +16,7 @@ export HFTOKEN="insert-your-huggingface-token-here" export http_proxy= export https_proxy= -helm install vllm-openvino vllm-openvino --set global.modelUseHostPath=${MODELDIR} --set global.LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +helm install vllm-openvino vllm-openvino --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait ``` `--wait` flag in the above helm installation command lets the shell wait till `vllm-openvino` is completely up and ready. diff --git a/helm-charts/common/vllm-openvino/templates/configmap.yaml b/helm-charts/common/vllm-openvino/templates/configmap.yaml index 59a59c38c..b61de3f8c 100644 --- a/helm-charts/common/vllm-openvino/templates/configmap.yaml +++ b/helm-charts/common/vllm-openvino/templates/configmap.yaml @@ -8,7 +8,7 @@ metadata: labels: {{- include "vllm-openvino.labels" . | nindent 4 }} data: - MODEL_ID: {{ .Values.global.LLM_MODEL_ID | quote }} + MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }} PORT: {{ .Values.service.port | quote }} HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote }} @@ -22,7 +22,6 @@ data: no_proxy: {{ .Values.global.no_proxy | quote }} HABANA_LOGS: "/tmp/habana_logs" NUMBA_CACHE_DIR: "/tmp" - TRANSFORMERS_CACHE: "/tmp/transformers_cache" HF_HOME: "/tmp/.cache/huggingface" {{- if .Values.CUDA_GRAPHS }} CUDA_GRAPHS: {{ .Values.CUDA_GRAPHS | quote }} diff --git a/helm-charts/common/vllm-openvino/templates/deployment.yaml b/helm-charts/common/vllm-openvino/templates/deployment.yaml index d8bde1a4c..6e72f9e2b 100644 --- a/helm-charts/common/vllm-openvino/templates/deployment.yaml +++ b/helm-charts/common/vllm-openvino/templates/deployment.yaml @@ -78,7 +78,7 @@ spec: - | cd / && \ python3 -m vllm.entrypoints.openai.api_server \ - --model "{{ .Values.global.LLM_MODEL_ID }}" \ + --model "{{ .Values.LLM_MODEL_ID }}" \ --host 0.0.0.0 \ --port 80 volumes: diff --git a/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml b/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml index 138379b73..1fcb75793 100644 --- a/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml +++ b/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml @@ -20,7 +20,7 @@ spec: for ((i=1; i<=max_retry; i++)); do \ curl http://{{ include "vllm-openvino.fullname" . }}/v1/completions -sS --fail-with-body \ -X POST \ - -d '{"prompt":"What is Deep Learning?", "model": {{ .Values.global.LLM_MODEL_ID | quote }}, "max_tokens":17, "temperature": 0.5}' \ + -d '{"prompt":"What is Deep Learning?", "model": {{ .Values.LLM_MODEL_ID | quote }}, "max_tokens":17, "temperature": 0.5}' \ -H 'Content-Type: application/json' && break; curlcode=$? if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; diff --git a/helm-charts/common/vllm-openvino/values.yaml b/helm-charts/common/vllm-openvino/values.yaml index 3e74be940..fc1b5eeb2 100644 --- a/helm-charts/common/vllm-openvino/values.yaml +++ b/helm-charts/common/vllm-openvino/values.yaml @@ -96,6 +96,8 @@ tolerations: [] affinity: {} +LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + CUDA_GRAPHS: "0" VLLM_CPU_KVCACHE_SPACE: 50 HABANA_VISIBLE_DEVICES: all @@ -107,8 +109,6 @@ global: no_proxy: "" HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 - # Choose where to save your downloaded models # Set modelUseHostPath for local directory, this is good for one node test. Example: # modelUseHostPath: /mnt/opea-models From 4ac8fb07171c19bfd517ba361de57b831afe2a6a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Sep 2024 19:57:08 +0000 Subject: [PATCH 08/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/chatqna/README.md | 15 ++++++------ .../common/llm-vllm-uservice/Chart.yaml | 7 ++++-- .../common/llm-vllm-uservice/README.md | 17 +++++++------ .../templates/deployment.yaml | 3 +++ .../llm-vllm-uservice/templates/service.yaml | 3 +++ .../common/llm-vllm-uservice/values.yaml | 2 +- helm-charts/common/vllm-openvino/README.md | 24 +++++++++---------- 7 files changed, 39 insertions(+), 32 deletions(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 759ac5c96..3d8070900 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -14,13 +14,12 @@ Apart from above mentioned services, there are following conditional dependencie 1. If we want to use TGI as our inference service, following 2 services will be required: - - [llm-uservice](../common/llm-uservice) - - [tgi](../common/tgi) + - [llm-uservice](../common/llm-uservice) + - [tgi](../common/tgi) 2. If we want to use OpenVINO vLLM inference service, following 2 services would be required: - - [llm-vllm-uservice](../common/llm-vllm-uservice) - - [vllm-openvino](../common/vllm-openvino) - + - [llm-vllm-uservice](../common/llm-vllm-uservice) + - [vllm-openvino](../common/vllm-openvino) ## Installing the Chart @@ -70,7 +69,7 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml ``` ->**_NOTE:_** Default installation will use [TGI (Text Generation Inference)](https://github.com/huggingface/text-generation-inference) as inference engine. To use vLLM as inference engine, please see below. +> **_NOTE:_** Default installation will use [TGI (Text Generation Inference)](https://github.com/huggingface/text-generation-inference) as inference engine. To use vLLM as inference engine, please see below. ```bash # To use OpenVINO vLLM inference engine on Xeon device @@ -137,5 +136,5 @@ Access `http://localhost:5174` to play with the ChatQnA workload through UI. | image.repository | string | `"opea/chatqna"` | | | service.port | string | `"8888"` | | | tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | -| vllm-openvino.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.horizontalPodAutoscaler.enabled | bool | false | HPA autoscaling for the TGI and TEI service deployments based on metrics they provide. See HPA section in ../README.md before enabling! | +| vllm-openvino.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.horizontalPodAutoscaler.enabled | bool | false | HPA autoscaling for the TGI and TEI service deployments based on metrics they provide. See HPA section in ../README.md before enabling! | diff --git a/helm-charts/common/llm-vllm-uservice/Chart.yaml b/helm-charts/common/llm-vllm-uservice/Chart.yaml index 67da3e01a..91fe4abd5 100644 --- a/helm-charts/common/llm-vllm-uservice/Chart.yaml +++ b/helm-charts/common/llm-vllm-uservice/Chart.yaml @@ -1,6 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + apiVersion: v2 name: llm-vllm-uservice -description: A Helm chart for LLM microservice for which connects with vLLM microservice to recieve inferences. +description: A Helm chart for LLM microservice for which connects with vLLM microservice to receive inferences. type: application version: 0.9.0 appVersion: "v0.9" @@ -8,4 +11,4 @@ dependencies: - name: vllm-openvino version: 0.9.0 repository: file://../vllm-openvino - condition: autodependency.enabled \ No newline at end of file + condition: autodependency.enabled diff --git a/helm-charts/common/llm-vllm-uservice/README.md b/helm-charts/common/llm-vllm-uservice/README.md index 12d11688f..d2fd159b3 100644 --- a/helm-charts/common/llm-vllm-uservice/README.md +++ b/helm-charts/common/llm-vllm-uservice/README.md @@ -4,7 +4,7 @@ Helm chart for deploying a microservice which facilitates connections and handle `llm-vllm-uservice` depends on OpenVINO vLLM. You should properly set `vLLM_ENDPOINT` as the HOST URI of vLLM microservice. If not set, it will consider the default value : `http://-vllm-openvino:80` -As this service depends on vLLM microservice, we can proceed in either of 2 ways: +As this service depends on vLLM microservice, we can proceed in either of 2 ways: - Install both microservices separately one after another. - Install the vLLM microservice as dependency for the our main `llm-vllm-uservice` microservice. @@ -15,7 +15,6 @@ First, you need to install the `vllm-openvino` chart, please refer to the [vllm- After you've deployed the `vllm-openvino` chart successfully, please run `kubectl get svc` to get the vLLM service name with port. We need to provide this to `llm-vllm-uservice` as a value for vLLM_ENDPOINT for letting it discover and connect to the vLLM microservice. - > **_NOTE:_** While installing charts separately, if you don't provide any vLLM endpoint explicitly, it will take the default endpoint as `http://-vllm-openvino:80`. So, if you are not providing the vLLM endpoint explicitly, please make sure to provide same helm release name to both the charts while installing. To install the chart, run the following: @@ -66,7 +65,7 @@ Once you see `llm-vllm-uservice` pod and `llm-vllm-uservice-vllm-openvino` pod i kubectl port-forward svc/llm-vllm-uservice 9000:9000 ``` -This exposes the port 9000, on which `llm-vllm-uservice` is running inside the pod, at port 9000 on the host. +This exposes the port 9000, on which `llm-vllm-uservice` is running inside the pod, at port 9000 on the host. Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `llm-vllm-uservice` is working: @@ -79,10 +78,10 @@ curl http://localhost:9000/v1/chat/completions \ ## Values -| Key | Type | Default | Description | -| ------------------------------- | ------ | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| global.HUGGINGFACEHUB_API_TOKEN | string | `""` | Your own Hugging Face API token | +| Key | Type | Default | Description | +| ------------------------------- | ------ | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| global.HUGGINGFACEHUB_API_TOKEN | string | `""` | Your own Hugging Face API token | | global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, vLLM will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory | -| image.repository | string | `"opea/llm-vllm"` | | -| service.port | string | `"9000"` | | -| vLLM_ENDPOINT | string | `""` | OpenVINO vLLM service endpoint | +| image.repository | string | `"opea/llm-vllm"` | | +| service.port | string | `"9000"` | | +| vLLM_ENDPOINT | string | `""` | OpenVINO vLLM service endpoint | diff --git a/helm-charts/common/llm-vllm-uservice/templates/deployment.yaml b/helm-charts/common/llm-vllm-uservice/templates/deployment.yaml index d4b823c60..96c0658e4 100644 --- a/helm-charts/common/llm-vllm-uservice/templates/deployment.yaml +++ b/helm-charts/common/llm-vllm-uservice/templates/deployment.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + apiVersion: apps/v1 kind: Deployment metadata: diff --git a/helm-charts/common/llm-vllm-uservice/templates/service.yaml b/helm-charts/common/llm-vllm-uservice/templates/service.yaml index ec4758e59..2c58d8f24 100644 --- a/helm-charts/common/llm-vllm-uservice/templates/service.yaml +++ b/helm-charts/common/llm-vllm-uservice/templates/service.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + apiVersion: v1 kind: Service metadata: diff --git a/helm-charts/common/llm-vllm-uservice/values.yaml b/helm-charts/common/llm-vllm-uservice/values.yaml index 0748addc6..50240146a 100644 --- a/helm-charts/common/llm-vllm-uservice/values.yaml +++ b/helm-charts/common/llm-vllm-uservice/values.yaml @@ -12,7 +12,7 @@ replicaCount: 1 vLLM_ENDPOINT: "" HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 - + image: repository: opea/llm-vllm diff --git a/helm-charts/common/vllm-openvino/README.md b/helm-charts/common/vllm-openvino/README.md index b7eba4361..770fcaea5 100644 --- a/helm-charts/common/vllm-openvino/README.md +++ b/helm-charts/common/vllm-openvino/README.md @@ -1,4 +1,4 @@ -# OpenVINO vLLM +# OpenVINO vLLM Helm chart for deploying OpenVINO optimized vLLM Inference service. @@ -21,7 +21,7 @@ helm install vllm-openvino vllm-openvino --set global.modelUseHostPath=${MODELDI `--wait` flag in the above helm installation command lets the shell wait till `vllm-openvino` is completely up and ready. ->**_NOTE:_** Make sure your `MODELDIR` exists on the node where your workload is scheduled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model. +> **_NOTE:_** Make sure your `MODELDIR` exists on the node where your workload is scheduled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model. If you already cached the model locally, you can pass it to container like this example: @@ -29,7 +29,7 @@ MODELDIR=/mnt/opea-models MODELNAME="/data/models--bigscience--bloom-560m" ->**_NOTE:_** By default, the vLLM service will be downloading **Intel/neural-chat-7b-v3-3** model from Huggingface, which is around 4GB in size. To use a smaller model, please set the LLM_MODEL_ID value to your desired model, as shown above, while installing the chart. +> **_NOTE:_** By default, the vLLM service will be downloading **Intel/neural-chat-7b-v3-3** model from Huggingface, which is around 4GB in size. To use a smaller model, please set the LLM_MODEL_ID value to your desired model, as shown above, while installing the chart. ## Verify @@ -45,7 +45,7 @@ Once you see `vllm-openvino` pod in ready and running state, run the following c kubectl port-forward svc/vllm-openvino 2080:80 ``` -This exposes the port 80, on which `vllm-openvino` service is running inside the pod, at port 2080 on the host. +This exposes the port 80, on which `vllm-openvino` service is running inside the pod, at port 2080 on the host. Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `vllm-openvino` service is working: @@ -58,11 +58,11 @@ curl http://localhost:2080/v1/completions -sS --fail-with-body \ ## Values -| Key | Type | Default | Description | -| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| LLM_MODEL_ID | string | `"bigscience/bloom-560m"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.HUGGINGFACEHUB_API_TOKEN | string | `insert-your-huggingface-token-here` | Hugging Face API token | -| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, vLLM will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Setting this to null/empty will force it to download model. | -| image.repository | string | `"vllm"` | | -| image.tag | string | `"openvino"` | | -| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployment based on metrics it provides. See HPA section in ../../README.md before enabling! | +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| LLM_MODEL_ID | string | `"bigscience/bloom-560m"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.HUGGINGFACEHUB_API_TOKEN | string | `insert-your-huggingface-token-here` | Hugging Face API token | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, vLLM will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Setting this to null/empty will force it to download model. | +| image.repository | string | `"vllm"` | | +| image.tag | string | `"openvino"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployment based on metrics it provides. See HPA section in ../../README.md before enabling! | From 5fffdd04f6ef2a50a48a407b34a99c6c168bc2f1 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Sat, 7 Sep 2024 01:41:57 +0530 Subject: [PATCH 09/37] =?UTF-8?q?=F0=9F=93=8C=20Bumped=20up=20the=20chart?= =?UTF-8?q?=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/Chart.yaml | 4 ++-- helm-charts/common/llm-vllm-uservice/Chart.yaml | 6 +++--- helm-charts/common/vllm-openvino/Chart.yaml | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml index efd0b8609..676df3e12 100644 --- a/helm-charts/chatqna/Chart.yaml +++ b/helm-charts/chatqna/Chart.yaml @@ -22,7 +22,7 @@ dependencies: tags: - tgi - name: vllm-openvino - version: 0.9.0 + version: 1.0.0 repository: "file://../common/vllm-openvino" condition: vllm-openvino.enabled tags: @@ -34,7 +34,7 @@ dependencies: tags: - tgi - name: llm-vllm-uservice - version: 0.9.0 + version: 1.0.0 repository: "file://../common/llm-vllm-uservice" condition: vllm-openvino.enabled tags: diff --git a/helm-charts/common/llm-vllm-uservice/Chart.yaml b/helm-charts/common/llm-vllm-uservice/Chart.yaml index 91fe4abd5..f939ca271 100644 --- a/helm-charts/common/llm-vllm-uservice/Chart.yaml +++ b/helm-charts/common/llm-vllm-uservice/Chart.yaml @@ -5,10 +5,10 @@ apiVersion: v2 name: llm-vllm-uservice description: A Helm chart for LLM microservice for which connects with vLLM microservice to receive inferences. type: application -version: 0.9.0 -appVersion: "v0.9" +version: 1.0.0 +appVersion: "v1.0" dependencies: - name: vllm-openvino - version: 0.9.0 + version: 1.0.0 repository: file://../vllm-openvino condition: autodependency.enabled diff --git a/helm-charts/common/vllm-openvino/Chart.yaml b/helm-charts/common/vllm-openvino/Chart.yaml index 1c6ce08df..da51598b2 100644 --- a/helm-charts/common/vllm-openvino/Chart.yaml +++ b/helm-charts/common/vllm-openvino/Chart.yaml @@ -5,5 +5,5 @@ apiVersion: v2 name: vllm-openvino description: A Helm chart for OpenVINO optimized vLLM Service type: application -version: 0.9.0 -appVersion: "v0.9" +version: 1.0.0 +appVersion: "v1.0" From 7497322defe6f9c81fb7b631b76b3a334a8dc9c1 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Tue, 10 Sep 2024 14:42:32 +0530 Subject: [PATCH 10/37] =?UTF-8?q?=F0=9F=94=A5=20Removed=20unused=20vars=20?= =?UTF-8?q?and=20resources?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- .../vllm-openvino/templates/configmap.yaml | 1 - .../common/vllm-openvino/templates/hpa.yaml | 49 ------------------- .../templates/servicemonitor.yaml | 18 ------- helm-charts/common/vllm-openvino/values.yaml | 1 - 4 files changed, 69 deletions(-) delete mode 100644 helm-charts/common/vllm-openvino/templates/hpa.yaml delete mode 100644 helm-charts/common/vllm-openvino/templates/servicemonitor.yaml diff --git a/helm-charts/common/vllm-openvino/templates/configmap.yaml b/helm-charts/common/vllm-openvino/templates/configmap.yaml index b61de3f8c..a9b1f7b33 100644 --- a/helm-charts/common/vllm-openvino/templates/configmap.yaml +++ b/helm-charts/common/vllm-openvino/templates/configmap.yaml @@ -12,7 +12,6 @@ data: PORT: {{ .Values.service.port | quote }} HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote }} - HABANA_VISIBLE_DEVICES : {{ .Values.HABANA_VISIBLE_DEVICES | quote }} OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote }} {{- if .Values.global.HF_ENDPOINT }} HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} diff --git a/helm-charts/common/vllm-openvino/templates/hpa.yaml b/helm-charts/common/vllm-openvino/templates/hpa.yaml deleted file mode 100644 index 62a2de8bf..000000000 --- a/helm-charts/common/vllm-openvino/templates/hpa.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -{{- if .Values.global.horizontalPodAutoscaler.enabled }} -apiVersion: autoscaling/v2 -kind: HorizontalPodAutoscaler -metadata: - name: {{ include "vllm-openvino.fullname" . }} - labels: - {{- include "vllm-openvino.labels" . | nindent 4 }} -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: {{ include "vllm-openvino.fullname" . }} - minReplicas: {{ .Values.horizontalPodAutoscaler.minReplicas }} - maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} - metrics: - - type: Object - object: - metric: - # VLLM time metrics are in seconds - name: vllm_ov_request_latency - describedObject: - apiVersion: v1 - # get metric for named object of given type (in same namespace) - kind: Service - name: {{ include "vllm-openvino.fullname" . }} - target: - type: Value - value: 4 - behavior: - scaleDown: - stabilizationWindowSeconds: 180 - policies: - - type: Percent - value: 25 - periodSeconds: 15 - scaleUp: - selectPolicy: Max - stabilizationWindowSeconds: 0 - policies: - - type: Percent - value: 50 - periodSeconds: 15 - - type: Pods - value: 2 - periodSeconds: 15 -{{- end }} diff --git a/helm-charts/common/vllm-openvino/templates/servicemonitor.yaml b/helm-charts/common/vllm-openvino/templates/servicemonitor.yaml deleted file mode 100644 index fefa864b7..000000000 --- a/helm-charts/common/vllm-openvino/templates/servicemonitor.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# - -{{- if .Values.global.horizontalPodAutoscaler.enabled }} -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: {{ include "vllm-openvino.fullname" . }} -spec: - selector: - matchLabels: - {{- include "vllm-openvino.selectorLabels" . | nindent 6 }} - endpoints: - - interval: 4s - port: tgi - scheme: http -{{- end }} diff --git a/helm-charts/common/vllm-openvino/values.yaml b/helm-charts/common/vllm-openvino/values.yaml index fc1b5eeb2..579630044 100644 --- a/helm-charts/common/vllm-openvino/values.yaml +++ b/helm-charts/common/vllm-openvino/values.yaml @@ -100,7 +100,6 @@ LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 CUDA_GRAPHS: "0" VLLM_CPU_KVCACHE_SPACE: 50 -HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none global: From 027923c2e659134673c7fca03efd3995069bc7d3 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 18 Sep 2024 15:12:05 +0530 Subject: [PATCH 11/37] =?UTF-8?q?=F0=9F=94=A7=20added=20openvino=20values?= =?UTF-8?q?=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/Chart.yaml | 6 ++-- helm-charts/chatqna/README.md | 2 +- helm-charts/chatqna/templates/deployment.yaml | 2 +- helm-charts/chatqna/values.yaml | 2 +- .../common/llm-vllm-uservice/Chart.yaml | 4 +-- .../common/llm-vllm-uservice/README.md | 28 +++++++++++-------- .../templates/configmap.yaml | 6 ++-- .../common/llm-vllm-uservice/values.yaml | 2 +- helm-charts/common/vllm/README.md | 24 ++++++++++++++-- helm-charts/common/vllm/openvino-values.yaml | 24 ++++++++++++++++ .../common/vllm/templates/configmap.yaml | 13 +++++++++ .../common/vllm/templates/deployment.yaml | 14 ++++++++++ helm-charts/common/vllm/values.yaml | 2 ++ 13 files changed, 102 insertions(+), 27 deletions(-) create mode 100644 helm-charts/common/vllm/openvino-values.yaml diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml index 676df3e12..d79bc1385 100644 --- a/helm-charts/chatqna/Chart.yaml +++ b/helm-charts/chatqna/Chart.yaml @@ -21,10 +21,10 @@ dependencies: condition: tgi.enabled tags: - tgi - - name: vllm-openvino + - name: vllm version: 1.0.0 repository: "file://../common/vllm-openvino" - condition: vllm-openvino.enabled + condition: vllm.enabled tags: - vllm - name: llm-uservice @@ -36,7 +36,7 @@ dependencies: - name: llm-vllm-uservice version: 1.0.0 repository: "file://../common/llm-vllm-uservice" - condition: vllm-openvino.enabled + condition: vllm.enabled tags: - vllm - name: tei diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 3d8070900..5e4048fbe 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -73,7 +73,7 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- ```bash # To use OpenVINO vLLM inference engine on Xeon device -helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-vllm-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm-openvino.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm-openvino.enabled=true +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-vllm-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true ``` ### IMPORTANT NOTE diff --git a/helm-charts/chatqna/templates/deployment.yaml b/helm-charts/chatqna/templates/deployment.yaml index 95a8c0f0b..56d1cea7d 100644 --- a/helm-charts/chatqna/templates/deployment.yaml +++ b/helm-charts/chatqna/templates/deployment.yaml @@ -30,7 +30,7 @@ spec: containers: - name: {{ .Release.Name }} env: - {{- if (index .Values "vllm-openvino" "enabled") }} + {{- if .Values.vllm.enabled }} - name: LLM_SERVICE_HOST_IP value: {{ .Release.Name }}-llm-vllm-uservice {{- else }} diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index 56b382a35..b712d56c5 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -47,7 +47,7 @@ affinity: {} tgi: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 -vllm-openvino: +vllm: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 enabled: false diff --git a/helm-charts/common/llm-vllm-uservice/Chart.yaml b/helm-charts/common/llm-vllm-uservice/Chart.yaml index f939ca271..ded29e3b5 100644 --- a/helm-charts/common/llm-vllm-uservice/Chart.yaml +++ b/helm-charts/common/llm-vllm-uservice/Chart.yaml @@ -8,7 +8,7 @@ type: application version: 1.0.0 appVersion: "v1.0" dependencies: - - name: vllm-openvino + - name: vllm version: 1.0.0 - repository: file://../vllm-openvino + repository: file://../vllm condition: autodependency.enabled diff --git a/helm-charts/common/llm-vllm-uservice/README.md b/helm-charts/common/llm-vllm-uservice/README.md index d2fd159b3..261018d78 100644 --- a/helm-charts/common/llm-vllm-uservice/README.md +++ b/helm-charts/common/llm-vllm-uservice/README.md @@ -11,26 +11,30 @@ As this service depends on vLLM microservice, we can proceed in either of 2 ways ## (Option 1): Installing the chart separately: -First, you need to install the `vllm-openvino` chart, please refer to the [vllm-openvino](../vllm-openvino) chart for more information. +First, you need to install the `vllm-openvino` chart, please refer to the [vllm](../vllm) chart for more information. -After you've deployed the `vllm-openvino` chart successfully, please run `kubectl get svc` to get the vLLM service name with port. We need to provide this to `llm-vllm-uservice` as a value for vLLM_ENDPOINT for letting it discover and connect to the vLLM microservice. +After you've deployed the `vllm` chart successfully, please run `kubectl get svc` to get the vLLM service name with port. We need to provide this to `llm-vllm-uservice` as a value for vLLM_ENDPOINT for letting it discover and connect to the vLLM microservice. -> **_NOTE:_** While installing charts separately, if you don't provide any vLLM endpoint explicitly, it will take the default endpoint as `http://-vllm-openvino:80`. So, if you are not providing the vLLM endpoint explicitly, please make sure to provide same helm release name to both the charts while installing. +> **_NOTE:_** While installing charts separately, if you don't provide any vLLM endpoint explicitly, it will take the default endpoint as `http://-vllm:80`. So, if you are not providing the vLLM endpoint explicitly, please make sure to provide same helm release name to both the charts while installing. + +Get the service name for vLLM deployment by running: `kubectl get svc`. In the current case, service name would be `myvllm`. + +> **_NOTE:_** Please add the service name for vLLM to the value of no_proxy env var, if you are behind a proxy. To install the chart, run the following: ```bash cd GenAIInfra/helm-charts/common/llm-vllm-uservice export HFTOKEN="insert-your-huggingface-token-here" -export vLLM_ENDPOINT="http://vllm-openvino" -export MODELNAME="bigscience/bloom-560m" +export vLLM_ENDPOINT="http://myvllm" +export MODELNAME="Intel/neural-chat-7b-v3-3" # If proxy is required, please export the appropriate proxy values. export http_proxy= export https_proxy= helm dependency update -helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set vLLM_ENDPOINT=${vLLM_ENDPOINT} --set LLM_MODEL_ID=${MODELNAME} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +helm install llmcontrol . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set vLLM_ENDPOINT=${vLLM_ENDPOINT} --set LLM_MODEL_ID=${MODELNAME} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait ``` ## (Option 2): Installing the chart with automatic installation of dependency: @@ -39,14 +43,14 @@ helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN cd GenAIInfra/helm-charts/common/llm-vllm-uservice export HFTOKEN="insert-your-huggingface-token-here" export MODELDIR="/mnt/opea-models" -export MODELNAME="bigscience/bloom-560m" +export MODELNAME="Intel/neural-chat-7b-v3-3" # If proxy is required, please export the appropriate proxy values. export http_proxy= export https_proxy= helm dependency update -helm install llm-vllm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set vllm-openvino.LLM_MODEL_ID=${MODELNAME} --set autodependency.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +helm install llmcontrol . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set autodependency.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait ``` `--wait` flag in above installation command will make sure that all the dependencies are resolved and all services are deployed. @@ -59,15 +63,15 @@ To verify the installation, run the following command to make sure all pods are kubectl get pod ``` -Once you see `llm-vllm-uservice` pod and `llm-vllm-uservice-vllm-openvino` pod in ready and running state, run the following command: +Once you see `llmcontrolr-llm-vllm-uservice` pod and `llmcontrol-vllm` pod in ready and running state, run the following command: ```bash -kubectl port-forward svc/llm-vllm-uservice 9000:9000 +kubectl port-forward svc/llmcontrol-llm-vllm-uservice 9000:9000 ``` -This exposes the port 9000, on which `llm-vllm-uservice` is running inside the pod, at port 9000 on the host. +This exposes the port 9000, on which `llmcontrol-llm-vllm-uservice` is running inside the pod, at port 9000 on the host. -Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `llm-vllm-uservice` is working: +Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `llmcontrol-llm-vllm-uservice` is working: ```bash curl http://localhost:9000/v1/chat/completions \ diff --git a/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml b/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml index 247cd614e..22e21e0f2 100644 --- a/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml +++ b/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml @@ -11,7 +11,7 @@ data: {{- if .Values.vLLM_ENDPOINT }} vLLM_ENDPOINT: {{ .Values.vLLM_ENDPOINT | quote}} {{- else }} - vLLM_ENDPOINT: "http://{{ .Release.Name }}-vllm-openvino" + vLLM_ENDPOINT: "http://{{ .Release.Name }}-vllm" {{- end }} LLM_MODEL: {{ .Values.LLM_MODEL_ID | quote }} HUGGINGFACEHUB_API_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} @@ -22,9 +22,9 @@ data: http_proxy: {{ .Values.global.http_proxy | quote }} https_proxy: {{ .Values.global.https_proxy | quote }} {{- if and (not .Values.vLLM_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy) }} - no_proxy: "{{ .Release.Name }}-vllm-openvino,{{ .Values.global.no_proxy }}" + no_proxy: "{{ .Release.Name }}-vllm,{{ .Values.global.no_proxy }}" {{- else }} - no_proxy: "{{ .Values.global.no_proxy }}, vllm-openvino" + no_proxy: "{{ .Values.global.no_proxy }},myvllm,vllm" {{- end }} LANGCHAIN_TRACING_V2: {{ .Values.global.LANGCHAIN_TRACING_V2 | quote }} LANGCHAIN_API_KEY: {{ .Values.global.LANGCHAIN_API_KEY }} diff --git a/helm-charts/common/llm-vllm-uservice/values.yaml b/helm-charts/common/llm-vllm-uservice/values.yaml index 50240146a..7febb660b 100644 --- a/helm-charts/common/llm-vllm-uservice/values.yaml +++ b/helm-charts/common/llm-vllm-uservice/values.yaml @@ -91,7 +91,7 @@ affinity: {} LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" # Overriding the Model ID being used by vllm-openvino service.(As llm-vllm microservice depends on vllm-openvino, these 2 values should be same.) -vllm-openvino: +vllm: LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" global: diff --git a/helm-charts/common/vllm/README.md b/helm-charts/common/vllm/README.md index 28bff9700..2dd04df2e 100644 --- a/helm-charts/common/vllm/README.md +++ b/helm-charts/common/vllm/README.md @@ -10,16 +10,34 @@ To install the chart, run the following: Note that you cannot use vllm as the service release name due to [environment variables conflict](https://docs.vllm.ai/en/stable/serving/env_vars.html#environment-variables). -```console +```bash cd GenAIInfra/helm-charts/common export MODELDIR=/mnt/opea-models export MODELNAME="Intel/neural-chat-7b-v3-3" export HFTOKEN="insert-your-huggingface-token-here" + +# If you are behind a proxy, please export the appropriate proxy values. +export http_proxy= +export https_proxy= + +``` + +- Deploy on XEON device: + +```bash helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -# To deploy on Gaudi enabled kubernetes cluster -# helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values gaudi-values.yaml ``` +- To deploy on Gaudi enabled Kubernetes cluster: +```bash +helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values vllm/gaudi-values.yaml +``` + +- To deploy OpenVINO optimized vLLM on XEON device: +```bash +helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --values vllm/openvino-values.yaml +`` + By default, the vllm service will downloading the "Intel/neural-chat-7b-v3-3". If you already cached the model locally, you can pass it to container like this example: diff --git a/helm-charts/common/vllm/openvino-values.yaml b/helm-charts/common/vllm/openvino-values.yaml new file mode 100644 index 000000000..e12dc6505 --- /dev/null +++ b/helm-charts/common/vllm/openvino-values.yaml @@ -0,0 +1,24 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Values for OpenVINO optimized vLLM. + +openvino_enabled: true + +image: + repository: vllm + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "openvino" + +extraCmdArgs: [] + +LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + +CUDA_GRAPHS: "0" +VLLM_CPU_KVCACHE_SPACE: 50 +VLLM_OPENVINO_KVCACHE_SPACE: 32 +OMPI_MCA_btl_vader_single_copy_mechanism: none + +ov_command: ["/bin/bash"] + diff --git a/helm-charts/common/vllm/templates/configmap.yaml b/helm-charts/common/vllm/templates/configmap.yaml index 80b9a97da..c794b6cb0 100644 --- a/helm-charts/common/vllm/templates/configmap.yaml +++ b/helm-charts/common/vllm/templates/configmap.yaml @@ -8,10 +8,17 @@ metadata: labels: {{- include "vllm.labels" . | nindent 4 }} data: + {{- if .Values.openvino_enabled }} + MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }} + PORT: {{ .Values.port | quote }} + {{- end }} HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} {{- if .Values.global.HF_ENDPOINT }} HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} {{- end }} + {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }} + OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote }} + {{- end}} http_proxy: {{ .Values.global.http_proxy | quote }} https_proxy: {{ .Values.global.https_proxy | quote }} no_proxy: {{ .Values.global.no_proxy | quote }} @@ -23,3 +30,9 @@ data: {{- if .Values.VLLM_CPU_KVCACHE_SPACE }} VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}} {{- end }} + {{- if .Values.VLLM_OPENVINO_KVCACHE_SPACE }} + VLLM_OPENVINO_KVCACHE_SPACE: {{ .Values.VLLM_OPENVINO_KVCACHE_SPACE | quote }} + {{- end }} + {{- if .Values.CUDA_GRAPHS }} + CUDA_GRAPHS: {{ .Values.CUDA_GRAPHS | quote }} + {{- end }} diff --git a/helm-charts/common/vllm/templates/deployment.yaml b/helm-charts/common/vllm/templates/deployment.yaml index 133cc0dfb..b117f9aaf 100644 --- a/helm-charts/common/vllm/templates/deployment.yaml +++ b/helm-charts/common/vllm/templates/deployment.yaml @@ -45,7 +45,20 @@ spec: {{- end }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- if .Values.openvino_enabled }} + command: {{ .Values.ov_command }} + {{- end }} args: + {{- if .Values.openvino_enabled }} + - "-c" + - | + cd / && \ + python3 -m vllm.entrypoints.openai.api_server \ + --model {{ .Values.LLM_MODEL_ID | quote }} \ + --host 0.0.0.0 \ + --port {{ .Values.port | quote }} \ + --download-dir /data + {{- else }} {{- if .Values.extraCmdArgs }} {{- range .Values.extraCmdArgs }} - {{ . | quote }} @@ -59,6 +72,7 @@ spec: - {{ .Values.port | quote }} - "--download-dir" - "/data" + {{- end }} volumeMounts: - mountPath: /data name: model-volume diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml index 3e98a21be..783e11f88 100644 --- a/helm-charts/common/vllm/values.yaml +++ b/helm-charts/common/vllm/values.yaml @@ -6,6 +6,8 @@ # Declare variables to be passed into your templates. replicaCount: 1 +fullnameOverride: "myvllm" +nameOverride: "myvllm" port: 2080 shmSize: 1Gi From 4154f022e9c5494225920d2eda5a3da77d85ae73 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 09:39:11 +0000 Subject: [PATCH 12/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/common/vllm/README.md | 8 +++++--- helm-charts/common/vllm/openvino-values.yaml | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/helm-charts/common/vllm/README.md b/helm-charts/common/vllm/README.md index 2dd04df2e..031e76194 100644 --- a/helm-charts/common/vllm/README.md +++ b/helm-charts/common/vllm/README.md @@ -29,14 +29,16 @@ helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MOD ``` - To deploy on Gaudi enabled Kubernetes cluster: + ```bash helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values vllm/gaudi-values.yaml ``` - To deploy OpenVINO optimized vLLM on XEON device: -```bash + +````bash helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --values vllm/openvino-values.yaml -`` +`` By default, the vllm service will downloading the "Intel/neural-chat-7b-v3-3". @@ -58,7 +60,7 @@ Open another terminal and run the following command to verify the service if wor curl http://localhost:2080/v1/completions \ -H "Content-Type: application/json" \ -d '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}' -``` +```` ## Values diff --git a/helm-charts/common/vllm/openvino-values.yaml b/helm-charts/common/vllm/openvino-values.yaml index e12dc6505..dfd157699 100644 --- a/helm-charts/common/vllm/openvino-values.yaml +++ b/helm-charts/common/vllm/openvino-values.yaml @@ -21,4 +21,3 @@ VLLM_OPENVINO_KVCACHE_SPACE: 32 OMPI_MCA_btl_vader_single_copy_mechanism: none ov_command: ["/bin/bash"] - From 8b911f5bfdc9cfe08568a1e3bf73bab2301398b1 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 18 Sep 2024 15:33:58 +0530 Subject: [PATCH 13/37] =?UTF-8?q?=F0=9F=A9=B9=20minor=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/Chart.yaml | 2 +- helm-charts/common/vllm/values.yaml | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml index fa4fd4bff..284db3d6d 100644 --- a/helm-charts/chatqna/Chart.yaml +++ b/helm-charts/chatqna/Chart.yaml @@ -23,7 +23,7 @@ dependencies: - tgi - name: vllm version: 1.0.0 - repository: "file://../common/vllm-openvino" + repository: "file://../common/vllm" condition: vllm.enabled tags: - vllm diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml index 783e11f88..3e98a21be 100644 --- a/helm-charts/common/vllm/values.yaml +++ b/helm-charts/common/vllm/values.yaml @@ -6,8 +6,6 @@ # Declare variables to be passed into your templates. replicaCount: 1 -fullnameOverride: "myvllm" -nameOverride: "myvllm" port: 2080 shmSize: 1Gi From 2ba4c8f9164087b145e0d5ba25382a7c1dd129b1 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 18 Sep 2024 16:23:53 +0530 Subject: [PATCH 14/37] =?UTF-8?q?=F0=9F=A9=B9=20renamed=20chart=20llm-vllm?= =?UTF-8?q?-uservice=20to=20avoid=20conflict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/Chart.yaml | 4 +- helm-charts/chatqna/README.md | 13 +- helm-charts/chatqna/templates/deployment.yaml | 2 +- helm-charts/chatqna/values.yaml | 2 +- .../.helmignore | 0 .../Chart.yaml | 4 +- .../README.md | 24 ++-- .../templates/_helpers.tpl | 20 +-- .../templates/configmap.yaml | 4 +- .../templates/deployment.yaml | 10 +- .../templates/service.yaml | 6 +- .../templates/tests/test-pod.yaml | 6 +- .../values.yaml | 2 +- helm-charts/common/vllm-openvino/.helmignore | 23 ---- helm-charts/common/vllm-openvino/Chart.yaml | 9 -- helm-charts/common/vllm-openvino/README.md | 68 ---------- .../vllm-openvino/templates/_helpers.tpl | 62 --------- .../vllm-openvino/templates/configmap.yaml | 27 ---- .../vllm-openvino/templates/deployment.yaml | 113 ---------------- .../vllm-openvino/templates/service.yaml | 18 --- .../templates/tests/test-pod.yaml | 29 ---- helm-charts/common/vllm-openvino/values.yaml | 125 ------------------ 22 files changed, 50 insertions(+), 521 deletions(-) rename helm-charts/common/{llm-vllm-uservice => llm-ctrl-uservice}/.helmignore (100%) rename helm-charts/common/{llm-vllm-uservice => llm-ctrl-uservice}/Chart.yaml (64%) rename helm-charts/common/{llm-vllm-uservice => llm-ctrl-uservice}/README.md (79%) rename helm-charts/common/{llm-vllm-uservice => llm-ctrl-uservice}/templates/_helpers.tpl (73%) rename helm-charts/common/{llm-vllm-uservice => llm-ctrl-uservice}/templates/configmap.yaml (91%) rename helm-charts/common/{llm-vllm-uservice => llm-ctrl-uservice}/templates/deployment.yaml (88%) rename helm-charts/common/{llm-vllm-uservice => llm-ctrl-uservice}/templates/service.yaml (64%) rename helm-charts/common/{llm-vllm-uservice => llm-ctrl-uservice}/templates/tests/test-pod.yaml (83%) rename helm-charts/common/{llm-vllm-uservice => llm-ctrl-uservice}/values.yaml (98%) delete mode 100644 helm-charts/common/vllm-openvino/.helmignore delete mode 100644 helm-charts/common/vllm-openvino/Chart.yaml delete mode 100644 helm-charts/common/vllm-openvino/README.md delete mode 100644 helm-charts/common/vllm-openvino/templates/_helpers.tpl delete mode 100644 helm-charts/common/vllm-openvino/templates/configmap.yaml delete mode 100644 helm-charts/common/vllm-openvino/templates/deployment.yaml delete mode 100644 helm-charts/common/vllm-openvino/templates/service.yaml delete mode 100644 helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml delete mode 100644 helm-charts/common/vllm-openvino/values.yaml diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml index 284db3d6d..ba5e5967f 100644 --- a/helm-charts/chatqna/Chart.yaml +++ b/helm-charts/chatqna/Chart.yaml @@ -33,9 +33,9 @@ dependencies: condition: tgi.enabled tags: - tgi - - name: llm-vllm-uservice + - name: llm-ctrl-uservice version: 1.0.0 - repository: "file://../common/llm-vllm-uservice" + repository: "file://../common/llm-ctrl-uservice" condition: vllm.enabled tags: - vllm diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 4b61f2135..93fd157f8 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -17,9 +17,9 @@ Apart from above mentioned services, there are following conditional dependencie - [llm-uservice](../common/llm-uservice) - [tgi](../common/tgi) -2. If we want to use OpenVINO vLLM inference service, following 2 services would be required: - - [llm-vllm-uservice](../common/llm-vllm-uservice) - - [vllm-openvino](../common/vllm-openvino) +2. If we want to use vLLM inference service, following 2 services would be required: + - [llm-ctrl-uservice](../common/llm-ctrl-uservice) + - [vllm](../common/vllm) ## Installing the Chart @@ -76,8 +76,11 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- > **_NOTE:_** Default installation will use [TGI (Text Generation Inference)](https://github.com/huggingface/text-generation-inference) as inference engine. To use vLLM as inference engine, please see below. ```bash -# To use OpenVINO vLLM inference engine on Xeon device -helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-vllm-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true +# To use vLLM inference engine on XEON device +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true + +# To use OpenVINO optimized vLLM inference engine on XEON device +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true --values common/vllm/openvino-values.yaml ``` ### IMPORTANT NOTE diff --git a/helm-charts/chatqna/templates/deployment.yaml b/helm-charts/chatqna/templates/deployment.yaml index 08f7e41e7..f30030172 100644 --- a/helm-charts/chatqna/templates/deployment.yaml +++ b/helm-charts/chatqna/templates/deployment.yaml @@ -35,7 +35,7 @@ spec: env: {{- if .Values.vllm.enabled }} - name: LLM_SERVICE_HOST_IP - value: {{ .Release.Name }}-llm-vllm-uservice + value: {{ .Release.Name }}-llm-ctrl-uservice {{- else }} - name: LLM_SERVICE_HOST_IP value: {{ .Release.Name }}-llm-uservice diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index 6913f58e6..567b0025e 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -56,7 +56,7 @@ vllm: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 enabled: false -llm-vllm-uservice: +llm-ctrl-uservice: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 tags: diff --git a/helm-charts/common/llm-vllm-uservice/.helmignore b/helm-charts/common/llm-ctrl-uservice/.helmignore similarity index 100% rename from helm-charts/common/llm-vllm-uservice/.helmignore rename to helm-charts/common/llm-ctrl-uservice/.helmignore diff --git a/helm-charts/common/llm-vllm-uservice/Chart.yaml b/helm-charts/common/llm-ctrl-uservice/Chart.yaml similarity index 64% rename from helm-charts/common/llm-vllm-uservice/Chart.yaml rename to helm-charts/common/llm-ctrl-uservice/Chart.yaml index ded29e3b5..95d7b24d9 100644 --- a/helm-charts/common/llm-vllm-uservice/Chart.yaml +++ b/helm-charts/common/llm-ctrl-uservice/Chart.yaml @@ -2,8 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 apiVersion: v2 -name: llm-vllm-uservice -description: A Helm chart for LLM microservice for which connects with vLLM microservice to receive inferences. +name: llm-ctrl-uservice +description: A Helm chart for LLM controller microservice which connects with vLLM microservice to provide inferences. type: application version: 1.0.0 appVersion: "v1.0" diff --git a/helm-charts/common/llm-vllm-uservice/README.md b/helm-charts/common/llm-ctrl-uservice/README.md similarity index 79% rename from helm-charts/common/llm-vllm-uservice/README.md rename to helm-charts/common/llm-ctrl-uservice/README.md index 261018d78..f7c5edbd8 100644 --- a/helm-charts/common/llm-vllm-uservice/README.md +++ b/helm-charts/common/llm-ctrl-uservice/README.md @@ -1,19 +1,19 @@ -# llm-vllm Microservice +# llm-ctrl Microservice Helm chart for deploying a microservice which facilitates connections and handles responses from OpenVINO vLLM microservice. -`llm-vllm-uservice` depends on OpenVINO vLLM. You should properly set `vLLM_ENDPOINT` as the HOST URI of vLLM microservice. If not set, it will consider the default value : `http://-vllm-openvino:80` +`llm-ctrl-uservice` depends on OpenVINO vLLM. You should properly set `vLLM_ENDPOINT` as the HOST URI of vLLM microservice. If not set, it will consider the default value : `http://-vllm-openvino:80` As this service depends on vLLM microservice, we can proceed in either of 2 ways: - Install both microservices separately one after another. -- Install the vLLM microservice as dependency for the our main `llm-vllm-uservice` microservice. +- Install the vLLM microservice as dependency for the our main `llm-ctrl-uservice` microservice. ## (Option 1): Installing the chart separately: First, you need to install the `vllm-openvino` chart, please refer to the [vllm](../vllm) chart for more information. -After you've deployed the `vllm` chart successfully, please run `kubectl get svc` to get the vLLM service name with port. We need to provide this to `llm-vllm-uservice` as a value for vLLM_ENDPOINT for letting it discover and connect to the vLLM microservice. +After you've deployed the `vllm` chart successfully, please run `kubectl get svc` to get the vLLM service name with port. We need to provide this to `llm-ctrl-uservice` as a value for vLLM_ENDPOINT for letting it discover and connect to the vLLM microservice. > **_NOTE:_** While installing charts separately, if you don't provide any vLLM endpoint explicitly, it will take the default endpoint as `http://-vllm:80`. So, if you are not providing the vLLM endpoint explicitly, please make sure to provide same helm release name to both the charts while installing. @@ -24,7 +24,7 @@ Get the service name for vLLM deployment by running: `kubectl get svc`. In the c To install the chart, run the following: ```bash -cd GenAIInfra/helm-charts/common/llm-vllm-uservice +cd GenAIInfra/helm-charts/common/llm-ctrl-uservice export HFTOKEN="insert-your-huggingface-token-here" export vLLM_ENDPOINT="http://myvllm" export MODELNAME="Intel/neural-chat-7b-v3-3" @@ -34,13 +34,13 @@ export http_proxy= export https_proxy= helm dependency update -helm install llmcontrol . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set vLLM_ENDPOINT=${vLLM_ENDPOINT} --set LLM_MODEL_ID=${MODELNAME} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +helm install llm-ctrl-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set vLLM_ENDPOINT=${vLLM_ENDPOINT} --set LLM_MODEL_ID=${MODELNAME} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait ``` ## (Option 2): Installing the chart with automatic installation of dependency: ```bash -cd GenAIInfra/helm-charts/common/llm-vllm-uservice +cd GenAIInfra/helm-charts/common/llm-ctrl-uservice export HFTOKEN="insert-your-huggingface-token-here" export MODELDIR="/mnt/opea-models" export MODELNAME="Intel/neural-chat-7b-v3-3" @@ -50,7 +50,7 @@ export http_proxy= export https_proxy= helm dependency update -helm install llmcontrol . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set autodependency.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +helm install llm-ctrl-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set autodependency.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait ``` `--wait` flag in above installation command will make sure that all the dependencies are resolved and all services are deployed. @@ -63,15 +63,15 @@ To verify the installation, run the following command to make sure all pods are kubectl get pod ``` -Once you see `llmcontrolr-llm-vllm-uservice` pod and `llmcontrol-vllm` pod in ready and running state, run the following command: +Once you see `llm-ctrl-uservice` pod and `llm-ctrl-uservice-vllm` pod in ready and running state, run the following command: ```bash -kubectl port-forward svc/llmcontrol-llm-vllm-uservice 9000:9000 +kubectl port-forward svc/llm-ctrl-uservice 9000:9000 ``` -This exposes the port 9000, on which `llmcontrol-llm-vllm-uservice` is running inside the pod, at port 9000 on the host. +This exposes the port 9000, on which `llm-ctrl-uservice` is running inside the pod, at port 9000 on the host. -Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `llmcontrol-llm-vllm-uservice` is working: +Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `llm-ctrl-uservice` is working: ```bash curl http://localhost:9000/v1/chat/completions \ diff --git a/helm-charts/common/llm-vllm-uservice/templates/_helpers.tpl b/helm-charts/common/llm-ctrl-uservice/templates/_helpers.tpl similarity index 73% rename from helm-charts/common/llm-vllm-uservice/templates/_helpers.tpl rename to helm-charts/common/llm-ctrl-uservice/templates/_helpers.tpl index 211968599..3cf82f83a 100644 --- a/helm-charts/common/llm-vllm-uservice/templates/_helpers.tpl +++ b/helm-charts/common/llm-ctrl-uservice/templates/_helpers.tpl @@ -1,7 +1,7 @@ {{/* Expand the name of the chart. */}} -{{- define "llm-vllm-uservice.name" -}} +{{- define "llm-ctrl-uservice.name" -}} {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} {{- end }} @@ -10,7 +10,7 @@ Create a default fully qualified app name. We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). If release name contains chart name it will be used as a full name. */}} -{{- define "llm-vllm-uservice.fullname" -}} +{{- define "llm-ctrl-uservice.fullname" -}} {{- if .Values.fullnameOverride }} {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} {{- else }} @@ -26,16 +26,16 @@ If release name contains chart name it will be used as a full name. {{/* Create chart name and version as used by the chart label. */}} -{{- define "llm-vllm-uservice.chart" -}} +{{- define "llm-ctrl-uservice.chart" -}} {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} {{- end }} {{/* Common labels */}} -{{- define "llm-vllm-uservice.labels" -}} -helm.sh/chart: {{ include "llm-vllm-uservice.chart" . }} -{{ include "llm-vllm-uservice.selectorLabels" . }} +{{- define "llm-ctrl-uservice.labels" -}} +helm.sh/chart: {{ include "llm-ctrl-uservice.chart" . }} +{{ include "llm-ctrl-uservice.selectorLabels" . }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} @@ -45,17 +45,17 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} {{/* Selector labels */}} -{{- define "llm-vllm-uservice.selectorLabels" -}} -app.kubernetes.io/name: {{ include "llm-vllm-uservice.name" . }} +{{- define "llm-ctrl-uservice.selectorLabels" -}} +app.kubernetes.io/name: {{ include "llm-ctrl-uservice.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} {{/* Create the name of the service account to use */}} -{{- define "llm-vllm-uservice.serviceAccountName" -}} +{{- define "llm-ctrl-uservice.serviceAccountName" -}} {{- if .Values.serviceAccount.create }} -{{- default (include "llm-vllm-uservice.fullname" .) .Values.serviceAccount.name }} +{{- default (include "llm-ctrl-uservice.fullname" .) .Values.serviceAccount.name }} {{- else }} {{- default "default" .Values.serviceAccount.name }} {{- end }} diff --git a/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml b/helm-charts/common/llm-ctrl-uservice/templates/configmap.yaml similarity index 91% rename from helm-charts/common/llm-vllm-uservice/templates/configmap.yaml rename to helm-charts/common/llm-ctrl-uservice/templates/configmap.yaml index 22e21e0f2..4bc0fcea4 100644 --- a/helm-charts/common/llm-vllm-uservice/templates/configmap.yaml +++ b/helm-charts/common/llm-ctrl-uservice/templates/configmap.yaml @@ -4,9 +4,9 @@ apiVersion: v1 kind: ConfigMap metadata: - name: {{ include "llm-vllm-uservice.fullname" . }}-config + name: {{ include "llm-ctrl-uservice.fullname" . }}-config labels: - {{- include "llm-vllm-uservice.labels" . | nindent 4 }} + {{- include "llm-ctrl-uservice.labels" . | nindent 4 }} data: {{- if .Values.vLLM_ENDPOINT }} vLLM_ENDPOINT: {{ .Values.vLLM_ENDPOINT | quote}} diff --git a/helm-charts/common/llm-vllm-uservice/templates/deployment.yaml b/helm-charts/common/llm-ctrl-uservice/templates/deployment.yaml similarity index 88% rename from helm-charts/common/llm-vllm-uservice/templates/deployment.yaml rename to helm-charts/common/llm-ctrl-uservice/templates/deployment.yaml index 96c0658e4..9020a59de 100644 --- a/helm-charts/common/llm-vllm-uservice/templates/deployment.yaml +++ b/helm-charts/common/llm-ctrl-uservice/templates/deployment.yaml @@ -4,14 +4,14 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ include "llm-vllm-uservice.fullname" . }} + name: {{ include "llm-ctrl-uservice.fullname" . }} labels: - {{- include "llm-vllm-uservice.labels" . | nindent 4 }} + {{- include "llm-ctrl-uservice.labels" . | nindent 4 }} spec: replicas: {{ .Values.replicaCount }} selector: matchLabels: - {{- include "llm-vllm-uservice.selectorLabels" . | nindent 6 }} + {{- include "llm-ctrl-uservice.selectorLabels" . | nindent 6 }} template: metadata: {{- with .Values.podAnnotations }} @@ -19,7 +19,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} labels: - {{- include "llm-vllm-uservice.labels" . | nindent 8 }} + {{- include "llm-ctrl-uservice.labels" . | nindent 8 }} {{- with .Values.podLabels }} {{- toYaml . | nindent 8 }} {{- end }} @@ -34,7 +34,7 @@ spec: - name: {{ .Chart.Name }} envFrom: - configMapRef: - name: {{ include "llm-vllm-uservice.fullname" . }}-config + name: {{ include "llm-ctrl-uservice.fullname" . }}-config {{- if .Values.global.extraEnvConfig }} - configMapRef: name: {{ .Values.global.extraEnvConfig }} diff --git a/helm-charts/common/llm-vllm-uservice/templates/service.yaml b/helm-charts/common/llm-ctrl-uservice/templates/service.yaml similarity index 64% rename from helm-charts/common/llm-vllm-uservice/templates/service.yaml rename to helm-charts/common/llm-ctrl-uservice/templates/service.yaml index 2c58d8f24..ef5ae38fb 100644 --- a/helm-charts/common/llm-vllm-uservice/templates/service.yaml +++ b/helm-charts/common/llm-ctrl-uservice/templates/service.yaml @@ -4,9 +4,9 @@ apiVersion: v1 kind: Service metadata: - name: {{ include "llm-vllm-uservice.fullname" . }} + name: {{ include "llm-ctrl-uservice.fullname" . }} labels: - {{- include "llm-vllm-uservice.labels" . | nindent 4 }} + {{- include "llm-ctrl-uservice.labels" . | nindent 4 }} spec: type: {{ .Values.service.type }} ports: @@ -15,4 +15,4 @@ spec: protocol: TCP name: llm-vllm selector: - {{- include "llm-vllm-uservice.selectorLabels" . | nindent 4 }} + {{- include "llm-ctrl-uservice.selectorLabels" . | nindent 4 }} diff --git a/helm-charts/common/llm-vllm-uservice/templates/tests/test-pod.yaml b/helm-charts/common/llm-ctrl-uservice/templates/tests/test-pod.yaml similarity index 83% rename from helm-charts/common/llm-vllm-uservice/templates/tests/test-pod.yaml rename to helm-charts/common/llm-ctrl-uservice/templates/tests/test-pod.yaml index a2803fc95..a0d86f3da 100644 --- a/helm-charts/common/llm-vllm-uservice/templates/tests/test-pod.yaml +++ b/helm-charts/common/llm-ctrl-uservice/templates/tests/test-pod.yaml @@ -4,9 +4,9 @@ apiVersion: v1 kind: Pod metadata: - name: "{{ include "llm-vllm-uservice.fullname" . }}-testpod" + name: "{{ include "llm-ctrl-uservice.fullname" . }}-testpod" labels: - {{- include "llm-vllm-uservice.labels" . | nindent 4 }} + {{- include "llm-ctrl-uservice.labels" . | nindent 4 }} annotations: "helm.sh/hook": test spec: @@ -18,7 +18,7 @@ spec: - | max_retry=20; for ((i=1; i<=max_retry; i++)); do - curl http://{{ include "llm-vllm-uservice.fullname" . }}:{{ .Values.service.port }}/v1/chat/completions -sS --fail-with-body \ + curl http://{{ include "llm-ctrl-uservice.fullname" . }}:{{ .Values.service.port }}/v1/chat/completions -sS --fail-with-body \ -X POST \ -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ -H 'Content-Type: application/json' && break; diff --git a/helm-charts/common/llm-vllm-uservice/values.yaml b/helm-charts/common/llm-ctrl-uservice/values.yaml similarity index 98% rename from helm-charts/common/llm-vllm-uservice/values.yaml rename to helm-charts/common/llm-ctrl-uservice/values.yaml index 7febb660b..54d504792 100644 --- a/helm-charts/common/llm-vllm-uservice/values.yaml +++ b/helm-charts/common/llm-ctrl-uservice/values.yaml @@ -1,7 +1,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# Default values for llm-vllm-uservice. +# Default values for llm-ctrl-uservice. # This is a YAML-formatted file. # Declare variables to be passed into your templates. diff --git a/helm-charts/common/vllm-openvino/.helmignore b/helm-charts/common/vllm-openvino/.helmignore deleted file mode 100644 index 0e8a0eb36..000000000 --- a/helm-charts/common/vllm-openvino/.helmignore +++ /dev/null @@ -1,23 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*.orig -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/helm-charts/common/vllm-openvino/Chart.yaml b/helm-charts/common/vllm-openvino/Chart.yaml deleted file mode 100644 index da51598b2..000000000 --- a/helm-charts/common/vllm-openvino/Chart.yaml +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: v2 -name: vllm-openvino -description: A Helm chart for OpenVINO optimized vLLM Service -type: application -version: 1.0.0 -appVersion: "v1.0" diff --git a/helm-charts/common/vllm-openvino/README.md b/helm-charts/common/vllm-openvino/README.md deleted file mode 100644 index 770fcaea5..000000000 --- a/helm-charts/common/vllm-openvino/README.md +++ /dev/null @@ -1,68 +0,0 @@ -# OpenVINO vLLM - -Helm chart for deploying OpenVINO optimized vLLM Inference service. - -## Installing the Chart - -To install the chart, run the following: - -```bash -cd GenAIInfra/helm-charts/common -export MODELDIR=/mnt/opea-models -export MODELNAME="bigscience/bloom-560m" -export HFTOKEN="insert-your-huggingface-token-here" - -# If proxy is required, please export the appropriate proxy values. -export http_proxy= -export https_proxy= - -helm install vllm-openvino vllm-openvino --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait -``` - -`--wait` flag in the above helm installation command lets the shell wait till `vllm-openvino` is completely up and ready. - -> **_NOTE:_** Make sure your `MODELDIR` exists on the node where your workload is scheduled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model. - -If you already cached the model locally, you can pass it to container like this example: - -MODELDIR=/mnt/opea-models - -MODELNAME="/data/models--bigscience--bloom-560m" - -> **_NOTE:_** By default, the vLLM service will be downloading **Intel/neural-chat-7b-v3-3** model from Huggingface, which is around 4GB in size. To use a smaller model, please set the LLM_MODEL_ID value to your desired model, as shown above, while installing the chart. - -## Verify - -To verify the installation, run the following command to make sure all pods are running. Please note that it may take a while to come the vLLM pod in ready state. - -```bash -kubectl get pod -``` - -Once you see `vllm-openvino` pod in ready and running state, run the following command: - -```bash - kubectl port-forward svc/vllm-openvino 2080:80 -``` - -This exposes the port 80, on which `vllm-openvino` service is running inside the pod, at port 2080 on the host. - -Now, we can access the service from the host machine. Open another terminal and run the following command to verify whether `vllm-openvino` service is working: - -```bash -curl http://localhost:2080/v1/completions -sS --fail-with-body \ - -X POST \ - -d '{"prompt":"What is Deep Learning?", "model": "bigscience/bloom-560m", "max_tokens":17, "temperature": 0.5}' \ - -H 'Content-Type: application/json' -``` - -## Values - -| Key | Type | Default | Description | -| ------------------------------- | ------ | ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| LLM_MODEL_ID | string | `"bigscience/bloom-560m"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.HUGGINGFACEHUB_API_TOKEN | string | `insert-your-huggingface-token-here` | Hugging Face API token | -| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, vLLM will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Setting this to null/empty will force it to download model. | -| image.repository | string | `"vllm"` | | -| image.tag | string | `"openvino"` | | -| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployment based on metrics it provides. See HPA section in ../../README.md before enabling! | diff --git a/helm-charts/common/vllm-openvino/templates/_helpers.tpl b/helm-charts/common/vllm-openvino/templates/_helpers.tpl deleted file mode 100644 index 7c16c1206..000000000 --- a/helm-charts/common/vllm-openvino/templates/_helpers.tpl +++ /dev/null @@ -1,62 +0,0 @@ -{{/* -Expand the name of the chart. -*/}} -{{- define "vllm-openvino.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "vllm-openvino.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "vllm-openvino.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Common labels -*/}} -{{- define "vllm-openvino.labels" -}} -helm.sh/chart: {{ include "vllm-openvino.chart" . }} -{{ include "vllm-openvino.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end }} - -{{/* -Selector labels -*/}} -{{- define "vllm-openvino.selectorLabels" -}} -app.kubernetes.io/name: {{ include "vllm-openvino.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end }} - -{{/* -Create the name of the service account to use -*/}} -{{- define "vllm-openvino.serviceAccountName" -}} -{{- if .Values.serviceAccount.create }} -{{- default (include "vllm-openvino.fullname" .) .Values.serviceAccount.name }} -{{- else }} -{{- default "default" .Values.serviceAccount.name }} -{{- end }} -{{- end }} diff --git a/helm-charts/common/vllm-openvino/templates/configmap.yaml b/helm-charts/common/vllm-openvino/templates/configmap.yaml deleted file mode 100644 index a9b1f7b33..000000000 --- a/helm-charts/common/vllm-openvino/templates/configmap.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "vllm-openvino.fullname" . }}-config - labels: - {{- include "vllm-openvino.labels" . | nindent 4 }} -data: - MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }} - PORT: {{ .Values.service.port | quote }} - HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} - VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote }} - OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote }} - {{- if .Values.global.HF_ENDPOINT }} - HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} - {{- end }} - http_proxy: {{ .Values.global.http_proxy | quote }} - https_proxy: {{ .Values.global.https_proxy | quote }} - no_proxy: {{ .Values.global.no_proxy | quote }} - HABANA_LOGS: "/tmp/habana_logs" - NUMBA_CACHE_DIR: "/tmp" - HF_HOME: "/tmp/.cache/huggingface" - {{- if .Values.CUDA_GRAPHS }} - CUDA_GRAPHS: {{ .Values.CUDA_GRAPHS | quote }} - {{- end }} diff --git a/helm-charts/common/vllm-openvino/templates/deployment.yaml b/helm-charts/common/vllm-openvino/templates/deployment.yaml deleted file mode 100644 index 6e72f9e2b..000000000 --- a/helm-charts/common/vllm-openvino/templates/deployment.yaml +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "vllm-openvino.fullname" . }} - labels: - {{- include "vllm-openvino.labels" . | nindent 4 }} -spec: - {{- if not .Values.global.horizontalPodAutoscaler.enabled }} - replicas: {{ .Values.replicaCount }} - {{- end }} - selector: - matchLabels: - {{- include "vllm-openvino.selectorLabels" . | nindent 6 }} - template: - metadata: - {{- with .Values.podAnnotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "vllm-openvino.labels" . | nindent 8 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - containers: - - name: {{ .Chart.Name }} - envFrom: - - configMapRef: - name: {{ include "vllm-openvino.fullname" . }}-config - {{- if .Values.global.extraEnvConfig }} - - configMapRef: - name: {{ .Values.global.extraEnvConfig }} - optional: true - {{- end }} - securityContext: - {{- if .Values.global.modelUseHostPath }} - {} - {{- else }} - {{- toYaml .Values.securityContext | nindent 12 }} - {{- end }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - ports: - - name: http - containerPort: {{ .Values.service.targetPort }} - protocol: TCP - {{- if .Values.livenessProbe}} - livenessProbe: - {{- toYaml .Values.livenessProbe | nindent 12 }} - {{ end }} - {{- if .Values.readinessProbe}} - readinessProbe: - {{- toYaml .Values.readinessProbe | nindent 12 }} - {{ end }} - {{- if .Values.startupProbe }} - startupProbe: - {{- toYaml .Values.startupProbe | nindent 12 }} - {{- end }} - resources: - {{- toYaml .Values.resources | nindent 12 }} - {{- with .Values.volumeMounts }} - volumeMounts: - {{- toYaml . | nindent 12 }} - {{- end }} - command: ["/bin/bash"] - args: - - "-c" - - | - cd / && \ - python3 -m vllm.entrypoints.openai.api_server \ - --model "{{ .Values.LLM_MODEL_ID }}" \ - --host 0.0.0.0 \ - --port 80 - volumes: - - name: model-volume - {{- if .Values.global.modelUsePVC }} - persistentVolumeClaim: - claimName: {{ .Values.global.modelUsePVC }} - {{- else if .Values.global.modelUseHostPath }} - hostPath: - path: {{ .Values.global.modelUseHostPath }} - type: Directory - {{- else }} - emptyDir: {} - {{- end }} - - name: tmp - emptyDir: {} - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- if .Values.global.horizontalPodAutoscaler.enabled }} - # extra time to finish processing buffered requests before HPA forcibly terminates pod - terminationGracePeriodSeconds: 120 - {{- end }} diff --git a/helm-charts/common/vllm-openvino/templates/service.yaml b/helm-charts/common/vllm-openvino/templates/service.yaml deleted file mode 100644 index 89fe92e01..000000000 --- a/helm-charts/common/vllm-openvino/templates/service.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: v1 -kind: Service -metadata: - name: {{ include "vllm-openvino.fullname" . }} - labels: - {{- include "vllm-openvino.labels" . | nindent 4 }} -spec: - type: {{ .Values.service.type }} - ports: - - port: {{ .Values.service.port }} - targetPort: {{ .Values.service.targetPort }} - protocol: TCP - name: http - selector: - {{- include "vllm-openvino.selectorLabels" . | nindent 4 }} diff --git a/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml b/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml deleted file mode 100644 index 1fcb75793..000000000 --- a/helm-charts/common/vllm-openvino/templates/tests/test-pod.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -apiVersion: v1 -kind: Pod -metadata: - name: "{{ include "vllm-openvino.fullname" . }}-testpod" - labels: - {{- include "vllm-openvino.labels" . | nindent 4 }} - annotations: - "helm.sh/hook": test -spec: - containers: - - name: curl - image: python:3.10.14 - command: ['bash', '-c'] - args: - - | - max_retry=20; - for ((i=1; i<=max_retry; i++)); do \ - curl http://{{ include "vllm-openvino.fullname" . }}/v1/completions -sS --fail-with-body \ - -X POST \ - -d '{"prompt":"What is Deep Learning?", "model": {{ .Values.LLM_MODEL_ID | quote }}, "max_tokens":17, "temperature": 0.5}' \ - -H 'Content-Type: application/json' && break; - curlcode=$? - if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; - done; - if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi - restartPolicy: Never diff --git a/helm-charts/common/vllm-openvino/values.yaml b/helm-charts/common/vllm-openvino/values.yaml deleted file mode 100644 index 579630044..000000000 --- a/helm-charts/common/vllm-openvino/values.yaml +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -# Default values for vllm-openvino. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -replicaCount: 1 - -image: - repository: vllm - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - tag: "openvino" - -imagePullSecrets: [] -nameOverride: "" -fullnameOverride: "" - -podAnnotations: {} -podLabels: {} - -podSecurityContext: {} - # fsGroup: 2000 - - -securityContext: - readOnlyRootFilesystem: true - allowPrivilegeEscalation: false - runAsNonRoot: true - runAsUser: 1000 - capabilities: - drop: - - ALL - seccompProfile: - type: RuntimeDefault - -service: - type: ClusterIP - port: 80 - targetPort: 80 - -resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - -livenessProbe: - tcpSocket: - port: http - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 24 -readinessProbe: - tcpSocket: - port: http - initialDelaySeconds: 5 - periodSeconds: 5 -startupProbe: - tcpSocket: - port: http - initialDelaySeconds: 5 - periodSeconds: 5 - failureThreshold: 120 - -horizontalPodAutoscaler: - minReplicas: 1 - maxReplicas: 6 - # targetCPUUtilizationPercentage: 80 - # targetMemoryUtilizationPercentage: 80 - -# Additional volumes on the output Deployment definition. -volumes: [] -# - name: foo -# secret: -# secretName: mysecret -# optional: false - -# Additional volumeMounts on the output Deployment definition. -volumeMounts: - - mountPath: /data - name: model-volume - - mountPath: /tmp - name: tmp - -nodeSelector: {} - -tolerations: [] - -affinity: {} - -LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 - -CUDA_GRAPHS: "0" -VLLM_CPU_KVCACHE_SPACE: 50 -OMPI_MCA_btl_vader_single_copy_mechanism: none - -global: - http_proxy: "" - https_proxy: "" - no_proxy: "" - HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" - - # Choose where to save your downloaded models - # Set modelUseHostPath for local directory, this is good for one node test. Example: - # modelUseHostPath: /mnt/opea-models - # Set modelUsePVC for PersistentVolumeClaim(PVC), which is suitable for multinode deployment. Example: - # modelUsePVC: model-volume - # You can only set one of the following var, the behavior is not defined is both are set. - # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. - modelUseHostPath: "" - modelUsePVC: "" - # Enabling HPA will: - # - Ignore above replica count, as it will be controlled by HPA - # - Add example HPA scaling rules with thresholds suitable for Xeon deployments - # - Require custom metrics ConfigMap available in the main application chart - horizontalPodAutoscaler: - enabled: false From 207d2bd93f33403342409809ae8dfb768f86bcb1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 10:44:57 +0000 Subject: [PATCH 15/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/chatqna/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 93fd157f8..ab737c77e 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -80,7 +80,7 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true # To use OpenVINO optimized vLLM inference engine on XEON device -helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true --values common/vllm/openvino-values.yaml +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true --values common/vllm/openvino-values.yaml ``` ### IMPORTANT NOTE From 01eb2b4680dc24cc2b0dc21560906b3427726bba Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Mon, 30 Sep 2024 09:59:10 +0530 Subject: [PATCH 16/37] updated vllm-openvino image Signed-off-by: Krishna Murti --- helm-charts/common/vllm/openvino-values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helm-charts/common/vllm/openvino-values.yaml b/helm-charts/common/vllm/openvino-values.yaml index dfd157699..5e72d5b00 100644 --- a/helm-charts/common/vllm/openvino-values.yaml +++ b/helm-charts/common/vllm/openvino-values.yaml @@ -6,10 +6,10 @@ openvino_enabled: true image: - repository: vllm + repository: opea/vllm-openvino pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. - tag: "openvino" + tag: "latest" extraCmdArgs: [] From 738ff5913da6f33158ae3fda5a89c362cad282d6 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Tue, 8 Oct 2024 09:37:55 +0530 Subject: [PATCH 17/37] =?UTF-8?q?=F0=9F=94=96=20updated=20tags=20for=20llm?= =?UTF-8?q?-vllm=20and=20ctrl-uservice?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/common/llm-ctrl-uservice/values.yaml | 2 +- helm-charts/common/vllm/openvino-values.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/helm-charts/common/llm-ctrl-uservice/values.yaml b/helm-charts/common/llm-ctrl-uservice/values.yaml index 54d504792..fb402a7b1 100644 --- a/helm-charts/common/llm-ctrl-uservice/values.yaml +++ b/helm-charts/common/llm-ctrl-uservice/values.yaml @@ -18,7 +18,7 @@ image: repository: opea/llm-vllm pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. - tag: "latest" + tag: "1.0" imagePullSecrets: [] nameOverride: "" diff --git a/helm-charts/common/vllm/openvino-values.yaml b/helm-charts/common/vllm/openvino-values.yaml index 5e72d5b00..a722e196f 100644 --- a/helm-charts/common/vllm/openvino-values.yaml +++ b/helm-charts/common/vllm/openvino-values.yaml @@ -9,7 +9,7 @@ image: repository: opea/vllm-openvino pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. - tag: "latest" + tag: "1.0" extraCmdArgs: [] From e7de84c12e2e1632c49364a655b39d8d67364234 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Tue, 8 Oct 2024 15:23:06 +0530 Subject: [PATCH 18/37] =?UTF-8?q?=F0=9F=94=96=20added=20latest=20tag=20for?= =?UTF-8?q?=20llm-vllm=20and=20ctrl-uservice?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/common/llm-ctrl-uservice/values.yaml | 2 +- helm-charts/common/vllm/openvino-values.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/helm-charts/common/llm-ctrl-uservice/values.yaml b/helm-charts/common/llm-ctrl-uservice/values.yaml index fb402a7b1..54d504792 100644 --- a/helm-charts/common/llm-ctrl-uservice/values.yaml +++ b/helm-charts/common/llm-ctrl-uservice/values.yaml @@ -18,7 +18,7 @@ image: repository: opea/llm-vllm pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. - tag: "1.0" + tag: "latest" imagePullSecrets: [] nameOverride: "" diff --git a/helm-charts/common/vllm/openvino-values.yaml b/helm-charts/common/vllm/openvino-values.yaml index a722e196f..5e72d5b00 100644 --- a/helm-charts/common/vllm/openvino-values.yaml +++ b/helm-charts/common/vllm/openvino-values.yaml @@ -9,7 +9,7 @@ image: repository: opea/vllm-openvino pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. - tag: "1.0" + tag: "latest" extraCmdArgs: [] From 86b8064a225117b91d7b5baaadbeff7637f2f0d4 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 9 Oct 2024 01:43:03 +0530 Subject: [PATCH 19/37] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20openvino=20values?= =?UTF-8?q?=20issue=20for=20chatqna?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 1cbdf657a..292eb933d 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -78,10 +78,14 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- ```bash # To use vLLM inference engine on XEON device + helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true # To use OpenVINO optimized vLLM inference engine on XEON device -helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true --values common/vllm/openvino-values.yaml + +helm -f ./chatqna/vllm-openvino-values.yaml install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true + +# If you are behind a proxy, please add `--set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy}` in above installation commands. Please make sure `http_proxy` and `https_proxy` environment variables are set in your current shell environment. ``` ### IMPORTANT NOTE From 34f71b6341b6db4f07cacc381b639319248f2d80 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Oct 2024 20:19:58 +0000 Subject: [PATCH 20/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/chatqna/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 292eb933d..2e1de44c0 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -85,7 +85,7 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- helm -f ./chatqna/vllm-openvino-values.yaml install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true -# If you are behind a proxy, please add `--set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy}` in above installation commands. Please make sure `http_proxy` and `https_proxy` environment variables are set in your current shell environment. +# If you are behind a proxy, please add `--set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy}` in above installation commands. Please make sure `http_proxy` and `https_proxy` environment variables are set in your current shell environment. ``` ### IMPORTANT NOTE From e382dea5f3ec48f2d8d6ed5fe533b1f39a08a5c3 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 9 Oct 2024 12:38:20 +0530 Subject: [PATCH 21/37] =?UTF-8?q?=F0=9F=93=84=20added=20missing=20openvino?= =?UTF-8?q?=20values=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/vllm-openvino-values.yaml | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 helm-charts/chatqna/vllm-openvino-values.yaml diff --git a/helm-charts/chatqna/vllm-openvino-values.yaml b/helm-charts/chatqna/vllm-openvino-values.yaml new file mode 100644 index 000000000..197abf036 --- /dev/null +++ b/helm-charts/chatqna/vllm-openvino-values.yaml @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: false + +vllm: + enabled: true + openvino_enabled: true + image: + repository: opea/vllm-openvino + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + + extraCmdArgs: [] + + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + + CUDA_GRAPHS: "0" + VLLM_CPU_KVCACHE_SPACE: 50 + VLLM_OPENVINO_KVCACHE_SPACE: 32 + OMPI_MCA_btl_vader_single_copy_mechanism: none + + ov_command: ["/bin/bash"] From 4065c9e37cfe986f46a958d12368993ec5061570 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 9 Oct 2024 12:39:00 +0530 Subject: [PATCH 22/37] =?UTF-8?q?=F0=9F=94=A5=20removed=20tags=20for=20con?= =?UTF-8?q?ditional=20chart=20selection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/Chart.yaml | 8 -------- helm-charts/chatqna/README.md | 4 ++-- helm-charts/chatqna/values.yaml | 5 +---- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml index ba5e5967f..199669bf7 100644 --- a/helm-charts/chatqna/Chart.yaml +++ b/helm-charts/chatqna/Chart.yaml @@ -19,26 +19,18 @@ dependencies: version: 1.0.0 repository: "file://../common/tgi" condition: tgi.enabled - tags: - - tgi - name: vllm version: 1.0.0 repository: "file://../common/vllm" condition: vllm.enabled - tags: - - vllm - name: llm-uservice version: 1.0.0 repository: "file://../common/llm-uservice" condition: tgi.enabled - tags: - - tgi - name: llm-ctrl-uservice version: 1.0.0 repository: "file://../common/llm-ctrl-uservice" condition: vllm.enabled - tags: - - vllm - name: tei version: 1.0.0 repository: "file://../common/tei" diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 2e1de44c0..66d8adfaf 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -79,11 +79,11 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- ```bash # To use vLLM inference engine on XEON device -helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true +helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tgi.enabled=false --set vllm.enabled=true # To use OpenVINO optimized vLLM inference engine on XEON device -helm -f ./chatqna/vllm-openvino-values.yaml install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tags.tgi=false --set vllm.enabled=true +helm -f ./chatqna/vllm-openvino-values.yaml install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} # If you are behind a proxy, please add `--set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy}` in above installation commands. Please make sure `http_proxy` and `https_proxy` environment variables are set in your current shell environment. ``` diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index 567b0025e..b251706ec 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -51,6 +51,7 @@ horizontalPodAutoscaler: # Override values in specific subcharts tgi: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + enabled: true vllm: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 @@ -59,10 +60,6 @@ vllm: llm-ctrl-uservice: LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 -tags: - tgi: true - vllm: false - # disable guardrails-usvc by default # See guardrails-values.yaml for guardrail related options guardrails-usvc: From 81d269c3721f4c31437cf5371c82afbdf96de114 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Oct 2024 07:15:44 +0000 Subject: [PATCH 23/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/chatqna/README.md | 2 +- helm-charts/chatqna/vllm-openvino-values.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 66d8adfaf..1ad7ea234 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -83,7 +83,7 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- # To use OpenVINO optimized vLLM inference engine on XEON device -helm -f ./chatqna/vllm-openvino-values.yaml install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} +helm -f ./chatqna/vllm-openvino-values.yaml install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} # If you are behind a proxy, please add `--set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy}` in above installation commands. Please make sure `http_proxy` and `https_proxy` environment variables are set in your current shell environment. ``` diff --git a/helm-charts/chatqna/vllm-openvino-values.yaml b/helm-charts/chatqna/vllm-openvino-values.yaml index 197abf036..653953d3d 100644 --- a/helm-charts/chatqna/vllm-openvino-values.yaml +++ b/helm-charts/chatqna/vllm-openvino-values.yaml @@ -3,7 +3,7 @@ tgi: enabled: false - + vllm: enabled: true openvino_enabled: true @@ -22,4 +22,4 @@ vllm: VLLM_OPENVINO_KVCACHE_SPACE: 32 OMPI_MCA_btl_vader_single_copy_mechanism: none - ov_command: ["/bin/bash"] + ov_command: ["/bin/bash"] From 05a2be2ac05bae8cd55dc52081f62e6344ed4d5d Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 9 Oct 2024 17:15:37 +0530 Subject: [PATCH 24/37] =?UTF-8?q?=F0=9F=93=9D=20formatting=20fixes=20in=20?= =?UTF-8?q?readme=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/README.md | 10 +++++----- helm-charts/common/vllm/README.md | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 1ad7ea234..7e5d90c95 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -10,18 +10,20 @@ Helm chart for deploying ChatQnA service. ChatQnA depends on the following servi - [reranking-usvc](../common/reranking-usvc/README.md) - [teirerank](../common/teirerank/README.md) -Apart from above mentioned services, some conditional dependencies are mentioned below (out of which, only one needs to be setup): +For LLM inference, two more microservices will be required. We can either use [TGI](https://github.com/huggingface/text-generation-inference) or [vLLM](https://github.com/vllm-project/vllm) as our LLM backend. Depending on that, we will have following microservices as part of dependencies for ChatQnA application. -1. If we want to use TGI as our inference service, following 2 services will be required: +1. For using **TGI** as an inference service, following 2 microservices will be required: - [llm-uservice](../common/llm-uservice/README.md) - [tgi](../common/tgi/README.md) -2. As an alternative to TGI, if we want to use vLLM inference service, following 2 services would be required instead: +2. For using **vLLM** as an inference service, following 2 microservices would be required: - [llm-ctrl-uservice](../common/llm-ctrl-uservice/README.md) - [vllm](../common/vllm/README.md) +>__**Note**__: We shouldn't have both inference engine in our setup. We have to setup either of them. For this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. + ## Installing the Chart Please follow the following steps to install the ChatQnA Chart: @@ -84,8 +86,6 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- # To use OpenVINO optimized vLLM inference engine on XEON device helm -f ./chatqna/vllm-openvino-values.yaml install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} - -# If you are behind a proxy, please add `--set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy}` in above installation commands. Please make sure `http_proxy` and `https_proxy` environment variables are set in your current shell environment. ``` ### IMPORTANT NOTE diff --git a/helm-charts/common/vllm/README.md b/helm-charts/common/vllm/README.md index 637428460..d366667be 100644 --- a/helm-charts/common/vllm/README.md +++ b/helm-charts/common/vllm/README.md @@ -36,17 +36,17 @@ helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MOD - To deploy OpenVINO optimized vLLM on XEON device: -````bash -helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --values vllm/openvino-values.yaml -`` - -By default, the vllm service will downloading the "Intel/neural-chat-7b-v3-3". +```bash +helm -f vllm/openvino-values.yaml install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} +``` -If you already cached the model locally, you can pass it to container like this example: +By default, the vLLM service will download "Intel/neural-chat-7b-v3-3" model. If you already cached the model locally, you can pass it to container like this example: +```bash MODELDIR=/mnt/opea-models MODELNAME="facebook/opt-125m" +``` ## Verify @@ -60,7 +60,7 @@ Open another terminal and run the following command to verify the service if wor curl http://localhost:2080/v1/completions \ -H "Content-Type: application/json" \ -d '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}' -```` +``` ## Values From f0dae3383109e5d6d3cd3c731782f07beae4fedf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Oct 2024 11:52:20 +0000 Subject: [PATCH 25/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/chatqna/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 7e5d90c95..d6c5e21a5 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -10,7 +10,7 @@ Helm chart for deploying ChatQnA service. ChatQnA depends on the following servi - [reranking-usvc](../common/reranking-usvc/README.md) - [teirerank](../common/teirerank/README.md) -For LLM inference, two more microservices will be required. We can either use [TGI](https://github.com/huggingface/text-generation-inference) or [vLLM](https://github.com/vllm-project/vllm) as our LLM backend. Depending on that, we will have following microservices as part of dependencies for ChatQnA application. +For LLM inference, two more microservices will be required. We can either use [TGI](https://github.com/huggingface/text-generation-inference) or [vLLM](https://github.com/vllm-project/vllm) as our LLM backend. Depending on that, we will have following microservices as part of dependencies for ChatQnA application. 1. For using **TGI** as an inference service, following 2 microservices will be required: @@ -22,7 +22,7 @@ For LLM inference, two more microservices will be required. We can either use [T - [llm-ctrl-uservice](../common/llm-ctrl-uservice/README.md) - [vllm](../common/vllm/README.md) ->__**Note**__: We shouldn't have both inference engine in our setup. We have to setup either of them. For this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. +> ****Note****: We shouldn't have both inference engine in our setup. We have to setup either of them. For this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. ## Installing the Chart From a8b85d799dba631a9782c8bdf66aa965b380585e Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 9 Oct 2024 23:52:43 +0530 Subject: [PATCH 26/37] =?UTF-8?q?=F0=9F=8E=A8=20prettier=20formatting=20fi?= =?UTF-8?q?xes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/README.md | 22 +++++++++++----------- helm-charts/chatqna/README.md | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/helm-charts/README.md b/helm-charts/README.md index ba3181d2e..94e7820ae 100644 --- a/helm-charts/README.md +++ b/helm-charts/README.md @@ -32,16 +32,16 @@ AI application examples you can run directly on Xeon and Gaudi. You can also ref ### Components -Components which are building blocks for AI application. -All components helm charts are put in the ./common directory, and the support list is growing. +Components which are building blocks for AI application. +All components helm charts are put in the ./common directory, and the support list is growing. Refer to [GenAIComps](https://github.com/opea-project/GenAIComps) for details of each component. ## Deploy with helm charts ### From Source Code -These helm charts are designed to be easy to start, which means you can deploy a workload easily without further options. -However, `HUGGINGFACEHUB_API_TOKEN` should be set in most cases for a workload to start up correctly. +These helm charts are designed to be easy to start, which means you can deploy a workload easily without further options. +However, `HUGGINGFACEHUB_API_TOKEN` should be set in most cases for a workload to start up correctly. Examples of deploy a workload: ``` @@ -91,7 +91,7 @@ There are global options(which should be shared across all components of a workl ## Using Persistent Volume -It's common to use Persistent Volume(PV) for model caches(huggingface hub cache) in a production k8s cluster. We support to pass the PersistentVolumeClaim(PVC) to containers, but it's the user's responsibility to create the PVC depending on your k8s cluster's capability. +It's common to use Persistent Volume(PV) for model caches(huggingface hub cache) in a production k8s cluster. We support to pass the PersistentVolumeClaim(PVC) to containers, but it's the user's responsibility to create the PVC depending on your k8s cluster's capability. Here is an setup example using NFS on Ubuntu 22.04. - Export NFS directory from NFS server @@ -154,10 +154,10 @@ helm install tgi common/tgi --set global.modelUsePVC=model-volume ## Using Private Docker Hub -By default, we're using docker images from [official docker hub](https://hub.docker.com/u/opea), with docker image version aligned with OPEA releases. +By default, we're using docker images from [official docker hub](https://hub.docker.com/u/opea), with docker image version aligned with OPEA releases. If you have private hub or would like to use different docker image versions, see the following examples. -To use the latest tag for all images: +To use the latest tag for all images: `find . -name '*values.yaml' -type f -exec sed -i 's#tag: ""#tag: latest#g' {} \;` To use local docker registry: @@ -169,8 +169,8 @@ find . -name '*values.yaml' -type f -exec sed -i "s#repository: opea/*#repositor ## Generate manifests from Helm Charts -Some users may want to use kubernetes manifests(yaml files) for workload deployment, we do not maintain manifests itself, and will generate them using `helm template`. -See update_genaiexamples.sh for how the manifests are generated for supported GenAIExamples. -See update_manifests.sh for how the manifests are generated for supported GenAIComps. -Please note that the above scripts have hardcoded settings to reduce user configuration effort. +Some users may want to use kubernetes manifests(yaml files) for workload deployment, we do not maintain manifests itself, and will generate them using `helm template`. +See update_genaiexamples.sh for how the manifests are generated for supported GenAIExamples. +See update_manifests.sh for how the manifests are generated for supported GenAIComps. +Please note that the above scripts have hardcoded settings to reduce user configuration effort. They are not supposed to be directly used by users. diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index d6c5e21a5..df9e2e08a 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -22,7 +22,7 @@ For LLM inference, two more microservices will be required. We can either use [T - [llm-ctrl-uservice](../common/llm-ctrl-uservice/README.md) - [vllm](../common/vllm/README.md) -> ****Note****: We shouldn't have both inference engine in our setup. We have to setup either of them. For this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. +> **__Note :__** We shouldn't have both inference engine in our setup. We have to setup either of them. For this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. ## Installing the Chart From 294f1a0c3698ba6fc45ffa3373b3a3977f8610fb Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Thu, 10 Oct 2024 00:08:56 +0530 Subject: [PATCH 27/37] =?UTF-8?q?=F0=9F=8E=A8=20prettier=20formatting=20fi?= =?UTF-8?q?xes=20for=20chatqna=20readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index df9e2e08a..887e73142 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -22,7 +22,7 @@ For LLM inference, two more microservices will be required. We can either use [T - [llm-ctrl-uservice](../common/llm-ctrl-uservice/README.md) - [vllm](../common/vllm/README.md) -> **__Note :__** We shouldn't have both inference engine in our setup. We have to setup either of them. For this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. +> **Note:** We shouldn't have both inference engine in our setup. We have to setup either of them. For this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. ## Installing the Chart From 9b12618c22679c233976027263f7a8572b29de01 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Thu, 10 Oct 2024 01:08:10 +0530 Subject: [PATCH 28/37] retrigger CI checks Signed-off-by: Krishna Murti From e8904481cd71ba282d6984d86097bc7db7cb588a Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Thu, 10 Oct 2024 08:31:06 +0530 Subject: [PATCH 29/37] =?UTF-8?q?=F0=9F=93=9D=20minor=20updates=20in=20rea?= =?UTF-8?q?dme=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/README.md | 4 ++-- helm-charts/common/llm-ctrl-uservice/README.md | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index 887e73142..f8b1041bd 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -22,7 +22,7 @@ For LLM inference, two more microservices will be required. We can either use [T - [llm-ctrl-uservice](../common/llm-ctrl-uservice/README.md) - [vllm](../common/vllm/README.md) -> **Note:** We shouldn't have both inference engine in our setup. We have to setup either of them. For this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. +> **_NOTE :_** We shouldn't have both inference engine deployed. It is required to only setup either of them. To achieve this, conditional flags are added in the chart dependency. We will be switching off flag corresponding to one service and switching on the other, in order to have a proper setup of all ChatQnA dependencies. ## Installing the Chart @@ -76,7 +76,7 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml ``` -> **_NOTE:_** Default installation will use [TGI (Text Generation Inference)](https://github.com/huggingface/text-generation-inference) as inference engine. To use vLLM as inference engine, please see below. +> **_NOTE :_** Default installation will use [TGI (Text Generation Inference)](https://github.com/huggingface/text-generation-inference) as inference engine. To use vLLM as inference engine, please see below. ```bash # To use vLLM inference engine on XEON device diff --git a/helm-charts/common/llm-ctrl-uservice/README.md b/helm-charts/common/llm-ctrl-uservice/README.md index f7c5edbd8..99ca7a18a 100644 --- a/helm-charts/common/llm-ctrl-uservice/README.md +++ b/helm-charts/common/llm-ctrl-uservice/README.md @@ -1,17 +1,17 @@ # llm-ctrl Microservice -Helm chart for deploying a microservice which facilitates connections and handles responses from OpenVINO vLLM microservice. +Helm chart for deploying LLM controller microservice which facilitates connections and handles responses from OpenVINO vLLM microservice. -`llm-ctrl-uservice` depends on OpenVINO vLLM. You should properly set `vLLM_ENDPOINT` as the HOST URI of vLLM microservice. If not set, it will consider the default value : `http://-vllm-openvino:80` +`llm-ctrl-uservice` depends on vLLM microservice. You should properly set `vLLM_ENDPOINT` as the HOST URI of vLLM microservice. If not set, it will consider the default value : `http://-vllm:80` As this service depends on vLLM microservice, we can proceed in either of 2 ways: -- Install both microservices separately one after another. -- Install the vLLM microservice as dependency for the our main `llm-ctrl-uservice` microservice. +- Install both microservices individually. +- Install the vLLM microservice as dependency for `llm-ctrl-uservice` microservice. -## (Option 1): Installing the chart separately: +## (Option 1): Installing the charts individually: -First, you need to install the `vllm-openvino` chart, please refer to the [vllm](../vllm) chart for more information. +First, you need to install the `vllm` chart, please refer to the [vllm](../vllm) chart for more information. After you've deployed the `vllm` chart successfully, please run `kubectl get svc` to get the vLLM service name with port. We need to provide this to `llm-ctrl-uservice` as a value for vLLM_ENDPOINT for letting it discover and connect to the vLLM microservice. From ef59964730d70195d3a0b26143009d281a7b4d34 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Thu, 10 Oct 2024 10:33:17 +0530 Subject: [PATCH 30/37] retrigger CI checks Signed-off-by: Krishna Murti From acd9a4734f18b5b31abc57821e8febab119c54ae Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Tue, 29 Oct 2024 08:59:27 +0530 Subject: [PATCH 31/37] =?UTF-8?q?=F0=9F=92=9A=20enabled=20ci=20checks=20fo?= =?UTF-8?q?r=20new=20values=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/ci-vllm-openvino-values.yaml | 1 + helm-charts/common/llm-ctrl-uservice/Chart.yaml | 2 +- helm-charts/common/llm-ctrl-uservice/README.md | 2 +- helm-charts/common/llm-ctrl-uservice/ci-values.yaml | 1 + helm-charts/common/llm-ctrl-uservice/values.yaml | 2 +- helm-charts/common/llm-uservice/Chart.yaml | 2 +- helm-charts/common/llm-uservice/README.md | 2 +- helm-charts/common/llm-uservice/values.yaml | 2 +- helm-charts/common/vllm/ci-openvino-values.yaml | 1 + 9 files changed, 9 insertions(+), 6 deletions(-) create mode 120000 helm-charts/chatqna/ci-vllm-openvino-values.yaml create mode 120000 helm-charts/common/llm-ctrl-uservice/ci-values.yaml create mode 120000 helm-charts/common/vllm/ci-openvino-values.yaml diff --git a/helm-charts/chatqna/ci-vllm-openvino-values.yaml b/helm-charts/chatqna/ci-vllm-openvino-values.yaml new file mode 120000 index 000000000..66f710b24 --- /dev/null +++ b/helm-charts/chatqna/ci-vllm-openvino-values.yaml @@ -0,0 +1 @@ +vllm-openvino-values.yaml \ No newline at end of file diff --git a/helm-charts/common/llm-ctrl-uservice/Chart.yaml b/helm-charts/common/llm-ctrl-uservice/Chart.yaml index 95d7b24d9..bb10012ea 100644 --- a/helm-charts/common/llm-ctrl-uservice/Chart.yaml +++ b/helm-charts/common/llm-ctrl-uservice/Chart.yaml @@ -11,4 +11,4 @@ dependencies: - name: vllm version: 1.0.0 repository: file://../vllm - condition: autodependency.enabled + condition: vllm.enabled diff --git a/helm-charts/common/llm-ctrl-uservice/README.md b/helm-charts/common/llm-ctrl-uservice/README.md index 99ca7a18a..b74190ca3 100644 --- a/helm-charts/common/llm-ctrl-uservice/README.md +++ b/helm-charts/common/llm-ctrl-uservice/README.md @@ -50,7 +50,7 @@ export http_proxy= export https_proxy= helm dependency update -helm install llm-ctrl-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set autodependency.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait +helm install llm-ctrl-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set vllm.enabled=true --set global.http_proxy=${http_proxy} --set global.https_proxy=${https_proxy} --wait ``` `--wait` flag in above installation command will make sure that all the dependencies are resolved and all services are deployed. diff --git a/helm-charts/common/llm-ctrl-uservice/ci-values.yaml b/helm-charts/common/llm-ctrl-uservice/ci-values.yaml new file mode 120000 index 000000000..7d1010096 --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/ci-values.yaml @@ -0,0 +1 @@ +values.yaml \ No newline at end of file diff --git a/helm-charts/common/llm-ctrl-uservice/values.yaml b/helm-charts/common/llm-ctrl-uservice/values.yaml index 54d504792..a4cf4e9cf 100644 --- a/helm-charts/common/llm-ctrl-uservice/values.yaml +++ b/helm-charts/common/llm-ctrl-uservice/values.yaml @@ -5,7 +5,7 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. -autodependency: +vllm: enabled: false replicaCount: 1 diff --git a/helm-charts/common/llm-uservice/Chart.yaml b/helm-charts/common/llm-uservice/Chart.yaml index 77a780614..8039c98de 100644 --- a/helm-charts/common/llm-uservice/Chart.yaml +++ b/helm-charts/common/llm-uservice/Chart.yaml @@ -12,4 +12,4 @@ dependencies: - name: tgi version: 1.0.0 repository: file://../tgi - condition: autodependency.enabled + condition: tgi.enabled diff --git a/helm-charts/common/llm-uservice/README.md b/helm-charts/common/llm-uservice/README.md index d9069d34e..0f2337852 100644 --- a/helm-charts/common/llm-uservice/README.md +++ b/helm-charts/common/llm-uservice/README.md @@ -26,7 +26,7 @@ helm install llm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --s cd GenAIInfra/helm-charts/common/llm-uservice export HFTOKEN="insert-your-huggingface-token-here" helm dependency update -helm install llm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set autodependency.enabled=true --wait +helm install llm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set tgi.enabled=true --wait ``` ## Verify diff --git a/helm-charts/common/llm-uservice/values.yaml b/helm-charts/common/llm-uservice/values.yaml index 39b10894d..6a2af0bbb 100644 --- a/helm-charts/common/llm-uservice/values.yaml +++ b/helm-charts/common/llm-uservice/values.yaml @@ -5,7 +5,7 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. -autodependency: +tgi: enabled: false replicaCount: 1 diff --git a/helm-charts/common/vllm/ci-openvino-values.yaml b/helm-charts/common/vllm/ci-openvino-values.yaml new file mode 120000 index 000000000..81b2b0484 --- /dev/null +++ b/helm-charts/common/vllm/ci-openvino-values.yaml @@ -0,0 +1 @@ +openvino-values.yaml \ No newline at end of file From cbb8d656704589a669af05dbc096121f947e107b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 29 Oct 2024 04:19:56 +0000 Subject: [PATCH 32/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/chatqna/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index b690f339b..a0636507d 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -126,13 +126,13 @@ Open a browser to access `http://:${port}` to play with the ## Values -| Key | Type | Default | Description | -| ----------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------------- | -| image.repository | string | `"opea/chatqna"` | | -| service.port | string | `"8888"` | | -| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | -| vllm-openvino.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | -| global.monitoring | bop; | false | Enable usage metrics for the service components. See ../monitoring.md before enabling! | +| Key | Type | Default | Description | +| -------------------------- | ------ | ----------------------------- | -------------------------------------------------------------------------------------- | +| image.repository | string | `"opea/chatqna"` | | +| service.port | string | `"8888"` | | +| tgi.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| vllm-openvino.LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.monitoring | bop; | false | Enable usage metrics for the service components. See ../monitoring.md before enabling! | ## Troubleshooting From d57086186b1685d827f843315abd2260f226d66b Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 30 Oct 2024 01:33:36 +0530 Subject: [PATCH 33/37] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20vllm=20charts=20mu?= =?UTF-8?q?ltiple=20installation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- helm-charts/chatqna/README.md | 2 +- helm-charts/chatqna/vllm-openvino-values.yaml | 4 ---- helm-charts/common/llm-ctrl-uservice/values.yaml | 5 +---- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index a0636507d..e329c105f 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -85,7 +85,7 @@ helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} -- # To use OpenVINO optimized vLLM inference engine on XEON device -helm -f ./chatqna/vllm-openvino-values.yaml install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} +helm install -f ./chatqna/vllm-openvino-values.yaml chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-ctrl-uservice.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} --set tgi.enabled=false --set vllm.enabled=true ``` ### IMPORTANT NOTE diff --git a/helm-charts/chatqna/vllm-openvino-values.yaml b/helm-charts/chatqna/vllm-openvino-values.yaml index 653953d3d..4097b0ee4 100644 --- a/helm-charts/chatqna/vllm-openvino-values.yaml +++ b/helm-charts/chatqna/vllm-openvino-values.yaml @@ -1,11 +1,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -tgi: - enabled: false - vllm: - enabled: true openvino_enabled: true image: repository: opea/vllm-openvino diff --git a/helm-charts/common/llm-ctrl-uservice/values.yaml b/helm-charts/common/llm-ctrl-uservice/values.yaml index a4cf4e9cf..b4f2521e8 100644 --- a/helm-charts/common/llm-ctrl-uservice/values.yaml +++ b/helm-charts/common/llm-ctrl-uservice/values.yaml @@ -5,15 +5,11 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. -vllm: - enabled: false - replicaCount: 1 vLLM_ENDPOINT: "" HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 - image: repository: opea/llm-vllm pullPolicy: IfNotPresent @@ -92,6 +88,7 @@ LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" # Overriding the Model ID being used by vllm-openvino service.(As llm-vllm microservice depends on vllm-openvino, these 2 values should be same.) vllm: + enabled: false LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" global: From 03b7d266d7c884e6631c0da3ec4fc32eb2455a5c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Oct 2024 05:29:58 +0000 Subject: [PATCH 34/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- helm-charts/chatqna/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index 4f02b5934..182cfc656 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -17,7 +17,7 @@ port: 8888 service: type: ClusterIP port: 8888 - + nginx: service: type: NodePort From 9e1cdf1935505cf285421f7cf5d1805b33b7c64b Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 30 Oct 2024 12:50:45 +0530 Subject: [PATCH 35/37] increased helm rollout timeout in ci Signed-off-by: Krishna Murti --- .github/workflows/_helm-e2e.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_helm-e2e.yaml b/.github/workflows/_helm-e2e.yaml index 0062127b0..9deff4af6 100644 --- a/.github/workflows/_helm-e2e.yaml +++ b/.github/workflows/_helm-e2e.yaml @@ -65,7 +65,7 @@ jobs: echo "CHART_NAME=$CHART_NAME" >> $GITHUB_ENV echo "RELEASE_NAME=${CHART_NAME}$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV echo "NAMESPACE=${CHART_NAME}-$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV - echo "ROLLOUT_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV + echo "ROLLOUT_TIMEOUT_SECONDS=1200s" >> $GITHUB_ENV echo "TEST_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV echo "KUBECTL_TIMEOUT_SECONDS=60s" >> $GITHUB_ENV echo "should_cleanup=false" >> $GITHUB_ENV From df8261e0af75f1f31752635b7a780a2a11de510e Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Tue, 5 Nov 2024 02:30:25 +0530 Subject: [PATCH 36/37] =?UTF-8?q?=F0=9F=92=9A=20fixes=20to=20enable=20ci?= =?UTF-8?q?=20for=20openvino-vllm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Krishna Murti --- .../chatqna/ci-vllm-openvino-values.yaml | 26 ++++++++++++++++++- helm-charts/chatqna/ci-vllm-values.yaml | 8 ++++++ .../common/llm-ctrl-uservice/ci-values.yaml | 6 ++++- 3 files changed, 38 insertions(+), 2 deletions(-) mode change 120000 => 100644 helm-charts/chatqna/ci-vllm-openvino-values.yaml create mode 100644 helm-charts/chatqna/ci-vllm-values.yaml mode change 120000 => 100644 helm-charts/common/llm-ctrl-uservice/ci-values.yaml diff --git a/helm-charts/chatqna/ci-vllm-openvino-values.yaml b/helm-charts/chatqna/ci-vllm-openvino-values.yaml deleted file mode 120000 index 66f710b24..000000000 --- a/helm-charts/chatqna/ci-vllm-openvino-values.yaml +++ /dev/null @@ -1 +0,0 @@ -vllm-openvino-values.yaml \ No newline at end of file diff --git a/helm-charts/chatqna/ci-vllm-openvino-values.yaml b/helm-charts/chatqna/ci-vllm-openvino-values.yaml new file mode 100644 index 000000000..653953d3d --- /dev/null +++ b/helm-charts/chatqna/ci-vllm-openvino-values.yaml @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: false + +vllm: + enabled: true + openvino_enabled: true + image: + repository: opea/vllm-openvino + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + + extraCmdArgs: [] + + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + + CUDA_GRAPHS: "0" + VLLM_CPU_KVCACHE_SPACE: 50 + VLLM_OPENVINO_KVCACHE_SPACE: 32 + OMPI_MCA_btl_vader_single_copy_mechanism: none + + ov_command: ["/bin/bash"] diff --git a/helm-charts/chatqna/ci-vllm-values.yaml b/helm-charts/chatqna/ci-vllm-values.yaml new file mode 100644 index 000000000..d16040d28 --- /dev/null +++ b/helm-charts/chatqna/ci-vllm-values.yaml @@ -0,0 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +tgi: + enabled: false + +vllm: + enabled: true diff --git a/helm-charts/common/llm-ctrl-uservice/ci-values.yaml b/helm-charts/common/llm-ctrl-uservice/ci-values.yaml deleted file mode 120000 index 7d1010096..000000000 --- a/helm-charts/common/llm-ctrl-uservice/ci-values.yaml +++ /dev/null @@ -1 +0,0 @@ -values.yaml \ No newline at end of file diff --git a/helm-charts/common/llm-ctrl-uservice/ci-values.yaml b/helm-charts/common/llm-ctrl-uservice/ci-values.yaml new file mode 100644 index 000000000..763f5c3f2 --- /dev/null +++ b/helm-charts/common/llm-ctrl-uservice/ci-values.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +vllm: + enabled: true From ac341acd3a04e0e18d20ae21ab7c50875d91e051 Mon Sep 17 00:00:00 2001 From: Krishna Murti Date: Wed, 6 Nov 2024 13:27:34 +0530 Subject: [PATCH 37/37] triggering CI checks Signed-off-by: Krishna Murti