From b1affab4c75ad108974217a2ce97b534f89e51c7 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Wed, 22 Jan 2025 08:54:42 -0800 Subject: [PATCH 1/3] add model config for Deepseek R1 --- charts/kubeai/values.yaml | 3 ++- charts/models/values.yaml | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml index 93f08249..9736a686 100644 --- a/charts/kubeai/values.yaml +++ b/charts/kubeai/values.yaml @@ -51,7 +51,8 @@ modelServers: # upstream vLLM seems to have broken ROCm support, so we are using a fork from AMD. # Source: https://hub.docker.com/r/rocm/vllm-dev # Source: https://github.com/ROCm/vllm - amd-gpu: substratusai/vllm-rocm:nightly_main_20250117 + # amd-gpu: substratusai/vllm-rocm:nightly_main_20250117 + amd-gpu: substratusai/vllm-rocm:nightly_main_20250120 OLlama: images: default: "ollama/ollama:latest" diff --git a/charts/models/values.yaml b/charts/models/values.yaml index 80630c8b..9e693c84 100644 --- a/charts/models/values.yaml +++ b/charts/models/values.yaml @@ -307,6 +307,33 @@ catalog: - --disable-log-requests resourceProfile: nvidia-gpu-gh200:1 targetRequests: 200 + deepseek-r1-mi300x: + enabled: false + features: [TextGeneration] + url: hf://deepseek-ai/DeepSeek-R1 + engine: VLLM + env: + HIP_FORCE_DEV_KERNARG: "1" + NCCL_MIN_NCHANNELS: "112" + TORCH_BLAS_PREFER_HIPBLASLT: "1" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_FP8_PADDING: "0" + args: + - --trust-remote-code + # Currently only context length =< 32k supported. + # See: https://github.com/ROCm/vllm/issues/375 + - --max-model-len=32768 + - --max-num-batched-token=32768 + - --max-num-seqs=1024 + - --num-scheduler-steps=10 + - --tensor-parallel-size=8 + - --gpu-memory-utilization=0.90 + - --disable-log-requests + - --enable-chunked-prefill=false + - --max-seq-len-to-capture=16384 + - --kv-cache-dtype=fp8 + resourceProfile: amd-gpu-mi300x:8 + targetRequests: 1024 nomic-embed-text-cpu: enabled: false features: ["TextEmbedding"] From 7d3d4bbd9ebf8215227ca9984dda2793c8a011d3 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Wed, 22 Jan 2025 08:55:50 -0800 Subject: [PATCH 2/3] remove comment --- charts/kubeai/values.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml index 9736a686..2d1c63c9 100644 --- a/charts/kubeai/values.yaml +++ b/charts/kubeai/values.yaml @@ -51,7 +51,6 @@ modelServers: # upstream vLLM seems to have broken ROCm support, so we are using a fork from AMD. # Source: https://hub.docker.com/r/rocm/vllm-dev # Source: https://github.com/ROCm/vllm - # amd-gpu: substratusai/vllm-rocm:nightly_main_20250117 amd-gpu: substratusai/vllm-rocm:nightly_main_20250120 OLlama: images: From 7550a00f87bbf5a7cc96ce6ad7b225f41b8ee18f Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Wed, 22 Jan 2025 09:28:06 -0800 Subject: [PATCH 3/3] add manifest --- manifests/models/deepseek-r1-mi300x.yaml | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 manifests/models/deepseek-r1-mi300x.yaml diff --git a/manifests/models/deepseek-r1-mi300x.yaml b/manifests/models/deepseek-r1-mi300x.yaml new file mode 100644 index 00000000..5318d723 --- /dev/null +++ b/manifests/models/deepseek-r1-mi300x.yaml @@ -0,0 +1,29 @@ +# Source: models/templates/models.yaml +apiVersion: kubeai.org/v1 +kind: Model +metadata: + name: deepseek-r1-mi300x +spec: + features: [TextGeneration] + url: hf://deepseek-ai/DeepSeek-R1 + engine: VLLM + args: + - --trust-remote-code + - --max-model-len=32768 + - --max-num-batched-token=32768 + - --max-num-seqs=1024 + - --num-scheduler-steps=10 + - --tensor-parallel-size=8 + - --gpu-memory-utilization=0.90 + - --disable-log-requests + - --enable-chunked-prefill=false + - --max-seq-len-to-capture=16384 + - --kv-cache-dtype=fp8 + env: + HIP_FORCE_DEV_KERNARG: "1" + NCCL_MIN_NCHANNELS: "112" + TORCH_BLAS_PREFER_HIPBLASLT: "1" + VLLM_FP8_PADDING: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + targetRequests: 1024 + resourceProfile: amd-gpu-mi300x:8