add hpa configuration for triton server helm chart

aws-ia · Jul 14, 2024 · 1191e03 · 1191e03
1 parent a8f575b
commit 1191e03
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 63 deletions.
diff --git a/helm-charts/nvidia-triton-server/templates/hpa.yaml b/helm-charts/nvidia-triton-server/templates/hpa.yaml
@@ -14,20 +14,12 @@ spec:
   minReplicas: {{ .Values.hpa.minReplicas }}
   maxReplicas: {{ .Values.hpa.maxReplicas }}
   metrics:
-    {{- if .Values.hpa.targetCPUUtilizationPercentage }}
-    - type: Resource
-      resource:
-        name: cpu
-        target:
-          type: Utilization
-          averageUtilization: {{ .Values.hpa.targetCPUUtilizationPercentage }}
-    {{- end }}
-    {{- if .Values.hpa.targetMemoryUtilizationPercentage }}
-    - type: Resource
-      resource:
-        name: memory
-        target:
-          type: Utilization
-          averageUtilization: {{ .Values.hpa.targetMemoryUtilizationPercentage }}
+    {{- range .Values.hpa.metrics }}
+        - {{ . | toYaml | nindent 10 }}
     {{- end }}
+  {{- if .Values.hpa.scaleDownStabilizationSecs }}
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: {{ .Values.hpa.scaleDownStabilizationSecs }}
+  {{- end }}
 {{- end }}
diff --git a/helm-charts/nvidia-triton-server/values.yaml b/helm-charts/nvidia-triton-server/values.yaml
@@ -31,35 +31,35 @@ ingress:
   enabled: true
   className: nginx
   annotations: {}
-    # kubernetes.io/ingress.class: nginx
-    # nginx.ingress.kubernetes.io/use-regex: "true"
-    # nginx.ingress.kubernetes.io/rewrite-target: "/$1"
-    # OR
-    # kubernetes.io/ingress.class: alb
-    # alb.ingress.kubernetes.io/scheme: internet-facing
-    # alb.ingress.kubernetes.io/target-type: ip
-    # alb.ingress.kubernetes.io/success-codes: "200-299"
-    # alb.ingress.kubernetes.io/healthcheck-path: "/v1/health/ready"
-    # alb.ingress.kubernetes.io/healthcheck-port: "8080"
+  # kubernetes.io/ingress.class: nginx
+  # nginx.ingress.kubernetes.io/use-regex: "true"
+  # nginx.ingress.kubernetes.io/rewrite-target: "/$1"
+  # OR
+  # kubernetes.io/ingress.class: alb
+  # alb.ingress.kubernetes.io/scheme: internet-facing
+  # alb.ingress.kubernetes.io/target-type: ip
+  # alb.ingress.kubernetes.io/success-codes: "200-299"
+  # alb.ingress.kubernetes.io/healthcheck-path: "/v1/health/ready"
+  # alb.ingress.kubernetes.io/healthcheck-port: "8080"
   hosts:
-    - host: "example.com"
-      paths:
-        - path: /
-          pathType: Prefix
-          service:
-            name:
-            port:
-              number: 8000
-        # - path: /serve/(.*)
-        #   pathType: ImplementationSpecific
-        #   service:
-        #     name:
-        #     port:
-        #       number: 8265
+  - host: "example.com"
+    paths:
+    - path: /
+      pathType: Prefix
+      service:
+        name:
+        port:
+          number: 8000
+          # - path: /serve/(.*)
+          #   pathType: ImplementationSpecific
+          #   service:
+          #     name:
+          #     port:
+          #       number: 8265
   tls: []
-    # - hosts:
-    #     - "example.com"
-    #   secretName: "example-tls"
+  # - hosts:
+  #     - "example.com"
+  #   secretName: "example-tls"
 
 selectorLabels:
   app: triton-inference-server
@@ -70,38 +70,38 @@ podSecurityContext:
   fsGroup: 1000
 
 securityContext: {}
-  # capabilities:
-  #   drop:
-  #   - ALL
-  # readOnlyRootFilesystem: true
-  # runAsNonRoot: true
-  # runAsUser: 1000
+# capabilities:
+#   drop:
+#   - ALL
+# readOnlyRootFilesystem: true
+# runAsNonRoot: true
+# runAsUser: 1000
 
 # Environment variables for Triton containers
 environment:
-  - name: "LD_PRELOAD"
-    value: ""
-  - name: "TRANSFORMERS_CACHE"
-    value: "/home/triton-server/.cache"
-  - name: "shm-size"
-    value: "5g"
-  - name: "NCCL_IGNORE_DISABLED_P2P"
-    value: "1"
+- name: "LD_PRELOAD"
+  value: ""
+- name: "TRANSFORMERS_CACHE"
+  value: "/home/triton-server/.cache"
+- name: "shm-size"
+  value: "5g"
+- name: "NCCL_IGNORE_DISABLED_P2P"
+  value: "1"
   # - name: "model_name"
   #   value: "meta-llama/Llama-2-7b-chat-hf"
 
 # Secret environment variables to authenticate with Hugging Face to load models
 secretEnvironment:
-  - name: "HUGGING_FACE_TOKEN"
-    secretName: "huggingface" # Name of the secret
-    key: "HF_TOKEN"           # Key within the secret
+- name: "HUGGING_FACE_TOKEN"
+  secretName: "huggingface" # Name of the secret
+  key: "HF_TOKEN" # Key within the secret
 
 resources:
-  requests:  # Minimum resource requests for each Triton pod
+  requests: # Minimum resource requests for each Triton pod
     cpu: "100m"
     memory: "512Mi"
     nvidia.com/gpu: 1
-  limits:  # Maximum resource limits
+  limits: # Maximum resource limits
     cpu: "500m"
     memory: "2Gi"
     nvidia.com/gpu: 1
@@ -111,8 +111,22 @@ hpa:
   enabled: true
   minReplicas: 1
   maxReplicas: 5
+  metrics: []
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180 # 3 minutes stabilization window
+      policies:
+      - type: Percent
+        value: 50 # Scale down by 50% at a time
+        periodSeconds: 60 # Check every 60 seconds
+    scaleUp:
+      stabilizationWindowSeconds: 60 # 1 minute stabilization window
+      policies:
+      - type: Percent
+        value: 100 # Scale up by 100% at a time
+        periodSeconds: 15
 
 # Advanced Configuration (If needed)
 nodeSelector: {} # Schedule pods on specific nodes
-tolerations: []  # Allow pods to be scheduled on nodes with 'taints'
-affinity: {}     # Influence pod scheduling based on node or pod labels
+tolerations: [] # Allow pods to be scheduled on nodes with 'taints'
+affinity: {} # Influence pod scheduling based on node or pod labels