Skip to content

Commit

Permalink
add hpa configuration for triton server helm chart
Browse files Browse the repository at this point in the history
  • Loading branch information
ratnopamc committed Jul 14, 2024
1 parent a8f575b commit 1191e03
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 63 deletions.
22 changes: 7 additions & 15 deletions helm-charts/nvidia-triton-server/templates/hpa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,12 @@ spec:
minReplicas: {{ .Values.hpa.minReplicas }}
maxReplicas: {{ .Values.hpa.maxReplicas }}
metrics:
{{- if .Values.hpa.targetCPUUtilizationPercentage }}
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.hpa.targetCPUUtilizationPercentage }}
{{- end }}
{{- if .Values.hpa.targetMemoryUtilizationPercentage }}
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: {{ .Values.hpa.targetMemoryUtilizationPercentage }}
{{- range .Values.hpa.metrics }}
- {{ . | toYaml | nindent 10 }}
{{- end }}
{{- if .Values.hpa.scaleDownStabilizationSecs }}
behavior:
scaleDown:
stabilizationWindowSeconds: {{ .Values.hpa.scaleDownStabilizationSecs }}
{{- end }}
{{- end }}
110 changes: 62 additions & 48 deletions helm-charts/nvidia-triton-server/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,35 +31,35 @@ ingress:
enabled: true
className: nginx
annotations: {}
# kubernetes.io/ingress.class: nginx
# nginx.ingress.kubernetes.io/use-regex: "true"
# nginx.ingress.kubernetes.io/rewrite-target: "/$1"
# OR
# kubernetes.io/ingress.class: alb
# alb.ingress.kubernetes.io/scheme: internet-facing
# alb.ingress.kubernetes.io/target-type: ip
# alb.ingress.kubernetes.io/success-codes: "200-299"
# alb.ingress.kubernetes.io/healthcheck-path: "/v1/health/ready"
# alb.ingress.kubernetes.io/healthcheck-port: "8080"
# kubernetes.io/ingress.class: nginx
# nginx.ingress.kubernetes.io/use-regex: "true"
# nginx.ingress.kubernetes.io/rewrite-target: "/$1"
# OR
# kubernetes.io/ingress.class: alb
# alb.ingress.kubernetes.io/scheme: internet-facing
# alb.ingress.kubernetes.io/target-type: ip
# alb.ingress.kubernetes.io/success-codes: "200-299"
# alb.ingress.kubernetes.io/healthcheck-path: "/v1/health/ready"
# alb.ingress.kubernetes.io/healthcheck-port: "8080"
hosts:
- host: "example.com"
paths:
- path: /
pathType: Prefix
service:
name:
port:
number: 8000
# - path: /serve/(.*)
# pathType: ImplementationSpecific
# service:
# name:
# port:
# number: 8265
- host: "example.com"
paths:
- path: /
pathType: Prefix
service:
name:
port:
number: 8000
# - path: /serve/(.*)
# pathType: ImplementationSpecific
# service:
# name:
# port:
# number: 8265
tls: []
# - hosts:
# - "example.com"
# secretName: "example-tls"
# - hosts:
# - "example.com"
# secretName: "example-tls"

selectorLabels:
app: triton-inference-server
Expand All @@ -70,38 +70,38 @@ podSecurityContext:
fsGroup: 1000

securityContext: {}
# capabilities:
# drop:
# - ALL
# readOnlyRootFilesystem: true
# runAsNonRoot: true
# runAsUser: 1000
# capabilities:
# drop:
# - ALL
# readOnlyRootFilesystem: true
# runAsNonRoot: true
# runAsUser: 1000

# Environment variables for Triton containers
environment:
- name: "LD_PRELOAD"
value: ""
- name: "TRANSFORMERS_CACHE"
value: "/home/triton-server/.cache"
- name: "shm-size"
value: "5g"
- name: "NCCL_IGNORE_DISABLED_P2P"
value: "1"
- name: "LD_PRELOAD"
value: ""
- name: "TRANSFORMERS_CACHE"
value: "/home/triton-server/.cache"
- name: "shm-size"
value: "5g"
- name: "NCCL_IGNORE_DISABLED_P2P"
value: "1"
# - name: "model_name"
# value: "meta-llama/Llama-2-7b-chat-hf"

# Secret environment variables to authenticate with Hugging Face to load models
secretEnvironment:
- name: "HUGGING_FACE_TOKEN"
secretName: "huggingface" # Name of the secret
key: "HF_TOKEN" # Key within the secret
- name: "HUGGING_FACE_TOKEN"
secretName: "huggingface" # Name of the secret
key: "HF_TOKEN" # Key within the secret

resources:
requests: # Minimum resource requests for each Triton pod
requests: # Minimum resource requests for each Triton pod
cpu: "100m"
memory: "512Mi"
nvidia.com/gpu: 1
limits: # Maximum resource limits
limits: # Maximum resource limits
cpu: "500m"
memory: "2Gi"
nvidia.com/gpu: 1
Expand All @@ -111,8 +111,22 @@ hpa:
enabled: true
minReplicas: 1
maxReplicas: 5
metrics: []
behavior:
scaleDown:
stabilizationWindowSeconds: 180 # 3 minutes stabilization window
policies:
- type: Percent
value: 50 # Scale down by 50% at a time
periodSeconds: 60 # Check every 60 seconds
scaleUp:
stabilizationWindowSeconds: 60 # 1 minute stabilization window
policies:
- type: Percent
value: 100 # Scale up by 100% at a time
periodSeconds: 15

# Advanced Configuration (If needed)
nodeSelector: {} # Schedule pods on specific nodes
tolerations: [] # Allow pods to be scheduled on nodes with 'taints'
affinity: {} # Influence pod scheduling based on node or pod labels
tolerations: [] # Allow pods to be scheduled on nodes with 'taints'
affinity: {} # Influence pod scheduling based on node or pod labels

0 comments on commit 1191e03

Please sign in to comment.