diff --git a/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-custom-resource.yaml b/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-custom-resource.yaml index 44ed29c..2c43246 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-custom-resource.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/cloudwatch-agent-custom-resource.yaml @@ -61,19 +61,14 @@ spec: image: {{ template "cloudwatch-agent.image" (merge $agent.image (dict "region" $.Values.region)) }} mode: {{ $agent.mode }} replicas: {{ $agent.replicas }} - nodeSelector: - kubernetes.io/os: linux + {{- with $agent.nodeSelector }} + nodeSelector: {{- toYaml . | nindent 2 }} + {{- end }} serviceAccount: {{ $agent.serviceAccount.name | default (include "cloudwatch-agent.serviceAccountName" $) }} priorityClassName: {{ $agent.priorityClassName | default $.Values.agent.priorityClassName }} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: {{ $.Values.fargateLabelKey }} - operator: NotIn - values: - - fargate + {{- with $agent.affinity }} + affinity: {{- toYaml . | nindent 2 }} + {{- end }} hostNetwork: true {{- if $agent.config }} config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" $agent.config) $ ) }} diff --git a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml index 0eefeeb..2f86a75 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml @@ -8,21 +8,13 @@ metadata: version: v1 spec: image: {{ template "dcgm-exporter.image" . }} - nodeSelector: - kubernetes.io/os: linux + {{- with .Values.dcgmExporter.nodeSelector }} + nodeSelector: {{- toYaml . | nindent 2 }} + {{- end }} serviceAccount: {{ template "dcgm-exporter.serviceAccountName" . }} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: {{ .Values.nodeLabelKey }} - operator: In - values: {{ .Values.gpuInstances | toYaml | nindent 16 }} - - key: {{ .Values.fargateLabelKey }} - operator: NotIn - values: - - fargate + {{- with .Values.dcgmExporter.affinity }} + affinity: {{- toYaml . | nindent 2 }} + {{- end }} {{- with .Values.dcgmExporter.resources }} resources: {{- toYaml . | nindent 4}} {{- end }} diff --git a/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml index 73f6866..65ffefb 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/fluent-bit-daemonset.yaml @@ -112,17 +112,12 @@ spec: - key: ca.crt path: tls-ca.crt serviceAccountName: {{ template "cloudwatch-agent.serviceAccountName" . }} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: {{ .Values.fargateLabelKey }} - operator: NotIn - values: - - fargate - nodeSelector: - kubernetes.io/os: linux + {{- with .Values.containerLogs.fluentBit.affinity }} + affinity: {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.containerLogs.fluentBit.nodeSelector }} + nodeSelector: {{- toYaml . | nindent 8 }} + {{- end }} {{- with .Values.tolerations }} tolerations: {{- toYaml . | nindent 6}} {{- end }} diff --git a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml index e62fe8c..7b77ddf 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml @@ -9,22 +9,12 @@ metadata: spec: image: {{ template "neuron-monitor.image" . }} serviceAccount: {{ template "neuron-monitor.serviceAccountName" . }} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/os - operator: In - values: - - linux - - key: {{ .Values.nodeLabelKey }} - operator: In - values: {{ .Values.neuronInstances | toYaml | nindent 20 }} - - key: {{ .Values.fargateLabelKey }} - operator: NotIn - values: - - fargate + {{- with .Values.neuronMonitor.nodeSelector }} + nodeSelector: {{- toYaml . | nindent 2 }} + {{- end }} + {{- with .Values.neuronMonitor.affinity }} + affinity: {{- toYaml . | nindent 2 }} + {{- end }} {{- with .Values.neuronMonitor.resources }} resources: {{- toYaml . | nindent 4}} {{- end }} diff --git a/charts/amazon-cloudwatch-observability/templates/operator-deployment.yaml b/charts/amazon-cloudwatch-observability/templates/operator-deployment.yaml index cef3caf..90fbd36 100644 --- a/charts/amazon-cloudwatch-observability/templates/operator-deployment.yaml +++ b/charts/amazon-cloudwatch-observability/templates/operator-deployment.yaml @@ -46,6 +46,9 @@ spec: - mountPath: /tmp/k8s-webhook-server/serving-certs name: cert readOnly: true + {{- with .Values.manager.affinity }} + affinity: {{- toYaml . | nindent 8 }} + {{- end }} serviceAccountName: {{ template "amazon-cloudwatch-observability.managerServiceAccountName" . }} terminationGracePeriodSeconds: 10 volumes: @@ -53,5 +56,6 @@ spec: secret: defaultMode: 420 secretName: {{ template "amazon-cloudwatch-observability.certificateSecretName" . }} - nodeSelector: - kubernetes.io/os: linux + {{- with .Values.manager.nodeSelector }} + nodeSelector: {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/amazon-cloudwatch-observability/values.yaml b/charts/amazon-cloudwatch-observability/values.yaml index cd26e1d..957c906 100644 --- a/charts/amazon-cloudwatch-observability/values.yaml +++ b/charts/amazon-cloudwatch-observability/values.yaml @@ -10,159 +10,6 @@ nameOverride: "" clusterName: ## Provide the Region (this is a required parameter) region: -nodeLabelKey: node.kubernetes.io/instance-type -fargateLabelKey: eks.amazonaws.com/compute-type -## NVIDIA GPU instance types -gpuInstances: - - g3.4xlarge - - g3.8xlarge - - g3.16xlarge - - g3s.xlarge - - g4ad.2xlarge - - g4ad.4xlarge - - g4ad.8xlarge - - g4ad.16xlarge - - g4ad.xlarge - - g4dn.2xlarge - - g4dn.4xlarge - - g4dn.8xlarge - - g4dn.12xlarge - - g4dn.16xlarge - - g4dn.metal - - g4dn.xlarge - - g5.2xlarge - - g5.4xlarge - - g5.8xlarge - - g5.12xlarge - - g5.16xlarge - - g5.24xlarge - - g5.48xlarge - - g5.xlarge - - g5g.2xlarge - - g5g.4xlarge - - g5g.8xlarge - - g5g.16xlarge - - g5g.metal - - g5g.xlarge - - g6.2xlarge - - g6.4xlarge - - g6.8xlarge - - g6.12xlarge - - g6.16xlarge - - g6.24xlarge - - g6.48xlarge - - g6.xlarge - - g6e.2xlarge - - g6e.4xlarge - - g6e.8xlarge - - g6e.12xlarge - - g6e.16xlarge - - g6e.24xlarge - - g6e.48xlarge - - g6e.xlarge - - gr6.4xlarge - - gr6.8xlarge - - p2.8xlarge - - p2.16xlarge - - p2.xlarge - - p3.2xlarge - - p3.8xlarge - - p3.16xlarge - - p3dn.24xlarge - - p4d.24xlarge - - p4de.24xlarge - - p5.48xlarge - - p5e.48xlarge - - p5en.48xlarge - - ml.g3.4xlarge - - ml.g3.8xlarge - - ml.g3.16xlarge - - ml.g3s.xlarge - - ml.g4ad.2xlarge - - ml.g4ad.4xlarge - - ml.g4ad.8xlarge - - ml.g4ad.16xlarge - - ml.g4ad.xlarge - - ml.g4dn.2xlarge - - ml.g4dn.4xlarge - - ml.g4dn.8xlarge - - ml.g4dn.12xlarge - - ml.g4dn.16xlarge - - ml.g4dn.metal - - ml.g4dn.xlarge - - ml.g5.2xlarge - - ml.g5.4xlarge - - ml.g5.8xlarge - - ml.g5.12xlarge - - ml.g5.16xlarge - - ml.g5.24xlarge - - ml.g5.48xlarge - - ml.g5.xlarge - - ml.g5g.2xlarge - - ml.g5g.4xlarge - - ml.g5g.8xlarge - - ml.g5g.16xlarge - - ml.g5g.metal - - ml.g5g.xlarge - - ml.g6.2xlarge - - ml.g6.4xlarge - - ml.g6.8xlarge - - ml.g6.12xlarge - - ml.g6.16xlarge - - ml.g6.24xlarge - - ml.g6.48xlarge - - ml.g6.xlarge - - ml.g6e.2xlarge - - ml.g6e.4xlarge - - ml.g6e.8xlarge - - ml.g6e.12xlarge - - ml.g6e.16xlarge - - ml.g6e.24xlarge - - ml.g6e.48xlarge - - ml.g6e.xlarge - - ml.gr6.4xlarge - - ml.gr6.8xlarge - - ml.p2.8xlarge - - ml.p2.16xlarge - - ml.p2.xlarge - - ml.p3.2xlarge - - ml.p3.8xlarge - - ml.p3.16xlarge - - ml.p3dn.24xlarge - - ml.p4d.24xlarge - - ml.p4de.24xlarge - - ml.p5.48xlarge - - ml.p5e.48xlarge - - ml.p5en.48xlarge -## Tranium/Infrentia instance types -neuronInstances: - - trn1.2xlarge - - trn1.32xlarge - - trn1n.32xlarge - - trn2.3xlarge - - trn2.48xlarge - - trn2a.48xlarge - - trn2n.48xlarge - - trn2u.48xlarg - - inf1.xlarge - - inf1.2xlarge - - inf1.6xlarge - - inf1.24xlarge - - inf2.xlarge - - inf2.8xlarge - - inf2.24xlarge - - inf2.48xlarge - - ml.trn1.2xlarge - - ml.trn1.32xlarge - - ml.trn1n.32xlarge - - ml.inf1.xlarge - - ml.inf1.2xlarge - - ml.inf1.6xlarge - - ml.inf1.24xlarge - - ml.inf2.xlarge - - ml.inf2.8xlarge - - ml.inf2.24xlarge - - ml.inf2.48xlarge ## Provide default tolerations tolerations: - operator: Exists @@ -1164,6 +1011,17 @@ containerLogs: log_stream_name ${HOST_NAME}.windows.system.events auto_create_group true extra_user_agent container-insights + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: eks.amazonaws.com/compute-type + operator: NotIn + values: + - fargate + nodeSelector: + kubernetes.io/os: linux ## Provide CloudWatchAgent Operator manager container image and resources. ## manager: @@ -1275,6 +1133,17 @@ manager: podLabels: {} service: name: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: eks.amazonaws.com/compute-type + operator: NotIn + values: + - fargate + nodeSelector: + kubernetes.io/os: linux ## Admission webhooks make sure only requests with correctly formatted rules will get into the Operator. admissionWebhooks: create: true @@ -1333,6 +1202,17 @@ agent: us-gov-west-1: 743662458514.dkr.ecr.us-gov-west-1.amazonaws.com enabled: true priorityClassName: system-node-critical + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: eks.amazonaws.com/compute-type + operator: NotIn + values: + - fargate + nodeSelector: + kubernetes.io/os: linux resources: requests: memory: 128Mi @@ -1422,6 +1302,136 @@ dcgmExporter: kubeletPath: /var/lib/kubelet/pod-resources serviceAccount: name: # override exporter service account name + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + - operator: In + - values: + ## NVIDIA GPU instance types + - g4ad.4xlarge + - g4ad.8xlarge + - g4ad.16xlarge + - g4ad.xlarge + - g4dn.2xlarge + - g4dn.4xlarge + - g4dn.8xlarge + - g4dn.12xlarge + - g4dn.16xlarge + - g4dn.metal + - g4dn.xlarge + - g5.2xlarge + - g5.4xlarge + - g5.8xlarge + - g5.12xlarge + - g5.16xlarge + - g5.24xlarge + - g5.48xlarge + - g5.xlarge + - g5g.2xlarge + - g5g.4xlarge + - g5g.8xlarge + - g5g.16xlarge + - g5g.metal + - g5g.xlarge + - g6.2xlarge + - g6.4xlarge + - g6.8xlarge + - g6.12xlarge + - g6.16xlarge + - g6.24xlarge + - g6.48xlarge + - g6.xlarge + - g6e.2xlarge + - g6e.4xlarge + - g6e.8xlarge + - g6e.12xlarge + - g6e.16xlarge + - g6e.24xlarge + - g6e.48xlarge + - g6e.xlarge + - gr6.4xlarge + - gr6.8xlarge + - p2.8xlarge + - p2.16xlarge + - p2.xlarge + - p3.2xlarge + - p3.8xlarge + - p3.16xlarge + - p3dn.24xlarge + - p4d.24xlarge + - p4de.24xlarge + - p5.48xlarge + - p5e.48xlarge + - p5en.48xlarge + - ml.g3.4xlarge + - ml.g3.8xlarge + - ml.g3.16xlarge + - ml.g3s.xlarge + - ml.g4ad.2xlarge + - ml.g4ad.4xlarge + - ml.g4ad.8xlarge + - ml.g4ad.16xlarge + - ml.g4ad.xlarge + - ml.g4dn.2xlarge + - ml.g4dn.4xlarge + - ml.g4dn.8xlarge + - ml.g4dn.12xlarge + - ml.g4dn.16xlarge + - ml.g4dn.metal + - ml.g4dn.xlarge + - ml.g5.2xlarge + - ml.g5.4xlarge + - ml.g5.8xlarge + - ml.g5.12xlarge + - ml.g5.16xlarge + - ml.g5.24xlarge + - ml.g5.48xlarge + - ml.g5.xlarge + - ml.g5g.2xlarge + - ml.g5g.4xlarge + - ml.g5g.8xlarge + - ml.g5g.16xlarge + - ml.g5g.metal + - ml.g5g.xlarge + - ml.g6.2xlarge + - ml.g6.4xlarge + - ml.g6.8xlarge + - ml.g6.12xlarge + - ml.g6.16xlarge + - ml.g6.24xlarge + - ml.g6.48xlarge + - ml.g6.xlarge + - ml.g6e.2xlarge + - ml.g6e.4xlarge + - ml.g6e.8xlarge + - ml.g6e.12xlarge + - ml.g6e.16xlarge + - ml.g6e.24xlarge + - ml.g6e.48xlarge + - ml.g6e.xlarge + - ml.gr6.4xlarge + - ml.gr6.8xlarge + - ml.p2.8xlarge + - ml.p2.16xlarge + - ml.p2.xlarge + - ml.p3.2xlarge + - ml.p3.8xlarge + - ml.p3.16xlarge + - ml.p3dn.24xlarge + - ml.p4d.24xlarge + - ml.p4de.24xlarge + - ml.p5.48xlarge + - ml.p5e.48xlarge + - ml.p5en.48xlarge + - key: eks.amazonaws.com/compute-type + operator: NotIn + values: + - fargate + nodeSelector: + kubernetes.io/os: linux neuronMonitor: name: image: @@ -1450,3 +1460,45 @@ neuronMonitor: - SYS_ADMIN serviceAccount: name: # override exporter service account name + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + - operator: In + - values: + ## Tranium/Infrentia instance types + - trn1.2xlarge + - trn1.32xlarge + - trn1n.32xlarge + - trn2.3xlarge + - trn2.48xlarge + - trn2a.48xlarge + - trn2n.48xlarge + - trn2u.48xlarg + - inf1.xlarge + - inf1.2xlarge + - inf1.6xlarge + - inf1.24xlarge + - inf2.xlarge + - inf2.8xlarge + - inf2.24xlarge + - inf2.48xlarge + - ml.trn1.2xlarge + - ml.trn1.32xlarge + - ml.trn1n.32xlarge + - ml.inf1.xlarge + - ml.inf1.2xlarge + - ml.inf1.6xlarge + - ml.inf1.24xlarge + - ml.inf2.xlarge + - ml.inf2.8xlarge + - ml.inf2.24xlarge + - ml.inf2.48xlarge + - key: eks.amazonaws.com/compute-type + operator: NotIn + values: + - fargate + nodeSelector: + kubernetes.io/os: linux