-
Notifications
You must be signed in to change notification settings - Fork 25
157 lines (157 loc) · 5.09 KB
/
daemonset.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Please edit the object below. Lines beginning with a '#' will be ignored,
# and an empty file will abort the edit. If an error occurs while saving this file will be
# reopened with the relevant failures.
#
apiVersion: apps/v1
kind: DaemonSet
metadata:
annotations:
deprecated.daemonset.template.generation: "1"
creationTimestamp: "2024-05-01T21:01:08Z"
generation: 1
labels:
k8s-app: dcgm-exporter
version: v1
name: dcgm-exporter
namespace: amazon-cloudwatch
resourceVersion: "1704573"
uid: ea691d02-d5a7-47bf-a58f-3bd1fd5e4de6
spec:
revisionHistoryLimit: 10
selector:
matchLabels:
k8s-app: dcgm-exporter
template:
metadata:
creationTimestamp: null
labels:
k8s-app: dcgm-exporter
version: v1
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- p2.xlarge
- p2.8xlarge
- p2.16xlarge
- p3.2xlarge
- p3.8xlarge
- p3.16xlarge
- p3dn.24xlarge
- p4d.24xlarge
- p4de.24xlarge
- p5.48xlarge
- g3s.xlarge
- g3.4xlarge
- g3.8xlarge
- g3.16xlarge
- g4dn.xlarge
- g4dn.2xlarge
- g4dn.4xlarge
- g4dn.8xlarge
- g4dn.16xlarge
- g4dn.12xlarge
- g4dn.metal
- g4ad.xlarge
- g4ad.2xlarge
- g4ad.4xlarge
- g4ad.8xlarge
- g4ad.16xlarge
- g5.xlarge
- g5.2xlarge
- g5.4xlarge
- g5.8xlarge
- g5.16xlarge
- g5.xlarge
- g5.2xlarge
- g5.4xlarge
- g5.8xlarge
- g5.16xlarge
- g5.12xlarge
- g5.24xlarge
- g5.48xlarge
- g5g.xlarge
- g5g.2xlarge
- g5g.4xlarge
- g5g.8xlarge
- g5g.16xlarge
- g5g.metal
containers:
- args:
- --web-config-file=/etc/dcgm-exporter/web-config.yaml
env:
- name: DCGM_EXPORTER_KUBERNETES
value: "true"
- name: DCGM_EXPORTER_LISTEN
value: :9400
- name: DCGM_EXPORTER_COLLECTORS
value: /etc/dcgm-exporter/dcp-metrics-included.csv
- name: NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
image: 602401143452.dkr.ecr.us-east-1.amazonaws.com/eks/observability/dcgm-exporter:3.3.3-3.3.1-ubuntu22.04
imagePullPolicy: IfNotPresent
name: dcgm-exporter
ports:
- containerPort: 9400
name: metrics
protocol: TCP
resources:
limits:
cpu: 500m
memory: 250Mi
requests:
cpu: 250m
memory: 128Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/lib/kubelet/pod-resources
name: pod-gpu-resources
readOnly: true
- mountPath: /etc/dcgm-exporter/
name: dcgm-config
- mountPath: /etc/amazon-cloudwatch-observability-dcgm-cert
name: dcgmtls
readOnly: true
dnsPolicy: ClusterFirst
nodeSelector:
kubernetes.io/os: linux
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: dcgm-exporter-service-acct
serviceAccountName: dcgm-exporter-service-acct
terminationGracePeriodSeconds: 30
volumes:
- name: dcgmtls
secret:
defaultMode: 420
items:
- key: tls.crt
path: server.crt
- key: tls.key
path: server.key
secretName: amazon-cloudwatch-observability-agent-cert
- hostPath:
path: /var/lib/kubelet/pod-resources
type: ""
name: pod-gpu-resources
- configMap:
defaultMode: 420
name: dcgm-exporter-config-map
name: dcgm-config
updateStrategy:
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
type: RollingUpdate
status:
currentNumberScheduled: 1