RUN-16744 Support KWOK (#70)

run-ai · Apr 2, 2024 · 50bcff3 · 50bcff3
1 parent 6f72a74
commit 50bcff3
Show file tree

Hide file tree

Showing 22 changed files with 778 additions and 479 deletions.
diff --git a/cmd/device-plugin/main.go b/cmd/device-plugin/main.go
@@ -41,7 +41,7 @@ func main() {
 	initNvidiaSmi()
 	initPreloaders()
 
-	devicePlugin := deviceplugin.NewDevicePlugin(topology)
+	devicePlugin := deviceplugin.NewDevicePlugin(topology, kubeClient)
 	if err = devicePlugin.Serve(); err != nil {
 		log.Printf("Failed to serve device plugin: %s\n", err)
 		os.Exit(1)

diff --git a/cmd/status-updater/main.go b/cmd/status-updater/main.go
@@ -2,10 +2,14 @@ package main
 
 import (
 	"github.com/run-ai/fake-gpu-operator/internal/common/app"
+	"github.com/run-ai/fake-gpu-operator/internal/common/config"
 	status_updater "github.com/run-ai/fake-gpu-operator/internal/status-updater"
 )
 
 func main() {
+	requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "FAKE_GPU_OPERATOR_NAMESPACE"}
+	config.ValidateConfig(requiredEnvVars)
+
 	appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{})
 	appRunner.Run()
 }
diff --git a/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl b/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
@@ -0,0 +1,77 @@
+{{- define "fake-gpu-operator.device-plugin.common.metadata.labels" -}}
+app: device-plugin
+{{- end -}}
+
+{{- define "fake-gpu-operator.device-plugin.common.metadata.annotations" -}}
+openshift.io/scc: hostmount-anyuid
+{{- end -}}
+
+{{- define "fake-gpu-operator.device-plugin.common.metadata.name" -}}
+device-plugin
+{{- end -}}
+
+{{- define "fake-gpu-operator.device-plugin.common.podSelector" }}
+matchLabels:
+  app: device-plugin
+  component: device-plugin
+{{- end }}
+
+{{- define "fake-gpu-operator.device-plugin.common.podTemplate.metadata" }}
+annotations:
+  checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
+labels:
+  app: device-plugin
+  component: device-plugin
+{{- end }}
+
+{{- define "fake-gpu-operator.device-plugin.common.podTemplate.spec" }}
+containers:
+  - image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
+    imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
+    resources:
+      {{- toYaml .Values.devicePlugin.resources | nindent 12 }}
+    env:
+      - name: NODE_NAME
+        valueFrom:
+          fieldRef:
+            fieldPath: spec.nodeName
+      - name: TOPOLOGY_CM_NAME
+        value: topology
+      - name: TOPOLOGY_CM_NAMESPACE
+        value: "{{ .Release.Namespace }}"
+    name: nvidia-device-plugin-ctr
+    securityContext:
+      privileged: true
+    terminationMessagePath: /dev/termination-log
+    terminationMessagePolicy: File
+    volumeMounts:
+      - mountPath: /runai/bin
+        name: runai-bin-directory
+      - mountPath: /runai/shared
+        name: runai-shared-directory              
+      - mountPath: /var/lib/kubelet/device-plugins
+        name: device-plugin
+dnsPolicy: ClusterFirst
+restartPolicy: Always
+serviceAccountName: nvidia-device-plugin
+terminationGracePeriodSeconds: 30
+tolerations:
+  - effect: NoSchedule
+    key: nvidia.com/gpu
+    operator: Exists
+imagePullSecrets:
+  - name: gcr-secret
+volumes:
+  - hostPath:
+      path: /var/lib/kubelet/device-plugins
+      type: ""
+    name: device-plugin
+  - hostPath:
+      path: /var/lib/runai/bin
+      type: DirectoryOrCreate
+    name: runai-bin-directory
+  - hostPath:
+      path: /var/lib/runai/shared
+      type: DirectoryOrCreate
+    name: runai-shared-directory
+{{- end }}
diff --git a/deploy/fake-gpu-operator/templates/device-plugin/daemonset.yml b/deploy/fake-gpu-operator/templates/device-plugin/daemonset.yml
@@ -1,75 +1,16 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-{{- if .Values.environment.openshift }}
-  annotations:
-    openshift.io/scc: hostmount-anyuid
-{{- end }}
+  name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }}
   labels:
-    app: device-plugin
-  name: device-plugin
+    {{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }}
 spec:
   selector:
-    matchLabels:
-      app: device-plugin
-      component: device-plugin
+    {{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }}
   template:
     metadata:
-      annotations:
-        checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
-      labels:
-        app: device-plugin
-        component: device-plugin
+      {{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }}
     spec:
-      containers:
-        - image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
-          imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
-          resources:
-            {{- toYaml .Values.devicePlugin.resources | nindent 12 }}
-          env:
-            - name: NODE_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: TOPOLOGY_CM_NAME
-              value: topology
-            - name: TOPOLOGY_CM_NAMESPACE
-              value: "{{ .Release.Namespace }}"
-          imagePullPolicy: Always
-          name: nvidia-device-plugin-ctr
-          securityContext:
-            privileged: true
-          terminationMessagePath: /dev/termination-log
-          terminationMessagePolicy: File
-          volumeMounts:
-            - mountPath: /runai/bin
-              name: runai-bin-directory
-            - mountPath: /runai/shared
-              name: runai-shared-directory              
-            - mountPath: /var/lib/kubelet/device-plugins
-              name: device-plugin
-      dnsPolicy: ClusterFirst
+      {{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }}
       nodeSelector:
         nvidia.com/gpu.deploy.device-plugin: "true"
-      restartPolicy: Always
-      serviceAccountName: nvidia-device-plugin
-      terminationGracePeriodSeconds: 30
-      tolerations:
-        - effect: NoSchedule
-          key: nvidia.com/gpu
-          operator: Exists
-      imagePullSecrets:
-        - name: gcr-secret
-      volumes:
-        - hostPath:
-            path: /var/lib/kubelet/device-plugins
-            type: ""
-          name: device-plugin
-        - hostPath:
-            path: /var/lib/runai/bin
-            type: DirectoryOrCreate
-          name: runai-bin-directory
-        - hostPath:
-            path: /var/lib/runai/shared
-            type: DirectoryOrCreate
-          name: runai-shared-directory
diff --git a/deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml b/deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml
@@ -0,0 +1,16 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }}
+  labels:
+    {{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }}
+    run.ai/fake-node-deployment-template: "true"
+spec:
+  replicas: 0
+  selector:
+    {{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }}
+  template:
+    metadata:
+      {{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }}
+    spec:
+      {{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }}
diff --git a/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl b/deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
@@ -0,0 +1,61 @@
+{{- define "fake-gpu-operator.status-exporter.common.metadata.labels" -}}
+app: nvidia-dcgm-exporter
+component: status-exporter
+app.kubernetes.io/name: nvidia-container-toolkit
+{{- end -}}
+
+{{- define "fake-gpu-operator.status-exporter.common.metadata.name" -}}
+nvidia-dcgm-exporter
+{{- end -}}
+
+{{- define "fake-gpu-operator.status-exporter.common.podSelector" -}}
+matchLabels:
+  app: nvidia-dcgm-exporter
+{{- end -}}
+
+{{- define "fake-gpu-operator.status-exporter.common.podTemplate.metadata" -}}
+labels:
+  app: nvidia-dcgm-exporter
+  app.kubernetes.io/name: nvidia-container-toolkit
+{{- end -}}
+
+{{- define "fake-gpu-operator.status-exporter.common.podTemplate.spec" -}}
+containers:
+- image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
+  imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
+  resources:
+    {{- toYaml .Values.statusExporter.resources | nindent 8 }}
+  name: nvidia-dcgm-exporter
+  env:
+    - name: NODE_NAME
+      valueFrom:
+        fieldRef:
+          fieldPath: spec.nodeName
+    - name: TOPOLOGY_CM_NAME
+      value: topology
+    - name: TOPOLOGY_CM_NAMESPACE
+      value: "{{ .Release.Namespace }}"
+    - name: TOPOLOGY_MAX_EXPORT_INTERVAL
+      value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
+  ports:
+    - containerPort: 9400
+      name: http
+  volumeMounts:
+    - mountPath: /runai/proc
+      name: runai-proc-directory
+restartPolicy: Always
+schedulerName: default-scheduler
+serviceAccount: status-exporter
+serviceAccountName: status-exporter
+tolerations:
+  - effect: NoSchedule
+    key: nvidia.com/gpu
+    operator: Exists
+imagePullSecrets:
+  - name: gcr-secret
+volumes:
+  - name: runai-proc-directory
+    hostPath:
+      path: /var/lib/runai/proc
+      type: DirectoryOrCreate
+{{- end -}}
diff --git a/deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml b/deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml
@@ -1,66 +1,16 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
+  name: {{ include "fake-gpu-operator.status-exporter.common.metadata.name" . }}
   labels:
-    app: nvidia-dcgm-exporter
-    component: status-exporter
-    # this label would make the deployment pod to mimic the container-toolkit, on top of mimicking the dcgm-exporter.
-    app.kubernetes.io/name: nvidia-container-toolkit
-  name: nvidia-dcgm-exporter
-
+    {{- include "fake-gpu-operator.status-exporter.common.metadata.labels" . | nindent 4 }}
 spec:
   selector:
-    matchLabels:
-      app: nvidia-dcgm-exporter
+    {{- include "fake-gpu-operator.status-exporter.common.podSelector" . | nindent 4 }}
   template:
     metadata:
-      creationTimestamp: null
-      labels:
-        app: nvidia-dcgm-exporter
-        app.kubernetes.io/name: nvidia-container-toolkit
+      {{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
     spec:
-      containers:
-        - image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
-          imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
-          resources:
-            {{- toYaml .Values.statusExporter.resources | nindent 12 }}
-          name: nvidia-dcgm-exporter
-          env:
-            - name: NODE_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: TOPOLOGY_CM_NAME
-              value: topology
-            - name: TOPOLOGY_CM_NAMESPACE
-              value: "{{ .Release.Namespace }}"
-            - name: TOPOLOGY_MAX_EXPORT_INTERVAL
-              value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
-          ports:
-            - containerPort: 9400
-              name: http
-          volumeMounts:
-            - mountPath: /runai/proc
-              name: runai-proc-directory
+      {{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
       nodeSelector:
-        nvidia.com/gpu.deploy.dcgm-exporter: "true"
-      restartPolicy: Always
-      schedulerName: default-scheduler
-      serviceAccount: status-exporter
-      serviceAccountName: status-exporter
-      tolerations:
-        - effect: NoSchedule
-          key: nvidia.com/gpu
-          operator: Exists
-      imagePullSecrets:
-        - name: gcr-secret
-      volumes:
-        - name: runai-proc-directory
-          hostPath:
-            path: /var/lib/runai/proc
-            type: DirectoryOrCreate
-  updateStrategy:
-    rollingUpdate:
-      maxSurge: 0
-      maxUnavailable: 1
-    type: RollingUpdate
+        nvidia.com/gpu.deploy.dcgm-exporter: "true"
diff --git a/deploy/fake-gpu-operator/templates/status-exporter/deployment-template.yaml b/deploy/fake-gpu-operator/templates/status-exporter/deployment-template.yaml
@@ -0,0 +1,16 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "fake-gpu-operator.status-exporter.common.metadata.name" . }}
+  labels:
+    {{- include "fake-gpu-operator.status-exporter.common.metadata.labels" . | nindent 4 }}
+    run.ai/fake-node-deployment-template: "true"
+spec:
+  replicas: 0
+  selector:
+    {{- include "fake-gpu-operator.status-exporter.common.podSelector" . | nindent 4 }}
+  template:
+    metadata:
+      {{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
+    spec:
+      {{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
diff --git a/deploy/fake-gpu-operator/templates/status-updater/clusterrole.yaml b/deploy/fake-gpu-operator/templates/status-updater/clusterrole.yaml
@@ -12,6 +12,7 @@ rules:
       - get
       - list
       - watch
+      - patch
   - apiGroups:
       - ""
     resources:

diff --git a/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml b/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml
@@ -29,6 +29,8 @@ spec:
               value: topology
             - name: TOPOLOGY_CM_NAMESPACE
               value: "{{ .Release.Namespace }}"
+            - name: FAKE_GPU_OPERATOR_NAMESPACE
+              value: "{{ .Release.Namespace }}"
       restartPolicy: Always
       serviceAccountName: status-updater
       imagePullSecrets:

diff --git a/deploy/fake-gpu-operator/templates/status-updater/role.yaml b/deploy/fake-gpu-operator/templates/status-updater/role.yaml
@@ -0,0 +1,16 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: fake-status-updater
+rules:
+  - apiGroups:
+      - apps
+    resources:
+      - deployments
+    verbs:
+      - update
+      - list
+      - get
+      - watch
+      - create
+      - delete