Skip to content

Commit

Permalink
RUN-16744 Support KWOK (#70)
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi authored Apr 2, 2024
1 parent 6f72a74 commit 50bcff3
Show file tree
Hide file tree
Showing 22 changed files with 778 additions and 479 deletions.
2 changes: 1 addition & 1 deletion cmd/device-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func main() {
initNvidiaSmi()
initPreloaders()

devicePlugin := deviceplugin.NewDevicePlugin(topology)
devicePlugin := deviceplugin.NewDevicePlugin(topology, kubeClient)
if err = devicePlugin.Serve(); err != nil {
log.Printf("Failed to serve device plugin: %s\n", err)
os.Exit(1)
Expand Down
4 changes: 4 additions & 0 deletions cmd/status-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@ package main

import (
"github.com/run-ai/fake-gpu-operator/internal/common/app"
"github.com/run-ai/fake-gpu-operator/internal/common/config"
status_updater "github.com/run-ai/fake-gpu-operator/internal/status-updater"
)

func main() {
requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "FAKE_GPU_OPERATOR_NAMESPACE"}
config.ValidateConfig(requiredEnvVars)

appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{})
appRunner.Run()
}
77 changes: 77 additions & 0 deletions deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{{- define "fake-gpu-operator.device-plugin.common.metadata.labels" -}}
app: device-plugin
{{- end -}}

{{- define "fake-gpu-operator.device-plugin.common.metadata.annotations" -}}
openshift.io/scc: hostmount-anyuid
{{- end -}}

{{- define "fake-gpu-operator.device-plugin.common.metadata.name" -}}
device-plugin
{{- end -}}

{{- define "fake-gpu-operator.device-plugin.common.podSelector" }}
matchLabels:
app: device-plugin
component: device-plugin
{{- end }}

{{- define "fake-gpu-operator.device-plugin.common.podTemplate.metadata" }}
annotations:
checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: device-plugin
component: device-plugin
{{- end }}

{{- define "fake-gpu-operator.device-plugin.common.podTemplate.spec" }}
containers:
- image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
resources:
{{- toYaml .Values.devicePlugin.resources | nindent 12 }}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
name: nvidia-device-plugin-ctr
securityContext:
privileged: true
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /runai/bin
name: runai-bin-directory
- mountPath: /runai/shared
name: runai-shared-directory
- mountPath: /var/lib/kubelet/device-plugins
name: device-plugin
dnsPolicy: ClusterFirst
restartPolicy: Always
serviceAccountName: nvidia-device-plugin
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
imagePullSecrets:
- name: gcr-secret
volumes:
- hostPath:
path: /var/lib/kubelet/device-plugins
type: ""
name: device-plugin
- hostPath:
path: /var/lib/runai/bin
type: DirectoryOrCreate
name: runai-bin-directory
- hostPath:
path: /var/lib/runai/shared
type: DirectoryOrCreate
name: runai-shared-directory
{{- end }}
69 changes: 5 additions & 64 deletions deploy/fake-gpu-operator/templates/device-plugin/daemonset.yml
Original file line number Diff line number Diff line change
@@ -1,75 +1,16 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
{{- if .Values.environment.openshift }}
annotations:
openshift.io/scc: hostmount-anyuid
{{- end }}
name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }}
labels:
app: device-plugin
name: device-plugin
{{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }}
spec:
selector:
matchLabels:
app: device-plugin
component: device-plugin
{{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }}
template:
metadata:
annotations:
checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
labels:
app: device-plugin
component: device-plugin
{{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }}
spec:
containers:
- image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
resources:
{{- toYaml .Values.devicePlugin.resources | nindent 12 }}
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
imagePullPolicy: Always
name: nvidia-device-plugin-ctr
securityContext:
privileged: true
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /runai/bin
name: runai-bin-directory
- mountPath: /runai/shared
name: runai-shared-directory
- mountPath: /var/lib/kubelet/device-plugins
name: device-plugin
dnsPolicy: ClusterFirst
{{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }}
nodeSelector:
nvidia.com/gpu.deploy.device-plugin: "true"
restartPolicy: Always
serviceAccountName: nvidia-device-plugin
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
imagePullSecrets:
- name: gcr-secret
volumes:
- hostPath:
path: /var/lib/kubelet/device-plugins
type: ""
name: device-plugin
- hostPath:
path: /var/lib/runai/bin
type: DirectoryOrCreate
name: runai-bin-directory
- hostPath:
path: /var/lib/runai/shared
type: DirectoryOrCreate
name: runai-shared-directory
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }}
labels:
{{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }}
run.ai/fake-node-deployment-template: "true"
spec:
replicas: 0
selector:
{{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }}
template:
metadata:
{{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }}
spec:
{{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }}
61 changes: 61 additions & 0 deletions deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{{- define "fake-gpu-operator.status-exporter.common.metadata.labels" -}}
app: nvidia-dcgm-exporter
component: status-exporter
app.kubernetes.io/name: nvidia-container-toolkit
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.common.metadata.name" -}}
nvidia-dcgm-exporter
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.common.podSelector" -}}
matchLabels:
app: nvidia-dcgm-exporter
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.common.podTemplate.metadata" -}}
labels:
app: nvidia-dcgm-exporter
app.kubernetes.io/name: nvidia-container-toolkit
{{- end -}}

{{- define "fake-gpu-operator.status-exporter.common.podTemplate.spec" -}}
containers:
- image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
resources:
{{- toYaml .Values.statusExporter.resources | nindent 8 }}
name: nvidia-dcgm-exporter
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: TOPOLOGY_MAX_EXPORT_INTERVAL
value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
ports:
- containerPort: 9400
name: http
volumeMounts:
- mountPath: /runai/proc
name: runai-proc-directory
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: status-exporter
serviceAccountName: status-exporter
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
imagePullSecrets:
- name: gcr-secret
volumes:
- name: runai-proc-directory
hostPath:
path: /var/lib/runai/proc
type: DirectoryOrCreate
{{- end -}}
62 changes: 6 additions & 56 deletions deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml
Original file line number Diff line number Diff line change
@@ -1,66 +1,16 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "fake-gpu-operator.status-exporter.common.metadata.name" . }}
labels:
app: nvidia-dcgm-exporter
component: status-exporter
# this label would make the deployment pod to mimic the container-toolkit, on top of mimicking the dcgm-exporter.
app.kubernetes.io/name: nvidia-container-toolkit
name: nvidia-dcgm-exporter

{{- include "fake-gpu-operator.status-exporter.common.metadata.labels" . | nindent 4 }}
spec:
selector:
matchLabels:
app: nvidia-dcgm-exporter
{{- include "fake-gpu-operator.status-exporter.common.podSelector" . | nindent 4 }}
template:
metadata:
creationTimestamp: null
labels:
app: nvidia-dcgm-exporter
app.kubernetes.io/name: nvidia-container-toolkit
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
spec:
containers:
- image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
resources:
{{- toYaml .Values.statusExporter.resources | nindent 12 }}
name: nvidia-dcgm-exporter
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: TOPOLOGY_CM_NAME
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: TOPOLOGY_MAX_EXPORT_INTERVAL
value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
ports:
- containerPort: 9400
name: http
volumeMounts:
- mountPath: /runai/proc
name: runai-proc-directory
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
nodeSelector:
nvidia.com/gpu.deploy.dcgm-exporter: "true"
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: status-exporter
serviceAccountName: status-exporter
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
imagePullSecrets:
- name: gcr-secret
volumes:
- name: runai-proc-directory
hostPath:
path: /var/lib/runai/proc
type: DirectoryOrCreate
updateStrategy:
rollingUpdate:
maxSurge: 0
maxUnavailable: 1
type: RollingUpdate
nvidia.com/gpu.deploy.dcgm-exporter: "true"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "fake-gpu-operator.status-exporter.common.metadata.name" . }}
labels:
{{- include "fake-gpu-operator.status-exporter.common.metadata.labels" . | nindent 4 }}
run.ai/fake-node-deployment-template: "true"
spec:
replicas: 0
selector:
{{- include "fake-gpu-operator.status-exporter.common.podSelector" . | nindent 4 }}
template:
metadata:
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
spec:
{{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ rules:
- get
- list
- watch
- patch
- apiGroups:
- ""
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ spec:
value: topology
- name: TOPOLOGY_CM_NAMESPACE
value: "{{ .Release.Namespace }}"
- name: FAKE_GPU_OPERATOR_NAMESPACE
value: "{{ .Release.Namespace }}"
restartPolicy: Always
serviceAccountName: status-updater
imagePullSecrets:
Expand Down
16 changes: 16 additions & 0 deletions deploy/fake-gpu-operator/templates/status-updater/role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: fake-status-updater
rules:
- apiGroups:
- apps
resources:
- deployments
verbs:
- update
- list
- get
- watch
- create
- delete
Loading

0 comments on commit 50bcff3

Please sign in to comment.