RUN-18413 nvidia-smi on shared GPU pod bugfix (#80)

run-ai · May 9, 2024 · 21b1c50 · 21b1c50
1 parent 9a17acc
commit 21b1c50
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 13 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -38,6 +38,9 @@ COPY ./cmd/mig-faker/ ./cmd/mig-faker/
 COPY ./internal/ ./internal/
 RUN --mount=type=cache,target=/root/.cache/go-build make build COMPONENT=mig-faker
 
+FROM jupyter/minimal-notebook as jupyter-notebook
+COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/
+
 FROM ubuntu as device-plugin
 COPY --from=device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/device-plugin /bin/
 COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/

diff --git a/Makefile b/Makefile
@@ -30,6 +30,7 @@ images:
 	make image COMPONENT=status-exporter
 	make image COMPONENT=topology-server
 	make image COMPONENT=mig-faker
+	make image COMPONENT=jupyter-notebook
 .PHONY: images
 
 push:
@@ -42,6 +43,7 @@ push-all:
 	make push COMPONENT=status-exporter
 	make push COMPONENT=topology-server
 	make push COMPONENT=mig-faker
+	make push COMPONENT=jupyter-notebook
 .PHONY: push-all
 
 restart: 
@@ -57,6 +59,7 @@ deploy-all:
 	make image push COMPONENT=status-exporter
 	make image push COMPONENT=topology-server
 	make image push COMPONENT=mig-faker
+	make image push COMPONENT=jupyter-notebook
 .PHONY: deploy-all
 
 image-test:

diff --git a/cmd/nvidia-smi/main.go b/cmd/nvidia-smi/main.go
@@ -64,23 +64,20 @@ func getNvidiaSmiArgs() (args nvidiaSmiArgs) {
 	args.GpuTotalMem = int(float64(nodeTopology.GpuMemory) * gpuPortion)
 
 	var gpuIdx int
-	if os.Getenv("NVIDIA_VISIBLE_DEVICES") == "" {
-		// Whole GPU is used
-		podName := os.Getenv("HOSTNAME")
-		// Search gpu for the podName
-		for idx, gpu := range nodeTopology.Gpus {
-			if gpu.Status.AllocatedBy.Pod == podName {
+	currentPodName := os.Getenv("HOSTNAME")
+	currentPodUuid := os.Getenv("POD_UUID")
+	for idx, gpu := range nodeTopology.Gpus {
+		if gpu.Status.AllocatedBy.Pod == currentPodName {
+			gpuIdx = idx
+			break
+		}
+
+		for podUuid := range gpu.Status.PodGpuUsageStatus {
+			if string(podUuid) == currentPodUuid {
 				gpuIdx = idx
 				break
 			}
 		}
-	} else {
-		// Shared GPU is used
-		gpuIdxStr := os.Getenv("NVIDIA_VISIBLE_DEVICES")
-		gpuIdx, err = strconv.Atoi(gpuIdxStr)
-		if err != nil {
-			panic(err)
-		}
 	}
 
 	args.GpuIdx = gpuIdx