Skip to content

Commit

Permalink
RUN-18413 nvidia-smi on shared GPU pod bugfix (#80)
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi authored May 9, 2024
1 parent 9a17acc commit 21b1c50
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 13 deletions.
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ COPY ./cmd/mig-faker/ ./cmd/mig-faker/
COPY ./internal/ ./internal/
RUN --mount=type=cache,target=/root/.cache/go-build make build COMPONENT=mig-faker

FROM jupyter/minimal-notebook as jupyter-notebook
COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/

FROM ubuntu as device-plugin
COPY --from=device-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/device-plugin /bin/
COPY --from=nvidia-smi-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/nvidia-smi /bin/
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ images:
make image COMPONENT=status-exporter
make image COMPONENT=topology-server
make image COMPONENT=mig-faker
make image COMPONENT=jupyter-notebook
.PHONY: images

push:
Expand All @@ -42,6 +43,7 @@ push-all:
make push COMPONENT=status-exporter
make push COMPONENT=topology-server
make push COMPONENT=mig-faker
make push COMPONENT=jupyter-notebook
.PHONY: push-all

restart:
Expand All @@ -57,6 +59,7 @@ deploy-all:
make image push COMPONENT=status-exporter
make image push COMPONENT=topology-server
make image push COMPONENT=mig-faker
make image push COMPONENT=jupyter-notebook
.PHONY: deploy-all

image-test:
Expand Down
23 changes: 10 additions & 13 deletions cmd/nvidia-smi/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,23 +64,20 @@ func getNvidiaSmiArgs() (args nvidiaSmiArgs) {
args.GpuTotalMem = int(float64(nodeTopology.GpuMemory) * gpuPortion)

var gpuIdx int
if os.Getenv("NVIDIA_VISIBLE_DEVICES") == "" {
// Whole GPU is used
podName := os.Getenv("HOSTNAME")
// Search gpu for the podName
for idx, gpu := range nodeTopology.Gpus {
if gpu.Status.AllocatedBy.Pod == podName {
currentPodName := os.Getenv("HOSTNAME")
currentPodUuid := os.Getenv("POD_UUID")
for idx, gpu := range nodeTopology.Gpus {
if gpu.Status.AllocatedBy.Pod == currentPodName {
gpuIdx = idx
break
}

for podUuid := range gpu.Status.PodGpuUsageStatus {
if string(podUuid) == currentPodUuid {
gpuIdx = idx
break
}
}
} else {
// Shared GPU is used
gpuIdxStr := os.Getenv("NVIDIA_VISIBLE_DEVICES")
gpuIdx, err = strconv.Atoi(gpuIdxStr)
if err != nil {
panic(err)
}
}

args.GpuIdx = gpuIdx
Expand Down

0 comments on commit 21b1c50

Please sign in to comment.