From bf6abc2b9b1c8b23caee03e3d1791876b48c22cc Mon Sep 17 00:00:00 2001 From: gshaibi <39763067+gshaibi@users.noreply.github.com> Date: Tue, 16 Jul 2024 12:05:32 +0300 Subject: [PATCH] RUN-20245 Support NodePool GPU Topology (#86) --- README.md | 70 ++++++------------- cmd/topology-server/main.go | 8 +-- .../templates/device-plugin/_helpers.tpl | 2 +- .../templates/mig-faker/daemonset.yml | 2 +- .../templates/status-updater/deployment.yaml | 2 +- .../templates/topology-cm.yml | 2 +- deploy/fake-gpu-operator/values.yaml | 21 +++--- internal/common/topology/const.go | 2 +- internal/common/topology/kubernetes.go | 23 +++--- internal/common/topology/types.go | 42 ++++++----- internal/status-updater/app_test.go | 53 ++++++-------- .../controllers/node/controller.go | 35 +++++++--- .../controllers/pod/controller.go | 1 + .../status-updater/handlers/node/handler.go | 17 ++++- .../status-updater/handlers/node/labels.go | 61 ++++++++++++++++ .../handlers/node/topology_cm.go | 21 +++--- 16 files changed, 211 insertions(+), 151 deletions(-) create mode 100644 internal/status-updater/handlers/node/labels.go diff --git a/README.md b/README.md index f29f34b..c5b9561 100644 --- a/README.md +++ b/README.md @@ -2,32 +2,36 @@ The purpose of the _fake GPU Operator_ or GPU Operator Simulator is to simulate the NVIDIA GPU Operator without a GPU. The software has been created by Run:ai in order to save money on actual machines in situations that do not require the GPU itself. This simulator: -* Allows you to take a CPU-only node and externalize it as if it has 1 or more GPUs. -* Simulates all aspects of the NVIDIA GPU Operator including feature discovery, NVIDIA MIG and more. -* Emits metrics to Prometheus simulating actual GPUs +* Allows a CPU-only node to be represented as if it has one or more GPUs. +* Simulates all features of the NVIDIA GPU Operator, including feature discovery and NVIDIA MIG. +* Emits metrics to Prometheus, simulating actual GPU behavior. -You can configure the simulator to have any NVIDIA GPU topology, including type and amount of GPU memory. +You can configure the simulator to have any NVIDIA GPU topology, including the type and amount of GPU memory. ## Prerequisites -The real Nvidia GPU Operator should not exist in the Kubernetes cluster +Ensure that the real Nvidia GPU Operator is not present in the Kubernetes cluster. ## Installation -Label the nodes you wish to have fake GPUs on, with the following labels: +Assign the nodes you want to simulate GPUs on to a node pool by labeling them with the `run.ai/simulated-gpu-node-pool` label. For example: -``` -kubectl label node nvidia.com/gpu.deploy.device-plugin=true nvidia.com/gpu.deploy.dcgm-exporter=true --overwrite +```sh +kubectl label node run.ai/simulated-gpu-node-pool=default ``` -By default, the operator creates a GPU topology of 2 Tesla K80 GPUs for each node in the cluster. To create a different GPU topology, see the __customization__ section below. +NodePools are used to group nodes that should have the same GPU topology. +These are defined in the `topology.nodePools` section of the Helm `values.yaml` file. +By default, a node pool with 2 Tesla K80 GPUs will be created for all nodes labeled with `run.ai/simulated-gpu-node-pool=default`. +To create a different GPU topology, refer to the __customization__ section below. -Install the operator: +To install the operator: -``` + +```sh helm repo add fake-gpu-operator https://fake-gpu-operator.storage.googleapis.com helm repo update helm upgrade -i gpu-operator fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace @@ -35,7 +39,7 @@ helm upgrade -i gpu-operator fake-gpu-operator/fake-gpu-operator --namespace gpu ## Usage -Submit any workload with a request for NVIDIA GPU: +Submit any workload with a request for an NVIDIA GPU: ``` resources: @@ -57,43 +61,11 @@ kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged ## Customization -The base GPU topology is defined using a Kubernetes configmap named `topology`. - -To customize the GPU topology, edit the Kubernetes configmap by running: - -``` -kubectl edit cm topology -n gpu-operator -``` - -The configmap should look like this: - -``` -apiVersion: v1 -data: - topology.yml: | - config: - node-autofill: - gpu-count: 16 - gpu-memory: 11441 - gpu-product: Tesla-K80 - mig-strategy: mixed -``` - -The configmap defines the GPU topology for all nodes. - -* __gpu-count__ - number of GPUs per node. -* __gpu-memory__ - amount of GPU memory per GPU. -* __gpu-product__ - GPU type. For example: `Tesla-K80`, `Tesla-V100`, etc. -* __mig-strategy__ - MIG strategy. Can be `none`, `mixed` or `single`. - -### Node specific customization - -Each node can have a different GPU topology. To customize a specific node, edit the configmap named `-topology` in the `gpu-operator` namespace. - +The GPU topology can be customized by editing the `values.yaml` file on the `topology` section before installing/upgrading the helm chart. -### GPU metrics +## GPU metrics -By default, dcgm exporter will export maximum GPU utilization for every pod that requests GPUs. +By default, the DCGM exporter will report maximum GPU utilization for every pod requesting GPUs. -If you want to customize the GPU utilization, add a `run.ai/simulated-gpu-utilization` annotation to the pod with a value that represents the range of the GPU utilization that should be simulated. -For example, add `run.ai/simulated-gpu-utilization: 10-30` annotation to simulate a pod that utilizes the GPU between 10% to 30%. +To customize GPU utilization, add a `run.ai/simulated-gpu-utilization` annotation to the pod with a value representing the desired range of GPU utilization. +For example, add `run.ai/simulated-gpu-utilization: 10-30` to simulate a pod that utilizes between 10% and 30% of the GPU. diff --git a/cmd/topology-server/main.go b/cmd/topology-server/main.go index fc38c37..9aee9f9 100644 --- a/cmd/topology-server/main.go +++ b/cmd/topology-server/main.go @@ -18,23 +18,23 @@ func main() { http.HandleFunc("/topology", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - baseTopology, err := topology.GetBaseTopologyFromCM(kubeclient.ClientSet) + clusterTopology, err := topology.GetClusterTopologyFromCM(kubeclient.ClientSet) if err != nil { w.WriteHeader(http.StatusInternalServerError) _, _ = w.Write([]byte(err.Error())) return } - baseTopologyJSON, err := json.Marshal(baseTopology) + clusterTopologyJSON, err := json.Marshal(clusterTopology) if err != nil { w.WriteHeader(http.StatusInternalServerError) _, _ = w.Write([]byte(err.Error())) return } - log.Printf("Returning cluster topology: %s", baseTopologyJSON) + log.Printf("Returning cluster topology: %s", clusterTopologyJSON) - _, err = w.Write(baseTopologyJSON) + _, err = w.Write(clusterTopologyJSON) if err != nil { panic(err) } diff --git a/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl b/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl index b4308aa..8813c7c 100644 --- a/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl +++ b/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl @@ -18,7 +18,7 @@ matchLabels: {{- define "fake-gpu-operator.device-plugin.common.podTemplate.metadata" }} annotations: - checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} + checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} labels: app: device-plugin component: device-plugin diff --git a/deploy/fake-gpu-operator/templates/mig-faker/daemonset.yml b/deploy/fake-gpu-operator/templates/mig-faker/daemonset.yml index 12525b7..00b15b0 100644 --- a/deploy/fake-gpu-operator/templates/mig-faker/daemonset.yml +++ b/deploy/fake-gpu-operator/templates/mig-faker/daemonset.yml @@ -12,7 +12,7 @@ spec: template: metadata: annotations: - checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} + checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} labels: app: mig-faker component: mig-faker diff --git a/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml b/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml index 1d57429..f45b34b 100644 --- a/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml +++ b/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml @@ -3,7 +3,7 @@ kind: Deployment metadata: name: status-updater annotations: - checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} + checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} labels: app: status-updater spec: diff --git a/deploy/fake-gpu-operator/templates/topology-cm.yml b/deploy/fake-gpu-operator/templates/topology-cm.yml index 0ab32b9..2d83eba 100644 --- a/deploy/fake-gpu-operator/templates/topology-cm.yml +++ b/deploy/fake-gpu-operator/templates/topology-cm.yml @@ -1,7 +1,7 @@ apiVersion: v1 data: topology.yml: |- -{{ toYaml .Values.initialTopology | indent 4 }} +{{ toYaml .Values.topology | indent 4 }} kind: ConfigMap metadata: name: topology diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml index 019fe96..a740a22 100644 --- a/deploy/fake-gpu-operator/values.yaml +++ b/deploy/fake-gpu-operator/values.yaml @@ -60,11 +60,16 @@ migFaker: repository: gcr.io/run-ai-staging/fake-gpu-operator/mig-faker tag: 0.0.1 -initialTopology: - config: - node-autofill: - gpu-count: 2 - gpu-product: Tesla-K80 - gpu-memory: 11441 - mig-strategy: mixed - nodes: {} \ No newline at end of file +topology: + # nodePools is a map of node pool name to node pool configuration. + # Nodes are assigned to node pools based on the node pool label's value (key is configurable via nodePoolLabelKey). + # + # For example, nodes that have the label "run.ai/simulated-gpu-node-pool: default" + # will be assigned to the "default" node pool. + nodePools: + default: + gpuProduct: Tesla-K80 + gpuCount: 2 + gpuMemory: 11441 + nodePoolLabelKey: run.ai/simulated-gpu-node-pool + migStrategy: mixed \ No newline at end of file diff --git a/internal/common/topology/const.go b/internal/common/topology/const.go index cdc645f..dd7120d 100644 --- a/internal/common/topology/const.go +++ b/internal/common/topology/const.go @@ -1,5 +1,5 @@ package topology const ( - CmTopologyKey = "topology.yml" + cmTopologyKey = "topology.yml" ) diff --git a/internal/common/topology/kubernetes.go b/internal/common/topology/kubernetes.go index 7bd4c8d..de612fa 100644 --- a/internal/common/topology/kubernetes.go +++ b/internal/common/topology/kubernetes.go @@ -48,13 +48,12 @@ func UpdateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTop } func DeleteNodeTopologyCM(kubeclient kubernetes.Interface, nodeName string) error { - err := kubeclient.CoreV1().ConfigMaps( viper.GetString(constants.EnvTopologyCmNamespace)).Delete(context.TODO(), GetNodeTopologyCMName(nodeName), metav1.DeleteOptions{}) return err } -func GetBaseTopologyFromCM(kubeclient kubernetes.Interface) (*BaseTopology, error) { +func GetClusterTopologyFromCM(kubeclient kubernetes.Interface) (*ClusterTopology, error) { topologyCm, err := kubeclient.CoreV1().ConfigMaps( viper.GetString(constants.EnvTopologyCmNamespace)).Get( context.TODO(), viper.GetString(constants.EnvTopologyCmName), metav1.GetOptions{}) @@ -62,7 +61,7 @@ func GetBaseTopologyFromCM(kubeclient kubernetes.Interface) (*BaseTopology, erro return nil, fmt.Errorf("failed to get topology configmap: %v", err) } - cluster, err := FromBaseTopologyCM(topologyCm) + cluster, err := FromClusterTopologyCM(topologyCm) if err != nil { return nil, fmt.Errorf("failed to parse topology configmap: %v", err) } @@ -70,19 +69,19 @@ func GetBaseTopologyFromCM(kubeclient kubernetes.Interface) (*BaseTopology, erro return cluster, nil } -func FromBaseTopologyCM(cm *corev1.ConfigMap) (*BaseTopology, error) { - var baseTopology BaseTopology - err := yaml.Unmarshal([]byte(cm.Data[CmTopologyKey]), &baseTopology) +func FromClusterTopologyCM(cm *corev1.ConfigMap) (*ClusterTopology, error) { + var clusterTopology ClusterTopology + err := yaml.Unmarshal([]byte(cm.Data[cmTopologyKey]), &clusterTopology) if err != nil { return nil, err } - return &baseTopology, nil + return &clusterTopology, nil } func FromNodeTopologyCM(cm *corev1.ConfigMap) (*NodeTopology, error) { var nodeTopology NodeTopology - err := yaml.Unmarshal([]byte(cm.Data[CmTopologyKey]), &nodeTopology) + err := yaml.Unmarshal([]byte(cm.Data[cmTopologyKey]), &nodeTopology) if err != nil { return nil, err } @@ -90,7 +89,7 @@ func FromNodeTopologyCM(cm *corev1.ConfigMap) (*NodeTopology, error) { return &nodeTopology, nil } -func ToBaseTopologyCM(baseTopology *BaseTopology) (*corev1.ConfigMap, error) { +func ToClusterTopologyCM(clusterTopology *ClusterTopology) (*corev1.ConfigMap, error) { cm := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: viper.GetString(constants.EnvTopologyCmName), @@ -99,12 +98,12 @@ func ToBaseTopologyCM(baseTopology *BaseTopology) (*corev1.ConfigMap, error) { Data: make(map[string]string), } - topologyData, err := yaml.Marshal(baseTopology) + topologyData, err := yaml.Marshal(clusterTopology) if err != nil { return nil, err } - cm.Data[CmTopologyKey] = string(topologyData) + cm.Data[cmTopologyKey] = string(topologyData) return cm, nil } @@ -127,7 +126,7 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf return nil, err } - cm.Data[CmTopologyKey] = string(topologyData) + cm.Data[cmTopologyKey] = string(topologyData) return cm, nil } diff --git a/internal/common/topology/types.go b/internal/common/topology/types.go index e70801c..c965172 100644 --- a/internal/common/topology/types.go +++ b/internal/common/topology/types.go @@ -6,28 +6,37 @@ import ( "k8s.io/apimachinery/pkg/types" ) -type BaseTopology struct { - Config Config `json:"config"` +type ClusterTopology struct { + NodePools map[string]NodePoolTopology `yaml:"nodePools"` + NodePoolLabelKey string `yaml:"nodePoolLabelKey"` + + MigStrategy string `yaml:"migStrategy"` +} + +type NodePoolTopology struct { + GpuCount int `yaml:"gpuCount"` + GpuMemory int `yaml:"gpuMemory"` + GpuProduct string `yaml:"gpuProduct"` } type NodeTopology struct { - GpuMemory int `yaml:"gpu-memory"` - GpuProduct string `yaml:"gpu-product"` + GpuMemory int `yaml:"gpuMemory"` + GpuProduct string `yaml:"gpuProduct"` Gpus []GpuDetails `yaml:"gpus"` - MigStrategy string `yaml:"mig-strategy"` + MigStrategy string `yaml:"migStrategy"` } type GpuDetails struct { - ID string `json:"id"` - Status GpuStatus `json:"status"` + ID string `yaml:"id"` + Status GpuStatus `yaml:"status"` } type PodGpuUsageStatusMap map[types.UID]GpuUsageStatus type GpuStatus struct { - AllocatedBy ContainerDetails `yaml:"allocated-by"` + AllocatedBy ContainerDetails `yaml:"allocatedBy"` // Maps PodUID to its GPU usage status - PodGpuUsageStatus PodGpuUsageStatusMap `yaml:"pod-gpu-usage-status"` + PodGpuUsageStatus PodGpuUsageStatusMap `yaml:"podGpuUsageStatus"` } type ContainerDetails struct { @@ -38,8 +47,8 @@ type ContainerDetails struct { type GpuUsageStatus struct { Utilization Range `yaml:"utilization"` - FbUsed int `yaml:"fb-used"` - UseKnativeUtilization bool `yaml:"use-knative-utilization"` + FbUsed int `yaml:"fbUsed"` + UseKnativeUtilization bool `yaml:"useKnativeUtilization"` } type Range struct { @@ -47,17 +56,6 @@ type Range struct { Max int `yaml:"max"` } -type Config struct { - NodeAutofill NodeAutofillSettings `yaml:"node-autofill"` -} - -type NodeAutofillSettings struct { - GpuCount int `yaml:"gpu-count"` - GpuMemory int `yaml:"gpu-memory"` - GpuProduct string `yaml:"gpu-product"` - MigStrategy string `yaml:"mig-strategy"` -} - // Errors var ErrNoNodes = fmt.Errorf("no nodes found") var ErrNoNode = fmt.Errorf("node not found") diff --git a/internal/status-updater/app_test.go b/internal/status-updater/app_test.go index 8796a9f..3accf0a 100644 --- a/internal/status-updater/app_test.go +++ b/internal/status-updater/app_test.go @@ -48,17 +48,6 @@ const ( nodeGpuCount = 2 ) -var ( - defaultTopologyConfig = topology.Config{ - NodeAutofill: topology.NodeAutofillSettings{ - GpuCount: nodeGpuCount, - GpuMemory: 11441, - GpuProduct: "Tesla-K80", - MigStrategy: "mixed", - }, - } -) - func TestStatusUpdater(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "StatusUpdater Suite") @@ -73,11 +62,19 @@ var _ = Describe("StatusUpdater", func() { ) BeforeEach(func() { - baseTopology := &topology.BaseTopology{ - Config: defaultTopologyConfig, + clusterTopology := &topology.ClusterTopology{ + NodePools: map[string]topology.NodePoolTopology{ + "default": { + GpuMemory: 11441, + GpuProduct: "Tesla-K80", + GpuCount: nodeGpuCount, + }, + }, + NodePoolLabelKey: "run.ai/simulated-gpu-node-pool", + MigStrategy: "mixed", } - topologyStr, err := yaml.Marshal(baseTopology) + topologyStr, err := yaml.Marshal(clusterTopology) Expect(err).ToNot(HaveOccurred()) topologyConfigMap := &v1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ @@ -102,8 +99,7 @@ var _ = Describe("StatusUpdater", func() { ObjectMeta: metav1.ObjectMeta{ Name: node, Labels: map[string]string{ - "nvidia.com/gpu.deploy.dcgm-exporter": "true", - "nvidia.com/gpu.deploy.device-plugin": "true", + "run.ai/simulated-gpu-node-pool": "default", }, }, } @@ -389,8 +385,7 @@ var _ = Describe("StatusUpdater", func() { ObjectMeta: metav1.ObjectMeta{ Name: "node1", Labels: map[string]string{ - "nvidia.com/gpu.deploy.device-plugin": "true", - "nvidia.com/gpu.deploy.dcgm-exporter": "true", + "run.ai/simulated-gpu-node-pool": "default", }, }, } @@ -400,18 +395,18 @@ var _ = Describe("StatusUpdater", func() { Eventually(getTopologyNodeFromKube(kubeclient, node.Name)).Should(Not(BeNil())) - baseTopology, err := getTopologyFromKube(kubeclient)() + clusterTopology, err := getTopologyFromKube(kubeclient)() Expect(err).ToNot(HaveOccurred()) - Expect(baseTopology).ToNot(BeNil()) + Expect(clusterTopology).ToNot(BeNil()) nodeTopology, err := getTopologyNodeFromKube(kubeclient, node.Name)() Expect(err).ToNot(HaveOccurred()) Expect(nodeTopology).ToNot(BeNil()) - Expect(nodeTopology.GpuMemory).To(Equal(baseTopology.Config.NodeAutofill.GpuMemory)) - Expect(nodeTopology.GpuProduct).To(Equal(baseTopology.Config.NodeAutofill.GpuProduct)) - Expect(nodeTopology.Gpus).To(HaveLen(baseTopology.Config.NodeAutofill.GpuCount)) - Expect(nodeTopology.MigStrategy).To(Equal(baseTopology.Config.NodeAutofill.MigStrategy)) + Expect(nodeTopology.GpuMemory).To(Equal(clusterTopology.NodePools["default"].GpuMemory)) + Expect(nodeTopology.GpuProduct).To(Equal(clusterTopology.NodePools["default"].GpuProduct)) + Expect(nodeTopology.Gpus).To(HaveLen(clusterTopology.NodePools["default"].GpuCount)) + Expect(nodeTopology.MigStrategy).To(Equal(clusterTopology.MigStrategy)) }) }) @@ -429,15 +424,13 @@ var _ = Describe("StatusUpdater", func() { }) }) - // When informed of a node deletion, it should remove the node from the cluster topology When("informed of a node deletion", func() { It("should remove the node from the cluster topology", func() { node := &v1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: "node1", Labels: map[string]string{ - "nvidia.com/gpu.deploy.device-plugin": "true", - "nvidia.com/gpu.deploy.dcgm-exporter": "true", + "run.ai/simulated-gpu-node-pool": "default", }, }, } @@ -455,9 +448,9 @@ var _ = Describe("StatusUpdater", func() { }) }) -func getTopologyFromKube(kubeclient kubernetes.Interface) func() (*topology.BaseTopology, error) { - return func() (*topology.BaseTopology, error) { - ret, err := topology.GetBaseTopologyFromCM(kubeclient) +func getTopologyFromKube(kubeclient kubernetes.Interface) func() (*topology.ClusterTopology, error) { + return func() (*topology.ClusterTopology, error) { + ret, err := topology.GetClusterTopologyFromCM(kubeclient) return ret, err } } diff --git a/internal/status-updater/controllers/node/controller.go b/internal/status-updater/controllers/node/controller.go index 30339c3..fb01470 100644 --- a/internal/status-updater/controllers/node/controller.go +++ b/internal/status-updater/controllers/node/controller.go @@ -18,6 +18,8 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/selection" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" @@ -28,22 +30,30 @@ type NodeController struct { kubeClient kubernetes.Interface informer cache.SharedIndexInformer handler nodehandler.Interface + + clusterTopology *topology.ClusterTopology } var _ controllers.Interface = &NodeController{} func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *NodeController { + clusterTopology, err := topology.GetClusterTopologyFromCM(kubeClient) + if err != nil { + log.Fatalf("Failed to get cluster topology: %v", err) + } + c := &NodeController{ - kubeClient: kubeClient, - informer: informers.NewSharedInformerFactory(kubeClient, 0).Core().V1().Nodes().Informer(), - handler: nodehandler.NewNodeHandler(kubeClient), + kubeClient: kubeClient, + informer: informers.NewSharedInformerFactory(kubeClient, 0).Core().V1().Nodes().Informer(), + handler: nodehandler.NewNodeHandler(kubeClient, clusterTopology), + clusterTopology: clusterTopology, } - _, err := c.informer.AddEventHandler(cache.FilteringResourceEventHandler{ + _, err = c.informer.AddEventHandler(cache.FilteringResourceEventHandler{ FilterFunc: func(obj interface{}) bool { switch node := obj.(type) { case *v1.Node: - return isFakeGpuNode(node) + return c.isFakeGpuNode(node) default: return false } @@ -76,6 +86,7 @@ func (c *NodeController) Run(stopCh <-chan struct{}) { log.Fatalf("Failed to prune topology nodes: %v", err) } + log.Println("Starting node controller") c.informer.Run(stopCh) } @@ -83,8 +94,13 @@ func (c *NodeController) Run(stopCh <-chan struct{}) { func (c *NodeController) pruneTopologyConfigMaps() error { log.Print("Pruning topology ConfigMaps...") + gpuNodesLabelReq, err := labels.NewRequirement(c.clusterTopology.NodePoolLabelKey, selection.Exists, nil) + if err != nil { + return fmt.Errorf("failed creating label requirement: %v", err) + } + gpuNodes, err := c.kubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ - LabelSelector: "nvidia.com/gpu.deploy.dcgm-exporter=true,nvidia.com/gpu.deploy.device-plugin=true", + LabelSelector: labels.NewSelector().Add(*gpuNodesLabelReq).String(), }) if err != nil { return fmt.Errorf("failed listing fake gpu nodes: %v", err) @@ -148,10 +164,9 @@ func (c *NodeController) pruneTopologyConfigMap(cm *v1.ConfigMap, isValidNodeTop return nil } -func isFakeGpuNode(node *v1.Node) bool { - return node != nil && - node.Labels["nvidia.com/gpu.deploy.dcgm-exporter"] == "true" && - node.Labels["nvidia.com/gpu.deploy.device-plugin"] == "true" +func (c *NodeController) isFakeGpuNode(node *v1.Node) bool { + _, isNodeAssignedToNodePool := node.Labels[c.clusterTopology.NodePoolLabelKey] + return isNodeAssignedToNodePool } func isPodExist(kubeClient kubernetes.Interface, podName string, namespace string) (bool, error) { diff --git a/internal/status-updater/controllers/pod/controller.go b/internal/status-updater/controllers/pod/controller.go index 47e632b..217443c 100644 --- a/internal/status-updater/controllers/pod/controller.go +++ b/internal/status-updater/controllers/pod/controller.go @@ -68,5 +68,6 @@ func NewPodController(kubeClient kubernetes.Interface, dynamicClient dynamic.Int } func (p *PodController) Run(stopCh <-chan struct{}) { + log.Println("Starting pod controller") p.informer.Run(stopCh) } diff --git a/internal/status-updater/handlers/node/handler.go b/internal/status-updater/handlers/node/handler.go index 4c66501..f404278 100644 --- a/internal/status-updater/handlers/node/handler.go +++ b/internal/status-updater/handlers/node/handler.go @@ -17,13 +17,16 @@ type Interface interface { type NodeHandler struct { kubeClient kubernetes.Interface + + clusterTopology *topology.ClusterTopology } var _ Interface = &NodeHandler{} -func NewNodeHandler(kubeClient kubernetes.Interface) *NodeHandler { +func NewNodeHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology) *NodeHandler { return &NodeHandler{ - kubeClient: kubeClient, + kubeClient: kubeClient, + clusterTopology: clusterTopology, } } @@ -40,6 +43,11 @@ func (p *NodeHandler) HandleAdd(node *v1.Node) error { return fmt.Errorf("failed to apply fake node deployments: %w", err) } + err = p.labelNode(node) + if err != nil { + return fmt.Errorf("failed to label node: %w", err) + } + return nil } @@ -56,5 +64,10 @@ func (p *NodeHandler) HandleDelete(node *v1.Node) error { return fmt.Errorf("failed to delete fake node deployments: %w", err) } + err = p.unlabelNode(node) + if err != nil { + return fmt.Errorf("failed to unlabel node: %w", err) + } + return nil } diff --git a/internal/status-updater/handlers/node/labels.go b/internal/status-updater/handlers/node/labels.go new file mode 100644 index 0000000..f05315a --- /dev/null +++ b/internal/status-updater/handlers/node/labels.go @@ -0,0 +1,61 @@ +package node + +import ( + "context" + "encoding/json" + "fmt" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +const ( + dcgmExporterLabelKey = "nvidia.com/gpu.deploy.dcgm-exporter" + devicePluginLabelKey = "nvidia.com/gpu.deploy.device-plugin" +) + +// labelNode labels the node with required labels for the fake-gpu-operator to function. +func (p *NodeHandler) labelNode(node *v1.Node) error { + err := p.patchNodeLabels(node, map[string]interface{}{ + dcgmExporterLabelKey: "true", + devicePluginLabelKey: "true", + }) + if err != nil { + return fmt.Errorf("failed to label node %s: %w", node.Name, err) + } + + return nil +} + +// unlabelNode removes the labels from the node that were added by the fake-gpu-operator. +func (p *NodeHandler) unlabelNode(node *v1.Node) error { + err := p.patchNodeLabels(node, map[string]interface{}{ + dcgmExporterLabelKey: nil, + devicePluginLabelKey: nil, + }) + if err != nil { + return fmt.Errorf("failed to unlabel node %s: %w", node.Name, err) + } + + return nil +} + +func (p *NodeHandler) patchNodeLabels(node *v1.Node, labels map[string]interface{}) error { + patch := map[string]interface{}{ + "metadata": map[string]interface{}{ + "labels": labels, + }, + } + patchBytes, err := json.Marshal(patch) + if err != nil { + return fmt.Errorf("failed to marshal patch: %w", err) + } + + _, err = p.kubeClient.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{}) + if err != nil { + return fmt.Errorf("failed to patch node %s labels: %w", node.Name, err) + } + + return nil +} diff --git a/internal/status-updater/handlers/node/topology_cm.go b/internal/status-updater/handlers/node/topology_cm.go index cff8b26..e6a4067 100644 --- a/internal/status-updater/handlers/node/topology_cm.go +++ b/internal/status-updater/handlers/node/topology_cm.go @@ -14,21 +14,24 @@ func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error { return nil } - baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient) - if err != nil { - return fmt.Errorf("failed to get base topology: %w", err) + nodePoolName, ok := node.Labels[p.clusterTopology.NodePoolLabelKey] + if !ok { + return fmt.Errorf("node %s does not have a nodepool label", node.Name) } - nodeAutofillSettings := baseTopology.Config.NodeAutofill + nodePoolTopology, ok := p.clusterTopology.NodePools[nodePoolName] + if !ok { + return fmt.Errorf("nodepool %s not found in cluster topology", nodePoolName) + } nodeTopology = &topology.NodeTopology{ - GpuMemory: nodeAutofillSettings.GpuMemory, - GpuProduct: nodeAutofillSettings.GpuProduct, - Gpus: generateGpuDetails(nodeAutofillSettings.GpuCount, node.Name), - MigStrategy: nodeAutofillSettings.MigStrategy, + GpuMemory: nodePoolTopology.GpuMemory, + GpuProduct: nodePoolTopology.GpuProduct, + Gpus: generateGpuDetails(nodePoolTopology.GpuCount, node.Name), + MigStrategy: p.clusterTopology.MigStrategy, } - err = topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name) + err := topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name) if err != nil { return fmt.Errorf("failed to create node topology: %w", err) }