From bf6abc2b9b1c8b23caee03e3d1791876b48c22cc Mon Sep 17 00:00:00 2001
From: gshaibi <39763067+gshaibi@users.noreply.github.com>
Date: Tue, 16 Jul 2024 12:05:32 +0300
Subject: [PATCH] RUN-20245 Support NodePool GPU Topology (#86)

---
 README.md                                     | 70 ++++++-------------
 cmd/topology-server/main.go                   |  8 +--
 .../templates/device-plugin/_helpers.tpl      |  2 +-
 .../templates/mig-faker/daemonset.yml         |  2 +-
 .../templates/status-updater/deployment.yaml  |  2 +-
 .../templates/topology-cm.yml                 |  2 +-
 deploy/fake-gpu-operator/values.yaml          | 21 +++---
 internal/common/topology/const.go             |  2 +-
 internal/common/topology/kubernetes.go        | 23 +++---
 internal/common/topology/types.go             | 42 ++++++-----
 internal/status-updater/app_test.go           | 53 ++++++--------
 .../controllers/node/controller.go            | 35 +++++++---
 .../controllers/pod/controller.go             |  1 +
 .../status-updater/handlers/node/handler.go   | 17 ++++-
 .../status-updater/handlers/node/labels.go    | 61 ++++++++++++++++
 .../handlers/node/topology_cm.go              | 21 +++---
 16 files changed, 211 insertions(+), 151 deletions(-)
 create mode 100644 internal/status-updater/handlers/node/labels.go
diff --git a/README.md b/README.md
index f29f34b..c5b9561 100644
--- a/README.md
+++ b/README.md
@@ -2,32 +2,36 @@
 
 The purpose of the _fake GPU Operator_ or GPU Operator Simulator is to simulate the NVIDIA GPU Operator without a GPU. The software has been created by Run:ai in order to save money on actual machines in situations that do not require the GPU itself. This simulator:
 
-* Allows you to take a CPU-only node and externalize it as if it has 1 or more GPUs. 
-* Simulates all aspects of the NVIDIA GPU Operator including feature discovery, NVIDIA MIG and more. 
-* Emits metrics to Prometheus simulating actual GPUs
+* Allows a CPU-only node to be represented as if it has one or more GPUs.
+* Simulates all features of the NVIDIA GPU Operator, including feature discovery and NVIDIA MIG.
+* Emits metrics to Prometheus, simulating actual GPU behavior.
 
-You can configure the simulator to have any NVIDIA GPU topology, including type and amount of GPU memory. 
+You can configure the simulator to have any NVIDIA GPU topology, including the type and amount of GPU memory.
 
 
 
 ## Prerequisites
 
-The real Nvidia GPU Operator should not exist in the Kubernetes cluster
+Ensure that the real Nvidia GPU Operator is not present in the Kubernetes cluster.
 
 ## Installation
 
-Label the nodes you wish to have fake GPUs on, with the following labels:
+Assign the nodes you want to simulate GPUs on to a node pool by labeling them with the `run.ai/simulated-gpu-node-pool` label. For example:
 
-```
-kubectl label node <node-name> nvidia.com/gpu.deploy.device-plugin=true nvidia.com/gpu.deploy.dcgm-exporter=true --overwrite
+```sh
+kubectl label node <node-name> run.ai/simulated-gpu-node-pool=default
 ```
 
-By default, the operator creates a GPU topology of 2 Tesla K80 GPUs for each node in the cluster. To create a different GPU topology, see the __customization__ section below.
+NodePools are used to group nodes that should have the same GPU topology.
+These are defined in the `topology.nodePools` section of the Helm `values.yaml` file.
+By default, a node pool with 2 Tesla K80 GPUs will be created for all nodes labeled with `run.ai/simulated-gpu-node-pool=default`.
+To create a different GPU topology, refer to the __customization__ section below.
 
 
-Install the operator:
+To install the operator:
 
-```
+
+```sh
 helm repo add fake-gpu-operator https://fake-gpu-operator.storage.googleapis.com
 helm repo update
 helm upgrade -i gpu-operator fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace
@@ -35,7 +39,7 @@ helm upgrade -i gpu-operator fake-gpu-operator/fake-gpu-operator --namespace gpu
 
 ## Usage
 
-Submit any workload with a request for NVIDIA GPU: 
+Submit any workload with a request for an NVIDIA GPU:
 
 ```
 resources:
@@ -57,43 +61,11 @@ kubectl label ns gpu-operator pod-security.kubernetes.io/enforce=privileged
 
 ## Customization
 
-The base GPU topology is defined using a Kubernetes configmap named `topology`.
-
-To customize the GPU topology, edit the Kubernetes configmap by running:
-
-```
-kubectl edit cm topology -n gpu-operator
-```
-
-The configmap should look like this:
-
-```
-apiVersion: v1
-data:
-  topology.yml: |
-    config:
-      node-autofill:
-        gpu-count: 16
-        gpu-memory: 11441
-        gpu-product: Tesla-K80
-    mig-strategy: mixed
-```
-
-The configmap defines the GPU topology for all nodes.
-
-* __gpu-count__ - number of GPUs per node.
-* __gpu-memory__ - amount of GPU memory per GPU.
-* __gpu-product__ - GPU type. For example: `Tesla-K80`, `Tesla-V100`, etc.
-* __mig-strategy__ - MIG strategy. Can be `none`, `mixed` or `single`.
-
-### Node specific customization
-
-Each node can have a different GPU topology. To customize a specific node, edit the configmap named `<node-name>-topology` in the `gpu-operator` namespace.
-
+The GPU topology can be customized by editing the `values.yaml` file on the `topology` section before installing/upgrading the helm chart.
 
-### GPU metrics
+## GPU metrics
 
-By default, dcgm exporter will export maximum GPU utilization for every pod that requests GPUs.
+By default, the DCGM exporter will report maximum GPU utilization for every pod requesting GPUs.
 
-If you want to customize the GPU utilization, add a `run.ai/simulated-gpu-utilization` annotation to the pod with a value that represents the range of the GPU utilization that should be simulated.
-For example, add `run.ai/simulated-gpu-utilization: 10-30` annotation to simulate a pod that utilizes the GPU between 10% to 30%.
+To customize GPU utilization, add a `run.ai/simulated-gpu-utilization` annotation to the pod with a value representing the desired range of GPU utilization.
+For example, add `run.ai/simulated-gpu-utilization: 10-30` to simulate a pod that utilizes between 10% and 30% of the GPU.
diff --git a/cmd/topology-server/main.go b/cmd/topology-server/main.go
index fc38c37..9aee9f9 100644
--- a/cmd/topology-server/main.go
+++ b/cmd/topology-server/main.go
@@ -18,23 +18,23 @@ func main() {
 	http.HandleFunc("/topology", func(w http.ResponseWriter, r *http.Request) {
 
 		w.Header().Set("Content-Type", "application/json")
-		baseTopology, err := topology.GetBaseTopologyFromCM(kubeclient.ClientSet)
+		clusterTopology, err := topology.GetClusterTopologyFromCM(kubeclient.ClientSet)
 		if err != nil {
 			w.WriteHeader(http.StatusInternalServerError)
 			_, _ = w.Write([]byte(err.Error()))
 			return
 		}
 
-		baseTopologyJSON, err := json.Marshal(baseTopology)
+		clusterTopologyJSON, err := json.Marshal(clusterTopology)
 		if err != nil {
 			w.WriteHeader(http.StatusInternalServerError)
 			_, _ = w.Write([]byte(err.Error()))
 			return
 		}
 
-		log.Printf("Returning cluster topology: %s", baseTopologyJSON)
+		log.Printf("Returning cluster topology: %s", clusterTopologyJSON)
 
-		_, err = w.Write(baseTopologyJSON)
+		_, err = w.Write(clusterTopologyJSON)
 		if err != nil {
 			panic(err)
 		}
diff --git a/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl b/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
index b4308aa..8813c7c 100644
--- a/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
+++ b/deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
@@ -18,7 +18,7 @@ matchLabels:
 
 {{- define "fake-gpu-operator.device-plugin.common.podTemplate.metadata" }}
 annotations:
-  checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
+  checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
 labels:
   app: device-plugin
   component: device-plugin
diff --git a/deploy/fake-gpu-operator/templates/mig-faker/daemonset.yml b/deploy/fake-gpu-operator/templates/mig-faker/daemonset.yml
index 12525b7..00b15b0 100644
--- a/deploy/fake-gpu-operator/templates/mig-faker/daemonset.yml
+++ b/deploy/fake-gpu-operator/templates/mig-faker/daemonset.yml
@@ -12,7 +12,7 @@ spec:
   template:
     metadata:
       annotations:
-        checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
+        checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
       labels:
         app: mig-faker
         component: mig-faker
diff --git a/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml b/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml
index 1d57429..f45b34b 100644
--- a/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml
+++ b/deploy/fake-gpu-operator/templates/status-updater/deployment.yaml
@@ -3,7 +3,7 @@ kind: Deployment
 metadata:
   name: status-updater
   annotations:
-    checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
+    checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
   labels:
     app: status-updater
 spec:
diff --git a/deploy/fake-gpu-operator/templates/topology-cm.yml b/deploy/fake-gpu-operator/templates/topology-cm.yml
index 0ab32b9..2d83eba 100644
--- a/deploy/fake-gpu-operator/templates/topology-cm.yml
+++ b/deploy/fake-gpu-operator/templates/topology-cm.yml
@@ -1,7 +1,7 @@
 apiVersion: v1
 data:
   topology.yml: |-
-{{ toYaml .Values.initialTopology | indent 4 }}
+{{ toYaml .Values.topology | indent 4 }}
 kind: ConfigMap
 metadata:
   name: topology
diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml
index 019fe96..a740a22 100644
--- a/deploy/fake-gpu-operator/values.yaml
+++ b/deploy/fake-gpu-operator/values.yaml
@@ -60,11 +60,16 @@ migFaker:
     repository: gcr.io/run-ai-staging/fake-gpu-operator/mig-faker
     tag: 0.0.1
 
-initialTopology:
-  config:
-    node-autofill:
-      gpu-count: 2
-      gpu-product: Tesla-K80
-      gpu-memory: 11441
-  mig-strategy: mixed
-  nodes: {}
\ No newline at end of file
+topology:
+  # nodePools is a map of node pool name to node pool configuration.
+  # Nodes are assigned to node pools based on the node pool label's value (key is configurable via nodePoolLabelKey).
+  # 
+  # For example, nodes that have the label "run.ai/simulated-gpu-node-pool: default"
+  # will be assigned to the "default" node pool.
+  nodePools:
+    default:
+      gpuProduct: Tesla-K80
+      gpuCount: 2
+      gpuMemory: 11441
+  nodePoolLabelKey: run.ai/simulated-gpu-node-pool
+  migStrategy: mixed
\ No newline at end of file
diff --git a/internal/common/topology/const.go b/internal/common/topology/const.go
index cdc645f..dd7120d 100644
--- a/internal/common/topology/const.go
+++ b/internal/common/topology/const.go
@@ -1,5 +1,5 @@
 package topology
 
 const (
-	CmTopologyKey = "topology.yml"
+	cmTopologyKey = "topology.yml"
 )
diff --git a/internal/common/topology/kubernetes.go b/internal/common/topology/kubernetes.go
index 7bd4c8d..de612fa 100644
--- a/internal/common/topology/kubernetes.go
+++ b/internal/common/topology/kubernetes.go
@@ -48,13 +48,12 @@ func UpdateNodeTopologyCM(kubeclient kubernetes.Interface, nodeTopology *NodeTop
 }
 
 func DeleteNodeTopologyCM(kubeclient kubernetes.Interface, nodeName string) error {
-
 	err := kubeclient.CoreV1().ConfigMaps(
 		viper.GetString(constants.EnvTopologyCmNamespace)).Delete(context.TODO(), GetNodeTopologyCMName(nodeName), metav1.DeleteOptions{})
 	return err
 }
 
-func GetBaseTopologyFromCM(kubeclient kubernetes.Interface) (*BaseTopology, error) {
+func GetClusterTopologyFromCM(kubeclient kubernetes.Interface) (*ClusterTopology, error) {
 	topologyCm, err := kubeclient.CoreV1().ConfigMaps(
 		viper.GetString(constants.EnvTopologyCmNamespace)).Get(
 		context.TODO(), viper.GetString(constants.EnvTopologyCmName), metav1.GetOptions{})
@@ -62,7 +61,7 @@ func GetBaseTopologyFromCM(kubeclient kubernetes.Interface) (*BaseTopology, erro
 		return nil, fmt.Errorf("failed to get topology configmap: %v", err)
 	}
 
-	cluster, err := FromBaseTopologyCM(topologyCm)
+	cluster, err := FromClusterTopologyCM(topologyCm)
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse topology configmap: %v", err)
 	}
@@ -70,19 +69,19 @@ func GetBaseTopologyFromCM(kubeclient kubernetes.Interface) (*BaseTopology, erro
 	return cluster, nil
 }
 
-func FromBaseTopologyCM(cm *corev1.ConfigMap) (*BaseTopology, error) {
-	var baseTopology BaseTopology
-	err := yaml.Unmarshal([]byte(cm.Data[CmTopologyKey]), &baseTopology)
+func FromClusterTopologyCM(cm *corev1.ConfigMap) (*ClusterTopology, error) {
+	var clusterTopology ClusterTopology
+	err := yaml.Unmarshal([]byte(cm.Data[cmTopologyKey]), &clusterTopology)
 	if err != nil {
 		return nil, err
 	}
 
-	return &baseTopology, nil
+	return &clusterTopology, nil
 }
 
 func FromNodeTopologyCM(cm *corev1.ConfigMap) (*NodeTopology, error) {
 	var nodeTopology NodeTopology
-	err := yaml.Unmarshal([]byte(cm.Data[CmTopologyKey]), &nodeTopology)
+	err := yaml.Unmarshal([]byte(cm.Data[cmTopologyKey]), &nodeTopology)
 	if err != nil {
 		return nil, err
 	}
@@ -90,7 +89,7 @@ func FromNodeTopologyCM(cm *corev1.ConfigMap) (*NodeTopology, error) {
 	return &nodeTopology, nil
 }
 
-func ToBaseTopologyCM(baseTopology *BaseTopology) (*corev1.ConfigMap, error) {
+func ToClusterTopologyCM(clusterTopology *ClusterTopology) (*corev1.ConfigMap, error) {
 	cm := &corev1.ConfigMap{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      viper.GetString(constants.EnvTopologyCmName),
@@ -99,12 +98,12 @@ func ToBaseTopologyCM(baseTopology *BaseTopology) (*corev1.ConfigMap, error) {
 		Data: make(map[string]string),
 	}
 
-	topologyData, err := yaml.Marshal(baseTopology)
+	topologyData, err := yaml.Marshal(clusterTopology)
 	if err != nil {
 		return nil, err
 	}
 
-	cm.Data[CmTopologyKey] = string(topologyData)
+	cm.Data[cmTopologyKey] = string(topologyData)
 
 	return cm, nil
 }
@@ -127,7 +126,7 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf
 		return nil, err
 	}
 
-	cm.Data[CmTopologyKey] = string(topologyData)
+	cm.Data[cmTopologyKey] = string(topologyData)
 
 	return cm, nil
 }
diff --git a/internal/common/topology/types.go b/internal/common/topology/types.go
index e70801c..c965172 100644
--- a/internal/common/topology/types.go
+++ b/internal/common/topology/types.go
@@ -6,28 +6,37 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 )
 
-type BaseTopology struct {
-	Config Config `json:"config"`
+type ClusterTopology struct {
+	NodePools        map[string]NodePoolTopology `yaml:"nodePools"`
+	NodePoolLabelKey string                      `yaml:"nodePoolLabelKey"`
+
+	MigStrategy string `yaml:"migStrategy"`
+}
+
+type NodePoolTopology struct {
+	GpuCount   int    `yaml:"gpuCount"`
+	GpuMemory  int    `yaml:"gpuMemory"`
+	GpuProduct string `yaml:"gpuProduct"`
 }
 
 type NodeTopology struct {
-	GpuMemory   int          `yaml:"gpu-memory"`
-	GpuProduct  string       `yaml:"gpu-product"`
+	GpuMemory   int          `yaml:"gpuMemory"`
+	GpuProduct  string       `yaml:"gpuProduct"`
 	Gpus        []GpuDetails `yaml:"gpus"`
-	MigStrategy string       `yaml:"mig-strategy"`
+	MigStrategy string       `yaml:"migStrategy"`
 }
 
 type GpuDetails struct {
-	ID     string    `json:"id"`
-	Status GpuStatus `json:"status"`
+	ID     string    `yaml:"id"`
+	Status GpuStatus `yaml:"status"`
 }
 
 type PodGpuUsageStatusMap map[types.UID]GpuUsageStatus
 
 type GpuStatus struct {
-	AllocatedBy ContainerDetails `yaml:"allocated-by"`
+	AllocatedBy ContainerDetails `yaml:"allocatedBy"`
 	// Maps PodUID to its GPU usage status
-	PodGpuUsageStatus PodGpuUsageStatusMap `yaml:"pod-gpu-usage-status"`
+	PodGpuUsageStatus PodGpuUsageStatusMap `yaml:"podGpuUsageStatus"`
 }
 
 type ContainerDetails struct {
@@ -38,8 +47,8 @@ type ContainerDetails struct {
 
 type GpuUsageStatus struct {
 	Utilization           Range `yaml:"utilization"`
-	FbUsed                int   `yaml:"fb-used"`
-	UseKnativeUtilization bool  `yaml:"use-knative-utilization"`
+	FbUsed                int   `yaml:"fbUsed"`
+	UseKnativeUtilization bool  `yaml:"useKnativeUtilization"`
 }
 
 type Range struct {
@@ -47,17 +56,6 @@ type Range struct {
 	Max int `yaml:"max"`
 }
 
-type Config struct {
-	NodeAutofill NodeAutofillSettings `yaml:"node-autofill"`
-}
-
-type NodeAutofillSettings struct {
-	GpuCount    int    `yaml:"gpu-count"`
-	GpuMemory   int    `yaml:"gpu-memory"`
-	GpuProduct  string `yaml:"gpu-product"`
-	MigStrategy string `yaml:"mig-strategy"`
-}
-
 // Errors
 var ErrNoNodes = fmt.Errorf("no nodes found")
 var ErrNoNode = fmt.Errorf("node not found")
diff --git a/internal/status-updater/app_test.go b/internal/status-updater/app_test.go
index 8796a9f..3accf0a 100644
--- a/internal/status-updater/app_test.go
+++ b/internal/status-updater/app_test.go
@@ -48,17 +48,6 @@ const (
 	nodeGpuCount                = 2
 )
 
-var (
-	defaultTopologyConfig = topology.Config{
-		NodeAutofill: topology.NodeAutofillSettings{
-			GpuCount:    nodeGpuCount,
-			GpuMemory:   11441,
-			GpuProduct:  "Tesla-K80",
-			MigStrategy: "mixed",
-		},
-	}
-)
-
 func TestStatusUpdater(t *testing.T) {
 	RegisterFailHandler(Fail)
 	RunSpecs(t, "StatusUpdater Suite")
@@ -73,11 +62,19 @@ var _ = Describe("StatusUpdater", func() {
 	)
 
 	BeforeEach(func() {
-		baseTopology := &topology.BaseTopology{
-			Config: defaultTopologyConfig,
+		clusterTopology := &topology.ClusterTopology{
+			NodePools: map[string]topology.NodePoolTopology{
+				"default": {
+					GpuMemory:  11441,
+					GpuProduct: "Tesla-K80",
+					GpuCount:   nodeGpuCount,
+				},
+			},
+			NodePoolLabelKey: "run.ai/simulated-gpu-node-pool",
+			MigStrategy:      "mixed",
 		}
 
-		topologyStr, err := yaml.Marshal(baseTopology)
+		topologyStr, err := yaml.Marshal(clusterTopology)
 		Expect(err).ToNot(HaveOccurred())
 		topologyConfigMap := &v1.ConfigMap{
 			ObjectMeta: metav1.ObjectMeta{
@@ -102,8 +99,7 @@ var _ = Describe("StatusUpdater", func() {
 			ObjectMeta: metav1.ObjectMeta{
 				Name: node,
 				Labels: map[string]string{
-					"nvidia.com/gpu.deploy.dcgm-exporter": "true",
-					"nvidia.com/gpu.deploy.device-plugin": "true",
+					"run.ai/simulated-gpu-node-pool": "default",
 				},
 			},
 		}
@@ -389,8 +385,7 @@ var _ = Describe("StatusUpdater", func() {
 				ObjectMeta: metav1.ObjectMeta{
 					Name: "node1",
 					Labels: map[string]string{
-						"nvidia.com/gpu.deploy.device-plugin": "true",
-						"nvidia.com/gpu.deploy.dcgm-exporter": "true",
+						"run.ai/simulated-gpu-node-pool": "default",
 					},
 				},
 			}
@@ -400,18 +395,18 @@ var _ = Describe("StatusUpdater", func() {
 
 			Eventually(getTopologyNodeFromKube(kubeclient, node.Name)).Should(Not(BeNil()))
 
-			baseTopology, err := getTopologyFromKube(kubeclient)()
+			clusterTopology, err := getTopologyFromKube(kubeclient)()
 			Expect(err).ToNot(HaveOccurred())
-			Expect(baseTopology).ToNot(BeNil())
+			Expect(clusterTopology).ToNot(BeNil())
 
 			nodeTopology, err := getTopologyNodeFromKube(kubeclient, node.Name)()
 			Expect(err).ToNot(HaveOccurred())
 			Expect(nodeTopology).ToNot(BeNil())
 
-			Expect(nodeTopology.GpuMemory).To(Equal(baseTopology.Config.NodeAutofill.GpuMemory))
-			Expect(nodeTopology.GpuProduct).To(Equal(baseTopology.Config.NodeAutofill.GpuProduct))
-			Expect(nodeTopology.Gpus).To(HaveLen(baseTopology.Config.NodeAutofill.GpuCount))
-			Expect(nodeTopology.MigStrategy).To(Equal(baseTopology.Config.NodeAutofill.MigStrategy))
+			Expect(nodeTopology.GpuMemory).To(Equal(clusterTopology.NodePools["default"].GpuMemory))
+			Expect(nodeTopology.GpuProduct).To(Equal(clusterTopology.NodePools["default"].GpuProduct))
+			Expect(nodeTopology.Gpus).To(HaveLen(clusterTopology.NodePools["default"].GpuCount))
+			Expect(nodeTopology.MigStrategy).To(Equal(clusterTopology.MigStrategy))
 		})
 	})
 
@@ -429,15 +424,13 @@ var _ = Describe("StatusUpdater", func() {
 		})
 	})
 
-	// When informed of a node deletion, it should remove the node from the cluster topology
 	When("informed of a node deletion", func() {
 		It("should remove the node from the cluster topology", func() {
 			node := &v1.Node{
 				ObjectMeta: metav1.ObjectMeta{
 					Name: "node1",
 					Labels: map[string]string{
-						"nvidia.com/gpu.deploy.device-plugin": "true",
-						"nvidia.com/gpu.deploy.dcgm-exporter": "true",
+						"run.ai/simulated-gpu-node-pool": "default",
 					},
 				},
 			}
@@ -455,9 +448,9 @@ var _ = Describe("StatusUpdater", func() {
 	})
 })
 
-func getTopologyFromKube(kubeclient kubernetes.Interface) func() (*topology.BaseTopology, error) {
-	return func() (*topology.BaseTopology, error) {
-		ret, err := topology.GetBaseTopologyFromCM(kubeclient)
+func getTopologyFromKube(kubeclient kubernetes.Interface) func() (*topology.ClusterTopology, error) {
+	return func() (*topology.ClusterTopology, error) {
+		ret, err := topology.GetClusterTopologyFromCM(kubeclient)
 		return ret, err
 	}
 }
diff --git a/internal/status-updater/controllers/node/controller.go b/internal/status-updater/controllers/node/controller.go
index 30339c3..fb01470 100644
--- a/internal/status-updater/controllers/node/controller.go
+++ b/internal/status-updater/controllers/node/controller.go
@@ -18,6 +18,8 @@ import (
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/selection"
 
 	"k8s.io/client-go/informers"
 	"k8s.io/client-go/kubernetes"
@@ -28,22 +30,30 @@ type NodeController struct {
 	kubeClient kubernetes.Interface
 	informer   cache.SharedIndexInformer
 	handler    nodehandler.Interface
+
+	clusterTopology *topology.ClusterTopology
 }
 
 var _ controllers.Interface = &NodeController{}
 
 func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *NodeController {
+	clusterTopology, err := topology.GetClusterTopologyFromCM(kubeClient)
+	if err != nil {
+		log.Fatalf("Failed to get cluster topology: %v", err)
+	}
+
 	c := &NodeController{
-		kubeClient: kubeClient,
-		informer:   informers.NewSharedInformerFactory(kubeClient, 0).Core().V1().Nodes().Informer(),
-		handler:    nodehandler.NewNodeHandler(kubeClient),
+		kubeClient:      kubeClient,
+		informer:        informers.NewSharedInformerFactory(kubeClient, 0).Core().V1().Nodes().Informer(),
+		handler:         nodehandler.NewNodeHandler(kubeClient, clusterTopology),
+		clusterTopology: clusterTopology,
 	}
 
-	_, err := c.informer.AddEventHandler(cache.FilteringResourceEventHandler{
+	_, err = c.informer.AddEventHandler(cache.FilteringResourceEventHandler{
 		FilterFunc: func(obj interface{}) bool {
 			switch node := obj.(type) {
 			case *v1.Node:
-				return isFakeGpuNode(node)
+				return c.isFakeGpuNode(node)
 			default:
 				return false
 			}
@@ -76,6 +86,7 @@ func (c *NodeController) Run(stopCh <-chan struct{}) {
 		log.Fatalf("Failed to prune topology nodes: %v", err)
 	}
 
+	log.Println("Starting node controller")
 	c.informer.Run(stopCh)
 }
 
@@ -83,8 +94,13 @@ func (c *NodeController) Run(stopCh <-chan struct{}) {
 func (c *NodeController) pruneTopologyConfigMaps() error {
 	log.Print("Pruning topology ConfigMaps...")
 
+	gpuNodesLabelReq, err := labels.NewRequirement(c.clusterTopology.NodePoolLabelKey, selection.Exists, nil)
+	if err != nil {
+		return fmt.Errorf("failed creating label requirement: %v", err)
+	}
+
 	gpuNodes, err := c.kubeClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{
-		LabelSelector: "nvidia.com/gpu.deploy.dcgm-exporter=true,nvidia.com/gpu.deploy.device-plugin=true",
+		LabelSelector: labels.NewSelector().Add(*gpuNodesLabelReq).String(),
 	})
 	if err != nil {
 		return fmt.Errorf("failed listing fake gpu nodes: %v", err)
@@ -148,10 +164,9 @@ func (c *NodeController) pruneTopologyConfigMap(cm *v1.ConfigMap, isValidNodeTop
 	return nil
 }
 
-func isFakeGpuNode(node *v1.Node) bool {
-	return node != nil &&
-		node.Labels["nvidia.com/gpu.deploy.dcgm-exporter"] == "true" &&
-		node.Labels["nvidia.com/gpu.deploy.device-plugin"] == "true"
+func (c *NodeController) isFakeGpuNode(node *v1.Node) bool {
+	_, isNodeAssignedToNodePool := node.Labels[c.clusterTopology.NodePoolLabelKey]
+	return isNodeAssignedToNodePool
 }
 
 func isPodExist(kubeClient kubernetes.Interface, podName string, namespace string) (bool, error) {
diff --git a/internal/status-updater/controllers/pod/controller.go b/internal/status-updater/controllers/pod/controller.go
index 47e632b..217443c 100644
--- a/internal/status-updater/controllers/pod/controller.go
+++ b/internal/status-updater/controllers/pod/controller.go
@@ -68,5 +68,6 @@ func NewPodController(kubeClient kubernetes.Interface, dynamicClient dynamic.Int
 }
 
 func (p *PodController) Run(stopCh <-chan struct{}) {
+	log.Println("Starting pod controller")
 	p.informer.Run(stopCh)
 }
diff --git a/internal/status-updater/handlers/node/handler.go b/internal/status-updater/handlers/node/handler.go
index 4c66501..f404278 100644
--- a/internal/status-updater/handlers/node/handler.go
+++ b/internal/status-updater/handlers/node/handler.go
@@ -17,13 +17,16 @@ type Interface interface {
 
 type NodeHandler struct {
 	kubeClient kubernetes.Interface
+
+	clusterTopology *topology.ClusterTopology
 }
 
 var _ Interface = &NodeHandler{}
 
-func NewNodeHandler(kubeClient kubernetes.Interface) *NodeHandler {
+func NewNodeHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology) *NodeHandler {
 	return &NodeHandler{
-		kubeClient: kubeClient,
+		kubeClient:      kubeClient,
+		clusterTopology: clusterTopology,
 	}
 }
 
@@ -40,6 +43,11 @@ func (p *NodeHandler) HandleAdd(node *v1.Node) error {
 		return fmt.Errorf("failed to apply fake node deployments: %w", err)
 	}
 
+	err = p.labelNode(node)
+	if err != nil {
+		return fmt.Errorf("failed to label node: %w", err)
+	}
+
 	return nil
 }
 
@@ -56,5 +64,10 @@ func (p *NodeHandler) HandleDelete(node *v1.Node) error {
 		return fmt.Errorf("failed to delete fake node deployments: %w", err)
 	}
 
+	err = p.unlabelNode(node)
+	if err != nil {
+		return fmt.Errorf("failed to unlabel node: %w", err)
+	}
+
 	return nil
 }
diff --git a/internal/status-updater/handlers/node/labels.go b/internal/status-updater/handlers/node/labels.go
new file mode 100644
index 0000000..f05315a
--- /dev/null
+++ b/internal/status-updater/handlers/node/labels.go
@@ -0,0 +1,61 @@
+package node
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+const (
+	dcgmExporterLabelKey = "nvidia.com/gpu.deploy.dcgm-exporter"
+	devicePluginLabelKey = "nvidia.com/gpu.deploy.device-plugin"
+)
+
+// labelNode labels the node with required labels for the fake-gpu-operator to function.
+func (p *NodeHandler) labelNode(node *v1.Node) error {
+	err := p.patchNodeLabels(node, map[string]interface{}{
+		dcgmExporterLabelKey: "true",
+		devicePluginLabelKey: "true",
+	})
+	if err != nil {
+		return fmt.Errorf("failed to label node %s: %w", node.Name, err)
+	}
+
+	return nil
+}
+
+// unlabelNode removes the labels from the node that were added by the fake-gpu-operator.
+func (p *NodeHandler) unlabelNode(node *v1.Node) error {
+	err := p.patchNodeLabels(node, map[string]interface{}{
+		dcgmExporterLabelKey: nil,
+		devicePluginLabelKey: nil,
+	})
+	if err != nil {
+		return fmt.Errorf("failed to unlabel node %s: %w", node.Name, err)
+	}
+
+	return nil
+}
+
+func (p *NodeHandler) patchNodeLabels(node *v1.Node, labels map[string]interface{}) error {
+	patch := map[string]interface{}{
+		"metadata": map[string]interface{}{
+			"labels": labels,
+		},
+	}
+	patchBytes, err := json.Marshal(patch)
+	if err != nil {
+		return fmt.Errorf("failed to marshal patch: %w", err)
+	}
+
+	_, err = p.kubeClient.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.MergePatchType, patchBytes, metav1.PatchOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to patch node %s labels: %w", node.Name, err)
+	}
+
+	return nil
+}
diff --git a/internal/status-updater/handlers/node/topology_cm.go b/internal/status-updater/handlers/node/topology_cm.go
index cff8b26..e6a4067 100644
--- a/internal/status-updater/handlers/node/topology_cm.go
+++ b/internal/status-updater/handlers/node/topology_cm.go
@@ -14,21 +14,24 @@ func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error {
 		return nil
 	}
 
-	baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient)
-	if err != nil {
-		return fmt.Errorf("failed to get base topology: %w", err)
+	nodePoolName, ok := node.Labels[p.clusterTopology.NodePoolLabelKey]
+	if !ok {
+		return fmt.Errorf("node %s does not have a nodepool label", node.Name)
 	}
 
-	nodeAutofillSettings := baseTopology.Config.NodeAutofill
+	nodePoolTopology, ok := p.clusterTopology.NodePools[nodePoolName]
+	if !ok {
+		return fmt.Errorf("nodepool %s not found in cluster topology", nodePoolName)
+	}
 
 	nodeTopology = &topology.NodeTopology{
-		GpuMemory:   nodeAutofillSettings.GpuMemory,
-		GpuProduct:  nodeAutofillSettings.GpuProduct,
-		Gpus:        generateGpuDetails(nodeAutofillSettings.GpuCount, node.Name),
-		MigStrategy: nodeAutofillSettings.MigStrategy,
+		GpuMemory:   nodePoolTopology.GpuMemory,
+		GpuProduct:  nodePoolTopology.GpuProduct,
+		Gpus:        generateGpuDetails(nodePoolTopology.GpuCount, node.Name),
+		MigStrategy: p.clusterTopology.MigStrategy,
 	}
 
-	err = topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name)
+	err := topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name)
 	if err != nil {
 		return fmt.Errorf("failed to create node topology: %w", err)
 	}