From ea3e47ba3eabfb2e0a880fd682e70abf1bd3ab1a Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Wed, 3 Jul 2024 01:07:45 +0300 Subject: [PATCH] add flag to skip deployments for fake nodes --- Makefile | 10 ++--- internal/common/topology/types.go | 3 +- .../controllers/node/controller.go | 4 ++ .../handlers/node/fake_node_deployments.go | 18 +++++++++ .../status-updater/handlers/node/handler.go | 39 +++++++++++++++++-- .../handlers/node/topology_cm.go | 11 ++---- 6 files changed, 68 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 151ca6c..8ab8ca8 100644 --- a/Makefile +++ b/Makefile @@ -28,12 +28,12 @@ clean: rm -rf ${BUILD_DIR} .PHONY: clean -init-buildx: - docker buildx inspect fgo-multi-platform > /dev/null || docker buildx create --name=fgo-multi-platform -.PHONY: init-buildx +# init-buildx: +# docker buildx inspect fgo-multi-platform > /dev/null || docker buildx create --name=fgo-multi-platform +# .PHONY: init-buildx -image: init-buildx - docker buildx --builder=fgo-multi-platform build -t ${DOCKER_IMAGE_NAME} --target ${COMPONENT} --platform ${DOCKER_BUILDX_PLATFORMS} ${DOCKER_BUILDX_PUSH_FLAG} . +image: + docker buildx build -t ${DOCKER_IMAGE_NAME} --target ${COMPONENT} ${DOCKER_BUILDX_PUSH_FLAG} . .PHONY: image images: diff --git a/internal/common/topology/types.go b/internal/common/topology/types.go index e70801c..06f5e06 100644 --- a/internal/common/topology/types.go +++ b/internal/common/topology/types.go @@ -48,7 +48,8 @@ type Range struct { } type Config struct { - NodeAutofill NodeAutofillSettings `yaml:"node-autofill"` + NodeAutofill NodeAutofillSettings `yaml:"node-autofill"` + FakeNodeHandling bool `yaml:"fake-node-handling"` } type NodeAutofillSettings struct { diff --git a/internal/status-updater/controllers/node/controller.go b/internal/status-updater/controllers/node/controller.go index bbe0cf1..b41bba9 100644 --- a/internal/status-updater/controllers/node/controller.go +++ b/internal/status-updater/controllers/node/controller.go @@ -53,6 +53,10 @@ func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *Nod node := obj.(*v1.Node) util.LogErrorIfExist(c.handler.HandleAdd(node), "Failed to handle node addition") }, + UpdateFunc: func(oldObj, newObj interface{}) { + newNode := newObj.(*v1.Node) + util.LogErrorIfExist(c.handler.HandleUpdate(newNode), "Failed to handle node addition") + }, DeleteFunc: func(obj interface{}) { node := obj.(*v1.Node) util.LogErrorIfExist(c.handler.HandleDelete(node), "Failed to handle node deletion") diff --git a/internal/status-updater/handlers/node/fake_node_deployments.go b/internal/status-updater/handlers/node/fake_node_deployments.go index f75ab9e..848143e 100644 --- a/internal/status-updater/handlers/node/fake_node_deployments.go +++ b/internal/status-updater/handlers/node/fake_node_deployments.go @@ -11,9 +11,27 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" ) +func (p *NodeHandler) applyFakeDevicePlugin(gpuCount int, node *v1.Node) error { + if !isFakeNode(node) { + return nil + } + + patch := fmt.Sprintf( + `{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`, + constants.GpuResourceName, gpuCount, constants.GpuResourceName, gpuCount, + ) + _, err := p.kubeClient.CoreV1().Nodes().Patch(context.TODO(), node.Name, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status") + if err != nil { + return fmt.Errorf("failed to update node capacity and allocatable: %v", err) + } + + return nil +} + func (p *NodeHandler) applyFakeNodeDeployments(node *v1.Node) error { if !isFakeNode(node) { return nil diff --git a/internal/status-updater/handlers/node/handler.go b/internal/status-updater/handlers/node/handler.go index 4c66501..8f3fd4e 100644 --- a/internal/status-updater/handlers/node/handler.go +++ b/internal/status-updater/handlers/node/handler.go @@ -13,6 +13,7 @@ import ( type Interface interface { HandleAdd(node *v1.Node) error HandleDelete(node *v1.Node) error + HandleUpdate(node *v1.Node) error } type NodeHandler struct { @@ -30,14 +31,26 @@ func NewNodeHandler(kubeClient kubernetes.Interface) *NodeHandler { func (p *NodeHandler) HandleAdd(node *v1.Node) error { log.Printf("Handling node addition: %s\n", node.Name) - err := p.createNodeTopologyCM(node) + baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient) if err != nil { - return fmt.Errorf("failed to create node topology ConfigMap: %w", err) + return fmt.Errorf("failed to get base topology: %w", err) } - err = p.applyFakeNodeDeployments(node) + err = p.createNodeTopologyCM(node, baseTopology) if err != nil { - return fmt.Errorf("failed to apply fake node deployments: %w", err) + return fmt.Errorf("failed to create node topology ConfigMap: %w", err) + } + + if baseTopology.Config.FakeNodeHandling { + err = p.applyFakeDevicePlugin(baseTopology.Config.NodeAutofill.GpuCount, node) + if err != nil { + return fmt.Errorf("failed to apply fake node deployments: %w", err) + } + } else { + err = p.applyFakeNodeDeployments(node) + if err != nil { + return fmt.Errorf("failed to apply fake node deployments: %w", err) + } } return nil @@ -58,3 +71,21 @@ func (p *NodeHandler) HandleDelete(node *v1.Node) error { return nil } + +func (p *NodeHandler) HandleUpdate(node *v1.Node) error { + baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient) + if err != nil { + return fmt.Errorf("failed to get base topology: %w", err) + } + + if !baseTopology.Config.FakeNodeHandling { + return nil + } + + gpuCount := baseTopology.Config.NodeAutofill.GpuCount + err = p.applyFakeDevicePlugin(gpuCount, node) + if err != nil { + return fmt.Errorf("failed to apply fake node deployments: %w", err) + } + return nil +} diff --git a/internal/status-updater/handlers/node/topology_cm.go b/internal/status-updater/handlers/node/topology_cm.go index cff8b26..cab793b 100644 --- a/internal/status-updater/handlers/node/topology_cm.go +++ b/internal/status-updater/handlers/node/topology_cm.go @@ -8,17 +8,14 @@ import ( v1 "k8s.io/api/core/v1" ) -func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error { +func (p *NodeHandler) createNodeTopologyCM( + node *v1.Node, baseTopology *topology.BaseTopology, +) error { nodeTopology, _ := topology.GetNodeTopologyFromCM(p.kubeClient, node.Name) if nodeTopology != nil { return nil } - baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient) - if err != nil { - return fmt.Errorf("failed to get base topology: %w", err) - } - nodeAutofillSettings := baseTopology.Config.NodeAutofill nodeTopology = &topology.NodeTopology{ @@ -28,7 +25,7 @@ func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error { MigStrategy: nodeAutofillSettings.MigStrategy, } - err = topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name) + err := topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name) if err != nil { return fmt.Errorf("failed to create node topology: %w", err) }