fix: do not rely on MachineStatus updates when checking maintenance

This check in the `ClusterMachineConfigStatus` controller is incorrect: there's no relation of the `MachineStatus` update and the machine connectivity. We should have another way to re-trigger reconciliation and one of the options is the requeue after. Make it re-queue the reconciliation limited amount of times. It's not 100% fail-proof, but no worse than before. Signed-off-by: Artem Chernyshev <[email protected]>
DmitriyMV · Jul 11, 2024 · 76263e1 · 76263e1
1 parent d271a8a
commit 76263e1
Showing 1 changed file with 28 additions and 5 deletions.
diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go
@@ -44,6 +44,7 @@ import (
 const (
 	gracefulResetAttemptCount = 4
 	etcdLeaveAttemptsLimit    = 2
+	maintenanceCheckAttempts  = 5
 )
 
 // ClusterMachineConfigStatusController manages ClusterMachineStatus resource lifecycle.
@@ -235,8 +236,9 @@ func NewClusterMachineConfigStatusController() *ClusterMachineConfigStatusContro
 }
 
 type resetStatus struct {
-	resetAttempts     uint
-	etcdLeaveAttempts uint
+	resetAttempts            uint
+	etcdLeaveAttempts        uint
+	maintenanceCheckAttempts uint
 }
 
 type ongoingResets struct {
@@ -284,6 +286,19 @@ func (r *ongoingResets) handleReset(id resource.ID) uint {
 	return r.statuses[id].resetAttempts
 }
 
+func (r *ongoingResets) handleMaintenanceCheck(id resource.ID) uint {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if _, ok := r.statuses[id]; !ok {
+		r.statuses[id] = &resetStatus{}
+	}
+
+	r.statuses[id].maintenanceCheckAttempts++
+
+	return r.statuses[id].maintenanceCheckAttempts
+}
+
 func (r *ongoingResets) handleEtcdLeave(id resource.ID) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
@@ -558,7 +573,7 @@ func (h *clusterMachineConfigStatusControllerHandler) reset(
 
 		defer logClose(c, logger, "reset maintenance")
 
-		_, err = c.Disks(ctx)
+		_, err = c.Version(ctx)
 
 		logger.Debug("maintenance mode check", zap.Error(err))
 
@@ -567,8 +582,16 @@ func (h *clusterMachineConfigStatusControllerHandler) reset(
 			return nil
 		}
 
-		// retry next time when MachineStatus updates
-		return xerrors.NewTagged[qtransform.SkipReconcileTag](fmt.Errorf("failed to get disks in maintenance mode for machine '%s': %w", machineConfig.Metadata().ID(), err))
+		wrappedErr := fmt.Errorf("failed to get version in maintenance mode for machine '%s': %w", machineConfig.Metadata().ID(), err)
+
+		attempt := h.ongoingResets.handleMaintenanceCheck(machineStatus.Metadata().ID())
+
+		if attempt <= maintenanceCheckAttempts {
+			// retry in N seconds
+			return controller.NewRequeueError(wrappedErr, time.Second*time.Duration(attempt))
+		}
+
+		return xerrors.NewTagged[qtransform.SkipReconcileTag](wrappedErr)
 	}
 
 	machineSetName, ok := clusterMachine.Metadata().Labels().Get(omni.LabelMachineSet)