Skip to content

Commit

Permalink
fix: do not rely on MachineStatus updates when checking maintenance
Browse files Browse the repository at this point in the history
This check in the `ClusterMachineConfigStatus` controller is incorrect:
there's no relation of the `MachineStatus` update and the machine
connectivity.
We should have another way to re-trigger reconciliation and one of the
options is the requeue after.

Make it re-queue the reconciliation limited amount of times.
It's not 100% fail-proof, but no worse than before.

Signed-off-by: Artem Chernyshev <[email protected]>
  • Loading branch information
Unix4ever committed Jul 11, 2024
1 parent d271a8a commit 76263e1
Showing 1 changed file with 28 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import (
const (
gracefulResetAttemptCount = 4
etcdLeaveAttemptsLimit = 2
maintenanceCheckAttempts = 5
)

// ClusterMachineConfigStatusController manages ClusterMachineStatus resource lifecycle.
Expand Down Expand Up @@ -235,8 +236,9 @@ func NewClusterMachineConfigStatusController() *ClusterMachineConfigStatusContro
}

type resetStatus struct {
resetAttempts uint
etcdLeaveAttempts uint
resetAttempts uint
etcdLeaveAttempts uint
maintenanceCheckAttempts uint
}

type ongoingResets struct {
Expand Down Expand Up @@ -284,6 +286,19 @@ func (r *ongoingResets) handleReset(id resource.ID) uint {
return r.statuses[id].resetAttempts
}

func (r *ongoingResets) handleMaintenanceCheck(id resource.ID) uint {
r.mu.Lock()
defer r.mu.Unlock()

if _, ok := r.statuses[id]; !ok {
r.statuses[id] = &resetStatus{}
}

r.statuses[id].maintenanceCheckAttempts++

return r.statuses[id].maintenanceCheckAttempts
}

func (r *ongoingResets) handleEtcdLeave(id resource.ID) {
r.mu.Lock()
defer r.mu.Unlock()
Expand Down Expand Up @@ -558,7 +573,7 @@ func (h *clusterMachineConfigStatusControllerHandler) reset(

defer logClose(c, logger, "reset maintenance")

_, err = c.Disks(ctx)
_, err = c.Version(ctx)

logger.Debug("maintenance mode check", zap.Error(err))

Expand All @@ -567,8 +582,16 @@ func (h *clusterMachineConfigStatusControllerHandler) reset(
return nil
}

// retry next time when MachineStatus updates
return xerrors.NewTagged[qtransform.SkipReconcileTag](fmt.Errorf("failed to get disks in maintenance mode for machine '%s': %w", machineConfig.Metadata().ID(), err))
wrappedErr := fmt.Errorf("failed to get version in maintenance mode for machine '%s': %w", machineConfig.Metadata().ID(), err)

attempt := h.ongoingResets.handleMaintenanceCheck(machineStatus.Metadata().ID())

if attempt <= maintenanceCheckAttempts {
// retry in N seconds
return controller.NewRequeueError(wrappedErr, time.Second*time.Duration(attempt))
}

return xerrors.NewTagged[qtransform.SkipReconcileTag](wrappedErr)
}

machineSetName, ok := clusterMachine.Metadata().Labels().Get(omni.LabelMachineSet)
Expand Down

0 comments on commit 76263e1

Please sign in to comment.