From c129c3a9ee2b225b7ff49d50b4eeb5adfd076318 Mon Sep 17 00:00:00 2001 From: Benamar Mekhissi Date: Fri, 4 Oct 2024 13:32:15 -0400 Subject: [PATCH] Disallow relocation execution when a cluster is unreachable Added a check to stop relocation reconciliation if one of the clusters is unreachable. This prevents potential misclassification after hub recovery, which could lead to undesired results and inconsistencies. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2304182 Signed-off-by: Benamar Mekhissi (cherry picked from commit 791fef05dd35f32ee07f0e0f4a126f9554710328) --- internal/controller/drplacementcontrol.go | 8 ++++ .../drplacementcontrol_controller.go | 37 ++++++++++--------- .../drplacementcontrol_controller_test.go | 4 +- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/internal/controller/drplacementcontrol.go b/internal/controller/drplacementcontrol.go index cc3ac787c..4ce206d5a 100644 --- a/internal/controller/drplacementcontrol.go +++ b/internal/controller/drplacementcontrol.go @@ -825,6 +825,14 @@ func (d *DRPCInstance) RunRelocate() (bool, error) { const done = true + if d.reconciler.numClustersQueriedSuccessfully != len(d.drPolicy.Spec.DRClusters) { + d.log.Info("Can't progress with relocation -- Not all clusters are reachable", + "numClustersQueriedSuccessfully", d.reconciler.numClustersQueriedSuccessfully, + "NumOfClusters", len(d.drPolicy.Spec.DRClusters)) + + return !done, nil + } + preferredCluster := d.instance.Spec.PreferredCluster preferredClusterNamespace := d.instance.Spec.PreferredCluster diff --git a/internal/controller/drplacementcontrol_controller.go b/internal/controller/drplacementcontrol_controller.go index e81943a8a..346cd81c9 100644 --- a/internal/controller/drplacementcontrol_controller.go +++ b/internal/controller/drplacementcontrol_controller.go @@ -75,15 +75,16 @@ type ProgressCallback func(string, string) // DRPlacementControlReconciler reconciles a DRPlacementControl object type DRPlacementControlReconciler struct { client.Client - APIReader client.Reader - Log logr.Logger - MCVGetter rmnutil.ManagedClusterViewGetter - Scheme *runtime.Scheme - Callback ProgressCallback - eventRecorder *rmnutil.EventReporter - savedInstanceStatus rmn.DRPlacementControlStatus - ObjStoreGetter ObjectStoreGetter - RateLimiter *workqueue.TypedRateLimiter[reconcile.Request] + APIReader client.Reader + Log logr.Logger + MCVGetter rmnutil.ManagedClusterViewGetter + Scheme *runtime.Scheme + Callback ProgressCallback + eventRecorder *rmnutil.EventReporter + savedInstanceStatus rmn.DRPlacementControlStatus + ObjStoreGetter ObjectStoreGetter + RateLimiter *workqueue.TypedRateLimiter[reconcile.Request] + numClustersQueriedSuccessfully int } func ManifestWorkPredicateFunc() predicate.Funcs { @@ -939,11 +940,13 @@ func (r *DRPlacementControlReconciler) createDRPCInstance( return nil, err } - vrgs, _, _, err := getVRGsFromManagedClusters(r.MCVGetter, drpc, drClusters, vrgNamespace, log) + vrgs, cqs, _, err := getVRGsFromManagedClusters(r.MCVGetter, drpc, drClusters, vrgNamespace, log) if err != nil { return nil, err } + r.numClustersQueriedSuccessfully = cqs + d := &DRPCInstance{ reconciler: r, ctx: ctx, @@ -1663,7 +1666,7 @@ func getVRGsFromManagedClusters( annotations[DRPCNameAnnotation] = drpc.Name annotations[DRPCNamespaceAnnotation] = drpc.Namespace - var clustersQueriedSuccessfully int + var numClustersQueriedSuccessfully int var failedCluster string @@ -1675,7 +1678,7 @@ func getVRGsFromManagedClusters( // Only NotFound error is accepted if errors.IsNotFound(err) { log.Info(fmt.Sprintf("VRG not found on %q", drCluster.Name)) - clustersQueriedSuccessfully++ + numClustersQueriedSuccessfully++ continue } @@ -1687,7 +1690,7 @@ func getVRGsFromManagedClusters( continue } - clustersQueriedSuccessfully++ + numClustersQueriedSuccessfully++ if rmnutil.ResourceIsDeleted(drCluster) { log.Info("Skipping VRG on deleted drcluster", "drcluster", drCluster.Name, "vrg", vrg.Name) @@ -1701,15 +1704,15 @@ func getVRGsFromManagedClusters( } // We are done if we successfully queried all drClusters - if clustersQueriedSuccessfully == len(drClusters) { - return vrgs, clustersQueriedSuccessfully, "", nil + if numClustersQueriedSuccessfully == len(drClusters) { + return vrgs, numClustersQueriedSuccessfully, "", nil } - if clustersQueriedSuccessfully == 0 { + if numClustersQueriedSuccessfully == 0 { return vrgs, 0, "", fmt.Errorf("failed to retrieve VRGs from clusters") } - return vrgs, clustersQueriedSuccessfully, failedCluster, nil + return vrgs, numClustersQueriedSuccessfully, failedCluster, nil } func (r *DRPlacementControlReconciler) deleteClonedPlacementRule(ctx context.Context, diff --git a/internal/controller/drplacementcontrol_controller_test.go b/internal/controller/drplacementcontrol_controller_test.go index af1eca6bf..d65596972 100644 --- a/internal/controller/drplacementcontrol_controller_test.go +++ b/internal/controller/drplacementcontrol_controller_test.go @@ -2479,8 +2479,8 @@ var _ = Describe("DRPlacementControl Reconciler", func() { clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() expectedAction := rmn.ActionRelocate - expectedPhase := rmn.Relocated - exptectedPorgression := rmn.ProgressionCleaningUp + expectedPhase := rmn.DRState("") + exptectedPorgression := rmn.ProgressionStatus("") verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) // User intervention is required (simulate user intervention)