From 9d379672fa38e2231a9409d66d8448e52ebe886f Mon Sep 17 00:00:00 2001 From: Raghavendra Talur Date: Tue, 23 Jan 2024 11:22:43 -0500 Subject: [PATCH 1/5] go.mod: Use a replace directive for the api module The creation of the api module is for other users of Ramen api and not for the controllers of Ramen. We should always use the latest available api in the repository. Signed-off-by: Raghavendra Talur --- go.mod | 3 +++ go.sum | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 0f270270d..8eeb52cba 100644 --- a/go.mod +++ b/go.mod @@ -2,6 +2,9 @@ module github.com/ramendr/ramen go 1.21.6 +// This replace should always be here for ease of development. +replace github.com/ramendr/ramen/api => ./api + require ( github.com/aws/aws-sdk-go v1.44.289 github.com/backube/volsync v0.7.1 diff --git a/go.sum b/go.sum index 7b1e17ead..49e17eb02 100644 --- a/go.sum +++ b/go.sum @@ -277,8 +277,6 @@ github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdO github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY= github.com/prometheus/procfs v0.10.1 h1:kYK1Va/YMlutzCGazswoHKo//tZVlFpKYh+PymziUAg= github.com/prometheus/procfs v0.10.1/go.mod h1:nwNm2aOCAYw8uTR/9bWRREkZFxAUcWzPHWJq+XBB/FM= -github.com/ramendr/ramen/api v0.0.0-20240117171503-e11c56eac24d h1:cusjxTd7EUt/VP51YA2hBH/+H7svwCAmiFybbssEKx4= -github.com/ramendr/ramen/api v0.0.0-20240117171503-e11c56eac24d/go.mod h1:KXZjrQDRobLq1FvIpxHVZCG054qQo+2t9uGjb/wc9k4= github.com/ramendr/recipe v0.0.0-20230817160432-729dc7fd8932 h1:n89W9K2gDa0XwdIVuWyg53hPgaR97DfGVi9o2V0WcWA= github.com/ramendr/recipe v0.0.0-20230817160432-729dc7fd8932/go.mod h1:QHVQXKgNId8EfvNd+Y6JcTrsXwTImtSFkV4IsiOkwCw= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= From f0694b16abb8eb79507e66cbab419d4b45f30d7b Mon Sep 17 00:00:00 2001 From: Raghavendra Talur Date: Tue, 23 Jan 2024 12:09:30 -0500 Subject: [PATCH 2/5] dockerfile: copy the api dir before go mod download The go mod download command fails on the github runner as it is not able to find the go.mod file under api dir. Signed-off-by: Raghavendra Talur --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f3a62020a..57d9d9ab7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,13 +8,13 @@ WORKDIR /workspace # Copy the Go Modules manifests COPY go.mod go.mod COPY go.sum go.sum +COPY api/ api/ # cache deps before building and copying source so that we don't need to re-download as much # and so that source changes don't invalidate our downloaded layer RUN go mod download # Copy the go source COPY main.go main.go -COPY api/ api/ COPY controllers/ controllers/ # Build From 8c3c5eeb4a80c69aa7b6d9bdfdaff31d74a8a43d Mon Sep 17 00:00:00 2001 From: Benamar Mekhissi Date: Sat, 13 Jan 2024 08:02:08 -0500 Subject: [PATCH 3/5] Fix Failover Confusion in DRPC Action Post Hub Recovery If the DRPC Action is set to Failover, for instance, transitioning from C1 to C2, a subsequent failover request from C2 to C1 creates confusion in Ramen post hub recovery because the action didn't change and the only thing that changed is the destination cluster. The solution is to permit failover from C2 to C1 if C1 is accessible. Signed-off-by: Benamar Mekhissi --- api/v1alpha1/drplacementcontrol_types.go | 4 + controllers/drplacementcontrol.go | 1 + controllers/drplacementcontrol_controller.go | 103 +++++++++++------- .../drplacementcontrol_controller_test.go | 55 ++++++---- controllers/util/mcv_util.go | 5 +- 5 files changed, 100 insertions(+), 68 deletions(-) diff --git a/api/v1alpha1/drplacementcontrol_types.go b/api/v1alpha1/drplacementcontrol_types.go index 369290858..15447f552 100644 --- a/api/v1alpha1/drplacementcontrol_types.go +++ b/api/v1alpha1/drplacementcontrol_types.go @@ -28,6 +28,10 @@ type DRState string // These are the valid values for DRState const ( + // WaitForUser, state recorded in DRPC status to indicate that we are + // waiting for the user to take an action after hub recover. + WaitForUser = DRState("WaitForUser") + // Initiating, state recorded in the DRPC status to indicate that this // action (Deploy/Failover/Relocate) is preparing for execution. There // is NO follow up state called 'Initiated' diff --git a/controllers/drplacementcontrol.go b/controllers/drplacementcontrol.go index e0eac5807..7c4f63a80 100644 --- a/controllers/drplacementcontrol.go +++ b/controllers/drplacementcontrol.go @@ -2335,6 +2335,7 @@ func (d *DRPCInstance) setConditionOnInitialDeploymentCompletion() { func (d *DRPCInstance) setStatusInitiating() { if !(d.instance.Status.Phase == "" || + d.instance.Status.Phase == rmn.WaitForUser || d.instance.Status.Phase == rmn.Deployed || d.instance.Status.Phase == rmn.FailedOver || d.instance.Status.Phase == rmn.Relocated) { diff --git a/controllers/drplacementcontrol_controller.go b/controllers/drplacementcontrol_controller.go index ede531b83..3ee5eec22 100644 --- a/controllers/drplacementcontrol_controller.go +++ b/controllers/drplacementcontrol_controller.go @@ -856,6 +856,8 @@ func (r *DRPlacementControlReconciler) createDRPCInstance( placementObj client.Object, log logr.Logger, ) (*DRPCInstance, error) { + log.Info("Creating DRPC instance") + drClusters, err := getDRClusters(ctx, r.Client, drPolicy) if err != nil { return nil, err @@ -2168,8 +2170,10 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( ) (bool, error) { requeue := true + log.Info("Ensure DRPC Status Consistency") + // This will always be false the first time the DRPC resource is first created OR after hub recovery - if drpc.Status.Phase != "" { + if drpc.Status.Phase != "" && drpc.Status.Phase != rmn.WaitForUser { return !requeue, nil } @@ -2178,7 +2182,10 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( dstCluster = drpc.Spec.FailoverCluster } - progress, err := r.determineDRPCState(ctx, drpc, drPolicy, placementObj, dstCluster, log) + progress, msg, err := r.determineDRPCState(ctx, drpc, drPolicy, placementObj, dstCluster, log) + + log.Info(msg) + if err != nil { return requeue, err } @@ -2187,22 +2194,23 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency( case Continue: return !requeue, nil case AllowFailover: + drpc.Status.Phase = rmn.WaitForUser updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable, - drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed") + drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, msg) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed") return requeue, nil default: - msg := "Operation Paused - User Intervention Required." + msg := fmt.Sprintf("Operation Paused - User Intervention Required. %s", msg) - log.Info(fmt.Sprintf("err:%v - msg:%s", err, msg)) + log.Info(msg) updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable, drpc.Generation, metav1.ConditionFalse, rmn.ReasonPaused, msg) addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation, - metav1.ConditionFalse, rmn.ReasonPaused, msg) + metav1.ConditionFalse, rmn.ReasonPaused, "User Intervention Required") return requeue, nil } @@ -2249,19 +2257,19 @@ func (r *DRPlacementControlReconciler) determineDRPCState( placementObj client.Object, dstCluster string, log logr.Logger, -) (Progress, error) { +) (Progress, string, error) { log.Info("Rebuild DRPC state") vrgNamespace, err := selectVRGNamespace(r.Client, log, drpc, placementObj) if err != nil { log.Info("Failed to select VRG namespace") - return Stop, err + return Stop, "", err } drClusters, err := getDRClusters(ctx, r.Client, drPolicy) if err != nil { - return Stop, err + return Stop, "", err } vrgs, successfullyQueriedClusterCount, failedCluster, err := getVRGsFromManagedClusters( @@ -2269,21 +2277,29 @@ func (r *DRPlacementControlReconciler) determineDRPCState( if err != nil { log.Info("Failed to get a list of VRGs") - return Stop, err + return Stop, "", err } // IF 2 clusters queried, and both queries failed, then STOP if successfullyQueriedClusterCount == 0 { - log.Info("Number of clusters queried is 0. Stop...") + msg := "Stop - Number of clusters queried is 0" - return Stop, nil + return Stop, msg, nil } // IF 2 clusters queried successfully and no VRGs, then continue with initial deployment if successfullyQueriedClusterCount == 2 && len(vrgs) == 0 { log.Info("Queried 2 clusters successfully") - return Continue, nil + return Continue, "", nil + } + + if drpc.Status.Phase == rmn.WaitForUser && + drpc.Spec.Action == rmn.ActionFailover && + drpc.Spec.FailoverCluster != failedCluster { + log.Info("Continue. The action is failover and the failoverCluster is accessible") + + return Continue, "", nil } // IF queried 2 clusters queried, 1 failed and 0 VRG found, then check s3 store. @@ -2297,21 +2313,22 @@ func (r *DRPlacementControlReconciler) determineDRPCState( if vrg == nil { // IF the failed cluster is not the dest cluster, then this could be an initial deploy if failedCluster != dstCluster { - return Continue, nil + return Continue, "", nil } - log.Info("Unable to query all clusters and failed to get VRG from s3 store") + msg := fmt.Sprintf("Unable to query all clusters and failed to get VRG from s3 store. Failed to query %s", + failedCluster) - return Stop, nil + return Stop, msg, nil } - log.Info("VRG From s3", "VRG Spec", vrg.Spec, "VRG Annotations", vrg.GetAnnotations()) + log.Info("Got VRG From s3", "VRG Spec", vrg.Spec, "VRG Annotations", vrg.GetAnnotations()) if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) { - log.Info(fmt.Sprintf("Two different actions - drpc action is '%s'/vrg action from s3 is '%s'", - drpc.Spec.Action, vrg.Spec.Action)) + msg := fmt.Sprintf("Failover is allowed - Two different actions - drpcAction is '%s' and vrgAction from s3 is '%s'", + drpc.Spec.Action, vrg.Spec.Action) - return AllowFailover, nil + return AllowFailover, msg, nil } if dstCluster == vrg.GetAnnotations()[DestinationClusterAnnotationKey] && @@ -2319,13 +2336,13 @@ func (r *DRPlacementControlReconciler) determineDRPCState( log.Info(fmt.Sprintf("VRG from s3. Same dstCluster %s/%s. Proceeding...", dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey])) - return Continue, nil + return Continue, "", nil } - log.Info(fmt.Sprintf("VRG from s3. DRPCAction/vrgAction/DRPCDstClstr/vrgDstClstr %s/%s/%s/%s. Allow Failover...", - drpc.Spec.Action, vrg.Spec.Action, dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey])) + msg := fmt.Sprintf("Failover is allowed - drpcAction:'%s'. vrgAction:'%s'. DRPCDstClstr:'%s'. vrgDstClstr:'%s'.", + drpc.Spec.Action, vrg.Spec.Action, dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey]) - return AllowFailover, nil + return AllowFailover, msg, nil } // IF 2 clusters queried, 1 failed and 1 VRG found on the failover cluster, then check the action, if they don't @@ -2341,25 +2358,26 @@ func (r *DRPlacementControlReconciler) determineDRPCState( break } - if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) { - log.Info(fmt.Sprintf("Stop! Two different actions - drpc action is '%s'/vrg action is '%s'", - drpc.Spec.Action, vrg.Spec.Action)) + if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) && + dstCluster == clusterName { + msg := fmt.Sprintf("Stop - Two different actions for the same cluster - drpcAction:'%s'. vrgAction:'%s'", + drpc.Spec.Action, vrg.Spec.Action) - return Stop, nil + return Stop, msg, nil } if dstCluster != clusterName && vrg.Spec.ReplicationState == rmn.Secondary { - log.Info(fmt.Sprintf("Same Action and dstCluster and ReplicationState %s/%s/%s", + log.Info(fmt.Sprintf("Failover is allowed. Action/dstCluster/ReplicationState %s/%s/%s", drpc.Spec.Action, dstCluster, vrg.Spec.ReplicationState)) - log.Info("Failover is allowed - Primary is assumed in the failed cluster") + msg := "Failover is allowed - Primary is assumed to be on the failed cluster" - return AllowFailover, nil + return AllowFailover, msg, nil } - log.Info("Allow to continue") + log.Info("Same action, dstCluster, and ReplicationState is primary. Continuing") - return Continue, nil + return Continue, "", nil } // Finally, IF 2 clusters queried successfully and 1 or more VRGs found, and if one of the VRGs is on the dstCluster, @@ -2379,25 +2397,26 @@ func (r *DRPlacementControlReconciler) determineDRPCState( // This can happen if a hub is recovered in the middle of a Relocate if vrg.Spec.ReplicationState == rmn.Secondary && len(vrgs) == 2 { - log.Info("Both VRGs are in secondary state") + msg := "Stop - Both VRGs have the same secondary state" - return Stop, nil + return Stop, msg, nil } if drpc.Spec.Action == rmn.DRAction(vrg.Spec.Action) && dstCluster == clusterName { - log.Info(fmt.Sprintf("Same Action %s", drpc.Spec.Action)) + log.Info(fmt.Sprintf("Same Action and dest cluster %s/%s", drpc.Spec.Action, dstCluster)) - return Continue, nil + return Continue, "", nil } - log.Info("Failover is allowed", "vrgs count", len(vrgs), "drpc action", - drpc.Spec.Action, "vrg action", vrg.Spec.Action, "dstCluster/clusterName", dstCluster+"/"+clusterName) + msg := fmt.Sprintf("Failover is allowed - VRGs count:'%d'. drpcAction:'%s'."+ + " vrgAction:'%s'. DstCluster:'%s'. vrgOnCluste '%s'", + len(vrgs), drpc.Spec.Action, vrg.Spec.Action, dstCluster, clusterName) - return AllowFailover, nil + return AllowFailover, msg, nil } // IF none of the above, then allow failover (set PeerReady), but stop until someone makes the change - log.Info("Failover is allowed, but user intervention is required") + msg := "Failover is allowed - User intervention is required" - return AllowFailover, nil + return AllowFailover, msg, nil } diff --git a/controllers/drplacementcontrol_controller_test.go b/controllers/drplacementcontrol_controller_test.go index 4f680f6db..605bf6ffd 100644 --- a/controllers/drplacementcontrol_controller_test.go +++ b/controllers/drplacementcontrol_controller_test.go @@ -1519,6 +1519,7 @@ func runFailoverAction(placementObj client.Object, fromCluster, toCluster string Expect(len(drpc.Status.Conditions)).To(Equal(2)) _, condition := getDRPCCondition(&drpc.Status, rmn.ConditionAvailable) Expect(condition.Reason).To(Equal(string(rmn.FailedOver))) + Expect(drpc.Status.ActionStartTime).ShouldNot(BeNil()) decision := getLatestUserPlacementDecision(placementObj.GetName(), placementObj.GetNamespace()) Expect(decision.ClusterName).To(Equal(toCluster)) @@ -2283,8 +2284,8 @@ var _ = Describe("DRPlacementControl Reconciler", func() { Specify("DRClusters", func() { populateDRClusters() }) - When("6 Applications deployed for the first time", func() { - It("Should deploy 6 drpcs", func() { + When("Application deployed for the first time", func() { + It("Should deploy drpc", func() { createNamespacesAsync(getNamespaceObj(DefaultDRPCNamespace)) createManagedClusters(asyncClusters) createDRClustersAsync() @@ -2309,7 +2310,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Secondary is back online --------- // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-1 busybox-drpc 12h East1ManagedClus Deployed Completed True - When("DRAction is Initial deploy -- during hub recovery -> Secondary Down", func() { + When("HubRecovery: DRAction is Initial deploy -> Secondary Down", func() { It("Should reconstructs the DRPC state to completion. Primary is East1ManagedCluster", func() { setClusterDown(West1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) @@ -2333,32 +2334,31 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online --------- // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-5 busybox-drpc 11h East1ManagedClus Deployed Completed 2023-12-20T12:52:20Z 5m32.467527356s True - When("DRAction is Initial deploy -- during hub recovery -> Primary Down", func() { - It("Should be able to reconstructs the DRPC state to completion. Primary East1ManagedCluster", func() { - setClusterDown(West1ManagedCluster) + When("HubRecovery: DRAction is Initial deploy -> Primary Down", func() { + It("Should pause and wait for user to trigger a failover. Primary East1ManagedCluster", func() { + setClusterDown(East1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() expectedAction := rmn.DRAction("") - expectedPhase := rmn.Deployed - exptectedPorgression := rmn.ProgressionCompleted + expectedPhase := rmn.WaitForUser + exptectedPorgression := rmn.ProgressionActionPaused verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) - resetClusterDown() - exptectedCompleted := rmn.ProgressionCompleted - verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedCompleted) }) }) // Failover - When("DRAction is set to Failover - Hub Recovery", func() { + When("HubRecovery: DRAction is set to Failover -> primary cluster down", func() { It("Should failover to West1ManagedCluster", func() { from := East1ManagedCluster to := West1ManagedCluster + resetClusterDown() runFailoverAction(userPlacementRule1, from, to, false, false) - uploadVRGtoS3Store(DRPCCommonName, DefaultDRPCNamespace, West1ManagedCluster, - rmn.VRGAction(rmn.ActionFailover)) waitForDRPCPhaseAndProgression(DefaultDRPCNamespace, rmn.FailedOver) + uploadVRGtoS3Store(DRPCCommonName, DefaultDRPCNamespace, West1ManagedCluster, rmn.VRGActionFailover) + resetClusterDown() }) }) + //nolint:lll // -------- Before Hub Recovery Action FailedOver --- // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY @@ -2369,14 +2369,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online ------------ // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-2 busybox-drpc 11h East1ManagedClus West1ManagedClu Failover FailedOver Completed True - When("DRAction is Failover -- during hub recovery -> Primary Down", func() { + When("HubRecovery: DRAction is Failover -> Primary Down", func() { It("Should Pause, but allows failover. Primary West1ManagedCluster", func() { setClusterDown(West1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() setDRPCSpecExpectationTo(DefaultDRPCNamespace, East1ManagedCluster, West1ManagedCluster, "") expectedAction := rmn.DRAction("") - expectedPhase := rmn.DRState("") + expectedPhase := rmn.WaitForUser exptectedPorgression := rmn.ProgressionActionPaused verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) checkConditionAllowFailover(DefaultDRPCNamespace) @@ -2393,7 +2393,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { }) // Relocate - When("DRAction is set to Relocate - Hub Recovery", func() { + When("HubRecovery: DRAction is set to Relocate", func() { It("Should relocate to Primary (East1ManagedCluster)", func() { // ----------------------------- RELOCATION TO PRIMARY -------------------------------------- from := West1ManagedCluster @@ -2412,7 +2412,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online ------------ // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-sample busybox-drpc 16h East1ManagedClus West1ManagedClu Relocate Relocated Completed True - When("DRAction is Relocate -- during hub recovery -> Secondary Down", func() { + When("HubRecovery: DRAction is Relocate -> Secondary Down", func() { It("Should Continue given the primary East1ManagedCluster is up", func() { setClusterDown(West1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) @@ -2442,14 +2442,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() { // -------- After Primary is back online ------------ // NAMESPACE NAME AGE PREFERREDCLUSTER FAILOVERCLUSTER DESIREDSTATE CURRENTSTATE PROGRESSION START TIME DURATION PEER READY // busybox-samples-3 busybox-drpc 11h East1ManagedClus Relocate Relocated Completed True - When("DRAction is supposed to be Relocate -- during hub recovery -> Primary Down -> Action Cleared", func() { + When("HubRecovery: DRAction is supposed to be Relocate -> Primary Down -> Action Cleared", func() { It("Should Pause given the primary East1ManagedCluster is down, but allow failover", func() { setClusterDown(East1ManagedCluster) clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() setDRPCSpecExpectationTo(DefaultDRPCNamespace, East1ManagedCluster, West1ManagedCluster, "") expectedAction := rmn.DRAction("") - expectedPhase := rmn.DRState("") + expectedPhase := rmn.WaitForUser exptectedPorgression := rmn.ProgressionActionPaused verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) checkConditionAllowFailover(DefaultDRPCNamespace) @@ -2495,11 +2495,19 @@ var _ = Describe("DRPlacementControl Reconciler", func() { func verifyDRPCStateAndProgression(expectedAction rmn.DRAction, expectedPhase rmn.DRState, exptectedPorgression rmn.ProgressionStatus, ) { + var phase rmn.DRState + + var progression rmn.ProgressionStatus + Eventually(func() bool { drpc := getLatestDRPC(DefaultDRPCNamespace) + phase = drpc.Status.Phase + progression = drpc.Status.Progression - return drpc.Status.Phase == expectedPhase && drpc.Status.Progression == exptectedPorgression - }, timeout, time.Millisecond*1000).Should(BeTrue(), "Phase has not been updated yet!") + return phase == expectedPhase && progression == exptectedPorgression + }, timeout, time.Millisecond*1000).Should(BeTrue(), + fmt.Sprintf("Phase has not been updated yet! Phase:%s Expected:%s - progression:%s exptected:%s", + phase, expectedPhase, progression, exptectedPorgression)) drpc := getLatestDRPC(DefaultDRPCNamespace) Expect(drpc.Spec.Action).Should(Equal(expectedAction)) @@ -2525,8 +2533,7 @@ func checkConditionAllowFailover(namespace string) { return false }, timeout, interval).Should(BeTrue(), fmt.Sprintf("Condition '%+v'", availableCondition)) - Expect(drpc.Status.Phase).To(Equal(rmn.DRState(""))) - Expect(availableCondition.Message).Should(Equal("Failover allowed")) + Expect(drpc.Status.Phase).To(Equal(rmn.WaitForUser)) } func uploadVRGtoS3Store(name, namespace, dstCluster string, action rmn.VRGAction) { diff --git a/controllers/util/mcv_util.go b/controllers/util/mcv_util.go index f48ff5407..ece7a2a4d 100644 --- a/controllers/util/mcv_util.go +++ b/controllers/util/mcv_util.go @@ -62,7 +62,7 @@ type ManagedClusterViewGetterImpl struct { func (m ManagedClusterViewGetterImpl) GetVRGFromManagedCluster(resourceName, resourceNamespace, managedCluster string, annotations map[string]string, ) (*rmn.VolumeReplicationGroup, error) { - logger := ctrl.Log.WithName("MCV").WithValues("resourceName", resourceName) + logger := ctrl.Log.WithName("MCV").WithValues("resourceName", resourceName, "cluster", managedCluster) // get VRG and verify status through ManagedClusterView mcvMeta := metav1.ObjectMeta{ Name: BuildManagedClusterViewName(resourceName, resourceNamespace, "vrg"), @@ -228,7 +228,8 @@ func (m ManagedClusterViewGetterImpl) getManagedClusterResource( return errorswrapper.Wrap(err, "getManagedClusterResource failed") } - logger.Info(fmt.Sprintf("MCV Conditions: %v", mcv.Status.Conditions)) + logger.Info(fmt.Sprintf("Get managedClusterResource Returned the following MCV Conditions: %v", + mcv.Status.Conditions)) return m.GetResource(mcv, resource) } From 303b5ff7864fbe68920a4d43261157745dc7f355 Mon Sep 17 00:00:00 2001 From: rakeshgm Date: Thu, 24 Aug 2023 23:25:04 +0530 Subject: [PATCH 4/5] report better s3Errors use aws/awserr package to report better errors with correct format Signed-off-by: rakeshgm --- controllers/s3utils.go | 61 +++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/controllers/s3utils.go b/controllers/s3utils.go index 389c15086..a192fa245 100644 --- a/controllers/s3utils.go +++ b/controllers/s3utils.go @@ -8,6 +8,7 @@ import ( "compress/gzip" "context" "encoding/json" + "errors" "fmt" "io" "reflect" @@ -223,8 +224,9 @@ func (s *s3ObjectStore) CreateBucket(bucket string) (err error) { cbInput := &s3.CreateBucketInput{Bucket: &bucket} if err = cbInput.Validate(); err != nil { - return fmt.Errorf("create bucket input validation failed for %s, err %w", - bucket, err) + errMsgPrefix := fmt.Errorf("create bucket input validation failed for %s", bucket) + + return processAwsError(errMsgPrefix, err) } _, err = s.client.CreateBucket(cbInput) @@ -235,8 +237,8 @@ func (s *s3ObjectStore) CreateBucket(bucket string) (err error) { case s3.ErrCodeBucketAlreadyExists: case s3.ErrCodeBucketAlreadyOwnedByYou: default: - return fmt.Errorf("failed to create bucket %s, %w", - bucket, err) + return fmt.Errorf("failed to create bucket %s, %s: %s", + bucket, aerr.Code(), aerr.Message()) } } } @@ -264,14 +266,16 @@ func (s *s3ObjectStore) DeleteBucket(bucket string) ( dbInput := &s3.DeleteBucketInput{Bucket: &bucket} if err = dbInput.Validate(); err != nil { - return fmt.Errorf("delete bucket input validation failed for %s, err %w", - bucket, err) + errMsgPrefix := fmt.Errorf("delete bucket input validation failed for %s", bucket) + + return processAwsError(errMsgPrefix, err) } _, err = s.client.DeleteBucket(dbInput) if err != nil && !isAwsErrCodeNoSuchBucket(err) { - return fmt.Errorf("failed to delete bucket %s, %w", - bucket, err) + errMsgPrefix := fmt.Errorf("failed to delete bucket %s", bucket) + + return processAwsError(errMsgPrefix, err) } return nil @@ -300,9 +304,11 @@ func (s *s3ObjectStore) PurgeBucket(bucket string) ( return nil // Not an error } - return fmt.Errorf("unable to ListKeys "+ - "from endpoint %s bucket %s, %w", - s.s3Endpoint, bucket, err) + errMsgPrefix := fmt.Errorf("unable to ListKeys "+ + "from endpoint %s bucket %s", + s.s3Endpoint, bucket) + + return processAwsError(errMsgPrefix, err) } for _, key := range keys { @@ -381,6 +387,15 @@ func DeleteTypedObject(s ObjectStorer, keyPrefix, keySuffix string, object inter return s.DeleteObject(typedKey(keyPrefix, keySuffix, reflect.TypeOf(object))) } +func processAwsError(errMsgPrefix, err error) error { + var awsErr awserr.Error + if errors.As(err, &awsErr) { + return fmt.Errorf("%w: code: %s, message: %s", errMsgPrefix, awsErr.Code(), awsErr.Message()) + } + + return errMsgPrefix +} + // UploadObject uploads the given object to the bucket with the given key. // - OK to call UploadObject() concurrently from multiple goroutines safely. // - Upload may fail due to many reasons: RequestError (connection error), @@ -414,8 +429,9 @@ func (s *s3ObjectStore) UploadObject(key string, Key: &key, Body: encodedUploadContent, }); err != nil { - return fmt.Errorf("failed to upload data of %s:%s, %w", - bucket, key, err) + errMsgPrefix := fmt.Errorf("failed to upload data of %s:%s", bucket, key) + + return processAwsError(errMsgPrefix, err) } return nil @@ -506,9 +522,9 @@ func (s *s3ObjectStore) ListKeys(keyPrefix string) ( ContinuationToken: nextContinuationToken, }) if err != nil { - return nil, - fmt.Errorf("failed to list objects in bucket %s:%s, %w", - bucket, keyPrefix, err) + errMsgPrefix := fmt.Errorf("failed to list objects in bucket") + + return nil, processAwsError(errMsgPrefix, err) } for _, entry := range result.Contents { @@ -552,8 +568,9 @@ func (s *s3ObjectStore) DownloadObject(key string, Bucket: &bucket, Key: &key, }); err != nil { - return fmt.Errorf("failed to download data of %s:%s, %w", - bucket, key, err) + errMsgPrefix := fmt.Errorf("failed to download data of %s:%s", bucket, key) + + return processAwsError(errMsgPrefix, err) } gzReader, err := gzip.NewReader(bytes.NewReader(writerAt.Bytes())) @@ -594,9 +611,11 @@ func (s *s3ObjectStore) DeleteObjectsWithKeyPrefix(keyPrefix string) ( keys, err := s.ListKeys(keyPrefix) if err != nil { - return fmt.Errorf("unable to ListKeys in DeleteObjects "+ - "from endpoint %s bucket %s keyPrefix %s, %w", - s.s3Endpoint, bucket, keyPrefix, err) + errMsgPrefix := fmt.Errorf("unable to ListKeys in DeleteObjects "+ + "from endpoint %s bucket %s keyPrefix %s", + s.s3Endpoint, bucket, keyPrefix) + + return processAwsError(errMsgPrefix, err) } if err = s.DeleteObjects(keys...); err != nil { From 040137e88af0300e65c9d05719a84472e0b3ff4f Mon Sep 17 00:00:00 2001 From: rakeshgm Date: Mon, 22 Jan 2024 18:04:51 +0530 Subject: [PATCH 5/5] addressing comments Signed-off-by: rakeshgm --- controllers/s3utils.go | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/controllers/s3utils.go b/controllers/s3utils.go index a192fa245..1e3fa4bc2 100644 --- a/controllers/s3utils.go +++ b/controllers/s3utils.go @@ -597,8 +597,13 @@ func (s *s3ObjectStore) DeleteObject(key string) error { Bucket: aws.String(s.s3Bucket), Key: aws.String(key), }) + if err != nil { + errMsgPrefix := fmt.Errorf("failed to delete object %s", *aws.String(key)) + + return processAwsError(errMsgPrefix, err) + } - return err + return nil } // DeleteObjectsWithKeyPrefix deletes from the bucket any objects that @@ -643,9 +648,16 @@ func (s *s3ObjectStore) DeleteObjects(keys ...string) error { ctx, cancel := context.WithDeadline(context.TODO(), time.Now().Add(s3Timeout)) defer cancel() - return s.batchDeleter.Delete(ctx, &s3manager.DeleteObjectsIterator{ + err := s.batchDeleter.Delete(ctx, &s3manager.DeleteObjectsIterator{ Objects: delObjects, }) + if err != nil { + errMsgPrefix := fmt.Errorf("unable to process batch delete") + + return processAwsError(errMsgPrefix, err) + } + + return nil } // isAwsErrCodeNoSuchBucket returns true if the given input `err` has wrapped