From 9d379672fa38e2231a9409d66d8448e52ebe886f Mon Sep 17 00:00:00 2001
From: Raghavendra Talur <raghavendra.talur@gmail.com>
Date: Tue, 23 Jan 2024 11:22:43 -0500
Subject: [PATCH 1/5] go.mod: Use a replace directive for the api module

The creation of the api module is for other users of Ramen api and not
for the controllers of Ramen. We should always use the latest available
api in the repository.

Signed-off-by: Raghavendra Talur <raghavendra.talur@gmail.com>
---
 go.mod | 3 +++
 go.sum | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/go.mod b/go.mod
index 0f270270d..8eeb52cba 100644
--- a/go.mod
+++ b/go.mod
@@ -2,6 +2,9 @@ module github.com/ramendr/ramen
 
 go 1.21.6
 
+// This replace should always be here for ease of development.
+replace github.com/ramendr/ramen/api => ./api
+
 require (
 	github.com/aws/aws-sdk-go v1.44.289
 	github.com/backube/volsync v0.7.1
diff --git a/go.sum b/go.sum
index 7b1e17ead..49e17eb02 100644
--- a/go.sum
+++ b/go.sum
@@ -277,8 +277,6 @@ github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdO
 github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY=
 github.com/prometheus/procfs v0.10.1 h1:kYK1Va/YMlutzCGazswoHKo//tZVlFpKYh+PymziUAg=
 github.com/prometheus/procfs v0.10.1/go.mod h1:nwNm2aOCAYw8uTR/9bWRREkZFxAUcWzPHWJq+XBB/FM=
-github.com/ramendr/ramen/api v0.0.0-20240117171503-e11c56eac24d h1:cusjxTd7EUt/VP51YA2hBH/+H7svwCAmiFybbssEKx4=
-github.com/ramendr/ramen/api v0.0.0-20240117171503-e11c56eac24d/go.mod h1:KXZjrQDRobLq1FvIpxHVZCG054qQo+2t9uGjb/wc9k4=
 github.com/ramendr/recipe v0.0.0-20230817160432-729dc7fd8932 h1:n89W9K2gDa0XwdIVuWyg53hPgaR97DfGVi9o2V0WcWA=
 github.com/ramendr/recipe v0.0.0-20230817160432-729dc7fd8932/go.mod h1:QHVQXKgNId8EfvNd+Y6JcTrsXwTImtSFkV4IsiOkwCw=
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=

From f0694b16abb8eb79507e66cbab419d4b45f30d7b Mon Sep 17 00:00:00 2001
From: Raghavendra Talur <raghavendra.talur@gmail.com>
Date: Tue, 23 Jan 2024 12:09:30 -0500
Subject: [PATCH 2/5] dockerfile: copy the api dir before go mod download

The go mod download command fails on the github runner as it is not able
to find the go.mod file under api dir.

Signed-off-by: Raghavendra Talur <raghavendra.talur@gmail.com>
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index f3a62020a..57d9d9ab7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,13 +8,13 @@ WORKDIR /workspace
 # Copy the Go Modules manifests
 COPY go.mod go.mod
 COPY go.sum go.sum
+COPY api/ api/
 # cache deps before building and copying source so that we don't need to re-download as much
 # and so that source changes don't invalidate our downloaded layer
 RUN go mod download
 
 # Copy the go source
 COPY main.go main.go
-COPY api/ api/
 COPY controllers/ controllers/
 
 # Build

From 8c3c5eeb4a80c69aa7b6d9bdfdaff31d74a8a43d Mon Sep 17 00:00:00 2001
From: Benamar Mekhissi <bmekhiss@ibm.com>
Date: Sat, 13 Jan 2024 08:02:08 -0500
Subject: [PATCH 3/5] Fix Failover Confusion in DRPC Action Post Hub Recovery

If the DRPC Action is set to Failover, for instance, transitioning from C1 to C2, a subsequent
failover request from C2 to C1 creates confusion in Ramen post hub recovery because the action
didn't change and the only thing that changed is the destination cluster. The solution is to
permit failover from C2 to C1 if C1 is accessible.

Signed-off-by: Benamar Mekhissi <bmekhiss@ibm.com>
---
 api/v1alpha1/drplacementcontrol_types.go      |   4 +
 controllers/drplacementcontrol.go             |   1 +
 controllers/drplacementcontrol_controller.go  | 103 +++++++++++-------
 .../drplacementcontrol_controller_test.go     |  55 ++++++----
 controllers/util/mcv_util.go                  |   5 +-
 5 files changed, 100 insertions(+), 68 deletions(-)

diff --git a/api/v1alpha1/drplacementcontrol_types.go b/api/v1alpha1/drplacementcontrol_types.go
index 369290858..15447f552 100644
--- a/api/v1alpha1/drplacementcontrol_types.go
+++ b/api/v1alpha1/drplacementcontrol_types.go
@@ -28,6 +28,10 @@ type DRState string
 
 // These are the valid values for DRState
 const (
+	// WaitForUser, state recorded in DRPC status to indicate that we are
+	// waiting for the user to take an action after hub recover.
+	WaitForUser = DRState("WaitForUser")
+
 	// Initiating, state recorded in the DRPC status to indicate that this
 	// action (Deploy/Failover/Relocate) is preparing for execution. There
 	// is NO follow up state called 'Initiated'
diff --git a/controllers/drplacementcontrol.go b/controllers/drplacementcontrol.go
index e0eac5807..7c4f63a80 100644
--- a/controllers/drplacementcontrol.go
+++ b/controllers/drplacementcontrol.go
@@ -2335,6 +2335,7 @@ func (d *DRPCInstance) setConditionOnInitialDeploymentCompletion() {
 
 func (d *DRPCInstance) setStatusInitiating() {
 	if !(d.instance.Status.Phase == "" ||
+		d.instance.Status.Phase == rmn.WaitForUser ||
 		d.instance.Status.Phase == rmn.Deployed ||
 		d.instance.Status.Phase == rmn.FailedOver ||
 		d.instance.Status.Phase == rmn.Relocated) {
diff --git a/controllers/drplacementcontrol_controller.go b/controllers/drplacementcontrol_controller.go
index ede531b83..3ee5eec22 100644
--- a/controllers/drplacementcontrol_controller.go
+++ b/controllers/drplacementcontrol_controller.go
@@ -856,6 +856,8 @@ func (r *DRPlacementControlReconciler) createDRPCInstance(
 	placementObj client.Object,
 	log logr.Logger,
 ) (*DRPCInstance, error) {
+	log.Info("Creating DRPC instance")
+
 	drClusters, err := getDRClusters(ctx, r.Client, drPolicy)
 	if err != nil {
 		return nil, err
@@ -2168,8 +2170,10 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency(
 ) (bool, error) {
 	requeue := true
 
+	log.Info("Ensure DRPC Status Consistency")
+
 	// This will always be false the first time the DRPC resource is first created OR after hub recovery
-	if drpc.Status.Phase != "" {
+	if drpc.Status.Phase != "" && drpc.Status.Phase != rmn.WaitForUser {
 		return !requeue, nil
 	}
 
@@ -2178,7 +2182,10 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency(
 		dstCluster = drpc.Spec.FailoverCluster
 	}
 
-	progress, err := r.determineDRPCState(ctx, drpc, drPolicy, placementObj, dstCluster, log)
+	progress, msg, err := r.determineDRPCState(ctx, drpc, drPolicy, placementObj, dstCluster, log)
+
+	log.Info(msg)
+
 	if err != nil {
 		return requeue, err
 	}
@@ -2187,22 +2194,23 @@ func (r *DRPlacementControlReconciler) ensureDRPCStatusConsistency(
 	case Continue:
 		return !requeue, nil
 	case AllowFailover:
+		drpc.Status.Phase = rmn.WaitForUser
 		updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log)
 		addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable,
-			drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed")
+			drpc.Generation, metav1.ConditionTrue, rmn.ReasonSuccess, msg)
 		addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation,
 			metav1.ConditionTrue, rmn.ReasonSuccess, "Failover allowed")
 
 		return requeue, nil
 	default:
-		msg := "Operation Paused - User Intervention Required."
+		msg := fmt.Sprintf("Operation Paused - User Intervention Required. %s", msg)
 
-		log.Info(fmt.Sprintf("err:%v - msg:%s", err, msg))
+		log.Info(msg)
 		updateDRPCProgression(drpc, rmn.ProgressionActionPaused, log)
 		addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionAvailable,
 			drpc.Generation, metav1.ConditionFalse, rmn.ReasonPaused, msg)
 		addOrUpdateCondition(&drpc.Status.Conditions, rmn.ConditionPeerReady, drpc.Generation,
-			metav1.ConditionFalse, rmn.ReasonPaused, msg)
+			metav1.ConditionFalse, rmn.ReasonPaused, "User Intervention Required")
 
 		return requeue, nil
 	}
@@ -2249,19 +2257,19 @@ func (r *DRPlacementControlReconciler) determineDRPCState(
 	placementObj client.Object,
 	dstCluster string,
 	log logr.Logger,
-) (Progress, error) {
+) (Progress, string, error) {
 	log.Info("Rebuild DRPC state")
 
 	vrgNamespace, err := selectVRGNamespace(r.Client, log, drpc, placementObj)
 	if err != nil {
 		log.Info("Failed to select VRG namespace")
 
-		return Stop, err
+		return Stop, "", err
 	}
 
 	drClusters, err := getDRClusters(ctx, r.Client, drPolicy)
 	if err != nil {
-		return Stop, err
+		return Stop, "", err
 	}
 
 	vrgs, successfullyQueriedClusterCount, failedCluster, err := getVRGsFromManagedClusters(
@@ -2269,21 +2277,29 @@ func (r *DRPlacementControlReconciler) determineDRPCState(
 	if err != nil {
 		log.Info("Failed to get a list of VRGs")
 
-		return Stop, err
+		return Stop, "", err
 	}
 
 	// IF 2 clusters queried, and both queries failed, then STOP
 	if successfullyQueriedClusterCount == 0 {
-		log.Info("Number of clusters queried is 0. Stop...")
+		msg := "Stop - Number of clusters queried is 0"
 
-		return Stop, nil
+		return Stop, msg, nil
 	}
 
 	// IF 2 clusters queried successfully and no VRGs, then continue with initial deployment
 	if successfullyQueriedClusterCount == 2 && len(vrgs) == 0 {
 		log.Info("Queried 2 clusters successfully")
 
-		return Continue, nil
+		return Continue, "", nil
+	}
+
+	if drpc.Status.Phase == rmn.WaitForUser &&
+		drpc.Spec.Action == rmn.ActionFailover &&
+		drpc.Spec.FailoverCluster != failedCluster {
+		log.Info("Continue. The action is failover and the failoverCluster is accessible")
+
+		return Continue, "", nil
 	}
 
 	// IF queried 2 clusters queried, 1 failed and 0 VRG found, then check s3 store.
@@ -2297,21 +2313,22 @@ func (r *DRPlacementControlReconciler) determineDRPCState(
 		if vrg == nil {
 			// IF the failed cluster is not the dest cluster, then this could be an initial deploy
 			if failedCluster != dstCluster {
-				return Continue, nil
+				return Continue, "", nil
 			}
 
-			log.Info("Unable to query all clusters and failed to get VRG from s3 store")
+			msg := fmt.Sprintf("Unable to query all clusters and failed to get VRG from s3 store. Failed to query %s",
+				failedCluster)
 
-			return Stop, nil
+			return Stop, msg, nil
 		}
 
-		log.Info("VRG From s3", "VRG Spec", vrg.Spec, "VRG Annotations", vrg.GetAnnotations())
+		log.Info("Got VRG From s3", "VRG Spec", vrg.Spec, "VRG Annotations", vrg.GetAnnotations())
 
 		if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) {
-			log.Info(fmt.Sprintf("Two different actions - drpc action is '%s'/vrg action from s3 is '%s'",
-				drpc.Spec.Action, vrg.Spec.Action))
+			msg := fmt.Sprintf("Failover is allowed - Two different actions - drpcAction is '%s' and vrgAction from s3 is '%s'",
+				drpc.Spec.Action, vrg.Spec.Action)
 
-			return AllowFailover, nil
+			return AllowFailover, msg, nil
 		}
 
 		if dstCluster == vrg.GetAnnotations()[DestinationClusterAnnotationKey] &&
@@ -2319,13 +2336,13 @@ func (r *DRPlacementControlReconciler) determineDRPCState(
 			log.Info(fmt.Sprintf("VRG from s3. Same dstCluster %s/%s. Proceeding...",
 				dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey]))
 
-			return Continue, nil
+			return Continue, "", nil
 		}
 
-		log.Info(fmt.Sprintf("VRG from s3. DRPCAction/vrgAction/DRPCDstClstr/vrgDstClstr %s/%s/%s/%s. Allow Failover...",
-			drpc.Spec.Action, vrg.Spec.Action, dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey]))
+		msg := fmt.Sprintf("Failover is allowed - drpcAction:'%s'. vrgAction:'%s'. DRPCDstClstr:'%s'. vrgDstClstr:'%s'.",
+			drpc.Spec.Action, vrg.Spec.Action, dstCluster, vrg.GetAnnotations()[DestinationClusterAnnotationKey])
 
-		return AllowFailover, nil
+		return AllowFailover, msg, nil
 	}
 
 	// IF 2 clusters queried, 1 failed and 1 VRG found on the failover cluster, then check the action, if they don't
@@ -2341,25 +2358,26 @@ func (r *DRPlacementControlReconciler) determineDRPCState(
 			break
 		}
 
-		if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) {
-			log.Info(fmt.Sprintf("Stop! Two different actions - drpc action is '%s'/vrg action is '%s'",
-				drpc.Spec.Action, vrg.Spec.Action))
+		if drpc.Spec.Action != rmn.DRAction(vrg.Spec.Action) &&
+			dstCluster == clusterName {
+			msg := fmt.Sprintf("Stop - Two different actions for the same cluster - drpcAction:'%s'. vrgAction:'%s'",
+				drpc.Spec.Action, vrg.Spec.Action)
 
-			return Stop, nil
+			return Stop, msg, nil
 		}
 
 		if dstCluster != clusterName && vrg.Spec.ReplicationState == rmn.Secondary {
-			log.Info(fmt.Sprintf("Same Action and dstCluster and ReplicationState %s/%s/%s",
+			log.Info(fmt.Sprintf("Failover is allowed. Action/dstCluster/ReplicationState %s/%s/%s",
 				drpc.Spec.Action, dstCluster, vrg.Spec.ReplicationState))
 
-			log.Info("Failover is allowed - Primary is assumed in the failed cluster")
+			msg := "Failover is allowed - Primary is assumed to be on the failed cluster"
 
-			return AllowFailover, nil
+			return AllowFailover, msg, nil
 		}
 
-		log.Info("Allow to continue")
+		log.Info("Same action, dstCluster, and ReplicationState is primary. Continuing")
 
-		return Continue, nil
+		return Continue, "", nil
 	}
 
 	// Finally, IF 2 clusters queried successfully and 1 or more VRGs found, and if one of the VRGs is on the dstCluster,
@@ -2379,25 +2397,26 @@ func (r *DRPlacementControlReconciler) determineDRPCState(
 
 		// This can happen if a hub is recovered in the middle of a Relocate
 		if vrg.Spec.ReplicationState == rmn.Secondary && len(vrgs) == 2 {
-			log.Info("Both VRGs are in secondary state")
+			msg := "Stop - Both VRGs have the same secondary state"
 
-			return Stop, nil
+			return Stop, msg, nil
 		}
 
 		if drpc.Spec.Action == rmn.DRAction(vrg.Spec.Action) && dstCluster == clusterName {
-			log.Info(fmt.Sprintf("Same Action %s", drpc.Spec.Action))
+			log.Info(fmt.Sprintf("Same Action and dest cluster %s/%s", drpc.Spec.Action, dstCluster))
 
-			return Continue, nil
+			return Continue, "", nil
 		}
 
-		log.Info("Failover is allowed", "vrgs count", len(vrgs), "drpc action",
-			drpc.Spec.Action, "vrg action", vrg.Spec.Action, "dstCluster/clusterName", dstCluster+"/"+clusterName)
+		msg := fmt.Sprintf("Failover is allowed - VRGs count:'%d'. drpcAction:'%s'."+
+			" vrgAction:'%s'. DstCluster:'%s'. vrgOnCluste '%s'",
+			len(vrgs), drpc.Spec.Action, vrg.Spec.Action, dstCluster, clusterName)
 
-		return AllowFailover, nil
+		return AllowFailover, msg, nil
 	}
 
 	// IF none of the above, then allow failover (set PeerReady), but stop until someone makes the change
-	log.Info("Failover is allowed, but user intervention is required")
+	msg := "Failover is allowed - User intervention is required"
 
-	return AllowFailover, nil
+	return AllowFailover, msg, nil
 }
diff --git a/controllers/drplacementcontrol_controller_test.go b/controllers/drplacementcontrol_controller_test.go
index 4f680f6db..605bf6ffd 100644
--- a/controllers/drplacementcontrol_controller_test.go
+++ b/controllers/drplacementcontrol_controller_test.go
@@ -1519,6 +1519,7 @@ func runFailoverAction(placementObj client.Object, fromCluster, toCluster string
 	Expect(len(drpc.Status.Conditions)).To(Equal(2))
 	_, condition := getDRPCCondition(&drpc.Status, rmn.ConditionAvailable)
 	Expect(condition.Reason).To(Equal(string(rmn.FailedOver)))
+	Expect(drpc.Status.ActionStartTime).ShouldNot(BeNil())
 
 	decision := getLatestUserPlacementDecision(placementObj.GetName(), placementObj.GetNamespace())
 	Expect(decision.ClusterName).To(Equal(toCluster))
@@ -2283,8 +2284,8 @@ var _ = Describe("DRPlacementControl Reconciler", func() {
 		Specify("DRClusters", func() {
 			populateDRClusters()
 		})
-		When("6 Applications deployed for the first time", func() {
-			It("Should deploy 6 drpcs", func() {
+		When("Application deployed for the first time", func() {
+			It("Should deploy drpc", func() {
 				createNamespacesAsync(getNamespaceObj(DefaultDRPCNamespace))
 				createManagedClusters(asyncClusters)
 				createDRClustersAsync()
@@ -2309,7 +2310,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() {
 		// -------- After Secondary is back online ---------
 		// NAMESPACE           NAME           AGE     PREFERREDCLUSTER   FAILOVERCLUSTER   DESIREDSTATE   CURRENTSTATE   PROGRESSION      START TIME             DURATION   PEER READY
 		// busybox-samples-1   busybox-drpc   12h     East1ManagedClus                                      Deployed       Completed                                              True
-		When("DRAction is Initial deploy -- during hub recovery -> Secondary Down", func() {
+		When("HubRecovery: DRAction is Initial deploy -> Secondary Down", func() {
 			It("Should reconstructs the DRPC state to completion. Primary is East1ManagedCluster", func() {
 				setClusterDown(West1ManagedCluster)
 				clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace)
@@ -2333,32 +2334,31 @@ var _ = Describe("DRPlacementControl Reconciler", func() {
 		// -------- After Primary is back online ---------
 		// NAMESPACE           NAME           AGE     PREFERREDCLUSTER   FAILOVERCLUSTER   DESIREDSTATE   CURRENTSTATE   PROGRESSION      START TIME             DURATION   PEER READY
 		// busybox-samples-5   busybox-drpc   11h     East1ManagedClus                                    Deployed       Completed     2023-12-20T12:52:20Z   5m32.467527356s   True
-		When("DRAction is Initial deploy -- during hub recovery -> Primary Down", func() {
-			It("Should be able to reconstructs the DRPC state to completion. Primary East1ManagedCluster", func() {
-				setClusterDown(West1ManagedCluster)
+		When("HubRecovery: DRAction is Initial deploy -> Primary Down", func() {
+			It("Should pause and wait for user to trigger a failover. Primary East1ManagedCluster", func() {
+				setClusterDown(East1ManagedCluster)
 				clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace)
 				clearDRPCStatus()
 				expectedAction := rmn.DRAction("")
-				expectedPhase := rmn.Deployed
-				exptectedPorgression := rmn.ProgressionCompleted
+				expectedPhase := rmn.WaitForUser
+				exptectedPorgression := rmn.ProgressionActionPaused
 				verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression)
-				resetClusterDown()
-				exptectedCompleted := rmn.ProgressionCompleted
-				verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedCompleted)
 			})
 		})
 
 		// Failover
-		When("DRAction is set to Failover - Hub Recovery", func() {
+		When("HubRecovery: DRAction is set to Failover -> primary cluster down", func() {
 			It("Should failover to West1ManagedCluster", func() {
 				from := East1ManagedCluster
 				to := West1ManagedCluster
+				resetClusterDown()
 				runFailoverAction(userPlacementRule1, from, to, false, false)
-				uploadVRGtoS3Store(DRPCCommonName, DefaultDRPCNamespace, West1ManagedCluster,
-					rmn.VRGAction(rmn.ActionFailover))
 				waitForDRPCPhaseAndProgression(DefaultDRPCNamespace, rmn.FailedOver)
+				uploadVRGtoS3Store(DRPCCommonName, DefaultDRPCNamespace, West1ManagedCluster, rmn.VRGActionFailover)
+				resetClusterDown()
 			})
 		})
+
 		//nolint:lll
 		// -------- Before Hub Recovery Action FailedOver ---
 		// NAMESPACE           NAME           AGE     PREFERREDCLUSTER   FAILOVERCLUSTER   DESIREDSTATE   CURRENTSTATE   PROGRESSION            START TIME             DURATION          PEER READY
@@ -2369,14 +2369,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() {
 		// -------- After Primary is back online ------------
 		// NAMESPACE           NAME           AGE     PREFERREDCLUSTER   FAILOVERCLUSTER   DESIREDSTATE   CURRENTSTATE   PROGRESSION      START TIME             DURATION   PEER READY
 		// busybox-samples-2   busybox-drpc   11h     East1ManagedClus   West1ManagedClu   Failover       FailedOver     Completed                                              True
-		When("DRAction is Failover -- during hub recovery -> Primary Down", func() {
+		When("HubRecovery: DRAction is Failover -> Primary Down", func() {
 			It("Should Pause, but allows failover. Primary West1ManagedCluster", func() {
 				setClusterDown(West1ManagedCluster)
 				clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace)
 				clearDRPCStatus()
 				setDRPCSpecExpectationTo(DefaultDRPCNamespace, East1ManagedCluster, West1ManagedCluster, "")
 				expectedAction := rmn.DRAction("")
-				expectedPhase := rmn.DRState("")
+				expectedPhase := rmn.WaitForUser
 				exptectedPorgression := rmn.ProgressionActionPaused
 				verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression)
 				checkConditionAllowFailover(DefaultDRPCNamespace)
@@ -2393,7 +2393,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() {
 		})
 
 		// Relocate
-		When("DRAction is set to Relocate - Hub Recovery", func() {
+		When("HubRecovery: DRAction is set to Relocate", func() {
 			It("Should relocate to Primary (East1ManagedCluster)", func() {
 				// ----------------------------- RELOCATION TO PRIMARY --------------------------------------
 				from := West1ManagedCluster
@@ -2412,7 +2412,7 @@ var _ = Describe("DRPlacementControl Reconciler", func() {
 		// -------- After Primary is back online ------------
 		// NAMESPACE           NAME           AGE     PREFERREDCLUSTER   FAILOVERCLUSTER   DESIREDSTATE   CURRENTSTATE   PROGRESSION      START TIME             DURATION   PEER READY
 		// busybox-sample      busybox-drpc   16h     East1ManagedClus   West1ManagedClu   Relocate       Relocated      Completed                                              True
-		When("DRAction is Relocate -- during hub recovery -> Secondary Down", func() {
+		When("HubRecovery: DRAction is Relocate -> Secondary Down", func() {
 			It("Should Continue given the primary East1ManagedCluster is up", func() {
 				setClusterDown(West1ManagedCluster)
 				clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace)
@@ -2442,14 +2442,14 @@ var _ = Describe("DRPlacementControl Reconciler", func() {
 		// -------- After Primary is back online ------------
 		// NAMESPACE           NAME           AGE     PREFERREDCLUSTER   FAILOVERCLUSTER   DESIREDSTATE   CURRENTSTATE   PROGRESSION      START TIME             DURATION   PEER READY
 		// busybox-samples-3   busybox-drpc   11h     East1ManagedClus                     Relocate       Relocated      Completed                                              True
-		When("DRAction is supposed to be Relocate -- during hub recovery -> Primary Down -> Action Cleared", func() {
+		When("HubRecovery: DRAction is supposed to be Relocate -> Primary Down -> Action Cleared", func() {
 			It("Should Pause given the primary East1ManagedCluster is down, but allow failover", func() {
 				setClusterDown(East1ManagedCluster)
 				clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace)
 				clearDRPCStatus()
 				setDRPCSpecExpectationTo(DefaultDRPCNamespace, East1ManagedCluster, West1ManagedCluster, "")
 				expectedAction := rmn.DRAction("")
-				expectedPhase := rmn.DRState("")
+				expectedPhase := rmn.WaitForUser
 				exptectedPorgression := rmn.ProgressionActionPaused
 				verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression)
 				checkConditionAllowFailover(DefaultDRPCNamespace)
@@ -2495,11 +2495,19 @@ var _ = Describe("DRPlacementControl Reconciler", func() {
 func verifyDRPCStateAndProgression(expectedAction rmn.DRAction, expectedPhase rmn.DRState,
 	exptectedPorgression rmn.ProgressionStatus,
 ) {
+	var phase rmn.DRState
+
+	var progression rmn.ProgressionStatus
+
 	Eventually(func() bool {
 		drpc := getLatestDRPC(DefaultDRPCNamespace)
+		phase = drpc.Status.Phase
+		progression = drpc.Status.Progression
 
-		return drpc.Status.Phase == expectedPhase && drpc.Status.Progression == exptectedPorgression
-	}, timeout, time.Millisecond*1000).Should(BeTrue(), "Phase has not been updated yet!")
+		return phase == expectedPhase && progression == exptectedPorgression
+	}, timeout, time.Millisecond*1000).Should(BeTrue(),
+		fmt.Sprintf("Phase has not been updated yet! Phase:%s Expected:%s - progression:%s exptected:%s",
+			phase, expectedPhase, progression, exptectedPorgression))
 
 	drpc := getLatestDRPC(DefaultDRPCNamespace)
 	Expect(drpc.Spec.Action).Should(Equal(expectedAction))
@@ -2525,8 +2533,7 @@ func checkConditionAllowFailover(namespace string) {
 		return false
 	}, timeout, interval).Should(BeTrue(), fmt.Sprintf("Condition '%+v'", availableCondition))
 
-	Expect(drpc.Status.Phase).To(Equal(rmn.DRState("")))
-	Expect(availableCondition.Message).Should(Equal("Failover allowed"))
+	Expect(drpc.Status.Phase).To(Equal(rmn.WaitForUser))
 }
 
 func uploadVRGtoS3Store(name, namespace, dstCluster string, action rmn.VRGAction) {
diff --git a/controllers/util/mcv_util.go b/controllers/util/mcv_util.go
index f48ff5407..ece7a2a4d 100644
--- a/controllers/util/mcv_util.go
+++ b/controllers/util/mcv_util.go
@@ -62,7 +62,7 @@ type ManagedClusterViewGetterImpl struct {
 func (m ManagedClusterViewGetterImpl) GetVRGFromManagedCluster(resourceName, resourceNamespace, managedCluster string,
 	annotations map[string]string,
 ) (*rmn.VolumeReplicationGroup, error) {
-	logger := ctrl.Log.WithName("MCV").WithValues("resourceName", resourceName)
+	logger := ctrl.Log.WithName("MCV").WithValues("resourceName", resourceName, "cluster", managedCluster)
 	// get VRG and verify status through ManagedClusterView
 	mcvMeta := metav1.ObjectMeta{
 		Name:        BuildManagedClusterViewName(resourceName, resourceNamespace, "vrg"),
@@ -228,7 +228,8 @@ func (m ManagedClusterViewGetterImpl) getManagedClusterResource(
 		return errorswrapper.Wrap(err, "getManagedClusterResource failed")
 	}
 
-	logger.Info(fmt.Sprintf("MCV Conditions: %v", mcv.Status.Conditions))
+	logger.Info(fmt.Sprintf("Get managedClusterResource Returned the following MCV Conditions: %v",
+		mcv.Status.Conditions))
 
 	return m.GetResource(mcv, resource)
 }

From 303b5ff7864fbe68920a4d43261157745dc7f355 Mon Sep 17 00:00:00 2001
From: rakeshgm <rakeshgm@redhat.com>
Date: Thu, 24 Aug 2023 23:25:04 +0530
Subject: [PATCH 4/5] report better s3Errors

use aws/awserr package to report
better errors with correct format

Signed-off-by: rakeshgm <rakeshgm@redhat.com>
---
 controllers/s3utils.go | 61 +++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/controllers/s3utils.go b/controllers/s3utils.go
index 389c15086..a192fa245 100644
--- a/controllers/s3utils.go
+++ b/controllers/s3utils.go
@@ -8,6 +8,7 @@ import (
 	"compress/gzip"
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"reflect"
@@ -223,8 +224,9 @@ func (s *s3ObjectStore) CreateBucket(bucket string) (err error) {
 
 	cbInput := &s3.CreateBucketInput{Bucket: &bucket}
 	if err = cbInput.Validate(); err != nil {
-		return fmt.Errorf("create bucket input validation failed for %s, err %w",
-			bucket, err)
+		errMsgPrefix := fmt.Errorf("create bucket input validation failed for %s", bucket)
+
+		return processAwsError(errMsgPrefix, err)
 	}
 
 	_, err = s.client.CreateBucket(cbInput)
@@ -235,8 +237,8 @@ func (s *s3ObjectStore) CreateBucket(bucket string) (err error) {
 			case s3.ErrCodeBucketAlreadyExists:
 			case s3.ErrCodeBucketAlreadyOwnedByYou:
 			default:
-				return fmt.Errorf("failed to create bucket %s, %w",
-					bucket, err)
+				return fmt.Errorf("failed to create bucket %s, %s: %s",
+					bucket, aerr.Code(), aerr.Message())
 			}
 		}
 	}
@@ -264,14 +266,16 @@ func (s *s3ObjectStore) DeleteBucket(bucket string) (
 
 	dbInput := &s3.DeleteBucketInput{Bucket: &bucket}
 	if err = dbInput.Validate(); err != nil {
-		return fmt.Errorf("delete bucket input validation failed for %s, err %w",
-			bucket, err)
+		errMsgPrefix := fmt.Errorf("delete bucket input validation failed for %s", bucket)
+
+		return processAwsError(errMsgPrefix, err)
 	}
 
 	_, err = s.client.DeleteBucket(dbInput)
 	if err != nil && !isAwsErrCodeNoSuchBucket(err) {
-		return fmt.Errorf("failed to delete bucket %s, %w",
-			bucket, err)
+		errMsgPrefix := fmt.Errorf("failed to delete bucket %s", bucket)
+
+		return processAwsError(errMsgPrefix, err)
 	}
 
 	return nil
@@ -300,9 +304,11 @@ func (s *s3ObjectStore) PurgeBucket(bucket string) (
 			return nil // Not an error
 		}
 
-		return fmt.Errorf("unable to ListKeys "+
-			"from endpoint %s bucket %s, %w",
-			s.s3Endpoint, bucket, err)
+		errMsgPrefix := fmt.Errorf("unable to ListKeys "+
+			"from endpoint %s bucket %s",
+			s.s3Endpoint, bucket)
+
+		return processAwsError(errMsgPrefix, err)
 	}
 
 	for _, key := range keys {
@@ -381,6 +387,15 @@ func DeleteTypedObject(s ObjectStorer, keyPrefix, keySuffix string, object inter
 	return s.DeleteObject(typedKey(keyPrefix, keySuffix, reflect.TypeOf(object)))
 }
 
+func processAwsError(errMsgPrefix, err error) error {
+	var awsErr awserr.Error
+	if errors.As(err, &awsErr) {
+		return fmt.Errorf("%w: code: %s, message: %s", errMsgPrefix, awsErr.Code(), awsErr.Message())
+	}
+
+	return errMsgPrefix
+}
+
 // UploadObject uploads the given object to the bucket with the given key.
 //   - OK to call UploadObject() concurrently from multiple goroutines safely.
 //   - Upload may fail due to many reasons: RequestError (connection error),
@@ -414,8 +429,9 @@ func (s *s3ObjectStore) UploadObject(key string,
 		Key:    &key,
 		Body:   encodedUploadContent,
 	}); err != nil {
-		return fmt.Errorf("failed to upload data of %s:%s, %w",
-			bucket, key, err)
+		errMsgPrefix := fmt.Errorf("failed to upload data of %s:%s", bucket, key)
+
+		return processAwsError(errMsgPrefix, err)
 	}
 
 	return nil
@@ -506,9 +522,9 @@ func (s *s3ObjectStore) ListKeys(keyPrefix string) (
 			ContinuationToken: nextContinuationToken,
 		})
 		if err != nil {
-			return nil,
-				fmt.Errorf("failed to list objects in bucket %s:%s, %w",
-					bucket, keyPrefix, err)
+			errMsgPrefix := fmt.Errorf("failed to list objects in bucket")
+
+			return nil, processAwsError(errMsgPrefix, err)
 		}
 
 		for _, entry := range result.Contents {
@@ -552,8 +568,9 @@ func (s *s3ObjectStore) DownloadObject(key string,
 		Bucket: &bucket,
 		Key:    &key,
 	}); err != nil {
-		return fmt.Errorf("failed to download data of %s:%s, %w",
-			bucket, key, err)
+		errMsgPrefix := fmt.Errorf("failed to download data of %s:%s", bucket, key)
+
+		return processAwsError(errMsgPrefix, err)
 	}
 
 	gzReader, err := gzip.NewReader(bytes.NewReader(writerAt.Bytes()))
@@ -594,9 +611,11 @@ func (s *s3ObjectStore) DeleteObjectsWithKeyPrefix(keyPrefix string) (
 
 	keys, err := s.ListKeys(keyPrefix)
 	if err != nil {
-		return fmt.Errorf("unable to ListKeys in DeleteObjects "+
-			"from endpoint %s bucket %s keyPrefix %s, %w",
-			s.s3Endpoint, bucket, keyPrefix, err)
+		errMsgPrefix := fmt.Errorf("unable to ListKeys in DeleteObjects "+
+			"from endpoint %s bucket %s keyPrefix %s",
+			s.s3Endpoint, bucket, keyPrefix)
+
+		return processAwsError(errMsgPrefix, err)
 	}
 
 	if err = s.DeleteObjects(keys...); err != nil {

From 040137e88af0300e65c9d05719a84472e0b3ff4f Mon Sep 17 00:00:00 2001
From: rakeshgm <rakeshgm@redhat.com>
Date: Mon, 22 Jan 2024 18:04:51 +0530
Subject: [PATCH 5/5] addressing comments

Signed-off-by: rakeshgm <rakeshgm@redhat.com>
---
 controllers/s3utils.go | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/controllers/s3utils.go b/controllers/s3utils.go
index a192fa245..1e3fa4bc2 100644
--- a/controllers/s3utils.go
+++ b/controllers/s3utils.go
@@ -597,8 +597,13 @@ func (s *s3ObjectStore) DeleteObject(key string) error {
 		Bucket: aws.String(s.s3Bucket),
 		Key:    aws.String(key),
 	})
+	if err != nil {
+		errMsgPrefix := fmt.Errorf("failed to delete object %s", *aws.String(key))
+
+		return processAwsError(errMsgPrefix, err)
+	}
 
-	return err
+	return nil
 }
 
 // DeleteObjectsWithKeyPrefix deletes from the bucket any objects that
@@ -643,9 +648,16 @@ func (s *s3ObjectStore) DeleteObjects(keys ...string) error {
 	ctx, cancel := context.WithDeadline(context.TODO(), time.Now().Add(s3Timeout))
 	defer cancel()
 
-	return s.batchDeleter.Delete(ctx, &s3manager.DeleteObjectsIterator{
+	err := s.batchDeleter.Delete(ctx, &s3manager.DeleteObjectsIterator{
 		Objects: delObjects,
 	})
+	if err != nil {
+		errMsgPrefix := fmt.Errorf("unable to process batch delete")
+
+		return processAwsError(errMsgPrefix, err)
+	}
+
+	return nil
 }
 
 // isAwsErrCodeNoSuchBucket returns true if the given input `err` has wrapped