From ee386c92ffa9724771e4980064fa279655e46f90 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Wed, 20 Nov 2024 00:17:20 +0800
Subject: [PATCH] Manual failover vote is not limited by two times the node
 timeout (#1305)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This limit should not restrict manual failover, otherwise in some
scenarios, manual failover will time out.

For example, if some FAILOVER_AUTH_REQUESTs or some FAILOVER_AUTH_ACKs
are lost during a manual failover, it cannot vote in the second manual
failover. Or in a mixed scenario of plain failover and manual failover,
it cannot vote for the subsequent manual failover.

The problem with the manual failover retry is that the mf will pause
the client 5s in the primary side. So every retry every manual failover
timed out is a bad move.

---------

Signed-off-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/cluster_legacy.c                   | 15 +++--
 src/cluster_legacy.h                   |  3 +-
 tests/support/cluster_util.tcl         |  1 +
 tests/unit/cluster/manual-failover.tcl | 88 ++++++++++++++++++++++++++
 4 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 69af65f1e8..7b3384ee9f 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4363,12 +4363,17 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
 
     /* We did not voted for a replica about this primary for two
      * times the node timeout. This is not strictly needed for correctness
-     * of the algorithm but makes the base case more linear. */
-    if (mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) {
+     * of the algorithm but makes the base case more linear.
+     *
+     * This limitation does not restrict manual failover. If a user initiates
+     * a manual failover, we need to allow it to vote, otherwise the manual
+     * failover may time out. */
+    if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) {
         serverLog(LL_WARNING,
-                  "Failover auth denied to %.40s %s: "
-                  "can't vote about this primary before %lld milliseconds",
+                  "Failover auth denied to %.40s (%s): "
+                  "can't vote for any replica of %.40s (%s) within %lld milliseconds",
                   node->name, node->human_nodename,
+                  node->replicaof->name, node->replicaof->human_nodename,
                   (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time)));
         return;
     }
@@ -4394,7 +4399,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
 
     /* We can vote for this replica. */
     server.cluster->lastVoteEpoch = server.cluster->currentEpoch;
-    node->replicaof->voted_time = mstime();
+    if (!force_ack) node->replicaof->voted_time = mstime();
     clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
     clusterSendFailoverAuth(node);
     serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename,
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 5280644e6e..2c3e1d83c8 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -338,7 +338,8 @@ struct _clusterNode {
     mstime_t pong_received;                 /* Unix time we received the pong */
     mstime_t data_received;                 /* Unix time we received any data */
     mstime_t fail_time;                     /* Unix time when FAIL flag was set */
-    mstime_t voted_time;                    /* Last time we voted for a replica of this primary */
+    mstime_t voted_time;                    /* Last time we voted for a replica of this primary in non manual
+                                             * failover scenarios. */
     mstime_t repl_offset_time;              /* Unix time we received offset for this node */
     mstime_t orphaned_time;                 /* Starting time of orphaned primary condition */
     long long repl_offset;                  /* Last known repl offset for this node. */
diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl
index 4b399214b9..686f00071b 100644
--- a/tests/support/cluster_util.tcl
+++ b/tests/support/cluster_util.tcl
@@ -145,6 +145,7 @@ proc wait_for_cluster_size {cluster_size} {
 # Check that cluster nodes agree about "state", or raise an error.
 proc wait_for_cluster_state {state} {
     for {set j 0} {$j < [llength $::servers]} {incr j} {
+        if {[process_is_paused [srv -$j pid]]} continue
         wait_for_condition 1000 50 {
             [CI $j cluster_state] eq $state
         } else {
diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl
index 2a9dff934b..78842068fa 100644
--- a/tests/unit/cluster/manual-failover.tcl
+++ b/tests/unit/cluster/manual-failover.tcl
@@ -183,3 +183,91 @@ test "Wait for instance #0 to return back alive" {
 }
 
 } ;# start_cluster
+
+start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} {
+    test "Manual failover vote is not limited by two times the node timeout - drop the auth ack" {
+        set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6
+        set CLUSTER_PACKET_TYPE_NONE -1
+
+        # Setting a large timeout to make sure we hit the voted_time limit.
+        R 0 config set cluster-node-timeout 150000
+        R 1 config set cluster-node-timeout 150000
+        R 2 config set cluster-node-timeout 150000
+
+        # Let replica drop FAILOVER_AUTH_ACK so that the election won't
+        # get the enough votes and the election will time out.
+        R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK
+
+        # The first manual failover will time out.
+        R 3 cluster failover
+        wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50
+        wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50
+
+        # Undo packet drop, so that replica can win the next election.
+        R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE
+
+        # Make sure the second manual failover will work.
+        R 3 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {slave} &&
+            [s -3 role] eq {master}
+        } else {
+            fail "The second failover does not happen"
+        }
+        wait_for_cluster_propagation
+    }
+} ;# start_cluster
+
+start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} {
+    test "Manual failover vote is not limited by two times the node timeout - mixed failover" {
+        # Make sure the failover is triggered by us.
+        R 1 config set cluster-replica-validity-factor 0
+        R 3 config set cluster-replica-no-failover yes
+        R 3 config set cluster-replica-validity-factor 0
+
+        # Pause the primary.
+        pause_process [srv 0 pid]
+        wait_for_cluster_state fail
+
+        # Setting a large timeout to make sure we hit the voted_time limit.
+        R 1 config set cluster-node-timeout 150000
+        R 2 config set cluster-node-timeout 150000
+
+        # R 3 performs an automatic failover and it will work.
+        R 3 config set cluster-replica-no-failover no
+        wait_for_condition 1000 50 {
+            [s -3 role] eq {master}
+        } else {
+            fail "The first failover does not happen"
+        }
+
+        # Resume the primary and wait for it to become a replica.
+        resume_process [srv 0 pid]
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {slave}
+        } else {
+            fail "Old primary not converted into replica"
+        }
+        wait_for_cluster_propagation
+
+        # The old primary doing a manual failover and wait for it.
+        R 0 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {master} &&
+            [s -3 role] eq {slave}
+        } else {
+            fail "The second failover does not happen"
+        }
+        wait_for_cluster_propagation
+
+        # R 3 performs a manual failover and it will work.
+        R 3 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {slave} &&
+            [s -3 role] eq {master}
+        } else {
+            fail "The third falover does not happen"
+        }
+        wait_for_cluster_propagation
+    }
+} ;# start_cluster