Manual failover vote is not limited by two times the node timeout (#1305

) This limit should not restrict manual failover, otherwise in some scenarios, manual failover will time out. For example, if some FAILOVER_AUTH_REQUESTs or some FAILOVER_AUTH_ACKs are lost during a manual failover, it cannot vote in the second manual failover. Or in a mixed scenario of plain failover and manual failover, it cannot vote for the subsequent manual failover. The problem with the manual failover retry is that the mf will pause the client 5s in the primary side. So every retry every manual failover timed out is a bad move. --------- Signed-off-by: Binbin <[email protected]> Co-authored-by: Viktor Söderqvist <[email protected]>
valkey-io · Nov 19, 2024 · ee386c9 · ee386c9
1 parent 132798b
commit ee386c9
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 6 deletions.
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -4363,12 +4363,17 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
 
     /* We did not voted for a replica about this primary for two
      * times the node timeout. This is not strictly needed for correctness
-     * of the algorithm but makes the base case more linear. */
-    if (mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) {
+     * of the algorithm but makes the base case more linear.
+     *
+     * This limitation does not restrict manual failover. If a user initiates
+     * a manual failover, we need to allow it to vote, otherwise the manual
+     * failover may time out. */
+    if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) {
         serverLog(LL_WARNING,
-                  "Failover auth denied to %.40s %s: "
-                  "can't vote about this primary before %lld milliseconds",
+                  "Failover auth denied to %.40s (%s): "
+                  "can't vote for any replica of %.40s (%s) within %lld milliseconds",
                   node->name, node->human_nodename,
+                  node->replicaof->name, node->replicaof->human_nodename,
                   (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time)));
         return;
     }
@@ -4394,7 +4399,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
 
     /* We can vote for this replica. */
     server.cluster->lastVoteEpoch = server.cluster->currentEpoch;
-    node->replicaof->voted_time = mstime();
+    if (!force_ack) node->replicaof->voted_time = mstime();
     clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
     clusterSendFailoverAuth(node);
     serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename,

diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
@@ -338,7 +338,8 @@ struct _clusterNode {
     mstime_t pong_received;                 /* Unix time we received the pong */
     mstime_t data_received;                 /* Unix time we received any data */
     mstime_t fail_time;                     /* Unix time when FAIL flag was set */
-    mstime_t voted_time;                    /* Last time we voted for a replica of this primary */
+    mstime_t voted_time;                    /* Last time we voted for a replica of this primary in non manual
+                                             * failover scenarios. */
     mstime_t repl_offset_time;              /* Unix time we received offset for this node */
     mstime_t orphaned_time;                 /* Starting time of orphaned primary condition */
     long long repl_offset;                  /* Last known repl offset for this node. */

diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl
@@ -145,6 +145,7 @@ proc wait_for_cluster_size {cluster_size} {
 # Check that cluster nodes agree about "state", or raise an error.
 proc wait_for_cluster_state {state} {
     for {set j 0} {$j < [llength $::servers]} {incr j} {
+        if {[process_is_paused [srv -$j pid]]} continue
         wait_for_condition 1000 50 {
             [CI $j cluster_state] eq $state
         } else {

diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl
@@ -183,3 +183,91 @@ test "Wait for instance #0 to return back alive" {
 }
 
 } ;# start_cluster
+
+start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} {
+    test "Manual failover vote is not limited by two times the node timeout - drop the auth ack" {
+        set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6
+        set CLUSTER_PACKET_TYPE_NONE -1
+
+        # Setting a large timeout to make sure we hit the voted_time limit.
+        R 0 config set cluster-node-timeout 150000
+        R 1 config set cluster-node-timeout 150000
+        R 2 config set cluster-node-timeout 150000
+
+        # Let replica drop FAILOVER_AUTH_ACK so that the election won't
+        # get the enough votes and the election will time out.
+        R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK
+
+        # The first manual failover will time out.
+        R 3 cluster failover
+        wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50
+        wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50
+
+        # Undo packet drop, so that replica can win the next election.
+        R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE
+
+        # Make sure the second manual failover will work.
+        R 3 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {slave} &&
+            [s -3 role] eq {master}
+        } else {
+            fail "The second failover does not happen"
+        }
+        wait_for_cluster_propagation
+    }
+} ;# start_cluster
+
+start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} {
+    test "Manual failover vote is not limited by two times the node timeout - mixed failover" {
+        # Make sure the failover is triggered by us.
+        R 1 config set cluster-replica-validity-factor 0
+        R 3 config set cluster-replica-no-failover yes
+        R 3 config set cluster-replica-validity-factor 0
+
+        # Pause the primary.
+        pause_process [srv 0 pid]
+        wait_for_cluster_state fail
+
+        # Setting a large timeout to make sure we hit the voted_time limit.
+        R 1 config set cluster-node-timeout 150000
+        R 2 config set cluster-node-timeout 150000
+
+        # R 3 performs an automatic failover and it will work.
+        R 3 config set cluster-replica-no-failover no
+        wait_for_condition 1000 50 {
+            [s -3 role] eq {master}
+        } else {
+            fail "The first failover does not happen"
+        }
+
+        # Resume the primary and wait for it to become a replica.
+        resume_process [srv 0 pid]
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {slave}
+        } else {
+            fail "Old primary not converted into replica"
+        }
+        wait_for_cluster_propagation
+
+        # The old primary doing a manual failover and wait for it.
+        R 0 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {master} &&
+            [s -3 role] eq {slave}
+        } else {
+            fail "The second failover does not happen"
+        }
+        wait_for_cluster_propagation
+
+        # R 3 performs a manual failover and it will work.
+        R 3 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {slave} &&
+            [s -3 role] eq {master}
+        } else {
+            fail "The third falover does not happen"
+        }
+        wait_for_cluster_propagation
+    }
+} ;# start_cluster