diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 69af65f1e8..7b3384ee9f 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4363,12 +4363,17 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We did not voted for a replica about this primary for two * times the node timeout. This is not strictly needed for correctness - * of the algorithm but makes the base case more linear. */ - if (mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) { + * of the algorithm but makes the base case more linear. + * + * This limitation does not restrict manual failover. If a user initiates + * a manual failover, we need to allow it to vote, otherwise the manual + * failover may time out. */ + if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) { serverLog(LL_WARNING, - "Failover auth denied to %.40s %s: " - "can't vote about this primary before %lld milliseconds", + "Failover auth denied to %.40s (%s): " + "can't vote for any replica of %.40s (%s) within %lld milliseconds", node->name, node->human_nodename, + node->replicaof->name, node->replicaof->human_nodename, (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time))); return; } @@ -4394,7 +4399,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We can vote for this replica. */ server.cluster->lastVoteEpoch = server.cluster->currentEpoch; - node->replicaof->voted_time = mstime(); + if (!force_ack) node->replicaof->voted_time = mstime(); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); clusterSendFailoverAuth(node); serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename, diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 5280644e6e..2c3e1d83c8 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -338,7 +338,8 @@ struct _clusterNode { mstime_t pong_received; /* Unix time we received the pong */ mstime_t data_received; /* Unix time we received any data */ mstime_t fail_time; /* Unix time when FAIL flag was set */ - mstime_t voted_time; /* Last time we voted for a replica of this primary */ + mstime_t voted_time; /* Last time we voted for a replica of this primary in non manual + * failover scenarios. */ mstime_t repl_offset_time; /* Unix time we received offset for this node */ mstime_t orphaned_time; /* Starting time of orphaned primary condition */ long long repl_offset; /* Last known repl offset for this node. */ diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index 4b399214b9..686f00071b 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -145,6 +145,7 @@ proc wait_for_cluster_size {cluster_size} { # Check that cluster nodes agree about "state", or raise an error. proc wait_for_cluster_state {state} { for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused [srv -$j pid]]} continue wait_for_condition 1000 50 { [CI $j cluster_state] eq $state } else { diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index 2a9dff934b..78842068fa 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -183,3 +183,91 @@ test "Wait for instance #0 to return back alive" { } } ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Manual failover vote is not limited by two times the node timeout - drop the auth ack" { + set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6 + set CLUSTER_PACKET_TYPE_NONE -1 + + # Setting a large timeout to make sure we hit the voted_time limit. + R 0 config set cluster-node-timeout 150000 + R 1 config set cluster-node-timeout 150000 + R 2 config set cluster-node-timeout 150000 + + # Let replica drop FAILOVER_AUTH_ACK so that the election won't + # get the enough votes and the election will time out. + R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK + + # The first manual failover will time out. + R 3 cluster failover + wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50 + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50 + + # Undo packet drop, so that replica can win the next election. + R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + + # Make sure the second manual failover will work. + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + } +} ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Manual failover vote is not limited by two times the node timeout - mixed failover" { + # Make sure the failover is triggered by us. + R 1 config set cluster-replica-validity-factor 0 + R 3 config set cluster-replica-no-failover yes + R 3 config set cluster-replica-validity-factor 0 + + # Pause the primary. + pause_process [srv 0 pid] + wait_for_cluster_state fail + + # Setting a large timeout to make sure we hit the voted_time limit. + R 1 config set cluster-node-timeout 150000 + R 2 config set cluster-node-timeout 150000 + + # R 3 performs an automatic failover and it will work. + R 3 config set cluster-replica-no-failover no + wait_for_condition 1000 50 { + [s -3 role] eq {master} + } else { + fail "The first failover does not happen" + } + + # Resume the primary and wait for it to become a replica. + resume_process [srv 0 pid] + wait_for_condition 1000 50 { + [s 0 role] eq {slave} + } else { + fail "Old primary not converted into replica" + } + wait_for_cluster_propagation + + # The old primary doing a manual failover and wait for it. + R 0 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {master} && + [s -3 role] eq {slave} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + + # R 3 performs a manual failover and it will work. + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The third falover does not happen" + } + wait_for_cluster_propagation + } +} ;# start_cluster