Skip to content

Commit

Permalink
Manual failover vote is not limited by two times the node timeout (#1305
Browse files Browse the repository at this point in the history
)

This limit should not restrict manual failover, otherwise in some
scenarios, manual failover will time out.

For example, if some FAILOVER_AUTH_REQUESTs or some FAILOVER_AUTH_ACKs
are lost during a manual failover, it cannot vote in the second manual
failover. Or in a mixed scenario of plain failover and manual failover,
it cannot vote for the subsequent manual failover.

The problem with the manual failover retry is that the mf will pause
the client 5s in the primary side. So every retry every manual failover
timed out is a bad move.

---------

Signed-off-by: Binbin <[email protected]>
Co-authored-by: Viktor Söderqvist <[email protected]>
  • Loading branch information
enjoy-binbin and zuiderkwast authored Nov 19, 2024
1 parent 132798b commit ee386c9
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 6 deletions.
15 changes: 10 additions & 5 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -4363,12 +4363,17 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {

/* We did not voted for a replica about this primary for two
* times the node timeout. This is not strictly needed for correctness
* of the algorithm but makes the base case more linear. */
if (mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) {
* of the algorithm but makes the base case more linear.
*
* This limitation does not restrict manual failover. If a user initiates
* a manual failover, we need to allow it to vote, otherwise the manual
* failover may time out. */
if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) {
serverLog(LL_WARNING,
"Failover auth denied to %.40s %s: "
"can't vote about this primary before %lld milliseconds",
"Failover auth denied to %.40s (%s): "
"can't vote for any replica of %.40s (%s) within %lld milliseconds",
node->name, node->human_nodename,
node->replicaof->name, node->replicaof->human_nodename,
(long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time)));
return;
}
Expand All @@ -4394,7 +4399,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {

/* We can vote for this replica. */
server.cluster->lastVoteEpoch = server.cluster->currentEpoch;
node->replicaof->voted_time = mstime();
if (!force_ack) node->replicaof->voted_time = mstime();
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
clusterSendFailoverAuth(node);
serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename,
Expand Down
3 changes: 2 additions & 1 deletion src/cluster_legacy.h
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,8 @@ struct _clusterNode {
mstime_t pong_received; /* Unix time we received the pong */
mstime_t data_received; /* Unix time we received any data */
mstime_t fail_time; /* Unix time when FAIL flag was set */
mstime_t voted_time; /* Last time we voted for a replica of this primary */
mstime_t voted_time; /* Last time we voted for a replica of this primary in non manual
* failover scenarios. */
mstime_t repl_offset_time; /* Unix time we received offset for this node */
mstime_t orphaned_time; /* Starting time of orphaned primary condition */
long long repl_offset; /* Last known repl offset for this node. */
Expand Down
1 change: 1 addition & 0 deletions tests/support/cluster_util.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ proc wait_for_cluster_size {cluster_size} {
# Check that cluster nodes agree about "state", or raise an error.
proc wait_for_cluster_state {state} {
for {set j 0} {$j < [llength $::servers]} {incr j} {
if {[process_is_paused [srv -$j pid]]} continue
wait_for_condition 1000 50 {
[CI $j cluster_state] eq $state
} else {
Expand Down
88 changes: 88 additions & 0 deletions tests/unit/cluster/manual-failover.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,91 @@ test "Wait for instance #0 to return back alive" {
}

} ;# start_cluster

start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} {
test "Manual failover vote is not limited by two times the node timeout - drop the auth ack" {
set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6
set CLUSTER_PACKET_TYPE_NONE -1

# Setting a large timeout to make sure we hit the voted_time limit.
R 0 config set cluster-node-timeout 150000
R 1 config set cluster-node-timeout 150000
R 2 config set cluster-node-timeout 150000

# Let replica drop FAILOVER_AUTH_ACK so that the election won't
# get the enough votes and the election will time out.
R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK

# The first manual failover will time out.
R 3 cluster failover
wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50
wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50

# Undo packet drop, so that replica can win the next election.
R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE

# Make sure the second manual failover will work.
R 3 cluster failover
wait_for_condition 1000 50 {
[s 0 role] eq {slave} &&
[s -3 role] eq {master}
} else {
fail "The second failover does not happen"
}
wait_for_cluster_propagation
}
} ;# start_cluster

start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} {
test "Manual failover vote is not limited by two times the node timeout - mixed failover" {
# Make sure the failover is triggered by us.
R 1 config set cluster-replica-validity-factor 0
R 3 config set cluster-replica-no-failover yes
R 3 config set cluster-replica-validity-factor 0

# Pause the primary.
pause_process [srv 0 pid]
wait_for_cluster_state fail

# Setting a large timeout to make sure we hit the voted_time limit.
R 1 config set cluster-node-timeout 150000
R 2 config set cluster-node-timeout 150000

# R 3 performs an automatic failover and it will work.
R 3 config set cluster-replica-no-failover no
wait_for_condition 1000 50 {
[s -3 role] eq {master}
} else {
fail "The first failover does not happen"
}

# Resume the primary and wait for it to become a replica.
resume_process [srv 0 pid]
wait_for_condition 1000 50 {
[s 0 role] eq {slave}
} else {
fail "Old primary not converted into replica"
}
wait_for_cluster_propagation

# The old primary doing a manual failover and wait for it.
R 0 cluster failover
wait_for_condition 1000 50 {
[s 0 role] eq {master} &&
[s -3 role] eq {slave}
} else {
fail "The second failover does not happen"
}
wait_for_cluster_propagation

# R 3 performs a manual failover and it will work.
R 3 cluster failover
wait_for_condition 1000 50 {
[s 0 role] eq {slave} &&
[s -3 role] eq {master}
} else {
fail "The third falover does not happen"
}
wait_for_cluster_propagation
}
} ;# start_cluster

0 comments on commit ee386c9

Please sign in to comment.