Skip to content

Commit

Permalink
raft: added strict fencing
Browse files Browse the repository at this point in the history
Current leader fencing implementation didn't really guarantee that old
leader would resing it's leadership before new leader could be elected.
That made it possible for several "leaders" coexist in cluster for some
time.

This commit changes replication_disconnect_timeout so that it is twice
as short for current raft leader (2*replication_timeout) if strict
fencing is enabled. Assuming that replication_timeout is the same for every
replica in replicaset this guarantees that leader will resign it's
leadership before anyone could start elections.

Old fencing behaviour can be enabled by setting fencing to soft mode.
This is useful when connection death timeouts shouldn't be affected
(e.g. different replication_timeouts are set to prioritize some replicas
as leader over the others).

Closes #7110

@TarantoolBot document

Removed `box.cfg` option `election_fencing_enabled` in favor of
`election_fencing_mode`. `election_fencing_mode` can be set to one of
the following values:
'off' - fencing turned off (same as `election_fencing_enabled` set to false
before).
Connection death timeout is 4*replication_timeout for all nodes.

'soft' (default) - fencing turned on, but connection death timeout is same
for leader and followers, doesn't guarantee that there is only one leader
in cluster. This is enough to solve cluster being readonly and not being
to elect a new leader in some situations, because of pre-vote.
Connection death timeout is 4*replication_timeout for all nodes.

'strict' - fencing turned on. This mode guarantees that tere is only one
leader in any moment, if replication_timeout is the same on all nodes.
Connection death timeout is 4*replication_timeout for followers and
2*replication_timout for current leader.
  • Loading branch information
grafin committed May 17, 2022
1 parent 1e13757 commit dbac4aa
Show file tree
Hide file tree
Showing 21 changed files with 239 additions and 71 deletions.
6 changes: 6 additions & 0 deletions changelogs/unreleased/strict_fencing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
## feature/raft

* Introduced strict fencing, which guaratees that there will be at most one
leader in cluster in any moment in time. This is achived by setting
connection death timeout on current leader to half the time compared to
followers (assuming replication_timeout is the same on every replica).
41 changes: 25 additions & 16 deletions src/box/box.cc
Original file line number Diff line number Diff line change
Expand Up @@ -835,18 +835,27 @@ box_check_election_timeout(void)
}

/**
* Raises error if election_fencing_enabled configuration is incorrect.
* Raises error if election_fencing_mode configuration is incorrect.
*/
static int
box_check_election_fencing_enabled(void)
static election_fencing_mode
box_check_election_fencing_mode(void)
{
int i = cfg_getb("election_fencing_enabled");
if (i < 0) {
diag_set(ClientError, ER_CFG, "election_fencing_enabled",
"the value must be a boolean");
return -1;
}
return i;
const char *mode = cfg_gets("election_fencing_mode");
if (mode == NULL)
goto error;

if (strcmp(mode, "off") == 0)
return ELECTION_FENCING_MODE_OFF;
else if (strcmp(mode, "soft") == 0)
return ELECTION_FENCING_MODE_SOFT;
else if (strcmp(mode, "strict") == 0)
return ELECTION_FENCING_MODE_STRICT;

error:
diag_set(ClientError, ER_CFG, "election_fencing_mode",
"the value must be one of the following strings: "
"'off', 'soft', 'strict'");
return ELECTION_FENCING_MODE_INVALID;
}

static int
Expand Down Expand Up @@ -1378,7 +1387,7 @@ box_check_config(void)
diag_raise();
if (box_check_election_timeout() < 0)
diag_raise();
if (box_check_election_fencing_enabled() < 0)
if (box_check_election_fencing_mode() < 0)
diag_raise();
if (box_check_replication() != 0)
diag_raise();
Expand Down Expand Up @@ -1439,12 +1448,12 @@ box_set_election_timeout(void)
}

int
box_set_election_fencing_enabled(void)
box_set_election_fencing_mode(void)
{
int enabled = box_check_election_fencing_enabled();
if (enabled < 0)
enum election_fencing_mode mode = box_check_election_fencing_mode();
if (mode == ELECTION_FENCING_MODE_INVALID)
return -1;
box_raft_set_election_fencing_enabled((bool)enabled);
box_raft_set_election_fencing_mode(mode);
return 0;
}

Expand Down Expand Up @@ -3978,7 +3987,7 @@ box_cfg_xc(void)

if (box_set_election_timeout() != 0)
diag_raise();
if (box_set_election_fencing_enabled() != 0)
if (box_set_election_fencing_mode() != 0)
diag_raise();
/*
* Election is enabled last. So as all the parameters are installed by
Expand Down
2 changes: 1 addition & 1 deletion src/box/box.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ void box_set_vinyl_cache(void);
void box_set_vinyl_timeout(void);
int box_set_election_mode(void);
int box_set_election_timeout(void);
int box_set_election_fencing_enabled(void);
int box_set_election_fencing_mode(void);
void box_set_replication_timeout(void);
void box_set_replication_connect_timeout(void);
void box_set_replication_connect_quorum(void);
Expand Down
6 changes: 3 additions & 3 deletions src/box/lua/cfg.cc
Original file line number Diff line number Diff line change
Expand Up @@ -299,9 +299,9 @@ lbox_cfg_set_election_timeout(struct lua_State *L)
}

static int
lbox_cfg_set_election_fencing_enabled(struct lua_State *L)
lbox_cfg_set_election_fencing_mode(struct lua_State *L)
{
if (box_set_election_fencing_enabled() != 0)
if (box_set_election_fencing_mode() != 0)
luaT_error(L);
return 0;
}
Expand Down Expand Up @@ -447,7 +447,7 @@ box_lua_cfg_init(struct lua_State *L)
{"cfg_set_vinyl_timeout", lbox_cfg_set_vinyl_timeout},
{"cfg_set_election_mode", lbox_cfg_set_election_mode},
{"cfg_set_election_timeout", lbox_cfg_set_election_timeout},
{"cfg_set_election_fencing_enabled", lbox_cfg_set_election_fencing_enabled},
{"cfg_set_election_fencing_mode", lbox_cfg_set_election_fencing_mode},
{"cfg_set_replication_timeout", lbox_cfg_set_replication_timeout},
{"cfg_set_replication_connect_quorum", lbox_cfg_set_replication_connect_quorum},
{"cfg_set_replication_connect_timeout", lbox_cfg_set_replication_connect_timeout},
Expand Down
10 changes: 5 additions & 5 deletions src/box/lua/load_cfg.lua
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ local default_cfg = {
worker_pool_threads = 4,
election_mode = 'off',
election_timeout = 5,
election_fencing_enabled = true,
election_fencing_mode = 'soft',
replication_timeout = 1,
replication_sync_lag = 10,
replication_sync_timeout = 300,
Expand Down Expand Up @@ -237,7 +237,7 @@ local template_cfg = {
worker_pool_threads = 'number',
election_mode = 'string',
election_timeout = 'number',
election_fencing_enabled = 'boolean',
election_fencing_mode = 'string',
replication_timeout = 'number',
replication_sync_lag = 'number',
replication_sync_timeout = 'number',
Expand Down Expand Up @@ -350,7 +350,7 @@ local dynamic_cfg = {
force_recovery = function() end,
election_mode = private.cfg_set_election_mode,
election_timeout = private.cfg_set_election_timeout,
election_fencing_enabled = private.cfg_set_election_fencing_enabled,
election_fencing_mode = private.cfg_set_election_fencing_mode,
replication_timeout = private.cfg_set_replication_timeout,
replication_connect_timeout = private.cfg_set_replication_connect_timeout,
replication_connect_quorum = private.cfg_set_replication_connect_quorum,
Expand Down Expand Up @@ -427,7 +427,7 @@ local dynamic_cfg_order = {
wal_cleanup_delay = 260,
election_mode = 300,
election_timeout = 320,
election_fencing_enabled = 320,
election_fencing_mode = 320,
}

local function sort_cfg_cb(l, r)
Expand All @@ -447,7 +447,7 @@ local dynamic_cfg_skip_at_load = {
too_long_threshold = true,
election_mode = true,
election_timeout = true,
election_fencing_enabled = true,
election_fencing_mode = true,
replication = true,
replication_timeout = true,
replication_connect_timeout = true,
Expand Down
34 changes: 24 additions & 10 deletions src/box/raft.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,8 @@ struct raft box_raft_global = {

enum election_mode box_election_mode = ELECTION_MODE_INVALID;

/**
* Flag whether Raft leader fencing is enabled. If enabled leader will
* resign when it looses quorum for any reason.
*/
static bool election_fencing_enabled = true;
enum election_fencing_mode box_election_fencing_mode =
ELECTION_FENCING_MODE_SOFT;

/**
* A trigger executed each time the Raft state machine updates any
Expand Down Expand Up @@ -284,7 +281,8 @@ box_raft_fence(void)
{
struct raft *raft = box_raft();
if (!raft->is_enabled || raft->state != RAFT_STATE_LEADER ||
!election_fencing_enabled || box_raft_election_fencing_paused)
box_election_fencing_mode == ELECTION_FENCING_MODE_OFF ||
box_raft_election_fencing_paused)
return;

txn_limbo_freeze(&txn_limbo);
Expand Down Expand Up @@ -558,11 +556,27 @@ box_raft_remove_quorum_triggers(void)
}

void
box_raft_set_election_fencing_enabled(bool enabled)
box_raft_set_election_fencing_mode(enum election_fencing_mode mode)
{
election_fencing_enabled = enabled;
say_info("RAFT: fencing %s", enabled ? "enabled" : "disabled");
if (!enabled)
if (box_election_fencing_mode == mode)
return;

box_election_fencing_mode = mode;
switch (box_election_fencing_mode) {
case ELECTION_FENCING_MODE_OFF:
say_info("RAFT: disabled fencing");
break;
case ELECTION_FENCING_MODE_SOFT:
say_info("RAFT: enabled soft fencing");
break;
case ELECTION_FENCING_MODE_STRICT:
say_info("RAFT: enabled strict fencing");
break;
default:
unreachable();
}

if (box_election_fencing_mode == ELECTION_FENCING_MODE_OFF)
txn_limbo_unfreeze(&txn_limbo);
replicaset_on_health_change();
}
Expand Down
26 changes: 21 additions & 5 deletions src/box/raft.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ enum election_mode {
ELECTION_MODE_CANDIDATE = 3,
};

enum election_fencing_mode {
ELECTION_FENCING_MODE_INVALID = -1,
ELECTION_FENCING_MODE_OFF = 0,
ELECTION_FENCING_MODE_SOFT = 1,
ELECTION_FENCING_MODE_STRICT = 2,
};

struct raft_request;

/**
Expand All @@ -61,6 +68,18 @@ struct raft_request;
*/
extern enum election_mode box_election_mode;

/**
* box_election_fencing_mode - current leader fencing mode:
* OFF - leader won't resign leadership when quorum is lost;
* SOFT (default) - leader will resign leadership when quorum is lost,
* doesn't guarantee that leader will resign before another leader is
* elected in cluster;
* STRICT - leader will resign leadership when quorum is lost, it will resign
* before automatic elections can start in any part of cluster (assuming
* replication_timeout is same on every replica).
*/
extern enum election_fencing_mode box_election_fencing_mode;

/** Raft state of this instance. */
static inline struct raft *
box_raft(void)
Expand Down Expand Up @@ -115,12 +134,9 @@ box_raft_wait_term_outcome(void);
int
box_raft_wait_term_persisted(void);

/**
* Enable/disable fencing. If enabled: instance will resign its leader role,
* when it looses quorum.
*/
/** Set the node's election_fencing_mode to @a mode. */
void
box_raft_set_election_fencing_enabled(bool enabled);
box_raft_set_election_fencing_mode(enum election_fencing_mode mode);

/**
* Pause fencing. Instance will not resign its leader role when it looses
Expand Down
10 changes: 10 additions & 0 deletions src/box/replication.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ rb_gen(MAYBE_UNUSED static, replica_hash_, replica_hash_t,
item != NULL && ((next = replica_hash_next(hash, item)) || 1); \
item = next)

double
replication_disconnect_timeout(void)
{
struct raft *raft = box_raft();
if (raft && raft->state == RAFT_STATE_LEADER &&
box_election_fencing_mode == ELECTION_FENCING_MODE_STRICT)
return replication_timeout * 2;
return replication_timeout * 4;
}

/**
* Return the number of replicas that have to be synchronized
* in order to form a quorum in the replica set.
Expand Down
9 changes: 4 additions & 5 deletions src/box/replication.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,11 @@ replication_reconnect_interval(void)
/**
* Disconnect a replica if no heartbeat message has been
* received from it within the given period.
* This timeout is different for replicaset leader. This is needed for
* strong fencing, where leader uniqueness is guaranteed.
*/
static inline double
replication_disconnect_timeout(void)
{
return replication_timeout * 4;
}
double
replication_disconnect_timeout(void);

void
replication_init(int num_threads);
Expand Down
2 changes: 1 addition & 1 deletion test/app-tap/init_script.result
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ checkpoint_count:2
checkpoint_interval:3600
checkpoint_wal_threshold:1e+18
coredump:false
election_fencing_enabled:true
election_fencing_mode:soft
election_mode:off
election_timeout:5
feedback_crashinfo:true
Expand Down
4 changes: 2 additions & 2 deletions test/box/admin.result
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ cfg_filter(box.cfg)
- 1000000000000000000
- - coredump
- false
- - election_fencing_enabled
- true
- - election_fencing_mode
- soft
- - election_mode
- off
- - election_timeout
Expand Down
8 changes: 4 additions & 4 deletions test/box/cfg.result
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ cfg_filter(box.cfg)
| - 1000000000000000000
| - - coredump
| - false
| - - election_fencing_enabled
| - true
| - - election_fencing_mode
| - soft
| - - election_mode
| - off
| - - election_timeout
Expand Down Expand Up @@ -190,8 +190,8 @@ cfg_filter(box.cfg)
| - 1000000000000000000
| - - coredump
| - false
| - - election_fencing_enabled
| - true
| - - election_fencing_mode
| - soft
| - - election_mode
| - off
| - - election_timeout
Expand Down
6 changes: 6 additions & 0 deletions test/luatest_helpers/server.lua
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ function Server:wait_for_readiness()
end)
end

function Server:wait_election_state(state)
return wait_cond('election state', self, self.exec, self, function(state)
return box.info.election.state == state
end, {state})
end

function Server:wait_election_leader()
-- Include read-only property too because if an instance is a leader, it
-- does not mean it finished the synchro queue ownership transition. It is
Expand Down
Loading

0 comments on commit dbac4aa

Please sign in to comment.