From 95610ac59a550c246731eee4f5de6c62493e66cf Mon Sep 17 00:00:00 2001 From: Michal Kuratczyk Date: Wed, 19 Feb 2025 13:15:12 +0100 Subject: [PATCH 1/2] Switch from `slave` to `peer` module --- test/coordination_SUITE.erl | 213 ++++++++++++++++++++--------------- test/erlang_node_helpers.erl | 32 ++---- test/partitions_SUITE.erl | 12 +- test/ra_SUITE.erl | 18 +-- test/ra_system_SUITE.erl | 41 ++++--- 5 files changed, 169 insertions(+), 147 deletions(-) diff --git a/test/coordination_SUITE.erl b/test/coordination_SUITE.erl index 66ab47c7..15f681e0 100644 --- a/test/coordination_SUITE.erl +++ b/test/coordination_SUITE.erl @@ -98,7 +98,7 @@ conf({Name, _Node} = NodeId, Nodes) -> start_stop_restart_delete_on_remote(Config) -> PrivDir = ?config(data_dir, Config), - S1 = start_follower(s1, PrivDir), + {S1, P1} = start_peer(s1, PrivDir), % ensure application is started ServerId = {c1, S1}, Conf = conf(ServerId, [ServerId]), @@ -116,14 +116,14 @@ start_stop_restart_delete_on_remote(Config) -> ok = ra:force_delete_server(?SYS, ServerId), % idempotency ok = ra:force_delete_server(?SYS, ServerId), - stop_nodes([ServerId]), - slave:stop(S1), + peer:stop(P1), ok. start_cluster(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3,s4,s5]], + Peers = start_peers([s1,s2,s3,s4,s5], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, {ok, Started, []} = ra:start_cluster(?SYS, ClusterName, Machine, ServerIds), % assert all were said to be started @@ -133,13 +133,14 @@ start_cluster(Config) -> PingResults = [{pong, _} = ra_server_proc:ping(N, 500) || N <- ServerIds], % assert one node is leader ?assert(lists:any(fun ({pong, S}) -> S =:= leader end, PingResults)), - stop_nodes(ServerIds), + stop_peers(Peers), ok. start_or_restart_cluster(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, %% this should start {ok, Started, []} = ra:start_or_restart_cluster(?SYS, ClusterName, Machine, @@ -150,8 +151,8 @@ start_or_restart_cluster(Config) -> PingResults = [{pong, _} = ra_server_proc:ping(N, 500) || N <- ServerIds], % assert one node is leader ?assert(lists:any(fun ({pong, S}) -> S =:= leader end, PingResults)), - [ok = slave:stop(S) || {_, S} <- ServerIds], - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + stop_peers(Peers), + Peers2 = start_peers([s1,s2,s3], PrivDir), %% this should restart {ok, Started2, []} = ra:start_or_restart_cluster(?SYS, ClusterName, Machine, ServerIds), @@ -160,13 +161,14 @@ start_or_restart_cluster(Config) -> PingResults2 = [{pong, _} = ra_server_proc:ping(N, 500) || N <- ServerIds], % assert one node is leader ?assert(lists:any(fun ({pong, S}) -> S =:= leader end, PingResults2)), - stop_nodes(ServerIds), + stop_peers(Peers2), ok. delete_one_server_cluster(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1]], + Peers1 = start_peers([s1], PrivDir), + ServerIds = server_ids(ClusterName, Peers1), Machine = {module, ?MODULE, #{}}, {ok, _, []} = ra:start_cluster(?SYS, ClusterName, Machine, ServerIds), [{_, Node}] = ServerIds, @@ -179,9 +181,9 @@ delete_one_server_cluster(Config) -> [] = [F || F <- filelib:wildcard(Wc), filelib:is_dir(F)], {error, _} = ra_server_proc:ping(hd(ServerIds), 50), % assert all nodes are actually started - [ok = slave:stop(S) || {_, S} <- ServerIds], + stop_peers(Peers1), % restart node - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1]], + Peers2 = start_peers([s1], PrivDir), receive Anything -> ct:pal("got weird message ~p", [Anything]), @@ -195,13 +197,15 @@ delete_one_server_cluster(Config) -> undefined = rpc:call(Node, ra_log_meta, fetch, [ra_log_meta, UId, current_term]), ct:pal("Files ~p", [Files]), [] = Files, - stop_nodes(ServerIds), + stop_peers(Peers1), + stop_peers(Peers2), ok. delete_two_server_cluster(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2]], + Peers = start_peers([s1,s2], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, {ok, _, []} = ra:start_cluster(?SYS, ClusterName, Machine, ServerIds), {ok, _} = ra:delete_cluster(ServerIds), @@ -212,7 +216,7 @@ delete_two_server_cluster(Config) -> undefined == erpc:call(Node, erlang, whereis, [Name]) end, ServerIds) end, 100), - stop_nodes(ServerIds), + stop_peers(Peers), receive Anything -> ct:pal("got weird message ~p", [Anything]), @@ -225,7 +229,8 @@ delete_two_server_cluster(Config) -> delete_three_server_cluster(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, {ok, _, []} = ra:start_cluster(?SYS, ClusterName, Machine, ServerIds), {ok, _} = ra:delete_cluster(ServerIds), @@ -236,13 +241,14 @@ delete_three_server_cluster(Config) -> undefined == erpc:call(Node, erlang, whereis, [Name]) end, ServerIds) end, 100), - stop_nodes(ServerIds), + stop_peers(Peers), ok. delete_three_server_cluster_parallel(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, {ok, _, []} = ra:start_cluster(?SYS, ClusterName, Machine, ServerIds), %% spawn a delete command to try cause it to commit more than @@ -261,7 +267,7 @@ delete_three_server_cluster_parallel(Config) -> true = rpc:call(S, ?MODULE, check_sup, []) end || {_, S} <- ServerIds], % assert all nodes are actually started - stop_nodes(ServerIds), + stop_peers(Peers), ok. check_sup() -> @@ -270,7 +276,8 @@ check_sup() -> start_cluster_majority(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds0 = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2]], + Peers = start_peers([s1,s2], PrivDir), + ServerIds0 = server_ids(ClusterName, Peers), % s3 isn't available S3 = make_node_name(s3), NodeIds = ServerIds0 ++ [{ClusterName, S3}], @@ -284,13 +291,14 @@ start_cluster_majority(Config) -> PingResults = [{pong, _} = ra_server_proc:ping(N, 500) || N <- Started], % assert one node is leader ?assert(lists:any(fun ({pong, S}) -> S =:= leader end, PingResults)), - stop_nodes(ServerIds0), + stop_peers(Peers), ok. start_cluster_minority(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds0 = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1]], + Peers = start_peers([s1], PrivDir), + ServerIds0 = server_ids(ClusterName, Peers), % s3 isn't available S2 = make_node_name(s2), S3 = make_node_name(s3), @@ -300,16 +308,16 @@ start_cluster_minority(Config) -> ra:start_cluster(?SYS, ClusterName, Machine, NodeIds), % assert none is started [{error, _} = ra_server_proc:ping(N, 50) || N <- NodeIds], - stop_nodes(ServerIds0), + stop_peers(Peers), ok. grow_cluster(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - [{_, ANode} = A, - {_, BNode} = B, - {_, CNode} = C] = - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + ServerIds = server_ids(ClusterName, Peers), + [{_, ANode} = A, {_, BNode} = B, {_, CNode} = C] = ServerIds, + Machine = {module, ?MODULE, #{}}, {ok, [A], []} = ra:start_cluster(?SYS, ClusterName, Machine, [A]), @@ -325,7 +333,10 @@ grow_cluster(Config) -> {ok, _, _} = ra:add_member(A, C), {ok, _, _} = ra:process_command(A, banana), {ok, _, L1} = ra:members(A), - [A, B, C] = rpc:call(ANode, ra_leaderboard, lookup_members, [ClusterName]), + X = rpc:call(ANode, ra_leaderboard, lookup_members, [ClusterName]), + ct:pal("X: ~p", [X]), + ct:pal("[A, B, C]: [~p, ~p, ~p]", [A, B, C]), + [A, B, C] = X, L1 = rpc:call(ANode, ra_leaderboard, lookup_leader, [ClusterName]), await_condition( @@ -368,14 +379,15 @@ grow_cluster(Config) -> undefined = rpc:call(ANode, ra_leaderboard, lookup_leader, [ClusterName]), undefined = rpc:call(BNode, ra_leaderboard, lookup_leader, [ClusterName]), - stop_nodes(ServerIds), + stop_peers(Peers), ok. send_local_msg(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [A, B, NonVoter] = [{ClusterName, start_follower(N, PrivDir)} - || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + [A, B, NonVoter] = server_ids(ClusterName, Peers), + NodeIds = [A, B], Machine = {module, ?MODULE, #{}}, {ok, Started, []} = ra:start_cluster(?SYS, ClusterName, Machine, NodeIds), @@ -407,14 +419,14 @@ send_local_msg(Config) -> test_local_msg(Leader, NonVoterNode, LeaderNode, send_local_msg, [local, ra_event]), test_local_msg(Leader, NonVoterNode, LeaderNode, send_local_msg, [local, cast]), test_local_msg(Leader, NonVoterNode, LeaderNode, send_local_msg, [local, cast, ra_event]), - stop_nodes(ServerIds), + stop_peers(Peers), ok. local_log_effect(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [A, B, NonVoter] = [{ClusterName, start_follower(N, PrivDir)} - || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + [A, B, NonVoter] = server_ids(ClusterName, Peers), NodeIds = [A, B], Machine = {module, ?MODULE, #{}}, {ok, Started, []} = ra:start_cluster(?SYS, ClusterName, Machine, NodeIds), @@ -446,13 +458,15 @@ local_log_effect(Config) -> test_local_msg(Leader, NonVoterNode, LeaderNode, do_local_log, [local, ra_event]), test_local_msg(Leader, NonVoterNode, LeaderNode, do_local_log, [local, cast]), test_local_msg(Leader, NonVoterNode, LeaderNode, do_local_log, [local, cast, ra_event]), - stop_nodes(ServerIds), + stop_peers(Peers), ok. disconnected_node_catches_up(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = [start_peer(N, PrivDir) || N <- [s1,s2,s3]], + ServerIds = server_ids(ClusterName, Peers), + Machine = {module, ?MODULE, #{}}, {ok, Started, []} = ra:start_cluster(?SYS, ClusterName, Machine, ServerIds), {ok, _, Leader} = ra:members(hd(Started)), @@ -461,7 +475,7 @@ disconnected_node_catches_up(Config) -> %% the ra_directory DETS table has a 500ms autosave configuration timer:sleep(1000), - ok = slave:stop(DownServerNode), + ok = stop_peer(lists:keyfind(DownServerNode, 1, Peers)), ct:pal("Nodes ~p", [nodes()]), [ @@ -483,7 +497,7 @@ disconnected_node_catches_up(Config) -> end, - DownNode = start_follower(DownServerNodeName, PrivDir), + {DownNode, _} = DownPeer = start_peer(DownServerNodeName, PrivDir), Self = self(), SPid = erlang:spawn(DownNode, @@ -521,13 +535,15 @@ disconnected_node_catches_up(Config) -> ok end, - stop_nodes(ServerIds), + stop_peers(Peers), + stop_peer(DownPeer), ok. nonvoter_catches_up(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - [A, B, C = {Group, NodeC}] = ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + [A, B, C = {Group, NodeC}] = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, {ok, Started, []} = ra:start_cluster(?SYS, ClusterName, Machine, [A, B]), {ok, _, Leader} = ra:members(hd(Started)), @@ -556,13 +572,14 @@ nonvoter_catches_up(Config) -> ?assertMatch(#{membership := voter}, ra:key_metrics(C)), - stop_nodes(ServerIds), + stop_peers(Peers), ok. nonvoter_catches_up_after_restart(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - [A, B, C = {Group, NodeC}] = ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + [A, B, C = {Group, NodeC}] = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, {ok, Started, []} = ra:start_cluster(?SYS, ClusterName, Machine, [A, B]), {ok, _, Leader} = ra:members(hd(Started)), @@ -593,13 +610,14 @@ nonvoter_catches_up_after_restart(Config) -> ?assertMatch(#{membership := voter}, ra:key_metrics(C)), - stop_nodes(ServerIds), + stop_peers(Peers), ok. nonvoter_catches_up_after_leader_restart(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - [A, B, C = {Group, NodeC}] = ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + [A, B, C = {Group, NodeC}] = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, {ok, Started, []} = ra:start_cluster(?SYS, ClusterName, Machine, [A, B]), {ok, _, Leader} = ra:members(hd(Started)), @@ -630,13 +648,14 @@ nonvoter_catches_up_after_leader_restart(Config) -> ?assertMatch(#{membership := voter}, ra:key_metrics(C)), - stop_nodes(ServerIds), + stop_peers(Peers), ok. key_metrics(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, {ok, Started, []} = ra:start_cluster(?SYS, ClusterName, Machine, ServerIds), {ok, _, Leader} = ra:members(hd(Started)), @@ -681,14 +700,15 @@ key_metrics(Config) -> end || S <- Started], - stop_nodes(ServerIds), + stop_peers(Peers), ok. leaderboard(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- [s1,s2,s3]], + Peers = start_peers([s1,s2,s3], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Machine = {module, ?MODULE, #{}}, {ok, Started, []} = ra:start_cluster(?SYS, ClusterName, Machine, ServerIds), % assert all were said to be started @@ -721,13 +741,15 @@ leaderboard(Config) -> end || {_, N} <- ServerIds]) end, 100), - stop_nodes(ServerIds), + stop_peers(Peers), ok. bench(Config) -> %% exercises the large message handling code PrivDir = ?config(data_dir, Config), - Nodes = [start_follower(N, PrivDir) || N <- [s1,s2,s3]], + ClusterName = ?FUNCTION_NAME, + Peers = start_peers([s1,s2,s3], PrivDir), + Nodes = [ N || {_, N} <- server_ids(ClusterName, Peers)], ok = ra_bench:run(#{name => ?FUNCTION_NAME, seconds => 10, target => 500, @@ -735,7 +757,7 @@ bench(Config) -> data_size => 256 * 1000, nodes => Nodes}), - stop_nodes(Nodes), + stop_peers(Peers), %% clean up ra_lib:recursive_delete(PrivDir), ok. @@ -743,8 +765,8 @@ bench(Config) -> recover_from_checkpoint(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerNames = [s1, s2, s3], - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- ServerNames], + Peers = start_peers([s1,s2,s3], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Configs = [begin UId = atom_to_binary(Name, utf8), #{cluster_name => ClusterName, @@ -857,7 +879,7 @@ recover_from_checkpoint(Config) -> undefined) end) || ServerId <- ServerIds], - stop_nodes(ServerIds), + stop_peers(Peers), ok. segment_writer_or_wal_crash_follower(Config) -> @@ -869,8 +891,8 @@ segment_writer_or_wal_crash_follower(Config) -> %% correactly and that the log data contains no missing entries PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerNames = [s1, s2, s3], - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- ServerNames], + Peers = start_peers([s1,s2,s3], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Configs = [begin UId = atom_to_binary(Name, utf8), #{cluster_name => ClusterName, @@ -954,7 +976,7 @@ segment_writer_or_wal_crash_follower(Config) -> await_condition(AwaitReplicated, 100), - stop_nodes(ServerIds), + stop_peers(Peers), ok. segment_writer_or_wal_crash_leader(Config) -> @@ -966,8 +988,8 @@ segment_writer_or_wal_crash_leader(Config) -> %% correctly and that the log data does not miss any entries PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerNames = [s1, s2, s3], - ServerIds = [{ClusterName, start_follower(N, PrivDir)} || N <- ServerNames], + Peers = start_peers([s1,s2,s3], PrivDir), + ServerIds = server_ids(ClusterName, Peers), Configs = [begin UId = atom_to_binary(Name, utf8), #{cluster_name => ClusterName, @@ -1064,16 +1086,16 @@ segment_writer_or_wal_crash_leader(Config) -> await_condition(AwaitReplicated, 100), - stop_nodes(ServerIds), + stop_peers(Peers), ok. server_recovery_strategy(Config) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerNames = [s1, s2, s3], SysCfg = #{server_recovery_strategy => registered}, - ServerIds = [{ClusterName, start_follower(N, PrivDir, SysCfg)} - || N <- ServerNames], + Peers = start_peers([s1,s2,s3], PrivDir, SysCfg), + ServerIds = server_ids(ClusterName, Peers), + Configs = [begin UId = atom_to_binary(Name, utf8), #{cluster_name => ClusterName, @@ -1101,7 +1123,7 @@ server_recovery_strategy(Config) -> timer:sleep(100), - stop_nodes(ServerIds), + stop_peers(Peers), ok. stopped_wal_causes_leader_change_registered(Config) -> @@ -1113,10 +1135,9 @@ stopped_wal_causes_leader_change_mfa(Config) -> stopped_wal_causes_leader_change(Config, RecoverStrat) -> PrivDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerNames = [s1, s2, s3], SysCfg = #{server_recovery_strategy => RecoverStrat}, - ServerIds = [{ClusterName, start_follower(N, PrivDir, SysCfg)} - || N <- ServerNames], + Peers = start_peers([s1,s2,s3], PrivDir, SysCfg), + ServerIds = server_ids(ClusterName, Peers), Configs = [begin UId = atom_to_binary(Name, utf8), #{cluster_name => ClusterName, @@ -1187,7 +1208,7 @@ stopped_wal_causes_leader_change(Config, RecoverStrat) -> end) end, 200), await_condition(AwaitReplicated, 100), - stop_nodes(ServerIds), + stop_peers(Peers), ok. %% Utility @@ -1254,29 +1275,50 @@ make_node_name(N) -> H = get_current_host(), list_to_atom(lists:flatten(io_lib:format("~s@~s", [N, H]))). -search_paths() -> - Ld = code:lib_dir(), - lists:filter(fun (P) -> string:prefix(P, Ld) =:= nomatch end, - code:get_path()). +start_peers(Ns, PrivDir) -> + start_peers(Ns, PrivDir, #{}). -start_follower(N, PrivDir) -> - start_follower(N, PrivDir, #{}). +start_peers(Ns, PrivDir, SysCfg) -> + Peers = [start_peer(N, PrivDir, SysCfg) || N <- Ns], + ct:pal("Peers: ~p", [Peers]), + Peers. -start_follower(N, PrivDir, SysCfg) -> +start_peer(N, PrivDir) -> + start_peer(N, PrivDir, #{}). + +start_peer(N, PrivDir, SysCfg) -> Dir0 = filename:join(PrivDir, N), - Dir = "'\"" ++ Dir0 ++ "\"'", - Host = get_current_host(), - Pa = string:join(["-pa" | search_paths()] ++ ["-ra data_dir", Dir], " "), - ct:pal("starting child node with ~ts on host ~ts for node ~ts", - [Pa, Host, node()]), - {ok, S} = slave:start_link(Host, N, Pa), + Dir = "'" ++ Dir0 ++ "'", + %Host = get_current_host(), + Args = ["-pa", filename:dirname(code:which(ra)), "-ra", "data_dir", Dir], + ct:pal("starting child node ~ts with args ~ts", [N, Args]), + {ok, P, S} = ?CT_PEER(#{name => N, args => Args}), ct:pal("started child node ~s", [S]), ok = ct_rpc:call(S, ?MODULE, node_setup, [Dir0]), % ok = ct_rpc:call(S, logger, set_primary_config, % [level, all]), _ = ct_cover:add_nodes([S]), {ok, _} = ct_rpc:call(S, ?MODULE, ra_start, [[], SysCfg]), - S. + {S, P}. + +server_ids(ClusterName, Peers) -> + [{ClusterName, S} || {S, _} <- Peers]. + +peer(NodeName, Peers) -> + {_S, P} = lists:keyfind(NodeName, 1, Peers), + P. + +stop_peers(Peers) when is_list(Peers) -> + [ ok = stop_peer(Peer) || Peer <- Peers], + ok. +stop_peer({S, P} = _Peer) -> + _ = ct_cover:remove_nodes(S), + case is_process_alive(P) of + true -> + peer:stop(P); + false -> + ok + end. ra_start(Params, SysCfg) when is_map(SysCfg) -> _ = application:stop(ra), @@ -1393,10 +1435,3 @@ server_recover_function(System) -> end end || {N, _Uid} <- Regd], ok. - -stop_nodes([{_, _} | _ ] = ServerIds) -> - stop_nodes([S || {_, S} <- ServerIds]); -stop_nodes(Nodes) -> - _ = ct_cover:remove_nodes(Nodes), - [ok = slave:stop(S) || S <- Nodes], - ok. diff --git a/test/erlang_node_helpers.erl b/test/erlang_node_helpers.erl index 348c59c8..cb7dc7e6 100644 --- a/test/erlang_node_helpers.erl +++ b/test/erlang_node_helpers.erl @@ -6,18 +6,17 @@ %% -module(erlang_node_helpers). --export([start_erlang_nodes/2, start_erlang_node/2, stop_erlang_nodes/1, stop_erlang_node/1, wait_for_stop/2]). +-export([start_erlang_nodes/2, start_erlang_node/2, stop_erlang_nodes/1, stop_erlang_node/1]). -include_lib("common_test/include/ct.hrl"). start_erlang_nodes(Nodes, Config) -> - [start_erlang_node(Node, Config) || Node <- Nodes], - Nodes. + [start_erlang_node(Node, Config) || Node <- Nodes]. start_erlang_node(Node, Config) -> DistMod = ?config(erlang_dist_module, Config), StartArgs = case DistMod of undefined -> - ""; + []; _ -> DistModS = atom_to_list(DistMod), DistModPath = filename:absname( @@ -25,13 +24,14 @@ start_erlang_node(Node, Config) -> code:where_is_file(DistModS ++ ".beam"))), DistArg = re:replace(DistModS, "_dist$", "", [{return, list}]), - "-pa \"" ++ DistModPath ++ "\" -proto_dist " ++ DistArg ++ - " -kernel prevent_overlapping_partitions false" + ["-pa", DistModPath, "-proto_dist", DistArg, + "-kernel", "prevent_overlapping_partitions", "false"] end, - _ = ct_slave:start(Node, [{erl_flags, StartArgs}]), + ct:log("Starting node ~p, with ~p", [Node, StartArgs]), + {ok, Peer, _} = peer:start(#{name => Node, args => StartArgs, connection => standard_io}), wait_for_distribution(Node, 50), add_lib_dir(Node), - Node. + Peer. add_lib_dir(Node) -> ct_rpc:call(Node, code, add_paths, [code:get_path()]). @@ -50,17 +50,5 @@ wait_for_distribution(Node, Attempts) -> stop_erlang_nodes(Nodes) -> [stop_erlang_node(Node) || Node <- Nodes]. -stop_erlang_node(Node) -> - ct:pal("Stopping node ~p", [Node]), - ct_slave:stop(Node), - wait_for_stop(Node, 100). - -wait_for_stop(Node, 0) -> - error({stop_failed_for, Node}); -wait_for_stop(Node, Attempts) -> - case ct_rpc:call(Node, erlang, node, []) of - {badrpc, nodedown} -> ok; - _ -> - timer:sleep(100), - wait_for_stop(Node, Attempts - 1) - end. +stop_erlang_node(Peer) -> + peer:stop(Peer). diff --git a/test/partitions_SUITE.erl b/test/partitions_SUITE.erl index 629156ef..448bd150 100644 --- a/test/partitions_SUITE.erl +++ b/test/partitions_SUITE.erl @@ -59,10 +59,10 @@ init_per_testcase(TestCase, Config0) -> end_per_testcase(print, Config) -> Config; end_per_testcase(_, Config) -> - Nodes = ?config(nodes, Config), - ct:pal("end_per_testcase: Stopping nodes ~p", [Nodes]), - erlang_node_helpers:stop_erlang_nodes(Nodes), - ct:pal("end_per_testcase: Stopped nodes ~p", [Nodes]), + Peers = proplists:get_value(peers, Config, []), + ct:pal("end_per_testcase: Stopping peer nodes ~p", [Peers]), + erlang_node_helpers:stop_erlang_nodes(proplists:get_value(peers, Config, [])), + ct:pal("end_per_testcase: Stopped peer nodes ~p", [Peers]), ok. -type nodes5() :: foo1@localhost | @@ -228,8 +228,8 @@ erlang_nodes(5) -> prepare_erlang_cluster(Config, Nodes) -> Config0 = tcp_inet_proxy_helpers:configure_dist_proxy(Config), - erlang_node_helpers:start_erlang_nodes(Nodes, Config0), - Config0. + Peers = erlang_node_helpers:start_erlang_nodes(Nodes, Config0), + [{peers, Peers} | Config0]. setup_ra_cluster(Config, Machine) -> Nodes = ?config(nodes, Config), diff --git a/test/ra_SUITE.erl b/test/ra_SUITE.erl index 30c3fabf..146a237d 100644 --- a/test/ra_SUITE.erl +++ b/test/ra_SUITE.erl @@ -1521,14 +1521,11 @@ start_remote_cluster(Num, PrivDir, ClusterName, Machine) -> Nodes. start_peer(Name, PrivDir) -> - Dir0 = filename:join(PrivDir, Name), - Dir = "'\"" ++ Dir0 ++ "\"'", - Host = get_current_host(), - Pa = string:join(["-pa" | search_paths()] ++ ["-s ra -ra data_dir", Dir], - " "), - ct:pal("starting peer node ~ts on host ~s for node ~ts with ~ts", - [Name, Host, node(), Pa]), - {ok, S} = slave:start_link(Host, Name, Pa), + Dir = "'" ++ filename:join(PrivDir, Name) ++ "'", + Pa = filename:dirname(code:which(ra)), + ct:pal("starting peer node ~ts for node ~ts with -pa ~ts and data_dir ~ts", + [Name, node(), Pa, Dir]), + {ok, _P, S} = ?CT_PEER(#{name => Name, args => ["-pa", Pa, "-ra", "data_dir", Dir]}), _ = rpc:call(S, ra, start, []), ok = ct_rpc:call(S, logger, set_primary_config, [level, all]), @@ -1539,11 +1536,6 @@ get_current_host() -> Host = re:replace(NodeStr, "^[^@]+@", "", [{return, list}]), list_to_atom(Host). -search_paths() -> - Ld = code:lib_dir(), - lists:filter(fun (P) -> string:prefix(P, Ld) =:= nomatch end, - code:get_path()). - await_condition(_Fun, 0) -> exit(condition_did_not_materialise); await_condition(Fun, Attempts) -> diff --git a/test/ra_system_SUITE.erl b/test/ra_system_SUITE.erl index 77df147f..960a57b6 100644 --- a/test/ra_system_SUITE.erl +++ b/test/ra_system_SUITE.erl @@ -79,8 +79,8 @@ start_cluster(Config) -> Sys = ?FUNCTION_NAME, DataDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_child_node(N, DataDir)} - || N <- [s1, s2, s3]], + Peers = [start_peer(N, DataDir) || N <- [s1, s2, s3]], + ServerIds = [{ClusterName, S} || {S, _P} <- Peers], Nodes = lists:map(fun ({_, N}) -> N end, ServerIds), Machine = {module, ?MODULE, #{}}, %% the system hasn't been started yet @@ -96,7 +96,7 @@ start_cluster(Config) -> PingResults = [{pong, _} = ra_server_proc:ping(S, 500) || S <- ServerIds], % assert one node is leader ?assert(lists:any(fun ({pong, S}) -> S =:= leader end, PingResults)), - [ok = slave:stop(N) || N <- Nodes], + stop_peers(Peers), ok. start_clusters_in_systems(Config) -> @@ -105,29 +105,30 @@ start_clusters_in_systems(Config) -> DataDir = ?config(data_dir, Config), ClusterName1 = start_clusters_in_systems_1, ClusterName2 = start_clusters_in_systems_2, - Nodes = [start_child_node(N, DataDir) || N <- [s1, s2, s3]], - Servers1 = [{ClusterName1, N} || N <- Nodes], - Servers2 = [{ClusterName2, N} || N <- Nodes], + Peers = [start_peer(N, DataDir) || N <- [s1, s2, s3]], + Servers1 = [{ClusterName1, S} || {S, _P} <- Peers], + Servers2 = [{ClusterName2, S} || {S, _P} <- Peers], Machine = {module, ?MODULE, #{}}, %% the system hasn't been started yet {error, cluster_not_formed} = ra:start_cluster(Sys1, ClusterName1, Machine, Servers1), %% start system on all nodes - [ok = start_system_on(Sys1, N, DataDir) || N <- Nodes], - [ok = start_system_on(Sys2, N, DataDir) || N <- Nodes], + [ok = start_system_on(Sys1, S, DataDir) || {S, _P} <- Peers], + [ok = start_system_on(Sys2, S, DataDir) || {S, _P} <- Peers], {ok, Started1, []} = ra:start_cluster(Sys1, ClusterName1, Machine, Servers1), [] = Started1 -- Servers1, {ok, Started2, []} = ra:start_cluster(Sys2, ClusterName2, Machine, Servers2), [] = Started2 -- Servers2, + stop_peers(Peers), ok. restart_system(Config) -> Sys = ?FUNCTION_NAME, DataDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - ServerIds = [{ClusterName, start_child_node(N, DataDir)} - || N <- [s1, s2, s3]], + Peers = [start_peer(N, DataDir) || N <- [s1, s2, s3]], + ServerIds = [{ClusterName, S} || {S, _P} <- Peers], Nodes = lists:map(fun ({_, N}) -> N end, ServerIds), Machine = {module, ?MODULE, #{}}, %% the system hasn't been started yet @@ -145,6 +146,7 @@ restart_system(Config) -> [ok = start_system_on(Sys, N, DataDir) || N <- Nodes], {ok, Started2, []} = ra:start_cluster(Sys, ClusterName, Machine, ServerIds), [] = Started2 -- ServerIds, + stop_peers(Peers), ok. ra_overview(Config) -> @@ -216,13 +218,18 @@ search_paths() -> lists:filter(fun (P) -> string:prefix(P, Ld) =:= nomatch end, code:get_path()). -start_child_node(N, _PrivDir) -> - Host = get_current_host(), - Pa = string:join(["-pa" | search_paths()], " "), - ct:pal("starting child node with ~ts on host ~ts for node ~ts~n", [Pa, Host, node()]), - {ok, S} = slave:start_link(Host, N, Pa), - _ = rpc:call(S, application, ensure_all_started, [ra]), - S. +start_peer(N, PrivDir) -> + Dir0 = filename:join(PrivDir, N), + Dir = "'" ++ Dir0 ++ "'", + Pa = filename:dirname(code:which(ra)), + Args = ["-pa", Pa, "-ra", "data_dir", Dir], + ct:pal("starting child node ~ts for node ~ts~n", [N, Args]), + {ok, P, S} = ?CT_PEER(#{name => N, args => Args}), + {ok, _} = rpc:call(S, application, ensure_all_started, [ra]), + {S, P}. + +stop_peers(Peers) -> + [peer:stop(P) || {_S, P} <- Peers]. flush() -> receive From f78c76260b8e02732b5d44eab403aa940be275fb Mon Sep 17 00:00:00 2001 From: Michal Kuratczyk Date: Fri, 21 Feb 2025 15:58:12 +0100 Subject: [PATCH 2/2] Fix test flakiness --- test/ra_system_SUITE.erl | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/test/ra_system_SUITE.erl b/test/ra_system_SUITE.erl index 960a57b6..c94aff61 100644 --- a/test/ra_system_SUITE.erl +++ b/test/ra_system_SUITE.erl @@ -79,7 +79,7 @@ start_cluster(Config) -> Sys = ?FUNCTION_NAME, DataDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - Peers = [start_peer(N, DataDir) || N <- [s1, s2, s3]], + Peers = [start_peer(DataDir) || _ <- lists:seq(1, 3)], ServerIds = [{ClusterName, S} || {S, _P} <- Peers], Nodes = lists:map(fun ({_, N}) -> N end, ServerIds), Machine = {module, ?MODULE, #{}}, @@ -105,7 +105,7 @@ start_clusters_in_systems(Config) -> DataDir = ?config(data_dir, Config), ClusterName1 = start_clusters_in_systems_1, ClusterName2 = start_clusters_in_systems_2, - Peers = [start_peer(N, DataDir) || N <- [s1, s2, s3]], + Peers = [start_peer(DataDir) || _ <- lists:seq(1, 3)], Servers1 = [{ClusterName1, S} || {S, _P} <- Peers], Servers2 = [{ClusterName2, S} || {S, _P} <- Peers], Machine = {module, ?MODULE, #{}}, @@ -127,7 +127,7 @@ restart_system(Config) -> Sys = ?FUNCTION_NAME, DataDir = ?config(data_dir, Config), ClusterName = ?config(cluster_name, Config), - Peers = [start_peer(N, DataDir) || N <- [s1, s2, s3]], + Peers = [start_peer(DataDir) || _ <- lists:seq(1, 3)], ServerIds = [{ClusterName, S} || {S, _P} <- Peers], Nodes = lists:map(fun ({_, N}) -> N end, ServerIds), Machine = {module, ?MODULE, #{}}, @@ -218,13 +218,14 @@ search_paths() -> lists:filter(fun (P) -> string:prefix(P, Ld) =:= nomatch end, code:get_path()). -start_peer(N, PrivDir) -> - Dir0 = filename:join(PrivDir, N), +start_peer(PrivDir) -> + Name = ?CT_PEER_NAME(), + Dir0 = filename:join(PrivDir, Name), Dir = "'" ++ Dir0 ++ "'", Pa = filename:dirname(code:which(ra)), Args = ["-pa", Pa, "-ra", "data_dir", Dir], - ct:pal("starting child node ~ts for node ~ts~n", [N, Args]), - {ok, P, S} = ?CT_PEER(#{name => N, args => Args}), + ct:pal("starting child node ~ts for node ~ts~n", [Name, Args]), + {ok, P, S} = ?CT_PEER(#{name => Name, args => Args}), {ok, _} = rpc:call(S, application, ensure_all_started, [ra]), {S, P}.