diff --git a/include/locks_trace.hrl b/include/locks_trace.hrl new file mode 100644 index 0000000..cb46eb0 --- /dev/null +++ b/include/locks_trace.hrl @@ -0,0 +1,10 @@ +%% This is to support stack traces on pre-OTP21 versions +%% while avoiding compiler warnings on later versions. +-ifdef(OTP_RELEASE). +-define(_catch_(ErrorType, Error, ErrorStackTrace), + catch ErrorType:Error:ErrorStackTrace ->). +-else. +-define(_catch_(ErrorType, Error, ErrorStackTrace), + catch ErrorType:Error -> + ErrorStackTrace = erlang:get_stacktrace(),). +-endif. diff --git a/rebar.config b/rebar.config index 5f66135..bf259fc 100644 --- a/rebar.config +++ b/rebar.config @@ -27,5 +27,10 @@ {top_level_readme, {"./README.md","https://github.com/uwiger/locks"}}]} ]}, - {test, [{project_app_dirs, [".", "examples/gdict"]}]} + {test, [ {project_app_dirs, [".", "examples/gdict"]} + , {deps, [{leader, ".*", + {git, "https://github.com/uwiger/locks-test.git", + {branch, "uw-error-inspection"}}} + ]} + ]} ]}. diff --git a/rebar.lock b/rebar.lock index cbc55d9..db306b7 100644 --- a/rebar.lock +++ b/rebar.lock @@ -1,6 +1,8 @@ -{"1.1.0", +{"1.2.0", [{<<"plain_fsm">>,{pkg,<<"plain_fsm">>,<<"1.4.1">>},0}]}. [ {pkg_hash,[ - {<<"plain_fsm">>, <<"47E9BF6AC9322FC7586FB6DF8DE7198391E93764571C75165F2C45B27ACDE1D0">>}]} + {<<"plain_fsm">>, <<"47E9BF6AC9322FC7586FB6DF8DE7198391E93764571C75165F2C45B27ACDE1D0">>}]}, +{pkg_hash_ext,[ + {<<"plain_fsm">>, <<"6CE2E7EFDCDC516EEBCED22A6ED2B1AB760F8275EE230F7C80B48DD32AE0DA39">>}]} ]. diff --git a/src/locks_agent.erl b/src/locks_agent.erl index 0f1fc5b..ffff7e6 100755 --- a/src/locks_agent.erl +++ b/src/locks_agent.erl @@ -58,6 +58,7 @@ -import(lists,[foreach/2,any/2,map/2,member/2]). +-include("locks_trace.hrl"). -include("locks_agent.hrl"). -ifdef(DEBUG). @@ -173,12 +174,11 @@ agent_init(Wait, Client, Options) -> {ok, St} -> ack(Wait, Client, {ok, self()}), try loop(St) - catch - error:Error -> + ?_catch_(error, Error, ST) error_logger:error_report( [{?MODULE, aborted}, {reason, Error}, - {trace, erlang:get_stacktrace()}]), + {trace, ST}]), error(Error) end %% Other -> diff --git a/src/locks_leader.erl b/src/locks_leader.erl index 333e786..da38372 100644 --- a/src/locks_leader.erl +++ b/src/locks_leader.erl @@ -135,13 +135,14 @@ mod_state, buffered = []}). +-include("locks_trace.hrl"). -include("locks.hrl"). -include("locks_debug.hrl"). -ifdef(LOCKS_DEBUG). -define(log(X, S), dbg_log(X, S)). -else. --define(log(X, S), ok). +-define(log(X, S), ?event(X, S)). -endif. -define(event(E), event(?LINE, E, none)). @@ -460,9 +461,8 @@ init_(Module, ModSt0, Options, Parent, Reg) -> {ok, MSt} -> MSt; {error, Reason} -> abort_init(Reason, Parent) - catch - error:Error -> - abort_init({Error, erlang:get_stacktrace()}, Parent) + ?_catch_(error, Error, ST) + abort_init({Error, ST}, Parent) end, AllNodes = ordsets:from_list([node()|nodes()]), Agent = @@ -565,10 +565,10 @@ safe_loop(#st{agent = A} = S) -> ?log(_Msg, S), ?event({in_safe_loop, _Msg}, S), noreply(leader_affirmed(L, Ref, S)); - {?MODULE, ensure_sync, _, _} = _Msg -> + {?MODULE, ensure_sync, L, Type, ERef} = _Msg -> ?log(_Msg, S), ?event({in_safe_loop, _Msg}, S), - noreply(S); + noreply(sync_requested(L, Type, ERef, S)); {'$gen_call', From, '$locks_leader_debug'} = _Msg -> ?log(_Msg, S), handle_call('$locks_leader_debug', From, S); @@ -641,11 +641,11 @@ handle_info_({?MODULE, leader_uncertain, L, Synced, SyncedWs}, S) -> handle_info_({?MODULE, affirm_leader, L, ERef} = _Msg, #st{} = S) -> ?event(_Msg, S), noreply(leader_affirmed(L, ERef, S)); -handle_info_({?MODULE, ensure_sync, Pid, Type} = _Msg, #st{} = S) -> +handle_info_({?MODULE, ensure_sync, Pid, Type, _ERef} = _Msg, #st{} = S) -> ?event(_Msg, S), S1 = case S#st.leader of Me when Me == self() -> - maybe_announce_leader(Pid, Type, remove_synced(Pid, Type, S)); + do_ensure_sync(Pid, Type, S); _ -> S end, @@ -832,6 +832,7 @@ add_cand(Client, S) when Client == self() -> add_cand(Client, #st{candidates = Cands, role = Role} = S) -> case lists:member(Client, Cands) of false -> + ?event({add_cand, Client}), monitor_cand(Client), S1 = S#st{candidates = [Client | Cands]}, if Role == worker -> @@ -1089,9 +1090,28 @@ leader_affirmed(L, ERef, #st{} = S) -> request_sync(L, ERef, S). request_sync(L, ERef, S) -> - snd(L, {?MODULE, ensure_sync, self(), S#st.role}), + snd(L, {?MODULE, ensure_sync, self(), S#st.role, ERef}), S#st{leader = undefined, election_ref = ERef}. +sync_requested(Pid, Type, ERef, #st{ leader = undefined + , election_ref = ERef + , vector = #{leader := Me} + , agent = A } = S) + when Me == self() -> + %% We were uncertain about whether we're leader, but this is + %% affirmation from at least one other candidate. + case locks_agent:transaction_status(A) of + {have_all_locks, _} -> + do_ensure_sync(Pid, Type, S#st{leader = self()}); + _ -> + S + end; +sync_requested(_, _, _, S) -> + S. + +do_ensure_sync(Pid, Type, S) -> + maybe_announce_leader(Pid, Type, remove_synced(Pid, Type, S)). + set_leader_uncertain(#st{agent = A} = S) -> send_all(S, {?MODULE, leader_uncertain, self(), S#st.synced, S#st.synced_workers}), @@ -1151,5 +1171,6 @@ lock_holder(#lock{queue = [#w{entries = [#entry{agent = A}]}|_]}) -> -ifdef(LOCKS_DEBUG). dbg_log(X, #st{leader = L, vector = V}) -> + ?event(X, S), ?log(#{x => X, l => L, v => V}). -endif. diff --git a/src/locks_ttb.erl b/src/locks_ttb.erl index 61dc305..2c04b46 100644 --- a/src/locks_ttb.erl +++ b/src/locks_ttb.erl @@ -3,6 +3,11 @@ -compile([export_all, nowarn_export_all]). -dialyzer({nowarn_function, pp_term/1}). +-record(h1, { tab = ets:new(h1, [ordered_set]) + , n = 1 + , max = 1000 + , first_ts }). + %% This function is also traced. Can be used to insert markers in the trace %% log. event(E) -> @@ -37,7 +42,15 @@ format(Dir) -> format(Dir, standard_io). format(Dir, OutFile) -> - ttb:format(Dir, format_opts(OutFile)). + %% ttb:format(Dir, format_opts(OutFile)). + #h1{tab = T} = H1 = #h1{}, + ok = ttb:format(Dir, [{handler, {fun handler1/4, H1}}]), + TS = fetch_ts(T), + PMap = fetch_pmap(T), + try to_file(OutFile, TS, PMap, T) + after + ets:delete(T) + end. format_opts() -> format_opts(standard_io). @@ -45,7 +58,72 @@ format_opts() -> format_opts(OutFile) -> [{out, OutFile}, {handler, {fun handler/4, {0,0}}}]. -handler(Fd, Trace, _, {Tp,Diff} = Acc) -> +to_file(OutFile, TS0, PMap, Tab) -> + {ok, Fd} = file:open(OutFile, [write]), + if TS0 > 0 -> + io:fwrite(Fd, "%% -*- erlang -*-~n", []); + true -> + ok + end, + try + ets:foldl( + fun({_, {Trace, TraceInfo}}, Acc) -> + handler(Fd, Trace, TraceInfo, Acc) + end, {TS0,0,PMap}, Tab) + after + file:close(Fd) + end. + +handler1(_Fd, Trace, TraceInfo, #h1{ first_ts = undefined } = S) + when element(1, Trace) == trace_ts -> + log_ts(Trace, S), + log_pids(Trace, S), + log({Trace, TraceInfo}, S#h1{ first_ts = Trace }); +handler1(_Fd, Trace, TraceInfo, S) -> + log_pids(Trace, S), + log({Trace, TraceInfo}, S). + +log_ts(TraceTs, #h1{ tab = T }) -> + TS = element(tuple_size(TraceTs), TraceTs), + ets:insert(T, {ts, TS}). + +log_pids(Trace, #h1{ tab = T }) -> + PMap = get_pmap(T), + PMap1 = get_pids(Trace, #{}, ttb), + ets:insert(T, {pmap, maps:merge(PMap, PMap1)}). + +get_pmap(T) -> + case ets:lookup(T, pmap) of + [] -> + #{}; + [{_, M}] -> + M + end. + +fetch_pmap(T) -> + M = get_pmap(T), + ets:delete(T, pmap), + M. + +fetch_ts(T) -> + case ets:lookup(T, ts) of + [] -> + 0; + [{_, TS}] -> + ets:delete(T, ts), + TS + end. + +log(X, #h1{ tab = T, n = N, max = Max } = S) -> + ets:insert(T, {N, X}), + if N > Max -> + ets:delete(T, ets:first(T)); + true -> + ok + end, + S#h1{ n = N+1 }. + +handler(Fd, Trace, _, {Tp,Diff,PMap} = Acc) -> if Acc == {0,0} -> io:fwrite(Fd, "%% -*- erlang -*-~n", []); true -> ok @@ -57,15 +135,18 @@ handler(Fd, Trace, _, {Tp,Diff} = Acc) -> Tdiff = tdiff(TS, Tp), Diff1 = Diff + Tdiff, print(Fd, Node, Mod, Line, Evt, State, Diff1), - case get_pids({Evt, State}) of - [] -> ok; + case get_pids({Evt, State}, PMap) of + M when map_size(M) == 0 -> ok; Pids -> - io:fwrite(Fd, " Nodes = ~p~n", [Pids]) + Nodes = [{node_prefix(P), N} + || {P, N} <- lists:ukeysort( + 2, maps:to_list(Pids))], + io:fwrite(Fd, " Nodes = ~p~n", [Nodes]) end, - {TS, Diff1}; + {TS, Diff1,PMap}; _ -> io:fwrite(Fd, "~p~n", [Trace]), - {Tp, Diff} + {Tp, Diff,PMap} end. -define(CHAR_MAX, 60). @@ -127,29 +208,42 @@ record_print_fun(Mod) -> end end. -get_pids(Term) -> - Pids = dict:to_list(get_pids(Term, dict:new())), - [{node_prefix(P), N} || {N, P} <- Pids]. +get_pids(Term, Ref) -> + get_pids(Term, #{}, Ref). -get_pids(T, Acc) when is_tuple(T) -> - get_pids(tuple_to_list(T), Acc); -get_pids(L, Acc) when is_list(L) -> - get_pids_(L, Acc); -get_pids(P, Acc) when is_pid(P) -> - try ets:lookup(ttb, P) of - [{_, _, Node}] -> - dict:store(Node, P, Acc); +%% get_pids(Term, M) -> +%% get_pids(Term, Dict)), +%% [{node_prefix(P), N} || {N, P} <- Pids]. + +get_pids(T, Acc, Ref) when is_tuple(T) -> + get_pids(tuple_to_list(T), Acc, Ref); +get_pids(L, Acc, Ref) when is_list(L) -> + get_pids_(L, Acc, Ref); +get_pids(P, Acc, Ref) when is_pid(P) -> + case check_ref(P, Ref) of + {ok, N} -> + Acc#{P => N}; _ -> Acc - catch - error:_ -> Acc end; -get_pids(_, Acc) -> +get_pids(_, Acc, _) -> Acc. -get_pids_([H|T], Acc) -> - get_pids_(T, get_pids(H, Acc)); -get_pids_(_, Acc) -> +check_ref(P, ttb) -> + try ets:lookup(ttb, P) of + [{_, _, Node}] -> + {ok, Node}; + _ -> + error + catch + error:_ -> error + end; +check_ref(P, Map) when is_map(Map) -> + maps:find(P, Map). + +get_pids_([H|T], Acc, Ref) -> + get_pids_(T, get_pids(H, Acc, Ref), Ref); +get_pids_(_, Acc, _) -> Acc. diff --git a/test/locks_leader_SUITE.erl b/test/locks_leader_SUITE.erl index 9332927..db1c8ad 100644 --- a/test/locks_leader_SUITE.erl +++ b/test/locks_leader_SUITE.erl @@ -16,7 +16,8 @@ gdict_simple_netsplit/1, gdict_all_nodes/1, gdict_netsplit/1, - start_incremental/1 + start_incremental/1, + random_netsplits/1 ]). -export([patch_net_kernel/0, @@ -24,7 +25,8 @@ connect_nodes/1, disconnect_nodes/1, unbar_nodes/0, - leader_nodes/1]). + leader_nodes/1, + same_leaders/1]). -include_lib("common_test/include/ct.hrl"). -define(retry_not(Res, Expr), retry(fun() -> @@ -49,7 +51,8 @@ all() -> {group, g_2i}, {group, g_3i}, {group, g_4i}, - {group, g_5i} + {group, g_5i}, + {group, random_netsplits} ]. groups() -> @@ -66,7 +69,8 @@ groups() -> {g_2i, [], [start_incremental]}, {g_3i, [], [start_incremental]}, {g_4i, [], [start_incremental]}, - {g_5i, [], [start_incremental]} + {g_5i, [], [start_incremental]}, + {random_netsplits, [], [random_netsplits]} ]. suite() -> @@ -114,11 +118,16 @@ init_per_group(g_4i, Config) -> init_per_group(g_5i, Config) -> application:start(locks), Ns = start_slaves(node_list(5)), + [{slaves, Ns}|Config]; +init_per_group(random_netsplits, Config) -> + application:start(locks), + Ns = start_slaves(node_list(10)), [{slaves, Ns}|Config]. end_per_group(g_local, _Config) -> application:stop(locks); end_per_group(_Group, Config) -> + application:stop(locks), stop_slaves(?config(slaves, Config)), ok. @@ -290,6 +299,125 @@ start_incremental(N, Alive, Rest, Name) -> ct:log("Leaders = ~p~n", [Leaders]), start_incremental(Rest, NewAlive, Name). +random_netsplits(Config) -> + with_trace(fun random_netsplits_/1, Config, "random_netsplits"). + +random_netsplits_(Config) -> + DName = [?MODULE, ?LINE], + Slaves = get_slave_nodes(Config), + ct:log("Slaves = ~p", [Slaves]), + St0 = #{ islands => [] + , idle => Slaves + , dict => DName }, + do_random_splits(St0, Config, 1000), + ok. + +do_random_splits(St, Config, N) when N > 0 -> + case next_cmd(St) of + stop -> + ok; + {Cmd, Args} -> + St1 = perform(Cmd, Args, St), + do_random_splits(St1, Config, N-1) + end; +do_random_splits(_, _, _) -> + ok. + +perform(split, {I, A, B} = Arg, #{ islands := Isls } = St) -> + locks_ttb:event({?LINE, split, Arg}), + ANodes = [N || {N,_} <- A], + BNodes = [N || {N,_} <- B], + proxy_multicall(ANodes, ?MODULE, disconnect_nodes, [BNodes]), + NewIslands = [A, B | Isls -- [I]], + ct:log("split ~p -> ~p", [Arg, NewIslands]), + St#{ islands => NewIslands }; +perform(rejoin, {A, B} = Arg, #{ islands := Isls } = St) -> + locks_ttb:event({?LINE, rejoin, Arg}), + ANodes = [N || {N,_} <- A], + BNodes = [N || {N,_} <- B], + proxy_multicall(ANodes, ?MODULE, allow, [BNodes]), + proxy_multicall(BNodes, ?MODULE, allow, [ANodes]), + proxy_multicall(ANodes, ?MODULE, connect_nodes, [BNodes]), + NewIslands = [ A ++ B | (Isls -- [A, B]) ], + ct:log("rejoined ~p -> ~p", [Arg, NewIslands]), + St#{ islands => NewIslands }; +perform(add, {Node, Island} = Arg, #{ islands := Isls + , idle := Idle + , dict := D } = St) -> + locks_ttb:event({?LINE, add, Arg}), + INodes = [N || {N,_} <- Island], + ok = call_proxy(Node, ?MODULE, connect_nodes, [INodes]), + ok = call_proxy(Node, application, start, [locks]), + {ok, Dx} = call_proxy(Node, gdict, new_opt, [[{resource, D}]]), + Island1 = [{Node, Dx}|Island], + ct:log("add ~p to ~p -> ~p", [Node, Island, Island1]), + St#{ islands => [Island1 | (Isls -- [Island])] + , idle => Idle -- [Node] }; +perform(update, Arg, St) -> + locks_ttb:event({?LINE, update, Arg}), + ct:log("update ~p - ignored", [Arg]), + St; +perform(check, [{N,_}|_] = I, St) -> + ct:log("check: I = ~p", [I]), + Dicts = [D || {_,D} <- I], + true = ?retry(true, call_proxy(N, ?MODULE, same_leaders, [Dicts])), + St. + +next_cmd(St) -> + case cmds(St) of + [] -> + ct:log("No possible cmd. St = ~p", [St]), + stop; + [_|_] = Cmds -> + Cmd = oneof(Cmds), + {Cmd, cmd_args(Cmd, St)} + end. + +cmds(#{ islands := Isls, idle := Idle }) -> + [ split || [I || I <- Isls, + length(I) > 1] =/= [] ] + ++ [ rejoin || length(Isls) > 1 ] + ++ [ update || Isls =/= [] ] + ++ [ add || Idle =/= [] ] + ++ [ check || Isls =/= [] ]. + +cmd_args(split, #{ islands := Isls }) -> + I = oneof([I || I <- Isls, + length(I) > 1]), + {A, B} = divide(I), + {I, A, B}; +cmd_args(rejoin, #{ islands := Isls }) -> + I1 = oneof(Isls), + I2 = oneof(Isls -- [I1]), + {I1, I2}; +cmd_args(update, #{ islands := Isls }) -> + oneof(Isls); +cmd_args(add, #{ islands := Isls, idle := Idle }) -> + Island = case Isls of + [] -> []; + [_|_] -> oneof(Isls) + end, + {oneof(Idle), Island}; +cmd_args(check, #{ islands := Isls }) -> + oneof(Isls). + +oneof(L) -> + lists:nth(rand:uniform(length(L)), L). + +divide(L) -> + N = rand:uniform(length(L) - 1), + pick_n(N, L). + +pick_n(N, L) -> + pick_n(N, L, []). + +pick_n(N, L, Acc) when N > 0 -> + X = oneof(L), + pick_n(N-1, L -- [X], [X|Acc]); +pick_n(_, Rest, Acc) -> + {lists:reverse(Acc), Rest}. + + %% ============================================================ %% Support code %% ============================================================ @@ -298,8 +426,13 @@ with_trace(F, Config, Name) -> Ns = get_slave_nodes(Config), Pats = [{test_cb, event, 3, []}|locks_ttb:default_patterns()], Flags = locks_ttb:default_flags(), - locks_ttb:trace_nodes([node()|Ns], Pats, Flags, [{file, Name}]), - try F(Config) + Nodes = [node() | Ns], + Opts = [{file, Name}], + locks_ttb:trace_nodes(Nodes, Pats, Flags, Opts), + try F([{locks_ttb, #{ pats => Pats + , flags => Flags + , opts => Opts + , nodes => Nodes }} | Config]) catch error:R -> Stack = erlang:get_stacktrace(), @@ -334,10 +467,11 @@ insert_initial(D, []) -> insert_initial(_, _) -> ok. -node_list(N) when is_integer(N), N > 0, N < 5 -> - lists:sublist(node_list(5), 1, N); -node_list(5) -> - [locks_1, locks_2, locks_3, locks_4, locks_5]. +node_list(N) when is_integer(N), N > 0, N < 10 -> + lists:sublist(node_list(10), 1, N); +node_list(10) -> + [ locks_1, locks_2, locks_3, locks_4, locks_5 + , locks_6, locks_7, locks_8, locks_9, locks_10 ]. retry(F, N) -> retry(F, N, undefined). @@ -367,6 +501,13 @@ leader_nodes(Ds) -> wait_for_dicts(Ds), [node(locks_leader:info(D, leader)) || D <- Ds]. +same_leaders(Ds) -> + Nodes = leader_nodes(Ds), + case lists:usort(Nodes) of + [_] -> true; + _ -> false + end. + -define(PROXY, locks_leader_test_proxy). proxy() -> @@ -422,7 +563,7 @@ start_slave(Name) -> {ok, Node} = ct_slave:start(host(), Name, [{erl_flags, Paths ++ Arg}]), {module,net_kernel} = rpc:call(Node, ?MODULE, patch_net_kernel, []), disconnect_node(Node), - true = net_kernel:hidden_connect(Node), + true = net_kernel:hidden_connect_node(Node), spawn(Node, ?MODULE, proxy, []), {Node, rpc:call(Node, os, getpid, [])}.