Skip to content

Commit

Permalink
Merge pull request #578 from salemove/fix_heartbeat_timeout
Browse files Browse the repository at this point in the history
Fix coordinator process exiting due to heartbeat race
  • Loading branch information
zmstone authored May 16, 2024
2 parents 6dd0150 + bf92860 commit 255c8a9
Showing 1 changed file with 14 additions and 1 deletion.
15 changes: 14 additions & 1 deletion src/brod_group_coordinator.erl
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,11 @@ stabilize(#state{ rejoin_delay_seconds = RejoinDelaySeconds
State3 = State2#state{is_in_group = false},

%$ 4. Clean up state based on the last failure reason
State = maybe_reset_member_id(State3, Reason),
State4 = maybe_reset_member_id(State3, Reason),

%% 5. Clean up ongoing heartbeat request ref if connection
%% was closed
State = maybe_reset_hb_ref(State4, Reason),

%% 5. ensure we have a connection to the (maybe new) group coordinator
F1 = fun discover_coordinator/1,
Expand Down Expand Up @@ -591,6 +595,15 @@ should_reset_member_id({connection_down, _Reason}) ->
should_reset_member_id(_) ->
false.

%% When connection goes down while waiting for heartbeat
%% response, the response will never be received.
%% Reset heartbeat ref to let new heartbeat request to
%% be sent over new connection.
maybe_reset_hb_ref(State, {connection_down, _Reason}) ->
State#state{hb_ref = ?undef};
maybe_reset_hb_ref(State, _) ->
State.

-spec join_group(state()) -> {ok, state()}.
join_group(#state{ groupId = GroupId
, memberId = MemberId0
Expand Down

0 comments on commit 255c8a9

Please sign in to comment.