mirrored_supervisor: Rework error handling after a failed update
[Why]
The retry logic I added in 4621fe7730
was completely wrong. If Khepri reached its own timeout of 30 seconds (as
of this writing), the mirrored supervisor would retry 50 times because
it would not check the time spent. This means it would retry for 25
minutes. Nice.
That retry would be terminated forcefully by the parent supervisor after
5 minutes if it was part of a shutdown.
[How]
This time, the code simply pass the error (timeout or something else)
down to the following `case`. It will shut the mirrored supervisor down.
This fixes very long RabbitMQ node termination (at least 5 minutes,
sometimes more) in testsuites. An example to reproduce:
gmake -C deps/rabbitmq_mqtt \
RABBITMQ_METADATA_STORE=khepri \
ct-v5 t=cluster_size_3:session_takeover_v3_v5
In this one, the third node of the cluster will take 5+ minutes to stop.
This commit is contained in:
parent
f882a28c71
commit
376dd2ca60
|
@ -345,10 +345,16 @@ handle_info({'DOWN', _Ref, process, Pid, _Reason},
|
|||
child_order = ChildOrder}) ->
|
||||
%% No guarantee pg will have received the DOWN before us.
|
||||
R = case lists:sort(pg:get_members(Group)) -- [Pid] of
|
||||
[O | _] -> ChildSpecs = retry_update_all(O, Pid),
|
||||
[start(Delegate, ChildSpec)
|
||||
|| ChildSpec <- restore_child_order(ChildSpecs,
|
||||
ChildOrder)];
|
||||
[O | _] -> ChildSpecs = update_all(O, Pid),
|
||||
case ChildSpecs of
|
||||
_ when is_list(ChildSpecs) ->
|
||||
[start(Delegate, ChildSpec)
|
||||
|| ChildSpec <- restore_child_order(
|
||||
ChildSpecs,
|
||||
ChildOrder)];
|
||||
{error, _} ->
|
||||
[ChildSpecs]
|
||||
end;
|
||||
_ -> []
|
||||
end,
|
||||
case errors(R) of
|
||||
|
@ -428,22 +434,6 @@ check_stop(Group, Delegate, Id) ->
|
|||
|
||||
id({Id, _, _, _, _, _}) -> Id.
|
||||
|
||||
retry_update_all(O, Pid) ->
|
||||
retry_update_all(O, Pid, 10000).
|
||||
|
||||
retry_update_all(O, Pid, TimeLeft) when TimeLeft > 0 ->
|
||||
case update_all(O, Pid) of
|
||||
List when is_list(List) ->
|
||||
List;
|
||||
{error, timeout} ->
|
||||
Sleep = 200,
|
||||
TimeLeft1 = TimeLeft - Sleep,
|
||||
timer:sleep(Sleep),
|
||||
retry_update_all(O, Pid, TimeLeft1)
|
||||
end;
|
||||
retry_update_all(O, Pid, _TimeLeft) ->
|
||||
update_all(O, Pid).
|
||||
|
||||
update_all(Overall, OldOverall) ->
|
||||
rabbit_db_msup:update_all(Overall, OldOverall).
|
||||
|
||||
|
|
Loading…
Reference in New Issue