mirrored_supervisor: Rework error handling after a failed update

[Why]
The retry logic I added in 4621fe7730
was completely wrong. If Khepri reached its own timeout of 30 seconds (as
of this writing), the mirrored supervisor would retry 50 times because
it would not check the time spent. This means it would retry for 25
minutes. Nice.

That retry would be terminated forcefully by the parent supervisor after
5 minutes if it was part of a shutdown.

[How]
This time, the code simply pass the error (timeout or something else)
down to the following `case`. It will shut the mirrored supervisor down.

This fixes very long RabbitMQ node termination (at least 5 minutes,
sometimes more) in testsuites. An example to reproduce:

    gmake -C deps/rabbitmq_mqtt \
      RABBITMQ_METADATA_STORE=khepri \
      ct-v5 t=cluster_size_3:session_takeover_v3_v5

In this one, the third node of the cluster will take 5+ minutes to stop.
This commit is contained in:
Jean-Sébastien Pédron 2025-06-03 12:23:18 +02:00
parent f882a28c71
commit 376dd2ca60
No known key found for this signature in database
GPG Key ID: 39E99761A5FD94CC
1 changed files with 10 additions and 20 deletions

View File

@ -345,10 +345,16 @@ handle_info({'DOWN', _Ref, process, Pid, _Reason},
child_order = ChildOrder}) ->
%% No guarantee pg will have received the DOWN before us.
R = case lists:sort(pg:get_members(Group)) -- [Pid] of
[O | _] -> ChildSpecs = retry_update_all(O, Pid),
[start(Delegate, ChildSpec)
|| ChildSpec <- restore_child_order(ChildSpecs,
ChildOrder)];
[O | _] -> ChildSpecs = update_all(O, Pid),
case ChildSpecs of
_ when is_list(ChildSpecs) ->
[start(Delegate, ChildSpec)
|| ChildSpec <- restore_child_order(
ChildSpecs,
ChildOrder)];
{error, _} ->
[ChildSpecs]
end;
_ -> []
end,
case errors(R) of
@ -428,22 +434,6 @@ check_stop(Group, Delegate, Id) ->
id({Id, _, _, _, _, _}) -> Id.
retry_update_all(O, Pid) ->
retry_update_all(O, Pid, 10000).
retry_update_all(O, Pid, TimeLeft) when TimeLeft > 0 ->
case update_all(O, Pid) of
List when is_list(List) ->
List;
{error, timeout} ->
Sleep = 200,
TimeLeft1 = TimeLeft - Sleep,
timer:sleep(Sleep),
retry_update_all(O, Pid, TimeLeft1)
end;
retry_update_all(O, Pid, _TimeLeft) ->
update_all(O, Pid).
update_all(Overall, OldOverall) ->
rabbit_db_msup:update_all(Overall, OldOverall).