mirrored_supervisor: Rework error handling after a failed update

[Why] The retry logic I added in 4621fe7730 was completely wrong. If Khepri reached its own timeout of 30 seconds (as of this writing), the mirrored supervisor would retry 50 times because it would not check the time spent. This means it would retry for 25 minutes. Nice. That retry would be terminated forcefully by the parent supervisor after 5 minutes if it was part of a shutdown. [How] This time, the code simply pass the error (timeout or something else) down to the following `case`. It will shut the mirrored supervisor down. This fixes very long RabbitMQ node termination (at least 5 minutes, sometimes more) in testsuites. An example to reproduce: gmake -C deps/rabbitmq_mqtt \ RABBITMQ_METADATA_STORE=khepri \ ct-v5 t=cluster_size_3:session_takeover_v3_v5 In this one, the third node of the cluster will take 5+ minutes to stop.
2025-06-03 12:23:18 +02:00 · 2025-06-03 12:23:18 +02:00 · 376dd2ca60
parent f882a28c71
commit 376dd2ca60
1 changed files with 10 additions and 20 deletions
--- a/deps/rabbit/src/mirrored_supervisor.erl
+++ b/deps/rabbit/src/mirrored_supervisor.erl
@ -345,10 +345,16 @@ handle_info({'DOWN', _Ref, process, Pid, _Reason},
                           child_order = ChildOrder}) ->
    %% No guarantee pg will have received the DOWN before us.
    R = case lists:sort(pg:get_members(Group)) -- [Pid] of
-            [O | _] -> ChildSpecs = retry_update_all(O, Pid),
-                       [start(Delegate, ChildSpec)
-                        || ChildSpec <- restore_child_order(ChildSpecs,
-                                                            ChildOrder)];
+            [O | _] -> ChildSpecs = update_all(O, Pid),
+                       case ChildSpecs of
+                           _ when is_list(ChildSpecs) ->
+                               [start(Delegate, ChildSpec)
+                                || ChildSpec <- restore_child_order(
+                                                  ChildSpecs,
+                                                  ChildOrder)];
+                           {error, _} ->
+                               [ChildSpecs]
+                       end;
            _       -> []
        end,
    case errors(R) of
@ -428,22 +434,6 @@ check_stop(Group, Delegate, Id) ->

 id({Id, _, _, _, _, _}) -> Id.

-retry_update_all(O, Pid) ->
-    retry_update_all(O, Pid, 10000).
-
-retry_update_all(O, Pid, TimeLeft) when TimeLeft > 0 ->
-    case update_all(O, Pid) of
-        List when is_list(List) ->
-            List;
-        {error, timeout} ->
-            Sleep = 200,
-            TimeLeft1 = TimeLeft - Sleep,
-            timer:sleep(Sleep),
-            retry_update_all(O, Pid, TimeLeft1)
-    end;
-retry_update_all(O, Pid, _TimeLeft) ->
-    update_all(O, Pid).
-
 update_all(Overall, OldOverall) ->
    rabbit_db_msup:update_all(Overall, OldOverall).