rabbit_db: Restart Ra systems after reset during join

[Why] When the local node joins a remote node, it resets its own data first. This includes the files of the Ra systems (`quorum` and `coordination`). When the CLI is used, that's fine because the `rabbit` app is stopped and thus the Ra systems. However, when this is done as part of peer discovery, the node is booting: the Ra systems were started earlier because they are required to run Khepri. Therefore, the reset deletes files being used. This breaks the Ra systems. [How] The Ra systems are stopped just before the reset (if the join is performed as part of peer discovery) and they are restarted after.
2023-12-28 11:39:47 +01:00 · 2023-12-28 11:39:47 +01:00 · cee181b7a8
parent dfa2117719
commit cee181b7a8
1 changed files with 35 additions and 1 deletions
--- a/deps/rabbit/src/rabbit_db_cluster.erl
+++ b/deps/rabbit/src/rabbit_db_cluster.erl
@ -101,11 +101,22 @@ join(RemoteNode, NodeType)
            %% database because we might change it during the join.
            RestartMnesia = rabbit_mnesia:is_running(),
            RestartFFCtl = rabbit_ff_controller:is_running(),
+            RestartRaSystems = rabbit_ra_systems:are_running(),
            RestartRabbit = rabbit:is_running(),
            case RestartRabbit of
                true ->
                    rabbit:stop();
                false ->
+                    %% The Ra systems were started before we initialize the
+                    %% database (because Khepri depends on one of them).
+                    %% Therefore, there are files in the data directory. They
+                    %% will go away with the reset and we will need to restart
+                    %% Ra systems afterwards.
+                    case RestartRaSystems of
+                        true  -> ok = rabbit_ra_systems:ensure_stopped();
+                        false -> ok
+                    end,
+
                    case RestartFFCtl of
                        true ->
                            ok = rabbit_ff_controller:wait_for_task_and_stop();
@ -136,6 +147,30 @@ join(RemoteNode, NodeType)
                rabbit_ff_registry_factory:release_state_change_lock()
            end,

+            %% After the regular reset, we also reset Mnesia specifically if
+            %% it is meant to be used. That's because we may switch back from
+            %% Khepri to Mnesia. To be safe, remove possibly stale files from
+            %% a previous instance where Mnesia was used.
+            case rabbit_khepri:is_enabled(RemoteNode) of
+                true  -> ok;
+                false -> ok = rabbit_mnesia:reset_gracefully()
+            end,
+
+            %% Now that the files are all gone after the reset above, restart
+            %% the Ra systems. They will recreate their folder in the process.
+            case RestartRabbit of
+                true ->
+                    ok;
+                false ->
+                    case RestartRaSystems of
+                        true ->
+                            ok = rabbit_ra_systems:ensure_started(),
+                            ok = rabbit_khepri:setup();
+                        false ->
+                            ok
+                    end
+            end,
+
            ?LOG_INFO(
               "DB: joining cluster using remote nodes:~n~tp", [ClusterNodes],
               #{domain => ?RMQLOG_DOMAIN_DB}),
@ -182,7 +217,6 @@ join(RemoteNode, NodeType)
    end.

 join_using_mnesia(ClusterNodes, NodeType) when is_list(ClusterNodes) ->
-    ok = rabbit_mnesia:reset_gracefully(),
    rabbit_mnesia:join_cluster(ClusterNodes, NodeType).

 join_using_khepri(ClusterNodes, disc) ->