rabbit_db: Restart Ra systems after reset during join

[Why]
When the local node joins a remote node, it resets its own data first.
This includes the files of the Ra systems (`quorum` and `coordination`).

When the CLI is used, that's fine because the `rabbit` app is stopped
and thus the Ra systems.

However, when this is done as part of peer discovery, the node is
booting: the Ra systems were started earlier because they are required
to run Khepri. Therefore, the reset deletes files being used. This
breaks the Ra systems.

[How]
The Ra systems are stopped just before the reset (if the join is
performed as part of peer discovery) and they are restarted after.
This commit is contained in:
Jean-Sébastien Pédron 2023-12-28 11:39:47 +01:00
parent dfa2117719
commit cee181b7a8
No known key found for this signature in database
GPG Key ID: 39E99761A5FD94CC
1 changed files with 35 additions and 1 deletions

View File

@ -101,11 +101,22 @@ join(RemoteNode, NodeType)
%% database because we might change it during the join.
RestartMnesia = rabbit_mnesia:is_running(),
RestartFFCtl = rabbit_ff_controller:is_running(),
RestartRaSystems = rabbit_ra_systems:are_running(),
RestartRabbit = rabbit:is_running(),
case RestartRabbit of
true ->
rabbit:stop();
false ->
%% The Ra systems were started before we initialize the
%% database (because Khepri depends on one of them).
%% Therefore, there are files in the data directory. They
%% will go away with the reset and we will need to restart
%% Ra systems afterwards.
case RestartRaSystems of
true -> ok = rabbit_ra_systems:ensure_stopped();
false -> ok
end,
case RestartFFCtl of
true ->
ok = rabbit_ff_controller:wait_for_task_and_stop();
@ -136,6 +147,30 @@ join(RemoteNode, NodeType)
rabbit_ff_registry_factory:release_state_change_lock()
end,
%% After the regular reset, we also reset Mnesia specifically if
%% it is meant to be used. That's because we may switch back from
%% Khepri to Mnesia. To be safe, remove possibly stale files from
%% a previous instance where Mnesia was used.
case rabbit_khepri:is_enabled(RemoteNode) of
true -> ok;
false -> ok = rabbit_mnesia:reset_gracefully()
end,
%% Now that the files are all gone after the reset above, restart
%% the Ra systems. They will recreate their folder in the process.
case RestartRabbit of
true ->
ok;
false ->
case RestartRaSystems of
true ->
ok = rabbit_ra_systems:ensure_started(),
ok = rabbit_khepri:setup();
false ->
ok
end
end,
?LOG_INFO(
"DB: joining cluster using remote nodes:~n~tp", [ClusterNodes],
#{domain => ?RMQLOG_DOMAIN_DB}),
@ -182,7 +217,6 @@ join(RemoteNode, NodeType)
end.
join_using_mnesia(ClusterNodes, NodeType) when is_list(ClusterNodes) ->
ok = rabbit_mnesia:reset_gracefully(),
rabbit_mnesia:join_cluster(ClusterNodes, NodeType).
join_using_khepri(ClusterNodes, disc) ->