rabbit_db: Restart Ra systems after reset during join
[Why] When the local node joins a remote node, it resets its own data first. This includes the files of the Ra systems (`quorum` and `coordination`). When the CLI is used, that's fine because the `rabbit` app is stopped and thus the Ra systems. However, when this is done as part of peer discovery, the node is booting: the Ra systems were started earlier because they are required to run Khepri. Therefore, the reset deletes files being used. This breaks the Ra systems. [How] The Ra systems are stopped just before the reset (if the join is performed as part of peer discovery) and they are restarted after.
This commit is contained in:
parent
dfa2117719
commit
cee181b7a8
|
|
@ -101,11 +101,22 @@ join(RemoteNode, NodeType)
|
|||
%% database because we might change it during the join.
|
||||
RestartMnesia = rabbit_mnesia:is_running(),
|
||||
RestartFFCtl = rabbit_ff_controller:is_running(),
|
||||
RestartRaSystems = rabbit_ra_systems:are_running(),
|
||||
RestartRabbit = rabbit:is_running(),
|
||||
case RestartRabbit of
|
||||
true ->
|
||||
rabbit:stop();
|
||||
false ->
|
||||
%% The Ra systems were started before we initialize the
|
||||
%% database (because Khepri depends on one of them).
|
||||
%% Therefore, there are files in the data directory. They
|
||||
%% will go away with the reset and we will need to restart
|
||||
%% Ra systems afterwards.
|
||||
case RestartRaSystems of
|
||||
true -> ok = rabbit_ra_systems:ensure_stopped();
|
||||
false -> ok
|
||||
end,
|
||||
|
||||
case RestartFFCtl of
|
||||
true ->
|
||||
ok = rabbit_ff_controller:wait_for_task_and_stop();
|
||||
|
|
@ -136,6 +147,30 @@ join(RemoteNode, NodeType)
|
|||
rabbit_ff_registry_factory:release_state_change_lock()
|
||||
end,
|
||||
|
||||
%% After the regular reset, we also reset Mnesia specifically if
|
||||
%% it is meant to be used. That's because we may switch back from
|
||||
%% Khepri to Mnesia. To be safe, remove possibly stale files from
|
||||
%% a previous instance where Mnesia was used.
|
||||
case rabbit_khepri:is_enabled(RemoteNode) of
|
||||
true -> ok;
|
||||
false -> ok = rabbit_mnesia:reset_gracefully()
|
||||
end,
|
||||
|
||||
%% Now that the files are all gone after the reset above, restart
|
||||
%% the Ra systems. They will recreate their folder in the process.
|
||||
case RestartRabbit of
|
||||
true ->
|
||||
ok;
|
||||
false ->
|
||||
case RestartRaSystems of
|
||||
true ->
|
||||
ok = rabbit_ra_systems:ensure_started(),
|
||||
ok = rabbit_khepri:setup();
|
||||
false ->
|
||||
ok
|
||||
end
|
||||
end,
|
||||
|
||||
?LOG_INFO(
|
||||
"DB: joining cluster using remote nodes:~n~tp", [ClusterNodes],
|
||||
#{domain => ?RMQLOG_DOMAIN_DB}),
|
||||
|
|
@ -182,7 +217,6 @@ join(RemoteNode, NodeType)
|
|||
end.
|
||||
|
||||
join_using_mnesia(ClusterNodes, NodeType) when is_list(ClusterNodes) ->
|
||||
ok = rabbit_mnesia:reset_gracefully(),
|
||||
rabbit_mnesia:join_cluster(ClusterNodes, NodeType).
|
||||
|
||||
join_using_khepri(ClusterNodes, disc) ->
|
||||
|
|
|
|||
Loading…
Reference in New Issue