Merge pull request #13643 from rabbitmq/su_aws/try_to_leave_cluster_before_joining

Allow a previously reset node to rejoin its original cluster
This commit is contained in:
Michael Klishin 2025-04-01 13:20:26 -04:00 committed by GitHub
commit e83c286367
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 32 additions and 13 deletions

View File

@ -50,7 +50,7 @@ ensure_feature_flags_are_in_sync(Nodes, NodeIsVirgin) ->
RemoteNode :: node(),
Ret :: Ok | Error,
Ok :: {ok, [node()]} | {ok, already_member},
Error :: {error, {inconsistent_cluster, string()}}.
Error :: {error, {inconsistent_cluster, string()} | {error, {erpc, noconnection}}}.
can_join(RemoteNode) ->
?LOG_INFO(
@ -82,7 +82,7 @@ can_join_using_khepri(RemoteNode) ->
NodeType :: node_type(),
Ret :: Ok | Error,
Ok :: ok | {ok, already_member},
Error :: {error, {inconsistent_cluster, string()}}.
Error :: {error, {inconsistent_cluster, string()} | {error, {erpc, noconnection}}}.
%% @doc Adds this node to a cluster using `RemoteNode' to reach it.
join(ThisNode, _NodeType) when ThisNode =:= node() ->
@ -214,6 +214,22 @@ join(RemoteNode, NodeType)
end;
{ok, already_member} ->
{ok, already_member};
{error, {inconsistent_cluster, _Msg}} = Error ->
case rabbit_khepri:is_enabled() of
true ->
Error;
false ->
%% rabbit_mnesia:can_join_cluster/1 notice inconsistent_cluster,
%% as RemoteNode thinks this node is already in the cluster.
%% Attempt to leave the RemoteNode cluster, the discovery cluster,
%% and simply retry the operation.
rabbit_log:info("Mnesia: node ~tp thinks it's clustered "
"with node ~tp, but ~tp disagrees. ~tp will ask "
"to leave the cluster and try again.",
[RemoteNode, node(), node(), node()]),
ok = rabbit_mnesia:leave_then_rediscover_cluster(RemoteNode),
join(RemoteNode, NodeType)
end;
{error, _} = Error ->
Error
end.

View File

@ -73,7 +73,7 @@
-export([node_info/0, remove_node_if_mnesia_running/1]).
%% Used internally in `rabbit_db_cluster'.
-export([members/0]).
-export([members/0, leave_then_rediscover_cluster/1]).
%% Used internally in `rabbit_khepri'.
-export([mnesia_and_msg_store_files/0]).
@ -155,7 +155,7 @@ init() ->
%% we cluster to its cluster.
-spec can_join_cluster(node())
-> {ok, [node()]} | {ok, already_member} | {error, {inconsistent_cluster, string()}}.
-> {ok, [node()]} | {ok, already_member} | {error, {inconsistent_cluster, string()} | {error, {erpc, noconnection}}}.
can_join_cluster(DiscoveryNode) ->
ensure_mnesia_dir(),
@ -179,7 +179,6 @@ can_join_cluster(DiscoveryNode) ->
{ok, already_member};
false ->
Msg = format_inconsistent_cluster_message(DiscoveryNode, node()),
rabbit_log:error(Msg),
{error, {inconsistent_cluster, Msg}}
end
end.
@ -923,15 +922,19 @@ remove_node_if_mnesia_running(Node) ->
end
end.
leave_cluster() ->
case rabbit_nodes:nodes_excl_me(cluster_nodes(all)) of
[] -> ok;
AllNodes -> case lists:any(fun leave_cluster/1, AllNodes) of
true -> ok;
false -> e(no_running_cluster_nodes)
end
end.
leave_then_rediscover_cluster(DiscoveryNode) ->
{ClusterNodes, _, _} = discover_cluster([DiscoveryNode]),
leave_cluster(rabbit_nodes:nodes_excl_me(ClusterNodes)).
leave_cluster() ->
leave_cluster(rabbit_nodes:nodes_excl_me(cluster_nodes(all))).
leave_cluster([]) ->
ok;
leave_cluster(Nodes) when is_list(Nodes) ->
case lists:any(fun leave_cluster/1, Nodes) of
true -> ok;
false -> e(no_running_cluster_nodes)
end;
leave_cluster(Node) ->
case rpc:call(Node,
rabbit_mnesia, remove_node_if_mnesia_running, [node()]) of