rabbit_node_monitor: Notify `rabbit` is live when handling a `nodeup` message

[Why] So far, when there was a network partition with Mnesia, the most popular partition handling strategies restarted RabbitMQ nodes. Therefore, `rabbit` would execute the boot steps and one of them would notify other members of the cluster that "this RabbitMQ node is live". With Khepri, nodes are not restarted anymore and thus, boot steps are not executed at the end of a network partition. As a consequence, other members are not notified that a member is back online. [How] When the node monitor receives the `nodeup` message (managed by Erlang, meaning that "a remote Erlang node just connected to this node through Erlang distribution"), a `node_up` message is sent to all cluster members (meaning "RabbitMQ is now running on the originating node"). Yeah, very poor naming... This lets the RabbitMQ node monitor know when other nodes running RabbitMQ are back online and react accordingly. If a node is restarted, it means that another node could receive the `node_up` message twice. The actions behind it must be idempotent.
2025-09-29 15:33:34 +02:00 · 2025-09-29 15:33:34 +02:00 · 2c1b75276e
parent 1158aca30e
commit 2c1b75276e
1 changed files with 22 additions and 10 deletions
--- a/deps/rabbit/src/rabbit_node_monitor.erl
+++ b/deps/rabbit/src/rabbit_node_monitor.erl
@ -430,16 +430,8 @@ handle_call(status, _From, State = #state{partitions = Partitions}) ->
 handle_call(_Request, _From, State) ->
    {noreply, State}.
-handle_cast(notify_node_up, State = #state{guid = GUID}) ->
+handle_cast(notify_node_up, State) ->
-    Nodes = rabbit_nodes:list_reachable() -- [node()],
+    do_notify_node_up(State),
    gen_server:abcast(Nodes, ?SERVER,
                      {node_up, node(), rabbit_db_cluster:node_type(), GUID}),
    %% register other active rabbits with this rabbit
    DiskNodes = rabbit_db_cluster:disc_members(),
    [gen_server:cast(?SERVER, {node_up, N, case lists:member(N, DiskNodes) of
                                               true  -> disc;
                                               false -> ram
                                           end}) || N <- Nodes],
    {noreply, State};
 %%----------------------------------------------------------------------------
@ -665,6 +657,12 @@ handle_info({nodedown, Node, Info}, State) ->
 handle_info({nodeup, Node, _Info}, State) ->
    ?LOG_INFO("node ~tp up", [Node]),
    %% We notify that `rabbit' is up here too (in addition to the message sent
    %% explicitly by a boot step. That's because nodes may go down then up
    %% during a network partition, and with Khepri, nodes are not restarted
    %% (unlike with some partition handling strategies used with Mnesia), and
    %% thus the boot steps are not executed.
    do_notify_node_up(State),
    {noreply, State};
 handle_info({mnesia_system_event,
@ -854,6 +852,20 @@ wait_for_cluster_recovery(Condition) ->
                 wait_for_cluster_recovery(Condition)
    end.
 do_notify_node_up(#state{guid = GUID}) ->
    Nodes = rabbit_nodes:list_reachable() -- [node()],
    gen_server:abcast(Nodes, ?SERVER,
                      {node_up, node(), rabbit_db_cluster:node_type(), GUID}),
    %% register other active rabbits with this rabbit
    DiskNodes = rabbit_db_cluster:disc_members(),
    _ = [gen_server:cast(
           ?SERVER,
           {node_up, N, case lists:member(N, DiskNodes) of
                            true  -> disc;
                            false -> ram
                        end}) || N <- Nodes],
    ok.
 handle_dead_rabbit(Node, State) ->
    %% TODO: This may turn out to be a performance hog when there are
    %% lots of nodes.  We really only need to execute some of these