Fix stats DB failover in the graceful shutdown case.

2010-11-05 18:04:42 +00:00 · 2010-11-05 18:04:42 +00:00 · 80327d0a81
parent a7f7201b43
commit 80327d0a81
1 changed files with 20 additions and 10 deletions
--- a/deps/rabbitmq_management/src/rabbit_mgmt_db_monitor.erl
+++ b/deps/rabbitmq_management/src/rabbit_mgmt_db_monitor.erl
@ -36,9 +36,18 @@ acquire_monitor() ->
    case global:whereis_name(rabbit_mgmt_db) of
        undefined ->
            timer:sleep(1000),
-            acquire_monitor();
+            gen_server:cast(?MODULE, acquire_monitor);
        Pid ->
-            erlang:monitor(process, Pid)
+            %% Don't monitor on same node - otherwise we can restart
+            %% it just as we're going down. Then everyone else will
+            %% lose the race at the first restart but not be able to
+            %% require the monitor since the pid never really comes
+            %% back up
+            SelfNode = node(self()),
+            case node(Pid) of
+                SelfNode -> ok;
+                _        -> erlang:monitor(process, Pid)
+            end
    end.

 %%----------------------------------------------------------------------------
@ -49,22 +58,23 @@ init([]) ->
 handle_call(_Request, _From, State) ->
    {reply, not_understood, State}.

+handle_cast(acquire_monitor, State) ->
+    acquire_monitor(),
+    {noreply, State};
+
 handle_cast(_Request, State) ->
    {noreply, State}.

-%% We're mainly interested in the noconnection reason, since otherwise
-%% the supervisor on the remote node will restart the process
-handle_info({'DOWN', _MonitorRef, _Type, _Object, noconnection}, _State) ->
+%% In theory we're only interested in the noconnection case here since
+%% the global sup will restart it normally but during graceful
+%% shutdown the process goes down normally first (and we never get
+%% another message), so we have to try to restart it anyway.
+handle_info({'DOWN', _MonitorRef, _Type, _Object, _Info}, _State) ->
    rabbit_log:info("Statistics database node down.~n", []),
    ok = rabbit_sup:stop_child(rabbit_mgmt_global_sup),
    ok = rabbit_sup:start_child(rabbit_mgmt_global_sup),
    {noreply, acquire_monitor()};

-handle_info({'DOWN', _MonitorRef, _Type, _Object, _Info}, _State) ->
-    rabbit_log:info(
-      "Statistics database down, reacquiring monitor.~n", []),
-    {noreply, acquire_monitor()};
-
 handle_info(_Info, State) ->
    {noreply, State}.