rabbitmq-server/deps/rabbit/test/crashing_queues_SUITE.erl

269 lines
9.1 KiB
Erlang

%% This Source Code Form is subject to the terms of the Mozilla Public
%% License, v. 2.0. If a copy of the MPL was not distributed with this
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
%%
%% Copyright (c) 2007-2023 VMware, Inc. or its affiliates. All rights reserved.
%%
-module(crashing_queues_SUITE).
-include_lib("common_test/include/ct.hrl").
-include_lib("amqp_client/include/amqp_client.hrl").
-compile(export_all).
all() ->
[
{group, cluster_size_2}
].
groups() ->
[
{cluster_size_2, [], [
crashing_unmirrored,
crashing_mirrored,
give_up_after_repeated_crashes
]}
].
%% -------------------------------------------------------------------
%% Testsuite setup/teardown.
%% -------------------------------------------------------------------
init_per_suite(Config) ->
rabbit_ct_helpers:log_environment(),
rabbit_ct_helpers:run_setup_steps(Config).
end_per_suite(Config) ->
rabbit_ct_helpers:run_teardown_steps(Config).
init_per_group(cluster_size_2, Config) ->
rabbit_ct_helpers:set_config(Config, [
{rmq_nodes_count, 2}
]).
end_per_group(_, Config) ->
Config.
init_per_testcase(Testcase, Config) ->
rabbit_ct_helpers:testcase_started(Config, Testcase),
ClusterSize = ?config(rmq_nodes_count, Config),
TestNumber = rabbit_ct_helpers:testcase_number(Config, ?MODULE, Testcase),
Config1 = rabbit_ct_helpers:set_config(Config, [
{rmq_nodename_suffix, Testcase},
{tcp_ports_base, {skip_n_nodes, TestNumber * ClusterSize}}
]),
rabbit_ct_helpers:run_steps(Config1,
rabbit_ct_broker_helpers:setup_steps() ++
rabbit_ct_client_helpers:setup_steps()).
end_per_testcase(Testcase, Config) ->
Config1 = rabbit_ct_helpers:run_steps(Config,
rabbit_ct_client_helpers:teardown_steps() ++
rabbit_ct_broker_helpers:teardown_steps()),
rabbit_ct_helpers:testcase_finished(Config1, Testcase).
%% -------------------------------------------------------------------
%% Testcases.
%% -------------------------------------------------------------------
crashing_unmirrored(Config) ->
[A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
ChA = rabbit_ct_client_helpers:open_channel(Config, A),
ConnB = rabbit_ct_client_helpers:open_connection(Config, B),
QName = <<"crashing_unmirrored-q">>,
amqp_channel:call(ChA, #'confirm.select'{}),
test_queue_failure(A, ChA, ConnB, 1, 0,
#'queue.declare'{queue = QName, durable = true}),
test_queue_failure(A, ChA, ConnB, 0, 0,
#'queue.declare'{queue = QName, durable = false}),
ok.
crashing_mirrored(Config) ->
[A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
rabbit_ct_broker_helpers:set_ha_policy(Config, A, <<".*">>, <<"all">>),
ChA = rabbit_ct_client_helpers:open_channel(Config, A),
ConnB = rabbit_ct_client_helpers:open_connection(Config, B),
QName = <<"crashing_mirrored-q">>,
amqp_channel:call(ChA, #'confirm.select'{}),
test_queue_failure(A, ChA, ConnB, 2, 1,
#'queue.declare'{queue = QName, durable = true}),
ok.
test_queue_failure(Node, Ch, RaceConn, MsgCount, FollowerCount, Decl) ->
#'queue.declare_ok'{queue = QName} = amqp_channel:call(Ch, Decl),
try
publish(Ch, QName, transient),
publish(Ch, QName, durable),
Racer = spawn_declare_racer(RaceConn, Decl),
kill_queue(Node, QName),
assert_message_count(MsgCount, Ch, QName),
assert_follower_count(FollowerCount, Node, QName),
stop_declare_racer(Racer)
after
amqp_channel:call(Ch, #'queue.delete'{queue = QName})
end.
give_up_after_repeated_crashes(Config) ->
[A, B] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
ChA = rabbit_ct_client_helpers:open_channel(Config, A),
ChB = rabbit_ct_client_helpers:open_channel(Config, B),
QName = <<"give_up_after_repeated_crashes-q">>,
amqp_channel:call(ChA, #'confirm.select'{}),
amqp_channel:call(ChA, #'queue.declare'{queue = QName,
durable = true}),
await_state(A, QName, running),
publish(ChA, QName, durable),
kill_queue_hard(A, QName),
{'EXIT', _} = (catch amqp_channel:call(
ChA, #'queue.declare'{queue = QName,
durable = true})),
await_state(A, QName, crashed),
amqp_channel:call(ChB, #'queue.delete'{queue = QName}),
amqp_channel:call(ChB, #'queue.declare'{queue = QName,
durable = true}),
await_state(A, QName, running),
%% Since it's convenient, also test absent queue status here.
rabbit_ct_broker_helpers:stop_node(Config, B),
await_state(A, QName, down),
ok.
publish(Ch, QName, DelMode) ->
Publish = #'basic.publish'{exchange = <<>>, routing_key = QName},
Msg = #amqp_msg{props = #'P_basic'{delivery_mode = del_mode(DelMode)}},
amqp_channel:cast(Ch, Publish, Msg),
amqp_channel:wait_for_confirms(Ch).
del_mode(transient) -> 1;
del_mode(durable) -> 2.
spawn_declare_racer(Conn, Decl) ->
Self = self(),
spawn_link(fun() -> declare_racer_loop(Self, Conn, Decl) end).
stop_declare_racer(Pid) ->
Pid ! stop,
MRef = erlang:monitor(process, Pid),
receive
{'DOWN', MRef, process, Pid, _} -> ok
end.
declare_racer_loop(Parent, Conn, Decl) ->
receive
stop -> unlink(Parent)
after 0 ->
%% Catch here because we might happen to catch the queue
%% while it is in the middle of recovering and thus
%% explode with NOT_FOUND because crashed. Doesn't matter,
%% we are only in this loop to try to fool the recovery
%% code anyway.
try
case amqp_connection:open_channel(Conn) of
{ok, Ch} -> amqp_channel:call(Ch, Decl);
closing -> ok;
{error, _} -> ok
end
catch
exit:_ ->
ok
end,
declare_racer_loop(Parent, Conn, Decl)
end.
await_state(Node, QName, State) ->
await_state(Node, QName, State, 30000).
await_state(Node, QName, State, Time) ->
case state(Node, QName) of
State ->
ok;
Other ->
case Time of
0 -> exit({timeout_awaiting_state, State, Other});
_ -> timer:sleep(100),
await_state(Node, QName, State, Time - 100)
end
end.
state(Node, QName) ->
V = <<"/">>,
Res = rabbit_misc:r(V, queue, QName),
Infos = rpc:call(Node, rabbit_amqqueue, info_all, [V, [name, state]]),
case Infos of
[] -> undefined;
[[{name, Res}, {state, State}]] -> State
end.
kill_queue_hard(Node, QName) ->
case kill_queue(Node, QName) of
crashed -> ok;
_NewPid -> timer:sleep(100),
kill_queue_hard(Node, QName)
end.
kill_queue(Node, QName) ->
Pid1 = queue_pid(Node, QName),
exit(Pid1, boom),
await_new_pid(Node, QName, Pid1).
queue_pid(Node, QName) ->
Q = lookup(Node, QName),
QPid = amqqueue:get_pid(Q),
State = amqqueue:get_state(Q),
#resource{virtual_host = VHost} = amqqueue:get_name(Q),
case State of
crashed ->
case rabbit_amqqueue_sup_sup:find_for_vhost(VHost, Node) of
{error, {queue_supervisor_not_found, _}} -> {error, no_sup};
{ok, SPid} ->
case sup_child(Node, SPid) of
{ok, _} -> QPid; %% restarting
{error, no_child} -> crashed %% given up
end
end;
_ -> QPid
end.
sup_child(Node, Sup) ->
case rpc:call(Node, supervisor, which_children, [Sup]) of
[{_, Child, _, _}] -> {ok, Child};
[] -> {error, no_child};
{badrpc, {'EXIT', {noproc, _}}} -> {error, no_sup}
end.
lookup(Node, QName) ->
{ok, Q} = rpc:call(Node, rabbit_amqqueue, lookup,
[rabbit_misc:r(<<"/">>, queue, QName)]),
Q.
await_new_pid(Node, QName, OldPid) ->
case queue_pid(Node, QName) of
OldPid -> timer:sleep(10),
await_new_pid(Node, QName, OldPid);
New -> New
end.
assert_message_count(Count, Ch, QName) ->
#'queue.declare_ok'{message_count = Count} =
amqp_channel:call(Ch, #'queue.declare'{queue = QName,
passive = true}).
assert_follower_count(Count, Node, QName) ->
Q = lookup(Node, QName),
[{_, Pids}] = rpc:call(Node, rabbit_amqqueue, info, [Q, [slave_pids]]),
RealCount = case Pids of
'' -> 0;
_ -> length(Pids)
end,
case RealCount of
Count ->
ok;
_ when RealCount < Count ->
timer:sleep(10),
assert_follower_count(Count, Node, QName);
_ ->
exit({too_many_replicas, Count, RealCount})
end.