Don't perform CMQ leadership transfer when entering maintenance mode

The time this operation can take in clusters with a lot of classic
mirrored queue (say, 10s or 100s of thousands) be prohibitive for
upgrades.

Upgrades that use a health check to ensure that there are in-sync
replicas before entering maintenance mode, in which case
the transfer is not really necessary.

All of the above is more obvious with the recent changes in #2749.
This commit is contained in:
Michael Klishin 2021-01-27 18:52:17 +03:00
parent 50761cbe03
commit c7b9c39352
No known key found for this signature in database
GPG Key ID: E80EDCFA0CDB21EE
2 changed files with 9 additions and 31 deletions

View File

@ -88,7 +88,8 @@ do_drain() ->
ReadableCandidates = readable_candidate_list(TransferCandidates), ReadableCandidates = readable_candidate_list(TransferCandidates),
rabbit_log:info("Node will transfer primary replicas of its queues to ~b peers: ~s", rabbit_log:info("Node will transfer primary replicas of its queues to ~b peers: ~s",
[length(TransferCandidates), ReadableCandidates]), [length(TransferCandidates), ReadableCandidates]),
transfer_leadership_of_classic_mirrored_queues(TransferCandidates), %% Note: only QQ leadership is transferred because it is a reasonably quick thing to do a lot of queues
%% in the cluster, unlike with CMQs.
transfer_leadership_of_quorum_queues(TransferCandidates), transfer_leadership_of_quorum_queues(TransferCandidates),
stop_local_quorum_queue_followers(), stop_local_quorum_queue_followers(),
@ -248,7 +249,12 @@ transfer_leadership_of_quorum_queues(_TransferCandidates) ->
rabbit_log:info("Leadership transfer for quorum queues hosted on this node has been initiated"). rabbit_log:info("Leadership transfer for quorum queues hosted on this node has been initiated").
-spec transfer_leadership_of_classic_mirrored_queues([node()]) -> ok. -spec transfer_leadership_of_classic_mirrored_queues([node()]) -> ok.
transfer_leadership_of_classic_mirrored_queues([]) -> %% This function is no longer used by maintanence mode. We retain it in case
%% classic mirrored queue leadership transfer would be reconsidered.
%%
%% With a lot of CMQs in a cluster, the transfer procedure can take prohibitively long
%% for a pre-upgrade task.
transfer_leadership_of_classic_mirrored_queues([]) ->
rabbit_log:warning("Skipping leadership transfer of classic mirrored queues: no candidate " rabbit_log:warning("Skipping leadership transfer of classic mirrored queues: no candidate "
"(online, not under maintenance) nodes to transfer to!"); "(online, not under maintenance) nodes to transfer to!");
transfer_leadership_of_classic_mirrored_queues(TransferCandidates) -> transfer_leadership_of_classic_mirrored_queues(TransferCandidates) ->

View File

@ -24,8 +24,7 @@ groups() ->
{cluster_size_3, [], [ {cluster_size_3, [], [
maintenance_mode_status, maintenance_mode_status,
listener_suspension_status, listener_suspension_status,
client_connection_closure, client_connection_closure
classic_mirrored_queue_leadership_transfer
]}, ]},
{quorum_queues, [], [ {quorum_queues, [], [
quorum_queue_leadership_transfer quorum_queue_leadership_transfer
@ -211,33 +210,6 @@ client_connection_closure(Config) ->
rabbit_ct_broker_helpers:revive_node(Config, A). rabbit_ct_broker_helpers:revive_node(Config, A).
classic_mirrored_queue_leadership_transfer(Config) ->
[A | _] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
ct:pal("Picked node ~s for maintenance tests...", [A]),
rabbit_ct_helpers:await_condition(
fun () -> not rabbit_ct_broker_helpers:is_being_drained_local_read(Config, A) end, 10000),
PolicyPattern = <<"^cq.mirrored">>,
rabbit_ct_broker_helpers:set_ha_policy(Config, A, PolicyPattern, <<"all">>),
Conn = rabbit_ct_client_helpers:open_connection(Config, A),
{ok, Ch} = amqp_connection:open_channel(Conn),
QName = <<"cq.mirrored.1">>,
amqp_channel:call(Ch, #'queue.declare'{queue = QName, durable = true}),
?assertEqual(1, length(rabbit_ct_broker_helpers:rpc(Config, A, rabbit_amqqueue, list_local, [<<"/">>]))),
rabbit_ct_broker_helpers:drain_node(Config, A),
rabbit_ct_helpers:await_condition(
fun () -> rabbit_ct_broker_helpers:is_being_drained_local_read(Config, A) end, 10000),
?assertEqual(0, length(rabbit_ct_broker_helpers:rpc(Config, A, rabbit_amqqueue, list_local, [<<"/">>]))),
rabbit_ct_broker_helpers:revive_node(Config, A),
%% rabbit_ct_broker_helpers:set_ha_policy/4 uses pattern for policy name
rabbit_ct_broker_helpers:clear_policy(Config, A, PolicyPattern).
quorum_queue_leadership_transfer(Config) -> quorum_queue_leadership_transfer(Config) ->
[A | _] = Nodenames = rabbit_ct_broker_helpers:get_node_configs( [A | _] = Nodenames = rabbit_ct_broker_helpers:get_node_configs(
Config, nodename), Config, nodename),