2020-07-14 00:39:36 +08:00
|
|
|
%% This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
%% License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
|
|
%%
|
2024-01-02 11:02:20 +08:00
|
|
|
%% Copyright (c) 2007-2025 Broadcom. All Rights Reserved. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
|
2020-07-14 00:39:36 +08:00
|
|
|
%%
|
2019-05-17 00:50:06 +08:00
|
|
|
-module(cluster_SUITE).
|
2022-10-13 18:38:42 +08:00
|
|
|
-compile([export_all, nowarn_export_all]).
|
2019-05-17 00:50:06 +08:00
|
|
|
|
|
|
|
-include_lib("eunit/include/eunit.hrl").
|
2023-01-03 01:02:43 +08:00
|
|
|
-import(util, [expect_publishes/3,
|
2022-12-12 18:05:43 +08:00
|
|
|
connect/3,
|
2023-01-03 01:02:43 +08:00
|
|
|
connect/4,
|
|
|
|
await_exit/1]).
|
2023-01-28 02:25:57 +08:00
|
|
|
|
2022-10-13 18:38:42 +08:00
|
|
|
-import(rabbit_ct_broker_helpers,
|
|
|
|
[setup_steps/0,
|
|
|
|
teardown_steps/0,
|
|
|
|
get_node_config/3,
|
|
|
|
rabbitmqctl/3,
|
2023-04-27 18:52:56 +08:00
|
|
|
rpc/4, rpc/5,
|
2023-01-03 01:02:43 +08:00
|
|
|
stop_node/2
|
|
|
|
]).
|
2023-01-28 02:25:57 +08:00
|
|
|
|
2023-04-27 18:52:56 +08:00
|
|
|
-import(rabbit_ct_helpers,
|
|
|
|
[eventually/3]).
|
|
|
|
|
2022-12-12 18:05:43 +08:00
|
|
|
-define(OPTS, [{connect_timeout, 1},
|
|
|
|
{ack_timeout, 1}]).
|
|
|
|
|
2019-05-17 00:50:06 +08:00
|
|
|
all() ->
|
|
|
|
[
|
2023-06-13 20:55:41 +08:00
|
|
|
{group, v4},
|
|
|
|
{group, v5}
|
2019-05-17 00:50:06 +08:00
|
|
|
].
|
|
|
|
|
|
|
|
groups() ->
|
|
|
|
[
|
2023-06-13 20:55:41 +08:00
|
|
|
{v4, [], cluster_size_5()},
|
|
|
|
{v5, [], cluster_size_5()}
|
|
|
|
].
|
|
|
|
cluster_size_5() ->
|
|
|
|
[
|
|
|
|
connection_id_tracking,
|
2023-10-27 18:56:13 +08:00
|
|
|
connection_id_tracking_on_nodedown
|
2019-05-17 00:50:06 +08:00
|
|
|
].
|
|
|
|
|
|
|
|
%% -------------------------------------------------------------------
|
|
|
|
%% Testsuite setup/teardown.
|
|
|
|
%% -------------------------------------------------------------------
|
|
|
|
|
|
|
|
merge_app_env(Config) ->
|
2023-01-03 01:02:43 +08:00
|
|
|
rabbit_ct_helpers:merge_app_env(
|
|
|
|
Config,
|
|
|
|
{rabbit, [
|
|
|
|
{collect_statistics, basic},
|
|
|
|
{collect_statistics_interval, 100}
|
|
|
|
]}).
|
2019-05-17 00:50:06 +08:00
|
|
|
|
|
|
|
init_per_suite(Config) ->
|
|
|
|
rabbit_ct_helpers:log_environment(),
|
|
|
|
rabbit_ct_helpers:run_setup_steps(Config).
|
|
|
|
|
|
|
|
end_per_suite(Config) ->
|
|
|
|
rabbit_ct_helpers:run_teardown_steps(Config).
|
|
|
|
|
2023-06-13 20:55:41 +08:00
|
|
|
init_per_group(Group, Config) ->
|
2022-10-13 18:38:42 +08:00
|
|
|
rabbit_ct_helpers:set_config(
|
2023-06-13 20:55:41 +08:00
|
|
|
Config, [{rmq_nodes_count, 5},
|
|
|
|
{mqtt_version, Group}]).
|
2019-05-17 00:50:06 +08:00
|
|
|
|
|
|
|
end_per_group(_, Config) ->
|
|
|
|
Config.
|
|
|
|
|
|
|
|
init_per_testcase(Testcase, Config) ->
|
|
|
|
rabbit_ct_helpers:testcase_started(Config, Testcase),
|
|
|
|
rabbit_ct_helpers:log_environment(),
|
|
|
|
Config1 = rabbit_ct_helpers:set_config(Config, [
|
|
|
|
{rmq_nodename_suffix, Testcase},
|
2022-10-13 18:38:42 +08:00
|
|
|
{rmq_nodes_clustered, true}
|
2019-05-17 00:50:06 +08:00
|
|
|
]),
|
2024-07-09 18:30:47 +08:00
|
|
|
rabbit_ct_helpers:run_setup_steps(
|
|
|
|
Config1,
|
|
|
|
[fun merge_app_env/1] ++
|
2022-10-13 18:38:42 +08:00
|
|
|
setup_steps() ++
|
2024-07-09 18:30:47 +08:00
|
|
|
rabbit_ct_client_helpers:setup_steps()).
|
2019-05-17 00:50:06 +08:00
|
|
|
|
|
|
|
end_per_testcase(Testcase, Config) ->
|
2023-06-21 23:19:38 +08:00
|
|
|
rabbit_ct_helpers:run_steps(Config,
|
2019-05-17 00:50:06 +08:00
|
|
|
rabbit_ct_client_helpers:teardown_steps() ++
|
2022-10-13 18:38:42 +08:00
|
|
|
teardown_steps()),
|
2019-05-17 00:50:06 +08:00
|
|
|
rabbit_ct_helpers:testcase_finished(Config, Testcase).
|
|
|
|
|
|
|
|
%% -------------------------------------------------------------------
|
2019-06-11 06:26:07 +08:00
|
|
|
%% Test cases
|
2019-05-17 00:50:06 +08:00
|
|
|
%% -------------------------------------------------------------------
|
|
|
|
|
2019-06-12 00:23:45 +08:00
|
|
|
%% Note about running this testsuite in a mixed-versions cluster:
|
|
|
|
%% All even-numbered nodes will use the same code base when using a
|
|
|
|
%% secondary Umbrella. Odd-numbered nodes might use an incompatible code
|
|
|
|
%% base. When cluster-wide client ID tracking was introduced, it was not
|
|
|
|
%% put behind a feature flag because there was no need for one. Here, we
|
|
|
|
%% don't have a way to ensure that all nodes participate in client ID
|
|
|
|
%% tracking. However, those using the same code should. That's why we
|
|
|
|
%% limit our RPC calls to those nodes.
|
|
|
|
%%
|
|
|
|
%% That's also the reason why we use a 5-node cluster: with node 2 and
|
|
|
|
%% 4 which might not participate, it leaves nodes 1, 3 and 5: thus 3
|
|
|
|
%% nodes, the minimum to use Ra in proper conditions.
|
|
|
|
|
2019-06-11 06:26:07 +08:00
|
|
|
connection_id_tracking(Config) ->
|
2022-12-12 18:05:43 +08:00
|
|
|
Id = <<"duplicate-id">>,
|
|
|
|
C1 = connect(Id, Config, 0, ?OPTS),
|
2022-08-30 16:24:40 +08:00
|
|
|
{ok, _, _} = emqtt:subscribe(C1, <<"TopicA">>, qos0),
|
|
|
|
ok = emqtt:publish(C1, <<"TopicA">>, <<"Payload">>),
|
2023-01-03 01:02:43 +08:00
|
|
|
ok = expect_publishes(C1, <<"TopicA">>, [<<"Payload">>]),
|
2019-06-11 06:26:07 +08:00
|
|
|
|
|
|
|
%% there's one connection
|
2023-02-03 03:31:15 +08:00
|
|
|
assert_connection_count(Config, 4, 1),
|
2019-06-11 06:26:07 +08:00
|
|
|
|
|
|
|
%% connect to the same node (A or 0)
|
2022-12-12 18:05:43 +08:00
|
|
|
process_flag(trap_exit, true),
|
|
|
|
C2 = connect(Id, Config, 0, ?OPTS),
|
2023-01-03 01:02:43 +08:00
|
|
|
await_exit(C1),
|
2023-02-03 03:31:15 +08:00
|
|
|
assert_connection_count(Config, 4, 1),
|
2019-06-11 06:26:07 +08:00
|
|
|
|
2019-06-12 00:23:45 +08:00
|
|
|
%% connect to a different node (C or 2)
|
2022-12-12 18:05:43 +08:00
|
|
|
C3 = connect(Id, Config, 2, ?OPTS),
|
2023-01-03 01:02:43 +08:00
|
|
|
await_exit(C2),
|
2023-02-03 03:31:15 +08:00
|
|
|
assert_connection_count(Config, 4, 1),
|
2022-08-30 16:24:40 +08:00
|
|
|
ok = emqtt:disconnect(C3).
|
2019-06-11 06:26:07 +08:00
|
|
|
|
|
|
|
connection_id_tracking_on_nodedown(Config) ->
|
2022-12-12 18:05:43 +08:00
|
|
|
C = connect(<<"simpleClient">>, Config, ?OPTS),
|
2022-08-30 16:24:40 +08:00
|
|
|
{ok, _, _} = emqtt:subscribe(C, <<"TopicA">>, qos0),
|
|
|
|
ok = emqtt:publish(C, <<"TopicA">>, <<"Payload">>),
|
2023-01-03 01:02:43 +08:00
|
|
|
ok = expect_publishes(C, <<"TopicA">>, [<<"Payload">>]),
|
2023-02-03 03:31:15 +08:00
|
|
|
assert_connection_count(Config, 4, 1),
|
2022-12-12 18:05:43 +08:00
|
|
|
process_flag(trap_exit, true),
|
2023-04-27 18:52:56 +08:00
|
|
|
ok = stop_node(Config, 0),
|
2023-01-03 01:02:43 +08:00
|
|
|
await_exit(C),
|
2024-07-09 18:30:47 +08:00
|
|
|
ok = eventually(?_assertEqual([], util:all_connection_pids(Config)), 500, 4).
|
2019-05-17 00:50:06 +08:00
|
|
|
|
2019-06-11 06:26:07 +08:00
|
|
|
%%
|
|
|
|
%% Helpers
|
|
|
|
%%
|
|
|
|
|
2023-02-03 03:31:15 +08:00
|
|
|
assert_connection_count(_Config, 0, NumElements) ->
|
Use best-effort client ID tracking
"Each Client connecting to the Server has a unique ClientId"
"If the ClientId represents a Client already connected to
the Server then the Server MUST disconnect the existing
Client [MQTT-3.1.4-2]."
Instead of tracking client IDs via Raft, we use local ETS tables in this
commit.
Previous tracking of client IDs via Raft:
(+) consistency (does the right thing)
(-) state of Ra process becomes large > 1GB with many (> 1 Million) MQTT clients
(-) Ra process becomes a bottleneck when many MQTT clients (e.g. 300k)
disconnect at the same time because monitor (DOWN) Ra commands get
written resulting in Ra machine timeout.
(-) if we need consistency, we ideally want a single source of truth,
e.g. only Mnesia, or only Khepri (but not Mnesia + MQTT ra process)
While above downsides could be fixed (e.g. avoiding DOWN commands by
instead doing periodic cleanups of client ID entries using session interval
in MQTT 5 or using subscription_ttl parameter in current RabbitMQ MQTT config),
in this case we do not necessarily need the consistency guarantees Raft provides.
In this commit, we try to comply with [MQTT-3.1.4-2] on a best-effort
basis: If there are no network failures and no messages get lost,
existing clients with duplicate client IDs get disconnected.
In the presence of network failures / lost messages, two clients with
the same client ID can end up publishing or receiving from the same
queue. Arguably, that's acceptable and less worse than the scaling
issues we experience when we want stronger consistency.
Note that it is also the responsibility of the client to not connect
twice with the same client ID.
This commit also ensures that the client ID is a binary to save memory.
A new feature flag is introduced, which when enabled, deletes the Ra
cluster named 'mqtt_node'.
Independent of that feature flag, client IDs are tracked locally in ETS
tables.
If that feature flag is disabled, client IDs are additionally tracked in
Ra.
The feature flag is required such that clients can continue to connect
to all nodes except for the node being udpated in a rolling update.
This commit also fixes a bug where previously all MQTT connections were
cluster-wide closed when one RabbitMQ node was put into maintenance
mode.
2022-10-10 16:40:20 +08:00
|
|
|
ct:fail("failed to match connection count ~b", [NumElements]);
|
2023-02-03 03:31:15 +08:00
|
|
|
assert_connection_count(Config, Retries, NumElements) ->
|
Use best-effort client ID tracking
"Each Client connecting to the Server has a unique ClientId"
"If the ClientId represents a Client already connected to
the Server then the Server MUST disconnect the existing
Client [MQTT-3.1.4-2]."
Instead of tracking client IDs via Raft, we use local ETS tables in this
commit.
Previous tracking of client IDs via Raft:
(+) consistency (does the right thing)
(-) state of Ra process becomes large > 1GB with many (> 1 Million) MQTT clients
(-) Ra process becomes a bottleneck when many MQTT clients (e.g. 300k)
disconnect at the same time because monitor (DOWN) Ra commands get
written resulting in Ra machine timeout.
(-) if we need consistency, we ideally want a single source of truth,
e.g. only Mnesia, or only Khepri (but not Mnesia + MQTT ra process)
While above downsides could be fixed (e.g. avoiding DOWN commands by
instead doing periodic cleanups of client ID entries using session interval
in MQTT 5 or using subscription_ttl parameter in current RabbitMQ MQTT config),
in this case we do not necessarily need the consistency guarantees Raft provides.
In this commit, we try to comply with [MQTT-3.1.4-2] on a best-effort
basis: If there are no network failures and no messages get lost,
existing clients with duplicate client IDs get disconnected.
In the presence of network failures / lost messages, two clients with
the same client ID can end up publishing or receiving from the same
queue. Arguably, that's acceptable and less worse than the scaling
issues we experience when we want stronger consistency.
Note that it is also the responsibility of the client to not connect
twice with the same client ID.
This commit also ensures that the client ID is a binary to save memory.
A new feature flag is introduced, which when enabled, deletes the Ra
cluster named 'mqtt_node'.
Independent of that feature flag, client IDs are tracked locally in ETS
tables.
If that feature flag is disabled, client IDs are additionally tracked in
Ra.
The feature flag is required such that clients can continue to connect
to all nodes except for the node being udpated in a rolling update.
This commit also fixes a bug where previously all MQTT connections were
cluster-wide closed when one RabbitMQ node was put into maintenance
mode.
2022-10-10 16:40:20 +08:00
|
|
|
case util:all_connection_pids(Config) of
|
2023-02-03 03:31:15 +08:00
|
|
|
Pids when length(Pids) =:= NumElements ->
|
2020-02-11 01:05:43 +08:00
|
|
|
ok;
|
2023-02-03 03:31:15 +08:00
|
|
|
Pids ->
|
|
|
|
ct:pal("Waiting for ~b connections, got following connections: ~p",
|
|
|
|
[NumElements, Pids]),
|
Use best-effort client ID tracking
"Each Client connecting to the Server has a unique ClientId"
"If the ClientId represents a Client already connected to
the Server then the Server MUST disconnect the existing
Client [MQTT-3.1.4-2]."
Instead of tracking client IDs via Raft, we use local ETS tables in this
commit.
Previous tracking of client IDs via Raft:
(+) consistency (does the right thing)
(-) state of Ra process becomes large > 1GB with many (> 1 Million) MQTT clients
(-) Ra process becomes a bottleneck when many MQTT clients (e.g. 300k)
disconnect at the same time because monitor (DOWN) Ra commands get
written resulting in Ra machine timeout.
(-) if we need consistency, we ideally want a single source of truth,
e.g. only Mnesia, or only Khepri (but not Mnesia + MQTT ra process)
While above downsides could be fixed (e.g. avoiding DOWN commands by
instead doing periodic cleanups of client ID entries using session interval
in MQTT 5 or using subscription_ttl parameter in current RabbitMQ MQTT config),
in this case we do not necessarily need the consistency guarantees Raft provides.
In this commit, we try to comply with [MQTT-3.1.4-2] on a best-effort
basis: If there are no network failures and no messages get lost,
existing clients with duplicate client IDs get disconnected.
In the presence of network failures / lost messages, two clients with
the same client ID can end up publishing or receiving from the same
queue. Arguably, that's acceptable and less worse than the scaling
issues we experience when we want stronger consistency.
Note that it is also the responsibility of the client to not connect
twice with the same client ID.
This commit also ensures that the client ID is a binary to save memory.
A new feature flag is introduced, which when enabled, deletes the Ra
cluster named 'mqtt_node'.
Independent of that feature flag, client IDs are tracked locally in ETS
tables.
If that feature flag is disabled, client IDs are additionally tracked in
Ra.
The feature flag is required such that clients can continue to connect
to all nodes except for the node being udpated in a rolling update.
This commit also fixes a bug where previously all MQTT connections were
cluster-wide closed when one RabbitMQ node was put into maintenance
mode.
2022-10-10 16:40:20 +08:00
|
|
|
timer:sleep(500),
|
2023-02-03 03:31:15 +08:00
|
|
|
assert_connection_count(Config, Retries-1, NumElements)
|
2020-02-11 01:05:43 +08:00
|
|
|
end.
|