Fix delete_replica bug
It caused a lot of flakiness on the rabbit_stream_queue_SUITE, both on `delete_replica` and `delete_last_replica` test cases.
This commit is contained in:
parent
6052ecdc9c
commit
e65ba8347c
|
|
@ -1018,16 +1018,14 @@ update_stream0(#{system_time := _Ts},
|
|||
_ ->
|
||||
false
|
||||
end,
|
||||
|
||||
case maps:get(Node, Members0) of
|
||||
#member{role = {replica, Epoch},
|
||||
current = {stopping, Idx},
|
||||
state = _} = Member0
|
||||
when IsLeaderInCurrent ->
|
||||
%% A leader has already been selected so skip straight to ready state
|
||||
Member = Member0#member{state = {ready, Epoch},
|
||||
target = Target,
|
||||
current = undefined},
|
||||
Member = update_target(Member0#member{state = {ready, Epoch},
|
||||
current = undefined}, Target),
|
||||
Members1 = Members0#{Node => Member},
|
||||
Stream0#stream{members = Members1};
|
||||
#member{role = {_, Epoch},
|
||||
|
|
@ -1037,9 +1035,8 @@ update_stream0(#{system_time := _Ts},
|
|||
%% epoch
|
||||
Member = case StoppedEpoch of
|
||||
Epoch ->
|
||||
Member0#member{state = {stopped, StoppedEpoch, Tail},
|
||||
target = Target,
|
||||
current = undefined};
|
||||
update_target(Member0#member{state = {stopped, StoppedEpoch, Tail},
|
||||
current = undefined}, Target);
|
||||
_ ->
|
||||
%% if stopped epoch is from another epoch
|
||||
%% leave target as is to retry stop in current term
|
||||
|
|
@ -1518,3 +1515,8 @@ set_running_to_stopped(Members) ->
|
|||
M
|
||||
end, Members).
|
||||
|
||||
update_target(#member{target = deleted} = Member, _) ->
|
||||
%% A deleted member can never transition to another state
|
||||
Member;
|
||||
update_target(Member, Target) ->
|
||||
Member#member{target = Target}.
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ all_tests() ->
|
|||
delete_stream,
|
||||
delete_replica_leader,
|
||||
delete_replica,
|
||||
delete_two_replicas,
|
||||
delete_replica_2,
|
||||
leader_start_failed
|
||||
].
|
||||
|
|
@ -907,6 +908,79 @@ delete_replica(_) ->
|
|||
{S4, []} = evaluate_stream(meta(?LINE), S4, []),
|
||||
ok.
|
||||
|
||||
delete_two_replicas(_) ->
|
||||
%% There was a race condition on the rabbit_stream_queue_SUITE testcases delete_replica
|
||||
%% and delete_last_replica. A replica can sometimes restart after deletion as it transitions
|
||||
%% again to running state. This test reproduces it. See `rabbit_stream_coordinator.erl`
|
||||
%% line 1039, the processing of `member_stopped` command. The new function `update_target`
|
||||
%% ensures this transition never happens.
|
||||
%% This test reproduces the trace that leads to that error.
|
||||
E = 1,
|
||||
StreamId = atom_to_list(?FUNCTION_NAME),
|
||||
LeaderPid = fake_pid(n1),
|
||||
[Replica1, Replica2] = [fake_pid(n2), fake_pid(n3)],
|
||||
N1 = node(LeaderPid),
|
||||
N2 = node(Replica1),
|
||||
%% this is to be added
|
||||
N3 = node(Replica2),
|
||||
|
||||
S0 = started_stream(StreamId, LeaderPid, [Replica1, Replica2]),
|
||||
From = {self(), make_ref()},
|
||||
Idx1 = ?LINE,
|
||||
Meta1 = (meta(Idx1))#{from => From},
|
||||
S1 = update_stream(Meta1, {delete_replica, StreamId, #{node => N3}}, S0),
|
||||
?assertMatch(#stream{target = running,
|
||||
nodes = [N1, N2],
|
||||
members = #{N1 := #member{target = stopped,
|
||||
current = undefined,
|
||||
state = {running, _, _}},
|
||||
N2 := #member{target = stopped,
|
||||
current = undefined,
|
||||
state = {running, _, _}},
|
||||
N3 := #member{target = deleted,
|
||||
current = undefined,
|
||||
state = {running, _, _}}
|
||||
}},
|
||||
S1),
|
||||
{S2, Actions1} = evaluate_stream(Meta1, S1, []),
|
||||
?assertMatch([{aux, {delete_member, StreamId, #{node := N3}, _}},
|
||||
{aux, {stop, StreamId, #{node := N1, epoch := E}, _}},
|
||||
{aux, {stop, StreamId, #{node := N2, epoch := E}, _}}],
|
||||
lists:sort(Actions1)),
|
||||
|
||||
Idx2 = ?LINE,
|
||||
Meta2 = (meta(Idx2))#{from => From},
|
||||
S3 = update_stream(Meta2, {delete_replica, StreamId, #{node => N2}}, S2),
|
||||
?assertMatch(#stream{target = running,
|
||||
nodes = [N1],
|
||||
members = #{N1 := #member{target = stopped,
|
||||
current = {stopping, _},
|
||||
state = {running, _, _}},
|
||||
N2 := #member{target = deleted,
|
||||
current = {stopping, _},
|
||||
state = {running, _, _}},
|
||||
N3 := #member{target = deleted,
|
||||
current = {deleting, _},
|
||||
state = {running, _, _}}
|
||||
}},
|
||||
S3),
|
||||
{S4, []} = evaluate_stream(Meta2, S3, []),
|
||||
|
||||
|
||||
Idx3 = ?LINE,
|
||||
S5 = update_stream(meta(Idx3),
|
||||
{member_stopped, StreamId, #{node => N2,
|
||||
index => Idx1,
|
||||
epoch => E,
|
||||
tail => {E, 101}}},
|
||||
S4),
|
||||
%% A deleted member can never transition to another target.
|
||||
?assertMatch(#stream{members = #{N2 := #member{target = deleted,
|
||||
current = undefined,
|
||||
state = {stopped, _, _}}}},
|
||||
S5),
|
||||
ok.
|
||||
|
||||
delete_replica_2(_) ->
|
||||
%% replica is deleted before it has been fully started
|
||||
E = 1,
|
||||
|
|
|
|||
|
|
@ -186,7 +186,6 @@ merge_app_env(Config) ->
|
|||
{rabbit, [{core_metrics_gc_interval, 100}]}).
|
||||
|
||||
end_per_testcase(Testcase, Config) ->
|
||||
Q = ?config(queue_name, Config),
|
||||
Config1 = rabbit_ct_helpers:run_steps(
|
||||
Config,
|
||||
rabbit_ct_client_helpers:teardown_steps()),
|
||||
|
|
|
|||
Loading…
Reference in New Issue