Store hash ring state in a single table

This implementation is significantly simpler and doesn't
perform nearly as many Mnesia operations.

Pair: @dcorbacho.

References #37, #38.

[#159822323]
This commit is contained in:
Michael Klishin 2018-08-28 16:40:38 +03:00
parent e5e9316b9e
commit a2d4f0bd87
1 changed files with 101 additions and 142 deletions

View File

@ -27,25 +27,12 @@
-export([init/0]). -export([init/0]).
-export([info/1, info/2]). -export([info/1, info/2]).
-record(bucket, { -record(chx_hash_ring, {
%% a {resource, bucket} pair %% a resource
%% where bucket is a non-negative integer exchange,
id, %% a map of bucket => queue | exchange
%% a resource bucket_map,
queue next_bucket_number
}).
-record(bucket_count, {
exchange,
count
}).
-record(binding_buckets, {
%% an {exchange, queue} pair because we
%% assume that there's only one binding between
%% a consistent hash exchange and a queue
id,
bucket_numbers = []
}). }).
-rabbit_boot_step( -rabbit_boot_step(
@ -66,17 +53,9 @@
{enables, external_infrastructure}]}). {enables, external_infrastructure}]}).
%% This data model allows for efficient routing and exchange deletion %% This data model allows for efficient routing and exchange deletion
%% but not efficient binding management. This is a future area of improvement. %% but less efficient (linear) binding management.
%% A couple of alternatives were considered, e.g. storing the entire ring state
%% in a single map. Without an additional structure such as a balanced tree
%% ring updates would be even less efficient (but easier to follow).
%% maps buckets to queues -define(HASH_RING_STATE_TABLE, rabbit_exchange_type_consistent_hash_ring_state).
-define(BUCKET_TABLE, rabbit_exchange_type_consistent_hash_bucket_queue).
%% maps exchange to total the number of buckets
-define(BUCKET_COUNT_TABLE, rabbit_exchange_type_consistent_hash_bucket_count).
%% maps {exchange, queue} pairs to a list of buckets
-define(BINDING_BUCKET_TABLE, rabbit_exchange_type_consistent_hash_binding_bucket).
-define(PROPERTIES, [<<"correlation_id">>, <<"message_id">>, <<"timestamp">>]). -define(PROPERTIES, [<<"correlation_id">>, <<"message_id">>, <<"timestamp">>]).
@ -91,21 +70,26 @@ description() ->
serialise_events() -> false. serialise_events() -> false.
route(#exchange { name = Name, route(#exchange {name = Name,
arguments = Args }, arguments = Args},
#delivery { message = Msg }) -> #delivery {message = Msg}) ->
case ets:lookup(?BUCKET_COUNT_TABLE, Name) of case ets:lookup(?HASH_RING_STATE_TABLE, Name) of
[] -> [] ->
[]; [];
[#bucket_count{count = N}] -> [#chx_hash_ring{bucket_map = BM}] ->
K = value_to_hash(hash_on(Args), Msg), case maps:size(BM) of
SelectedBucket = jump_consistent_hash(K, N), 0 -> [];
case mnesia:dirty_read({?BUCKET_TABLE, {Name, SelectedBucket}}) of N ->
[Bucket] -> [Bucket#bucket.queue]; K = value_to_hash(hash_on(Args), Msg),
[] -> rabbit_log:warning("Bucket ~p not found", [SelectedBucket]), SelectedBucket = jump_consistent_hash(K, N),
[]
end
case maps:get(SelectedBucket, BM, undefined) of
undefined ->
rabbit_log:warning("Bucket ~p not found", [SelectedBucket]),
[];
Queue -> [Queue]
end
end
end. end.
validate(#exchange { arguments = Args }) -> validate(#exchange { arguments = Args }) ->
@ -139,63 +123,64 @@ validate_binding(_X, #binding { key = K }) ->
{error, {binding_invalid, "The binding key must be an integer: ~p", [K]}} {error, {binding_invalid, "The binding key must be an integer: ~p", [K]}}
end. end.
create(_Tx, _X) -> ok. maybe_initialise_hash_ring_state(transaction, X) ->
case mnesia:read(?HASH_RING_STATE_TABLE, X) of
[_] -> ok;
[] ->
mnesia:write_lock_table(?HASH_RING_STATE_TABLE),
ok = mnesia:write(?HASH_RING_STATE_TABLE, #chx_hash_ring{
exchange = X,
next_bucket_number = 0,
bucket_map = #{}}, write)
end;
maybe_initialise_hash_ring_state(_, X) ->
rabbit_misc:execute_mnesia_transaction(
fun() -> maybe_initialise_hash_ring_state(transaction, X) end).
create(transaction, X) ->
maybe_initialise_hash_ring_state(transaction, X);
create(Tx, X) ->
maybe_initialise_hash_ring_state(Tx, X).
delete(transaction, #exchange{name = Name}, _Bs) -> delete(transaction, #exchange{name = Name}, _Bs) ->
ok = mnesia:write_lock_table(?BUCKET_TABLE), mnesia:write_lock_table(?HASH_RING_STATE_TABLE),
ok = mnesia:write_lock_table(?BUCKET_COUNT_TABLE),
Numbers = mnesia:select(?BUCKET_TABLE, [{ ok = mnesia:delete({?HASH_RING_STATE_TABLE, Name});
#bucket{id = {Name, '$1'}, _ = '_'},
[],
['$1']
}]),
[mnesia:delete({?BUCKET_TABLE, {Name, N}})
|| N <- Numbers],
Queues = mnesia:select(?BINDING_BUCKET_TABLE,
[{
#binding_buckets{id = {Name, '$1'}, _ = '_'},
[],
['$1']
}]),
[mnesia:delete({?BINDING_BUCKET_TABLE, {Name, Q}})
|| Q <- Queues],
mnesia:delete({?BUCKET_COUNT_TABLE, Name}),
ok;
delete(_Tx, _X, _Bs) -> delete(_Tx, _X, _Bs) ->
ok. ok.
policy_changed(_X1, _X2) -> ok. policy_changed(_X1, _X2) -> ok.
add_binding(transaction, _X, add_binding(transaction, X,
#binding{source = S, destination = D, key = K}) -> B = #binding{source = S, destination = D, key = K}) ->
Weight = rabbit_data_coercion:to_integer(K), Weight = rabbit_data_coercion:to_integer(K),
mnesia:write_lock_table(?BUCKET_TABLE), mnesia:write_lock_table(?HASH_RING_STATE_TABLE),
mnesia:write_lock_table(?BUCKET_COUNT_TABLE),
LastBucketNum = bucket_count_of(S), case mnesia:read(?HASH_RING_STATE_TABLE, S) of
NewBucketCount = LastBucketNum + Weight, [State0 = #chx_hash_ring{bucket_map = BM0,
next_bucket_number = NexN0}] ->
NextN = NexN0 + Weight,
%% hi/lo bucket counters are 0-based but weight is 1-based
Range = lists:seq(NexN0, (NextN - 1)),
BM = lists:foldl(fun(Key, Acc) ->
maps:put(Key, D, Acc)
end, BM0, Range),
State = State0#chx_hash_ring{bucket_map = BM,
next_bucket_number = NextN},
Numbers = lists:seq(LastBucketNum, (NewBucketCount - 1)), ok = mnesia:write(?HASH_RING_STATE_TABLE, State, write),
Buckets = [#bucket{id = {S, I}, queue = D} || I <- Numbers], ok;
[] ->
[ok = mnesia:write(?BUCKET_TABLE, B, write) || B <- Buckets], maybe_initialise_hash_ring_state(transaction, S),
add_binding(transaction, X, B)
mnesia:write(?BINDING_BUCKET_TABLE, #binding_buckets{id = {S, D}, end;
bucket_numbers = Numbers}, write),
mnesia:write(?BUCKET_COUNT_TABLE, #bucket_count{exchange = S,
count = NewBucketCount}, write),
ok;
add_binding(none, _X, _B) -> add_binding(none, _X, _B) ->
ok. ok.
remove_bindings(transaction, _X, Bindings) -> remove_bindings(transaction, _X, Bindings) ->
mnesia:write_lock_table(?BUCKET_TABLE), mnesia:write_lock_table(?HASH_RING_STATE_TABLE),
mnesia:write_lock_table(?BUCKET_COUNT_TABLE),
[remove_binding(B) || B <- Bindings], [remove_binding(B) || B <- Bindings],
@ -203,78 +188,52 @@ remove_bindings(transaction, _X, Bindings) ->
remove_bindings(none, _X, _Bs) -> remove_bindings(none, _X, _Bs) ->
ok. ok.
remove_binding(#binding{source = S, destination = D, key = K}) -> remove_binding(#binding{source = S, destination = D, key = RK}) ->
Weight = rabbit_data_coercion:to_integer(K), Weight = rabbit_data_coercion:to_integer(RK),
[#binding_buckets{bucket_numbers = Numbers}] = mnesia:read(?BINDING_BUCKET_TABLE, {S, D}), mnesia:write_lock_table(?HASH_RING_STATE_TABLE),
LastNum = lists:last(Numbers),
%% Delete all buckets for this {exchange, queue} pair case mnesia:read(?HASH_RING_STATE_TABLE, S) of
[ok = mnesia:delete(?BUCKET_TABLE, {S, N}, write) || N <- Numbers], [State0 = #chx_hash_ring{bucket_map = BM0,
next_bucket_number = NexN0}] ->
%% Buckets with lower numbers stay as is; buckets that
%% belong to this binding are removed; buckets with
%% greater numbers are updated (their numbers are adjusted downwards by weight)
BucketsOfThisBinding = maps:filter(fun (_K, V) -> V =:= D end, BM0),
LastBucket = lists:last(maps:keys(BucketsOfThisBinding)),
BucketsDownTheRing = maps:filter(fun (K, _) -> K > LastBucket end, BM0),
%% Buckets with lower numbers stay as is; buckets that %% hash ring state without the buckets of this binding
%% belong to this binding are removed; buckets with BM1 = maps:fold(fun(K, _, Acc) -> maps:remove(K, Acc) end, BM0, BucketsOfThisBinding),
%% greater numbers are updated (their numbers are adjusted downwards by weight) %% final state with "down the ring" buckets updated
BucketsToUpdate = mnesia:select(?BUCKET_TABLE, [{ BM2 = maps:fold(fun(K0, V, Acc) ->
#bucket{id = {S, '$1'}, _ = '_'}, M = maps:remove(K0, Acc),
[ maps:put(K0 - Weight, V, M)
{'>', '$1', LastNum} end, BM1, BucketsDownTheRing),
],
['$_']
}]),
QueuesWithUpdatedBuckets = lists:usort([Q || #bucket{queue = Q} <- BucketsToUpdate]),
[ok = mnesia:delete(?BUCKET_TABLE, Id, write) || #bucket{id = Id} <- BucketsToUpdate],
UpdatedBuckets = [B#bucket{id = {X, N - Weight}} || B = #bucket{id = {X, N}} <- BucketsToUpdate], NextN = NexN0 - Weight,
[ok = mnesia:write(?BUCKET_TABLE, B, write) || B <- UpdatedBuckets], State = State0#chx_hash_ring{bucket_map = BM2,
next_bucket_number = NextN},
%% There will be no buckets for this {exchange, queue} pair to track ok = mnesia:write(?HASH_RING_STATE_TABLE, State, write),
ok = mnesia:delete(?BINDING_BUCKET_TABLE, {S, D}, write),
%% Update the counter ok;
TotalBucketsForX = bucket_count_of(S), [] ->
mnesia:write(?BUCKET_COUNT_TABLE, #bucket_count{exchange = S, rabbit_log:warning("Can't remove binding: hash ring state for exchange ~s wasn't found",
count = TotalBucketsForX - Weight}, write), [rabbit_misc:rs(S)]),
ok
%% Update bucket numbers end.
[begin
case mnesia:read(?BINDING_BUCKET_TABLE, {S, Q}) of
[] -> ok;
[Val = #binding_buckets{bucket_numbers = BNs}] ->
NewBNs = [N - Weight || N <- BNs],
ok = mnesia:write(?BINDING_BUCKET_TABLE, Val#binding_buckets{bucket_numbers = NewBNs}, write)
end
end || Q <- QueuesWithUpdatedBuckets],
ok = mnesia:delete(?BINDING_BUCKET_TABLE, {S, D}, write),
ok.
assert_args_equivalence(X, Args) -> assert_args_equivalence(X, Args) ->
rabbit_exchange:assert_args_equivalence(X, Args). rabbit_exchange:assert_args_equivalence(X, Args).
bucket_count_of(X) ->
case ets:lookup(?BUCKET_COUNT_TABLE, X) of
[] -> 0;
[#bucket_count{count = N}] -> N
end.
init() -> init() ->
mnesia:create_table(?BUCKET_TABLE, [{record_name, bucket}, mnesia:create_table(?HASH_RING_STATE_TABLE, [{record_name, chx_hash_ring},
{attributes, record_info(fields, bucket)}, {attributes, record_info(fields, chx_hash_ring)},
{type, ordered_set}]), {type, ordered_set}]),
mnesia:create_table(?BUCKET_COUNT_TABLE, [{record_name, bucket_count}, mnesia:add_table_copy(?HASH_RING_STATE_TABLE, node(), ram_copies),
{attributes, record_info(fields, bucket_count)}, mnesia:wait_for_tables([?HASH_RING_STATE_TABLE], 30000),
{type, ordered_set}]),
mnesia:create_table(?BINDING_BUCKET_TABLE, [{record_name, binding_buckets},
{attributes, record_info(fields, binding_buckets)},
{type, ordered_set}]),
mnesia:add_table_copy(?BUCKET_TABLE, node(), ram_copies),
mnesia:add_table_copy(?BUCKET_COUNT_TABLE, node(), ram_copies),
mnesia:add_table_copy(?BINDING_BUCKET_TABLE, node(), ram_copies),
mnesia:wait_for_tables([?BUCKET_TABLE], 30000),
ok. ok.
%% %%