mirrored_supervisor: Catch timeout from Khepri in `hanlde_info/2`
[Why] The code assumed that the transaction would always succeed. It was kind of the case with Mnesia because it would throw an exception if it failed. Khepri returns an error instead. The code has to handle it. In particular, we see timeouts in CI and before this patch, they caused a crash because the list comprehension was asked to work on a tuple. [How] We now retry a few times for 10 seconds.
This commit is contained in:
parent
913bd9fa42
commit
4621fe7730
|
@ -345,7 +345,7 @@ handle_info({'DOWN', _Ref, process, Pid, _Reason},
|
|||
child_order = ChildOrder}) ->
|
||||
%% No guarantee pg will have received the DOWN before us.
|
||||
R = case lists:sort(pg:get_members(Group)) -- [Pid] of
|
||||
[O | _] -> ChildSpecs = update_all(O, Pid),
|
||||
[O | _] -> ChildSpecs = retry_update_all(O, Pid),
|
||||
[start(Delegate, ChildSpec)
|
||||
|| ChildSpec <- restore_child_order(ChildSpecs,
|
||||
ChildOrder)];
|
||||
|
@ -428,6 +428,22 @@ check_stop(Group, Delegate, Id) ->
|
|||
|
||||
id({Id, _, _, _, _, _}) -> Id.
|
||||
|
||||
retry_update_all(O, Pid) ->
|
||||
retry_update_all(O, Pid, 10000).
|
||||
|
||||
retry_update_all(O, Pid, TimeLeft) when TimeLeft > 0 ->
|
||||
case update_all(O, Pid) of
|
||||
List when is_list(List) ->
|
||||
List;
|
||||
{error, timeout} ->
|
||||
Sleep = 200,
|
||||
TimeLeft1 = TimeLeft - Sleep,
|
||||
timer:sleep(Sleep),
|
||||
retry_update_all(O, Pid, TimeLeft1)
|
||||
end;
|
||||
retry_update_all(O, Pid, _TimeLeft) ->
|
||||
update_all(O, Pid).
|
||||
|
||||
update_all(Overall, OldOverall) ->
|
||||
rabbit_db_msup:update_all(Overall, OldOverall).
|
||||
|
||||
|
|
Loading…
Reference in New Issue