rabbitmq-server/deps/rabbit/src/rabbit_variable_queue.erl

%% This Source Code Form is subject to the terms of the Mozilla Public
%% License, v. 2.0. If a copy of the MPL was not distributed with this
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
%%
%% Copyright (c) 2007-2024 Broadcom. All Rights Reserved. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
%%

-module(rabbit_variable_queue).

-export([init/3, terminate/2, delete_and_terminate/2, delete_crashed/1,
         purge/1, purge_acks/1,
         publish/5, publish_delivered/4,
         discard/3, drain_confirmed/1,
         dropwhile/2, fetchwhile/4, fetch/2, drop/2, ack/2, requeue/2,
         ackfold/4, fold/3, len/1, is_empty/1, depth/1,
         update_rates/1, needs_timeout/1, timeout/1,
         handle_pre_hibernate/1, resume/1, msg_rates/1,
         info/2, invoke/3, is_duplicate/2, set_queue_mode/2,
         set_queue_version/2, zip_msgs_and_acks/4]).

-export([start/2, stop/1]).

%% This function is used by rabbit_classic_queue_index_v2
%% to convert v1 queues to v2 after an upgrade to 4.0.
-export([convert_from_v1_to_v2_loop/8]).

%% exported for testing only
-export([start_msg_store/3, stop_msg_store/1, init/5]).

-include("mc.hrl").
-include_lib("stdlib/include/qlc.hrl").

-define(QUEUE_MIGRATION_BATCH_SIZE, 100).
-define(EMPTY_START_FUN_STATE, {fun (ok) -> finished end, ok}).

%%----------------------------------------------------------------------------
%% Messages, their metadata and their position in the queue (SeqId),
%% can be in memory or on disk, or both. Persistent messages in
%% durable queues are always written to disk when they arrive.
%% Transient messages as well as persistent messages in non-durable
%% queues may be kept only in memory.
%%
%% The number of messages kept in memory is dependent on the consume
%% rate of the queue. At a minimum 1 message is kept (necessary because
%% we often need to check the expiration at the head of the queue) and
%% at a maximum the semi-arbitrary number 2048.
%%
%% Messages are never written back to disk after they have been read
%% into memory. Instead the queue is designed to avoid keeping too
%% much to begin with.
%%
%% Messages are persisted using a queue index and a message store.
%% A few different scenarios may play out depending on the message
%% size:
%%
%% - size < qi_msgs_embed_below: the metadata
%%   is stored in rabbit_classic_queue_index_v2, while the content
%%   is stored in the per-queue rabbit_classic_queue_store_v2
%%
%% - size >= qi_msgs_embed_below: the metadata
%%   is stored in rabbit_classic_queue_index_v2, while the content
%%   is stored in the per-vhost shared rabbit_msg_store
%%
%% When messages must be read from disk, message bodies will
%% also be read from disk except if the message is stored
%% in the per-vhost shared rabbit_msg_store. In that case
%% the message gets read before it needs to be sent to the
%% consumer. Messages are read from rabbit_msg_store one
%% at a time currently.
%%
%% The queue also keeps track of messages that were delivered
%% but for which the ack has not been received. Pending acks
%% are currently kept in memory although the message may be
%% on disk.
%%
%% Messages being requeued are returned to their position in
%% the queue using their SeqId value.
%%
%% In order to try to achieve as fast a start-up as possible, if a
%% clean shutdown occurs, we try to save out state to disk to reduce
%% work on startup. In rabbit_msg_store this takes the form of the
%% index_module's state, plus the file_summary ets table, and client
%% refs. In the VQ, this takes the form of the count of persistent
%% messages in the queue and references into the msg_stores. The
%% queue index adds to these terms the details of its segments and
%% stores the terms in the queue directory.
%%
%% Two rabbit_msg_store(s) are used. One is created for persistent messages
%% to durable queues that must survive restarts, and the other is used
%% for all other messages that just happen to need to be written to
%% disk. On start up we can therefore nuke the transient message
%% store, and be sure that the messages in the persistent store are
%% all that we need.
%%
%% The references to the msg_stores are there so that the msg_store
%% knows to only trust its saved state if all of the queues it was
%% previously talking to come up cleanly. Likewise, the queues
%% themselves (especially indexes) skip work in init if all the queues
%% and msg_store were shutdown cleanly. This gives both good speed
%% improvements and also robustness so that if anything possibly went
%% wrong in shutdown (or there was subsequent manual tampering), all
%% messages and queues that can be recovered are recovered, safely.
%%
%% To delete transient messages lazily, the variable_queue, on
%% startup, stores the next_seq_id reported by the queue_index as the
%% transient_threshold. From that point on, whenever it's reading a
%% message off disk via the queue_index, if the seq_id is below this
%% threshold and the message is transient then it drops the message
%% (the message itself won't exist on disk because it would have been
%% stored in the transient msg_store which would have had its saved
%% state nuked on startup). This avoids the expensive operation of
%% scanning the entire queue on startup in order to delete transient
%% messages that were written to disk.
%%
%% The queue is keeping track of delivers via the
%% next_deliver_seq_id variable. This variable gets increased
%% with every (first-time) delivery. When delivering messages
%% the seq_id of the message is checked against this variable
%% to determine whether the message is a redelivery. The variable
%% is stored in the queue terms on graceful shutdown. On dirty
%% recovery the variable becomes the seq_id of the most recent
%% message in the queue (effectively marking all messages as
%% delivered, like the v1 index was doing).
%%
%% Previous versions of classic queues had a much more complex
%% way of working. Messages were categorized into four groups,
%% and remnants of these terms remain in the code at the time
%% of writing:
%%
%% alpha: this is a message where both the message itself, and its
%%        position within the queue are held in RAM
%%
%% beta:  this is a message where the message itself is only held on
%%        disk (if persisted to the message store) but its position
%%        within the queue is held in RAM.
%%
%% gamma: this is a message where the message itself is only held on
%%        disk, but its position is both in RAM and on disk.
%%
%% delta: this is a collection of messages, represented by a single
%%        term, where the messages and their position are only held on
%%        disk.
%%
%% Messages may have been stored in q1, q2, delta, q3 or q4 depending
%% on their location in the queue. The current version of classic
%% queues only use delta (on-disk, for the tail of the queue) or
%% q3 (in-memory, head of the queue). Messages used to move from
%% q1 -> q2 -> delta -> q3 -> q4 (and sometimes q3 -> delta or
%% q4 -> delta to reduce memory use). Now messages only move
%% from delta to q3. Full details on the old mechanisms can be
%% found in previous versions of this file (such as the 3.11 version).
%%
%% In the current version of classic queues, there is no distinction
%% between default and lazy queues. The current behavior is close to
%% lazy queues, except we avoid some write to disks when queues are
%% empty.
%%----------------------------------------------------------------------------

-behaviour(rabbit_backing_queue).

-record(vqstate,
        { q1, %% Unused.
          q2, %% Unused.
          delta,
          q3,
          q4, %% Unused.
          next_seq_id,
          %% seq_id() of first undelivered message
          %% everything before this seq_id() was delivered at least once
          next_deliver_seq_id,
          ram_pending_ack,    %% msgs still in RAM
          disk_pending_ack,   %% msgs in store, paged out
          qi_pending_ack, %% Unused.
          index_mod, %% Unused.
          index_state,
          store_state,
          msg_store_clients,
          durable,
          transient_threshold,
          qi_embed_msgs_below,

          len,                %% w/o unacked @todo No longer needed, is delta+q3.
          bytes,              %% w/o unacked
          unacked_bytes,
          persistent_count,   %% w   unacked
          persistent_bytes,   %% w   unacked
          delta_transient_bytes,        %%

          target_ram_count,
          ram_msg_count,      %% w/o unacked
          ram_msg_count_prev,
          ram_ack_count_prev,
          ram_bytes,          %% w   unacked
          out_counter,
          in_counter,
          rates,
          %% There are two confirms paths: either store/index produce confirms
          %% separately (v1 and v2 with per-vhost message store) or the confirms
          %% are produced all at once while syncing/flushing (v2 with per-queue
          %% message store). The latter is more efficient as it avoids many
          %% sets operations.
          msgs_on_disk,
          msg_indices_on_disk,
          unconfirmed,
          confirmed,
          ack_out_counter,
          ack_in_counter,
          %% Unlike the other counters these two do not feed into
          %% #rates{} and get reset
          disk_read_count,
          disk_write_count,

          io_batch_size,

          %% default queue or lazy queue
          mode, %% Unused.
          version = 2, %% Unused.
          %% Fast path for confirms handling. Instead of having
          %% index/store keep track of confirms separately and
          %% doing intersect/subtract/union we just put the messages
          %% here and on sync move them to 'confirmed'.
          %%
          %% Note: This field used to be 'memory_reduction_run_count'.
          unconfirmed_simple,
          %% Queue data is grouped by VHost. We need to store it
          %% to work with queue index.
          virtual_host,
          waiting_bump = false
        }).

-record(rates, { in, out, ack_in, ack_out, timestamp }).

-type msg_location() :: memory
                      | rabbit_msg_store
                      | rabbit_queue_index
                      | rabbit_classic_queue_store_v2:msg_location().
-export_type([msg_location/0]).

-record(msg_status,
        { seq_id,
          msg_id,
          msg,
          is_persistent,
          is_delivered,
          msg_location, %% ?IN_SHARED_STORE | ?IN_QUEUE_STORE | ?IN_QUEUE_INDEX | ?IN_MEMORY
          index_on_disk,
          persist_to,
          msg_props
        }).

-record(delta,
        { start_seq_id, %% start_seq_id is inclusive
          count,
          transient,
          end_seq_id    %% end_seq_id is exclusive
        }).

-define(HEADER_GUESS_SIZE, 100). %% see determine_persist_to/2
-define(PERSISTENT_MSG_STORE, msg_store_persistent).
-define(TRANSIENT_MSG_STORE,  msg_store_transient).

-define(QUEUE, lqueue).

-define(IN_SHARED_STORE, rabbit_msg_store).
-define(IN_QUEUE_STORE, {rabbit_classic_queue_store_v2, _, _}).
-define(IN_QUEUE_INDEX, rabbit_queue_index).
-define(IN_MEMORY, memory).

-include_lib("rabbit_common/include/rabbit.hrl").
-include("amqqueue.hrl").

%%----------------------------------------------------------------------------

-type seq_id()  :: non_neg_integer().
-export_type([seq_id/0]).

-type rates() :: #rates { in        :: float(),
                          out       :: float(),
                          ack_in    :: float(),
                          ack_out   :: float(),
                          timestamp :: rabbit_types:timestamp()}.

-type delta() :: #delta { start_seq_id :: non_neg_integer(),
                          count        :: non_neg_integer(),
                          end_seq_id   :: non_neg_integer() }.

%% The compiler (rightfully) complains that ack() and state() are
%% unused. For this reason we duplicate a -spec from
%% rabbit_backing_queue with the only intent being to remove
%% warnings. The problem here is that we can't parameterise the BQ
%% behaviour by these two types as we would like to. We still leave
%% these here for documentation purposes.
-type ack() :: seq_id().
-type state() :: #vqstate {
             q1                    :: ?QUEUE:?QUEUE(),
             q2                    :: ?QUEUE:?QUEUE(),
             delta                 :: delta(),
             q3                    :: ?QUEUE:?QUEUE(),
             q4                    :: ?QUEUE:?QUEUE(),
             next_seq_id           :: seq_id(),
             next_deliver_seq_id   :: seq_id(),
             ram_pending_ack       :: map(),
             disk_pending_ack      :: map(),
             qi_pending_ack        :: undefined,
             index_state           :: any(),
             store_state           :: any(),
             msg_store_clients     :: 'undefined' | {{any(), binary()},
                                                    {any(), binary()}},
             durable               :: boolean(),
             transient_threshold   :: non_neg_integer(),
             qi_embed_msgs_below   :: non_neg_integer(),

             len                   :: non_neg_integer(),
             bytes                 :: non_neg_integer(),
             unacked_bytes         :: non_neg_integer(),

             persistent_count      :: non_neg_integer(),
             persistent_bytes      :: non_neg_integer(),

             target_ram_count      :: non_neg_integer() | 'infinity',
             ram_msg_count         :: non_neg_integer(),
             ram_msg_count_prev    :: non_neg_integer(),
             ram_ack_count_prev    :: non_neg_integer(),
             ram_bytes             :: non_neg_integer(),
             out_counter           :: non_neg_integer(),
             in_counter            :: non_neg_integer(),
             rates                 :: rates(),
             msgs_on_disk          :: sets:set(),
             msg_indices_on_disk   :: sets:set(),
             unconfirmed           :: sets:set(),
             confirmed             :: sets:set(),
             ack_out_counter       :: non_neg_integer(),
             ack_in_counter        :: non_neg_integer(),
             disk_read_count       :: non_neg_integer(),
             disk_write_count      :: non_neg_integer(),

             io_batch_size         :: pos_integer(),
             mode                  :: 'default' | 'lazy',
             version               :: 2,
             unconfirmed_simple    :: sets:set()}.

-define(BLANK_DELTA, #delta { start_seq_id = undefined,
                              count        = 0,
                              transient    = 0,
                              end_seq_id   = undefined }).
-define(BLANK_DELTA_PATTERN(Z), #delta { start_seq_id = Z,
                                         count        = 0,
                                         transient    = 0,
                                         end_seq_id   = Z }).

-define(MICROS_PER_SECOND, 1000000.0).

%% We're updating rates every 5s at most; a half life that is of
%% the same order of magnitude is probably about right.
-define(RATE_AVG_HALF_LIFE, 5.0).

%% We will recalculate the #rates{} every 5 seconds,
%% or every N messages published, whichever is
%% sooner. We do this since the priority calculations in
%% rabbit_amqqueue_process need fairly fresh rates.
-define(MSGS_PER_RATE_CALC, 100).

%%----------------------------------------------------------------------------
%% Public API
%%----------------------------------------------------------------------------

start(VHost, DurableQueues) ->
    %% The v2 index walker function covers both v1 and v2 index files.
    {AllTerms, StartFunState} = rabbit_classic_queue_index_v2:start(VHost, DurableQueues),
    %% Group recovery terms by vhost.
    ClientRefs = [Ref || Terms <- AllTerms,
                      Terms /= non_clean_shutdown,
                      begin
                          Ref = proplists:get_value(persistent_ref, Terms),
                          Ref =/= undefined
                      end],
    start_msg_store(VHost, ClientRefs, StartFunState),
    {ok, AllTerms}.

stop(VHost) ->
    ok = stop_msg_store(VHost),
    ok = rabbit_classic_queue_index_v2:stop(VHost).

start_msg_store(VHost, Refs, StartFunState) when is_list(Refs); Refs == undefined ->
    rabbit_log:info("Starting message stores for vhost '~ts'", [VHost]),
    do_start_msg_store(VHost, ?TRANSIENT_MSG_STORE, undefined, ?EMPTY_START_FUN_STATE),
    do_start_msg_store(VHost, ?PERSISTENT_MSG_STORE, Refs, StartFunState),
    ok.

do_start_msg_store(VHost, Type, Refs, StartFunState) ->
    case rabbit_vhost_msg_store:start(VHost, Type, Refs, StartFunState) of
        {ok, _} ->
            rabbit_log:info("Started message store of type ~ts for vhost '~ts'", [abbreviated_type(Type), VHost]);
        {error, {no_such_vhost, VHost}} = Err ->
            rabbit_log:error("Failed to start message store of type ~ts for vhost '~ts': the vhost no longer exists!",
                             [Type, VHost]),
            exit(Err);
        {error, Error} ->
            rabbit_log:error("Failed to start message store of type ~ts for vhost '~ts': ~tp",
                             [Type, VHost, Error]),
            exit({error, Error})
    end.

abbreviated_type(?TRANSIENT_MSG_STORE)  -> transient;
abbreviated_type(?PERSISTENT_MSG_STORE) -> persistent.

stop_msg_store(VHost) ->
    rabbit_vhost_msg_store:stop(VHost, ?TRANSIENT_MSG_STORE),
    rabbit_vhost_msg_store:stop(VHost, ?PERSISTENT_MSG_STORE),
    ok.

init(Queue, Recover, Callback) ->
    init(
      Queue, Recover,
      fun (MsgIds, ActionTaken) ->
              msgs_written_to_disk(Callback, MsgIds, ActionTaken)
      end,
      fun (MsgIds) -> msg_indices_written_to_disk(Callback, MsgIds) end,
      fun (MsgIds) -> msgs_and_indices_written_to_disk(Callback, MsgIds) end).

init(Q, new, MsgOnDiskFun, MsgIdxOnDiskFun, MsgAndIdxOnDiskFun) when ?is_amqqueue(Q) ->
    QueueName = amqqueue:get_name(Q),
    IsDurable = amqqueue:is_durable(Q),
    IndexState = rabbit_classic_queue_index_v2:init(QueueName,
        MsgIdxOnDiskFun, MsgAndIdxOnDiskFun),
    StoreState = rabbit_classic_queue_store_v2:init(QueueName),
    VHost = QueueName#resource.virtual_host,
    init(IsDurable, IndexState, StoreState, 0, 0, [],
         case IsDurable of
             true  -> msg_store_client_init(?PERSISTENT_MSG_STORE,
                                            MsgOnDiskFun, VHost);
             false -> undefined
         end,
         msg_store_client_init(?TRANSIENT_MSG_STORE, undefined,
                               VHost), VHost);

%% We can be recovering a transient queue if it crashed
init(Q, Terms, MsgOnDiskFun, MsgIdxOnDiskFun, MsgAndIdxOnDiskFun) when ?is_amqqueue(Q) ->
    QueueName = amqqueue:get_name(Q),
    IsDurable = amqqueue:is_durable(Q),
    {PRef, RecoveryTerms} = process_recovery_terms(Terms),
    VHost = QueueName#resource.virtual_host,
    {PersistentClient, ContainsCheckFun} =
        case IsDurable of
            true  -> C = msg_store_client_init(?PERSISTENT_MSG_STORE, PRef,
                                               MsgOnDiskFun, VHost),
                     {C, fun (MsgId) when is_binary(MsgId) ->
                                 rabbit_msg_store:contains(MsgId, C);
                             (Msg) ->
                                 mc:is_persistent(Msg)
                         end};
            false -> {undefined, fun(_MsgId) -> false end}
        end,
    TransientClient  = msg_store_client_init(?TRANSIENT_MSG_STORE,
                                             undefined, VHost),
    {DeltaCount, DeltaBytes, IndexState} =
        rabbit_classic_queue_index_v2:recover(
          QueueName, RecoveryTerms,
          rabbit_vhost_msg_store:successfully_recovered_state(
              VHost,
              ?PERSISTENT_MSG_STORE),
          ContainsCheckFun, MsgIdxOnDiskFun, MsgAndIdxOnDiskFun,
          main),
    StoreState = rabbit_classic_queue_store_v2:init(QueueName),
    init(IsDurable, IndexState, StoreState,
         DeltaCount, DeltaBytes, RecoveryTerms,
         PersistentClient, TransientClient, VHost).

process_recovery_terms(Terms=non_clean_shutdown) ->
    {rabbit_guid:gen(), Terms};
process_recovery_terms(Terms) ->
    case proplists:get_value(persistent_ref, Terms) of
        undefined -> {rabbit_guid:gen(), []};
        PRef      -> {PRef, Terms}
    end.

terminate(_Reason, State) ->
    State1 = #vqstate { virtual_host        = VHost,
                        next_seq_id         = NextSeqId,
                        next_deliver_seq_id = NextDeliverSeqId,
                        persistent_count    = PCount,
                        persistent_bytes    = PBytes,
                        index_state         = IndexState,
                        store_state         = StoreState,
                        msg_store_clients   = {MSCStateP, MSCStateT} } =
        purge_pending_ack(true, State),
    PRef = case MSCStateP of
               undefined -> undefined;
               _         -> ok = maybe_client_terminate(MSCStateP),
                            rabbit_msg_store:client_ref(MSCStateP)
           end,
    ok = rabbit_msg_store:client_delete_and_terminate(MSCStateT),
    Terms = [{next_seq_id,         NextSeqId},
             {next_deliver_seq_id, NextDeliverSeqId},
             {persistent_ref,      PRef},
             {persistent_count,    PCount},
             {persistent_bytes,    PBytes}],
    a(State1#vqstate {
        index_state = rabbit_classic_queue_index_v2:terminate(VHost, Terms, IndexState),
        store_state = rabbit_classic_queue_store_v2:terminate(StoreState),
        msg_store_clients = undefined }).

%% the only difference between purge and delete is that delete also
%% needs to delete everything that's been delivered and not ack'd.
delete_and_terminate(_Reason, State) ->
    %% Normally when we purge messages we interact with the qi by
    %% issues delivers and acks for every purged message. In this case
    %% we don't need to do that, so we just delete the qi.
    State1 = purge_and_index_reset(State),
    State2 = #vqstate { msg_store_clients = {MSCStateP, MSCStateT} } =
        purge_pending_ack_delete_and_terminate(State1),
    case MSCStateP of
        undefined -> ok;
        _         -> rabbit_msg_store:client_delete_and_terminate(MSCStateP)
    end,
    rabbit_msg_store:client_delete_and_terminate(MSCStateT),
    a(State2 #vqstate { msg_store_clients = undefined }).

delete_crashed(Q) when ?is_amqqueue(Q) ->
    QName = amqqueue:get_name(Q),
    ok = rabbit_classic_queue_index_v2:erase(QName).

purge(State = #vqstate { len = Len }) ->
    case is_pending_ack_empty(State) and is_unconfirmed_empty(State) of
        true ->
            {Len, purge_and_index_reset(State)};
        false ->
            {Len, purge_when_pending_acks(State)}
    end.

purge_acks(State) -> a(purge_pending_ack(false, State)).

publish(Msg, MsgProps, IsDelivered, ChPid, State) ->
    State1 =
        publish1(Msg, MsgProps, IsDelivered, ChPid,
                 fun maybe_write_to_disk/4,
                 State),
    a(maybe_update_rates(State1)).

publish_delivered(Msg, MsgProps, ChPid, State) ->
    {SeqId, State1} =
        publish_delivered1(Msg, MsgProps, ChPid,
                           fun maybe_write_to_disk/4,
                           State),
    {SeqId, a(maybe_update_rates(State1))}.

discard(_MsgId, _ChPid, State) -> State.

drain_confirmed(State = #vqstate { confirmed = C }) ->
    case sets:is_empty(C) of
        true  -> {[], State}; %% common case
        false -> {sets:to_list(C), State #vqstate {
                                        confirmed = sets:new([{version, 2}]) }}
    end.

dropwhile(Pred, State) ->
    {MsgProps, State1} =
        remove_by_predicate(Pred, State),
    {MsgProps, a(State1)}.

fetchwhile(Pred, Fun, Acc, State) ->
    {MsgProps, Acc1, State1} =
         fetch_by_predicate(Pred, Fun, Acc, State),
    {MsgProps, Acc1, a(State1)}.

fetch(AckRequired, State) ->
    case queue_out(State) of
        {empty, State1} ->
            {empty, a(State1)};
        {{value, MsgStatus}, State1} ->
            %% it is possible that the message wasn't read from disk
            %% at this point, so read it in.
            {Msg, State2} = read_msg(MsgStatus, State1),
            {AckTag, State3} = remove(AckRequired, MsgStatus, State2),
            {{Msg, MsgStatus#msg_status.is_delivered, AckTag}, a(State3)}
    end.

%% @todo It may seem like we would benefit from avoiding reading the
%%       message content from disk. But benchmarks tell a different
%%       story. So we don't, until a better understanding is gained.
drop(AckRequired, State) ->
    case queue_out(State) of
        {empty, State1} ->
            {empty, a(State1)};
        {{value, MsgStatus}, State1} ->
            {AckTag, State2} = remove(AckRequired, MsgStatus, State1),
            {{MsgStatus#msg_status.msg_id, AckTag}, a(State2)}
    end.

%% Duplicated from rabbit_backing_queue
-spec ack([ack()], state()) -> {[rabbit_guid:guid()], state()}.

ack([], State) ->
    {[], State};
%% optimisation: this head is essentially a partial evaluation of the
%% general case below, for the single-ack case.
ack([SeqId], State) ->
    case remove_pending_ack(true, SeqId, State) of
        {none, _} ->
            {[], State};
        {MsgStatus = #msg_status{ msg_id = MsgId },
         State1 = #vqstate{ ack_out_counter = AckOutCount }} ->
            State2 = remove_from_disk(MsgStatus, State1),
            {[MsgId],
             a(State2 #vqstate { ack_out_counter  = AckOutCount + 1 })}
    end;
ack(AckTags, State) ->
    {{IndexOnDiskSeqIds, MsgIdsByStore, SeqIdsInStore, AllMsgIds},
     State1 = #vqstate { index_state       = IndexState,
                         store_state       = StoreState0,
                         ack_out_counter   = AckOutCount }} =
        lists:foldl(
          fun (SeqId, {Acc, State2}) ->
                  %% @todo When acking many messages we should update stats once not for every.
                  %%       Also remove the pending acks all at once instead of every.
                  case remove_pending_ack(true, SeqId, State2) of
                      {none, _} ->
                          {Acc, State2};
                      {MsgStatus, State3} ->
                          {accumulate_ack(MsgStatus, Acc), State3}
                  end
          end, {accumulate_ack_init(), State}, AckTags),
    {DeletedSegments, IndexState1} = rabbit_classic_queue_index_v2:ack(IndexOnDiskSeqIds, IndexState),
    StoreState1 = rabbit_classic_queue_store_v2:delete_segments(DeletedSegments, StoreState0),
    StoreState = lists:foldl(fun rabbit_classic_queue_store_v2:remove/2, StoreState1, SeqIdsInStore),
    State2 = remove_vhost_msgs_by_id(MsgIdsByStore, State1),
    {lists:reverse(AllMsgIds),
     a(State2 #vqstate { index_state      = IndexState1,
                         store_state      = StoreState,
                         ack_out_counter  = AckOutCount + length(AckTags) })}.

requeue(AckTags, #vqstate { delta      = Delta,
                            q3         = Q3,
                            in_counter = InCounter,
                            len        = Len } = State) ->
    %% @todo This can be heavily simplified: if the message falls into delta,
    %%       add it there. Otherwise just add it to q3 in the correct position.
    {SeqIds, Q3a, MsgIds, State1} = requeue_merge(lists:sort(AckTags), Q3, [],
                                                  delta_limit(Delta), State),
    {Delta1, MsgIds1, State2}     = delta_merge(SeqIds, Delta, MsgIds,
                                                State1),
    MsgCount = length(MsgIds1),
    {MsgIds1, a(
                  maybe_update_rates(ui(
                    State2 #vqstate { delta      = Delta1,
                                      q3         = Q3a,
                                      in_counter = InCounter + MsgCount,
                                      len        = Len + MsgCount })))}.

ackfold(MsgFun, Acc, State, AckTags) ->
    {AccN, StateN} =
        lists:foldl(fun(SeqId, {Acc0, State0}) ->
                            MsgStatus = lookup_pending_ack(SeqId, State0),
                            {Msg, State1} = read_msg(MsgStatus, State0),
                            {MsgFun(Msg, SeqId, Acc0), State1}
                    end, {Acc, State}, AckTags),
    {AccN, a(StateN)}.

fold(Fun, Acc, State = #vqstate{index_state = IndexState}) ->
    {Its, IndexState1} = lists:foldl(fun inext/2, {[], IndexState},
                                     [msg_iterator(State),
                                      disk_ack_iterator(State),
                                      ram_ack_iterator(State)]),
    ifold(Fun, Acc, Its, State#vqstate{index_state = IndexState1}).

len(#vqstate { len = Len }) -> Len.

is_empty(State) -> 0 == len(State).

depth(State) ->
    len(State) + count_pending_acks(State).

maybe_update_rates(State = #vqstate{ in_counter  = InCount,
                                     out_counter = OutCount })
  when InCount + OutCount > ?MSGS_PER_RATE_CALC ->
    update_rates(State);
maybe_update_rates(State) ->
    State.

update_rates(State = #vqstate{ in_counter      =     InCount,
                               out_counter     =    OutCount,
                               ack_in_counter  =  AckInCount,
                               ack_out_counter = AckOutCount,
                               rates = #rates{ in        =     InRate,
                                               out       =    OutRate,
                                               ack_in    =  AckInRate,
                                               ack_out   = AckOutRate,
                                               timestamp = TS }}) ->
    Now = erlang:monotonic_time(),

    Rates = #rates { in        = update_rate(Now, TS,     InCount,     InRate),
                     out       = update_rate(Now, TS,    OutCount,    OutRate),
                     ack_in    = update_rate(Now, TS,  AckInCount,  AckInRate),
                     ack_out   = update_rate(Now, TS, AckOutCount, AckOutRate),
                     timestamp = Now },

    State#vqstate{ in_counter      = 0,
                   out_counter     = 0,
                   ack_in_counter  = 0,
                   ack_out_counter = 0,
                   rates           = Rates }.

update_rate(Now, TS, Count, Rate) ->
    Time = erlang:convert_time_unit(Now - TS, native, micro_seconds) /
        ?MICROS_PER_SECOND,
    if
        Time == 0 -> Rate;
        true      -> rabbit_misc:moving_average(Time, ?RATE_AVG_HALF_LIFE,
                                                Count / Time, Rate)
    end.

needs_timeout(#vqstate { index_state = IndexState,
                         unconfirmed_simple = UCS }) ->
    case {rabbit_classic_queue_index_v2:needs_sync(IndexState), sets:is_empty(UCS)} of
        {false, false} -> timed;
        {confirms, _}  -> timed;
        {false, true}  -> false
    end.

timeout(State = #vqstate { index_state = IndexState0,
                           store_state = StoreState0,
                           unconfirmed_simple = UCS,
                           confirmed   = C }) ->
    IndexState = rabbit_classic_queue_index_v2:sync(IndexState0),
    StoreState = rabbit_classic_queue_store_v2:sync(StoreState0),
    State #vqstate { index_state = IndexState,
                     store_state = StoreState,
                     unconfirmed_simple = sets:new([{version,2}]),
                     confirmed   = sets:union(C, UCS) }.

handle_pre_hibernate(State = #vqstate { index_state = IndexState0,
                                        store_state = StoreState0,
                                        msg_store_clients = MSCState0,
                                        unconfirmed_simple = UCS,
                                        confirmed   = C }) ->
    MSCState = msg_store_pre_hibernate(MSCState0),
    IndexState = rabbit_classic_queue_index_v2:flush(IndexState0),
    StoreState = rabbit_classic_queue_store_v2:sync(StoreState0),
    State #vqstate { index_state = IndexState,
                     store_state = StoreState,
                     msg_store_clients = MSCState,
                     unconfirmed_simple = sets:new([{version,2}]),
                     confirmed   = sets:union(C, UCS) }.

resume(State) -> a(timeout(State)).

msg_rates(#vqstate { rates = #rates { in  = AvgIngressRate,
                                      out = AvgEgressRate } }) ->
    {AvgIngressRate, AvgEgressRate}.

info(messages_ready_ram, #vqstate{ram_msg_count = RamMsgCount}) ->
    RamMsgCount;
info(messages_unacknowledged_ram, #vqstate{ram_pending_ack = RPA}) ->
    map_size(RPA);
info(messages_ram, State) ->
    info(messages_ready_ram, State) + info(messages_unacknowledged_ram, State);
info(messages_persistent, #vqstate{persistent_count = PersistentCount}) ->
    PersistentCount;
info(messages_paged_out, #vqstate{delta = #delta{transient = Count}}) ->
    Count;
info(message_bytes, #vqstate{bytes         = Bytes,
                             unacked_bytes = UBytes}) ->
    Bytes + UBytes;
info(message_bytes_ready, #vqstate{bytes = Bytes}) ->
    Bytes;
info(message_bytes_unacknowledged, #vqstate{unacked_bytes = UBytes}) ->
    UBytes;
info(message_bytes_ram, #vqstate{ram_bytes = RamBytes}) ->
    RamBytes;
info(message_bytes_persistent, #vqstate{persistent_bytes = PersistentBytes}) ->
    PersistentBytes;
info(message_bytes_paged_out, #vqstate{delta_transient_bytes = PagedOutBytes}) ->
    PagedOutBytes;
info(head_message_timestamp, #vqstate{
          q3               = Q3,
          ram_pending_ack  = RPA}) ->
    head_message_timestamp(Q3, RPA);
info(oldest_message_received_timestamp, #vqstate{
                                           q3               = Q3,
                                           ram_pending_ack  = RPA}) ->
    oldest_message_received_timestamp(Q3, RPA);
info(disk_reads, #vqstate{disk_read_count = Count}) ->
    Count;
info(disk_writes, #vqstate{disk_write_count = Count}) ->
    Count;
info(backing_queue_status, #vqstate {
          delta = Delta, q3 = Q3,
          mode             = Mode,
          len              = Len,
          target_ram_count = TargetRamCount,
          next_seq_id      = NextSeqId,
          next_deliver_seq_id = NextDeliverSeqId,
          ram_pending_ack  = RPA,
          disk_pending_ack = DPA,
          unconfirmed      = UC,
          unconfirmed_simple = UCS,
          index_state      = IndexState,
          store_state      = StoreState,
          rates            = #rates { in      = AvgIngressRate,
                                      out     = AvgEgressRate,
                                      ack_in  = AvgAckIngressRate,
                                      ack_out = AvgAckEgressRate }}) ->
    [ {mode                , Mode},
      {version             , 2},
      {q1                  , 0},
      {q2                  , 0},
      {delta               , Delta},
      {q3                  , ?QUEUE:len(Q3)},
      {q4                  , 0},
      {len                 , Len},
      {target_ram_count    , TargetRamCount},
      {next_seq_id         , NextSeqId},
      {next_deliver_seq_id , NextDeliverSeqId},
      {num_pending_acks    , map_size(RPA) + map_size(DPA)},
      {num_unconfirmed     , sets:size(UC) + sets:size(UCS)},
      {avg_ingress_rate    , AvgIngressRate},
      {avg_egress_rate     , AvgEgressRate},
      {avg_ack_ingress_rate, AvgAckIngressRate},
      {avg_ack_egress_rate , AvgAckEgressRate} ]
    ++ rabbit_classic_queue_index_v2:info(IndexState)
    ++ rabbit_classic_queue_store_v2:info(StoreState);
info(_, _) ->
    ''.

invoke(?MODULE, Fun, State) -> Fun(?MODULE, State);
invoke(      _,   _, State) -> State.

is_duplicate(_Msg, State) -> {false, State}.

%% Queue mode has been unified.
set_queue_mode(_, State) ->
    State.

zip_msgs_and_acks(Msgs, AckTags, Accumulator, _State) ->
    lists:foldl(fun ({{Msg, _Props}, AckTag}, Acc) ->
                        Id = mc:get_annotation(id, Msg),
                        [{Id, AckTag} | Acc]
                end, Accumulator, lists:zip(Msgs, AckTags)).

%% Queue version now ignored; only v2 is available.
set_queue_version(_, State) ->
    State.

%% This function is used by rabbit_classic_queue_index_v2
%% to convert v1 queues to v2 after an upgrade to 4.0.
convert_from_v1_to_v2_loop(_, _, V2Index, V2Store, _, HiSeqId, HiSeqId, _) ->
    {V2Index, V2Store};
convert_from_v1_to_v2_loop(QueueName, V1Index0, V2Index0, V2Store0,
                           Counters = {CountersRef, CountIx, BytesIx},
                           LoSeqId, HiSeqId, SkipFun) ->
    UpSeqId = lists:min([rabbit_queue_index:next_segment_boundary(LoSeqId),
                         HiSeqId]),
    {Messages, V1Index} = rabbit_queue_index:read(LoSeqId, UpSeqId, V1Index0),
    %% We do a garbage collect immediately after the old index read
    %% because that may have created a lot of garbage.
    garbage_collect(),
    {V2Index3, V2Store3} = lists:foldl(fun
        %% Move embedded messages to the per-queue store.
        ({Msg, SeqId, rabbit_queue_index, Props, IsPersistent},
         {V2Index1, V2Store1}) ->
            MsgId = mc:get_annotation(id, Msg),
            {MsgLocation, V2Store2} = rabbit_classic_queue_store_v2:write(SeqId, Msg, Props, V2Store1),
            V2Index2 = case SkipFun(SeqId, V2Index1) of
                {skip, V2Index1a} ->
                    V2Index1a;
                {write, V2Index1a} ->
                    counters:add(CountersRef, CountIx, 1),
                    counters:add(CountersRef, BytesIx, Props#message_properties.size),
                    rabbit_classic_queue_index_v2:publish(MsgId, SeqId, MsgLocation, Props, IsPersistent, infinity, V2Index1a)
            end,
            {V2Index2, V2Store2};
        %% Keep messages in the per-vhost store where they are.
        ({MsgId, SeqId, rabbit_msg_store, Props, IsPersistent},
         {V2Index1, V2Store1}) ->
            V2Index2 = case SkipFun(SeqId, V2Index1) of
                {skip, V2Index1a} ->
                    V2Index1a;
                {write, V2Index1a} ->
                    counters:add(CountersRef, CountIx, 1),
                    counters:add(CountersRef, BytesIx, Props#message_properties.size),
                    rabbit_classic_queue_index_v2:publish(MsgId, SeqId, rabbit_msg_store, Props, IsPersistent, infinity, V2Index1a)
            end,
            {V2Index2, V2Store1}
    end, {V2Index0, V2Store0}, Messages),
    %% Flush to disk to avoid keeping too much in memory between segments.
    V2Index = rabbit_classic_queue_index_v2:flush(V2Index3),
    V2Store = rabbit_classic_queue_store_v2:sync(V2Store3),
    %% We have written everything to disk. We can delete the old segment file
    %% to free up much needed space, to avoid doubling disk usage during the upgrade.
    rabbit_queue_index:delete_segment_file_for_seq_id(LoSeqId, V1Index),
    %% Log some progress to keep the user aware of what's going on, as moving
    %% embedded messages can take quite some time.
    #resource{virtual_host = VHost, name = Name} = QueueName,
    rabbit_log:info("Queue ~ts in vhost ~ts converted ~b messages from v1 to v2",
                    [Name, VHost, length(Messages)]),
    convert_from_v1_to_v2_loop(QueueName, V1Index, V2Index, V2Store, Counters, UpSeqId, HiSeqId, SkipFun).

%% Get the Timestamp property of the first msg, if present. This is
%% the one with the oldest timestamp among the heads of the pending
%% acks and unread queues.  We can't check disk_pending_acks as these
%% are paged out - we assume some will soon be paged in rather than
%% forcing it to happen.  Pending ack msgs are included as they are
%% regarded as unprocessed until acked, this also prevents the result
%% apparently oscillating during repeated rejects.
%%
head_message_timestamp(Q3, RPA) ->
    HeadMsgs = [ HeadMsgStatus#msg_status.msg ||
                   HeadMsgStatus <-
                       [ get_q_head(Q3),
                         get_pa_head(RPA) ],
                   HeadMsgStatus /= undefined,
                   HeadMsgStatus#msg_status.msg /= undefined ],

    Timestamps =
        [Timestamp div 1000
         || HeadMsg <- HeadMsgs,
            Timestamp <- [mc:timestamp(HeadMsg)],
            Timestamp /= undefined
        ],

    case Timestamps == [] of
        true -> '';
        false -> lists:min(Timestamps)
    end.

oldest_message_received_timestamp(Q3, RPA) ->
    HeadMsgs = [ HeadMsgStatus#msg_status.msg ||
                   HeadMsgStatus <-
                       [ get_q_head(Q3),
                         get_pa_head(RPA) ],
                   HeadMsgStatus /= undefined,
                   HeadMsgStatus#msg_status.msg /= undefined ],

    Timestamps =
        [Timestamp
         || HeadMsg <- HeadMsgs,
            Timestamp <- [mc:get_annotation(?ANN_RECEIVED_AT_TIMESTAMP, HeadMsg)],
            Timestamp /= undefined
        ],

    case Timestamps == [] of
        true -> '';
        false -> lists:min(Timestamps)
    end.

get_q_head(Q) ->
    ?QUEUE:get(Q, undefined).

get_pa_head(PA) ->
    case maps:keys(PA) of
        [] -> undefined;
        Keys ->
            Smallest = lists:min(Keys),
            map_get(Smallest, PA)
    end.

a(State = #vqstate { delta = Delta, q3 = Q3,
                     len              = Len,
                     bytes            = Bytes,
                     unacked_bytes    = UnackedBytes,
                     persistent_count = PersistentCount,
                     persistent_bytes = PersistentBytes,
                     ram_msg_count    = RamMsgCount,
                     ram_bytes        = RamBytes}) ->
    ED = Delta#delta.count == 0,
    E3 = ?QUEUE:is_empty(Q3),
    LZ = Len == 0,
    L3 = ?QUEUE:len(Q3),

    %% if the queue is empty, then delta is empty and q3 is empty.
    true = LZ == (ED and E3),

    %% There should be no messages in q1, q2, and q4
    true = Delta#delta.count + L3 == Len,

    true = Len             >= 0,
    true = Bytes           >= 0,
    true = UnackedBytes    >= 0,
    true = PersistentCount >= 0,
    true = PersistentBytes >= 0,
    true = RamMsgCount     >= 0,
    true = RamMsgCount     =< Len,
    true = RamBytes        >= 0,
    true = RamBytes        =< Bytes + UnackedBytes,

    State.

d(Delta = #delta { start_seq_id = Start, count = Count, end_seq_id = End })
  when Start + Count =< End ->
    Delta.

m(MsgStatus = #msg_status { is_persistent = IsPersistent,
                            msg_location  = MsgLocation,
                            index_on_disk = IndexOnDisk }) ->
    true = (not IsPersistent) or IndexOnDisk,
    true = msg_in_ram(MsgStatus) or (MsgLocation =/= memory),
    MsgStatus.

one_if(true ) -> 1;
one_if(false) -> 0.

cons_if(true,   E, L) -> [E | L];
cons_if(false, _E, L) -> L.

msg_status(IsPersistent, IsDelivered, SeqId,
           Msg, MsgProps, IndexMaxSize) ->
    MsgId = mc:get_annotation(id, Msg),
    #msg_status{seq_id        = SeqId,
                msg_id        = MsgId,
                msg           = Msg,
                is_persistent = IsPersistent,
                %% This value will only be correct when the message is going out.
                %% See the set_deliver_flag/2 function.
                is_delivered  = IsDelivered,
                msg_location  = memory,
                index_on_disk = false,
                persist_to    = determine_persist_to(Msg, MsgProps, IndexMaxSize),
                msg_props     = MsgProps}.

beta_msg_status({MsgId, SeqId, MsgLocation, MsgProps, IsPersistent})
  when is_binary(MsgId) orelse
       MsgId =:= undefined ->
    MS0 = beta_msg_status0(SeqId, MsgProps, IsPersistent),
    MS0#msg_status{msg_id       = MsgId,
                   msg          = undefined,
                   persist_to   = case is_tuple(MsgLocation) of
                                      true  -> queue_store; %% @todo I'm not sure this clause is triggered anymore.
                                      false -> msg_store
                                  end,
                   msg_location = MsgLocation};
beta_msg_status({Msg, SeqId, MsgLocation, MsgProps, IsPersistent}) ->
    MsgId = mc:get_annotation(id, Msg),
    MS0 = beta_msg_status0(SeqId, MsgProps, IsPersistent),
    MS0#msg_status{msg_id       = MsgId,
                   msg          = Msg,
                   persist_to   = case MsgLocation of
                                      rabbit_queue_index -> queue_index;
                                      {rabbit_classic_queue_store_v2, _, _} -> queue_store;
                                      rabbit_msg_store -> msg_store
                                  end,
                   msg_location = case MsgLocation of
                                      rabbit_queue_index -> memory;
                                      _ -> MsgLocation
                                  end}.

beta_msg_status0(SeqId, MsgProps, IsPersistent) ->
  #msg_status{seq_id        = SeqId,
              msg           = undefined,
              is_persistent = IsPersistent,
              index_on_disk = true,
              msg_props     = MsgProps}.

with_msg_store_state({MSCStateP, MSCStateT},  true, Fun) ->
    {Result, MSCStateP1} = Fun(MSCStateP),
    {Result, {MSCStateP1, MSCStateT}};
with_msg_store_state({MSCStateP, MSCStateT}, false, Fun) ->
    {Result, MSCStateT1} = Fun(MSCStateT),
    {Result, {MSCStateP, MSCStateT1}}.

with_immutable_msg_store_state(MSCState, IsPersistent, Fun) ->
    {Res, MSCState} = with_msg_store_state(MSCState, IsPersistent,
                                           fun (MSCState1) ->
                                                   {Fun(MSCState1), MSCState1}
                                           end),
    Res.

msg_store_client_init(MsgStore, MsgOnDiskFun, VHost) ->
    msg_store_client_init(MsgStore, rabbit_guid:gen(), MsgOnDiskFun,
                          VHost).

msg_store_client_init(MsgStore, Ref, MsgOnDiskFun, VHost) ->
    rabbit_vhost_msg_store:client_init(VHost, MsgStore,
                                       Ref, MsgOnDiskFun).

msg_store_pre_hibernate({undefined, MSCStateT}) ->
    {undefined,
     rabbit_msg_store:client_pre_hibernate(MSCStateT)};
msg_store_pre_hibernate({MSCStateP, MSCStateT}) ->
    {rabbit_msg_store:client_pre_hibernate(MSCStateP),
     rabbit_msg_store:client_pre_hibernate(MSCStateT)}.

msg_store_write(MSCState, IsPersistent, SeqId, MsgId, Msg) ->
    with_immutable_msg_store_state(
      MSCState, IsPersistent,
      fun (MSCState1) ->
              rabbit_msg_store:write_flow(SeqId, MsgId, Msg, MSCState1)
      end).

msg_store_read(MSCState, IsPersistent, MsgId) ->
    with_msg_store_state(
      MSCState, IsPersistent,
      fun (MSCState1) ->
              rabbit_msg_store:read(MsgId, MSCState1)
      end).

msg_store_remove(MSCState, IsPersistent, MsgIds) ->
    with_immutable_msg_store_state(
      MSCState, IsPersistent,
      fun (MCSState1) ->
              rabbit_msg_store:remove(MsgIds, MCSState1)
      end).

betas_from_index_entries(List, TransientThreshold, DelsAndAcksFun, State = #vqstate{ next_deliver_seq_id = NextDeliverSeqId0 }) ->
    {Filtered, NextDeliverSeqId, Acks, RamReadyCount, RamBytes, TransientCount, TransientBytes} =
        lists:foldr(
          fun ({_MsgOrId, SeqId, _MsgLocation, _MsgProps, IsPersistent} = M,
               {Filtered1, NextDeliverSeqId1, Acks1, RRC, RB, TC, TB} = Acc) ->
                  case SeqId < TransientThreshold andalso not IsPersistent of
                      true  -> {Filtered1,
                                next_deliver_seq_id(SeqId, NextDeliverSeqId1),
                                [SeqId | Acks1], RRC, RB, TC, TB};
                      false -> MsgStatus = m(beta_msg_status(M)),
                               HaveMsg = msg_in_ram(MsgStatus),
                               Size = msg_size(MsgStatus),
                               case is_msg_in_pending_acks(SeqId, State) of
                                   false -> {?QUEUE:in_r(MsgStatus, Filtered1),
                                             NextDeliverSeqId1, Acks1,
                                             RRC + one_if(HaveMsg),
                                             RB + one_if(HaveMsg) * Size,
                                             TC + one_if(not IsPersistent),
                                             TB + one_if(not IsPersistent) * Size};
                                   true  -> Acc %% [0]
                               end
                  end
          end, {?QUEUE:new(), NextDeliverSeqId0, [], 0, 0, 0, 0}, List),
    {Filtered, RamReadyCount, RamBytes, DelsAndAcksFun(NextDeliverSeqId, Acks, State),
     TransientCount, TransientBytes}.
%% [0] We don't increase RamBytes here, even though it pertains to
%% unacked messages too, since if HaveMsg then the message must have
%% been stored in the QI, thus the message must have been in
%% qi_pending_ack, thus it must already have been in RAM.

%% We increase the next_deliver_seq_id only when the next
%% message (next seq_id) was delivered.
next_deliver_seq_id(SeqId, NextDeliverSeqId)
        when SeqId =:= NextDeliverSeqId ->
    NextDeliverSeqId + 1;
next_deliver_seq_id(_, NextDeliverSeqId) ->
    NextDeliverSeqId.

is_msg_in_pending_acks(SeqId, #vqstate { ram_pending_ack  = RPA,
                                         disk_pending_ack = DPA }) ->
    maps:is_key(SeqId, RPA) orelse
    maps:is_key(SeqId, DPA).

expand_delta(SeqId, ?BLANK_DELTA_PATTERN(X), IsPersistent) ->
    d(#delta { start_seq_id = SeqId, count = 1, end_seq_id = SeqId + 1,
               transient = one_if(not IsPersistent)});
expand_delta(SeqId, #delta { start_seq_id = StartSeqId,
                             count        = Count,
                             transient    = Transient } = Delta,
             IsPersistent )
  when SeqId < StartSeqId ->
    d(Delta #delta { start_seq_id = SeqId, count = Count + 1,
                     transient = Transient + one_if(not IsPersistent)});
expand_delta(SeqId, #delta { count        = Count,
                             end_seq_id   = EndSeqId,
                             transient    = Transient } = Delta,
             IsPersistent)
  when SeqId >= EndSeqId ->
    d(Delta #delta { count = Count + 1, end_seq_id = SeqId + 1,
                     transient = Transient + one_if(not IsPersistent)});
expand_delta(_SeqId, #delta { count       = Count,
                              transient   = Transient } = Delta,
             IsPersistent ) ->
    d(Delta #delta { count = Count + 1,
                     transient = Transient + one_if(not IsPersistent) }).

%%----------------------------------------------------------------------------
%% Internal major helpers for Public API
%%----------------------------------------------------------------------------

init(IsDurable, IndexState, StoreState, DeltaCount, DeltaBytes, Terms,
     PersistentClient, TransientClient, VHost) ->
    {LowSeqId, HiSeqId, IndexState1} = rabbit_classic_queue_index_v2:bounds(IndexState),

    {NextSeqId, NextDeliverSeqId, DeltaCount1, DeltaBytes1} =
        case Terms of
            non_clean_shutdown -> {HiSeqId, HiSeqId, DeltaCount, DeltaBytes};
            _                  -> NextSeqId0 = proplists:get_value(next_seq_id,
                                                                   Terms, HiSeqId),
                                  {NextSeqId0,
                                   proplists:get_value(next_deliver_seq_id,
                                                       Terms, NextSeqId0),
                                   proplists:get_value(persistent_count,
                                                       Terms, DeltaCount),
                                   proplists:get_value(persistent_bytes,
                                                       Terms, DeltaBytes)}
        end,
    Delta = case DeltaCount1 == 0 andalso DeltaCount /= undefined of
                true  -> ?BLANK_DELTA;
                false -> d(#delta { start_seq_id = LowSeqId,
                                    count        = DeltaCount1,
                                    transient    = 0,
                                    end_seq_id   = NextSeqId })
            end,
    Now = erlang:monotonic_time(),
    IoBatchSize = rabbit_misc:get_env(rabbit, msg_store_io_batch_size,
                                      ?IO_BATCH_SIZE),

    {ok, IndexMaxSize} = application:get_env(
                           rabbit, queue_index_embed_msgs_below),
    State = #vqstate {
      q1                  = ?QUEUE:new(),
      q2                  = ?QUEUE:new(),
      delta               = Delta,
      q3                  = ?QUEUE:new(),
      q4                  = ?QUEUE:new(),
      next_seq_id         = NextSeqId,
      next_deliver_seq_id = NextDeliverSeqId,
      ram_pending_ack     = #{},
      disk_pending_ack    = #{},
      index_state         = IndexState1,
      store_state         = StoreState,
      msg_store_clients   = {PersistentClient, TransientClient},
      durable             = IsDurable,
      transient_threshold = NextSeqId,
      qi_embed_msgs_below = IndexMaxSize,

      len                 = DeltaCount1,
      persistent_count    = DeltaCount1,
      bytes               = DeltaBytes1,
      persistent_bytes    = DeltaBytes1,
      delta_transient_bytes = 0,

      target_ram_count    = infinity,
      ram_msg_count       = 0,
      ram_msg_count_prev  = 0,
      ram_ack_count_prev  = 0,
      ram_bytes           = 0,
      unacked_bytes       = 0,
      out_counter         = 0,
      in_counter          = 0,
      rates               = blank_rates(Now),
      msgs_on_disk        = sets:new([{version,2}]),
      msg_indices_on_disk = sets:new([{version,2}]),
      unconfirmed         = sets:new([{version,2}]),
      unconfirmed_simple  = sets:new([{version,2}]),
      confirmed           = sets:new([{version,2}]),
      ack_out_counter     = 0,
      ack_in_counter      = 0,
      disk_read_count     = 0,
      disk_write_count    = 0,

      io_batch_size       = IoBatchSize,

      mode                = default,
      virtual_host        = VHost},
    a(maybe_deltas_to_betas(State)).

blank_rates(Now) ->
    #rates { in        = 0.0,
             out       = 0.0,
             ack_in    = 0.0,
             ack_out   = 0.0,
             timestamp = Now}.

in_r(MsgStatus = #msg_status {}, State = #vqstate { q3 = Q3 }) ->
    State #vqstate { q3 = ?QUEUE:in_r(MsgStatus, Q3) }.

queue_out(State) ->
    case fetch_from_q3(State) of
        {empty, _State1} = Result     -> Result;
        {loaded, {MsgStatus, State1}} -> {{value, set_deliver_flag(State, MsgStatus)}, State1}
    end.

set_deliver_flag(#vqstate{ next_deliver_seq_id = NextDeliverSeqId },
                 MsgStatus = #msg_status{ seq_id = SeqId }) ->
    MsgStatus#msg_status{ is_delivered = SeqId < NextDeliverSeqId }.

read_msg(#msg_status{seq_id        = SeqId,
                     msg           = undefined,
                     msg_id        = MsgId,
                     is_persistent = IsPersistent,
                     msg_location  = MsgLocation}, State) ->
    read_msg(SeqId, MsgId, IsPersistent, MsgLocation, State);
read_msg(#msg_status{msg = Msg}, State) ->
    {Msg, State}.

read_msg(SeqId, _, _, MsgLocation, State = #vqstate{ store_state = StoreState0 })
        when is_tuple(MsgLocation) ->
    {Msg, StoreState} = rabbit_classic_queue_store_v2:read(SeqId, MsgLocation, StoreState0),
    {Msg, State#vqstate{ store_state = StoreState }};
read_msg(_, MsgId, IsPersistent, rabbit_msg_store, State = #vqstate{msg_store_clients = MSCState,
                                                                    disk_read_count   = Count}) ->
    {{ok, Msg}, MSCState1} =
        msg_store_read(MSCState, IsPersistent, MsgId),
    {Msg, State #vqstate {msg_store_clients = MSCState1,
                          disk_read_count   = Count + 1}}.

%% Helper macros to make the code as obvious as possible.
%% It's OK to call msg_size/1 for Inc because it gets inlined.
-define(UP(A, Inc),
    A = St#vqstate.A + Inc).
-define(UP(A, B, Inc),
    A = St#vqstate.A + Inc,
    B = St#vqstate.B + Inc).
-define(UP(A, B, C, Inc),
    A = St#vqstate.A + Inc,
    B = St#vqstate.B + Inc,
    C = St#vqstate.C + Inc).

%% When publishing to memory, transient messages do not get written to disk.
%% On the other hand, persistent messages are kept in memory as well as disk.
stats_published_memory(MS = #msg_status{is_persistent = true}, St) ->
    St#vqstate{?UP(len, ram_msg_count, persistent_count, +1),
               ?UP(bytes, ram_bytes, persistent_bytes, +msg_size(MS))};
stats_published_memory(MS = #msg_status{is_persistent = false}, St) ->
    St#vqstate{?UP(len, ram_msg_count, +1),
               ?UP(bytes, ram_bytes, +msg_size(MS))}.

%% Messages published directly to disk are not kept in memory.
stats_published_disk(MS = #msg_status{is_persistent = true}, St) ->
    St#vqstate{?UP(len, persistent_count, +1),
               ?UP(bytes, persistent_bytes, +msg_size(MS))};
stats_published_disk(MS = #msg_status{is_persistent = false}, St) ->
    St#vqstate{?UP(len, +1),
               ?UP(bytes, delta_transient_bytes, +msg_size(MS))}.

%% Pending acks do not add to len. Messages are kept in memory.
stats_published_pending_acks(MS = #msg_status{is_persistent = true}, St) ->
    St#vqstate{?UP(persistent_count, +1),
               ?UP(persistent_bytes, unacked_bytes, ram_bytes, +msg_size(MS))};
stats_published_pending_acks(MS = #msg_status{is_persistent = false}, St) ->
    St#vqstate{?UP(unacked_bytes, ram_bytes, +msg_size(MS))}.

%% Messages are moved from memory to pending acks. They may have
%% the message body either in memory or on disk depending on how
%% the message got to memory in the first place (if the message
%% was fully on disk the content will not be read immediately).
%% The contents stay where they are during this operation.
stats_pending_acks(MS = #msg_status{msg = undefined}, St) ->
    St#vqstate{?UP(len, -1),
               ?UP(bytes, -msg_size(MS)), ?UP(unacked_bytes, +msg_size(MS))};
stats_pending_acks(MS, St) ->
    St#vqstate{?UP(len, ram_msg_count, -1),
               ?UP(bytes, -msg_size(MS)), ?UP(unacked_bytes, +msg_size(MS))}.

%% Message may or may not be persistent and the contents
%% may or may not be in memory.
%%
%% Removal from delta_transient_bytes is done by maybe_deltas_to_betas.
stats_removed(MS = #msg_status{is_persistent = true, msg = undefined}, St) ->
    St#vqstate{?UP(len, persistent_count, -1),
               ?UP(bytes, persistent_bytes, -msg_size(MS))};
stats_removed(MS = #msg_status{is_persistent = true}, St) ->
    St#vqstate{?UP(len, ram_msg_count, persistent_count, -1),
               ?UP(bytes, ram_bytes, persistent_bytes, -msg_size(MS))};
stats_removed(MS = #msg_status{is_persistent = false, msg = undefined}, St) ->
    St#vqstate{?UP(len, -1), ?UP(bytes, -msg_size(MS))};
stats_removed(MS = #msg_status{is_persistent = false}, St) ->
    St#vqstate{?UP(len, ram_msg_count, -1),
               ?UP(bytes, ram_bytes, -msg_size(MS))}.

%% @todo Very confusing that ram_msg_count is without unacked but ram_bytes is with.
%%       Rename the fields to make these things obvious. Fields are internal
%%       so that should be OK.

stats_acked_pending(MS = #msg_status{is_persistent = true, msg = undefined}, St) ->
    St#vqstate{?UP(persistent_count, -1),
               ?UP(persistent_bytes, unacked_bytes, -msg_size(MS))};
stats_acked_pending(MS = #msg_status{is_persistent = true}, St) ->
    St#vqstate{?UP(persistent_count, -1),
               ?UP(persistent_bytes, unacked_bytes, ram_bytes, -msg_size(MS))};
stats_acked_pending(MS = #msg_status{is_persistent = false,
                                     msg           = undefined}, St) ->
    St#vqstate{?UP(unacked_bytes, -msg_size(MS))};
stats_acked_pending(MS = #msg_status{is_persistent = false}, St) ->
    St#vqstate{?UP(unacked_bytes, ram_bytes, -msg_size(MS))}.

%% Notice that this is the reverse of stats_pending_acks.
stats_requeued_memory(MS = #msg_status{msg = undefined}, St) ->
    St#vqstate{?UP(len, +1),
               ?UP(bytes, +msg_size(MS)), ?UP(unacked_bytes, -msg_size(MS))};
stats_requeued_memory(MS, St) ->
    St#vqstate{?UP(len, ram_msg_count, +1),
               ?UP(bytes, +msg_size(MS)), ?UP(unacked_bytes, -msg_size(MS))}.

%% @todo For v2 since we don't remove from disk until we ack, we don't need
%%       to write to disk again on requeue. If the message falls within delta
%%       we can just drop the MsgStatus. Otherwise we just put it in q3 and
%%       we don't do any disk writes.
%%
%%       For v1 I'm not sure? I don't think we need to write to the index
%%       at least, but maybe we need to write the message if not embedded?
%%       I don't think we need to...
%%
%%       So we don't need to change anything except how we count stats as
%%       well as delta stats if the message falls within delta.
stats_requeued_disk(MS = #msg_status{is_persistent = true}, St) ->
    St#vqstate{?UP(len, +1),
               ?UP(bytes, +msg_size(MS)), ?UP(unacked_bytes, -msg_size(MS))};
stats_requeued_disk(MS = #msg_status{is_persistent = false}, St) ->
    St#vqstate{?UP(len, +1),
               ?UP(bytes, delta_transient_bytes, +msg_size(MS)),
               ?UP(unacked_bytes, -msg_size(MS))}.

msg_size(#msg_status{msg_props = #message_properties{size = Size}}) -> Size.

msg_in_ram(#msg_status{msg = Msg}) -> Msg =/= undefined.

%% first param: AckRequired
remove(true, MsgStatus = #msg_status {
               seq_id        = SeqId },
       State = #vqstate {next_deliver_seq_id = NextDeliverSeqId,
                         out_counter         = OutCount,
                         index_state         = IndexState1 }) ->

    State1 = record_pending_ack(
               MsgStatus #msg_status {
                 is_delivered = true }, State),

    State2 = stats_pending_acks(MsgStatus, State1),

    {SeqId, maybe_update_rates(
              State2 #vqstate {next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
                               out_counter         = OutCount + 1,
                               index_state         = IndexState1})};

%% This function body has the same behaviour as remove_queue_entries/3
%% but instead of removing messages based on a ?QUEUE, this removes
%% just one message, the one referenced by the MsgStatus provided.
remove(false, MsgStatus = #msg_status{ seq_id = SeqId },
              State = #vqstate{ next_deliver_seq_id = NextDeliverSeqId,
                                out_counter = OutCount }) ->
    State1 = remove_from_disk(MsgStatus, State),

    State2 = stats_removed(MsgStatus, State1),

    {undefined, maybe_update_rates(
                  State2 #vqstate {next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
                                   out_counter         = OutCount + 1 })}.

remove_from_disk(#msg_status {
                seq_id        = SeqId,
                msg_id        = MsgId,
                is_persistent = IsPersistent,
                msg_location  = MsgLocation,
                index_on_disk = IndexOnDisk },
       State = #vqstate {index_state         = IndexState1,
                         store_state         = StoreState0,
                         msg_store_clients   = MSCState}) ->
    {DeletedSegments, IndexState2} =
        case IndexOnDisk of
            true  -> rabbit_classic_queue_index_v2:ack([SeqId], IndexState1);
            false -> {[], IndexState1}
        end,
    {StoreState1, State1} = case MsgLocation of
        ?IN_SHARED_STORE  ->
            case msg_store_remove(MSCState, IsPersistent, [{SeqId, MsgId}]) of
                {ok, []} ->
                    {StoreState0, State};
                {ok, [_]} ->
                    {StoreState0, record_confirms(sets:add_element(MsgId, sets:new([{version,2}])), State)}
            end;
        ?IN_QUEUE_STORE   -> {rabbit_classic_queue_store_v2:remove(SeqId, StoreState0), State};
        ?IN_QUEUE_INDEX   -> {StoreState0, State};
        ?IN_MEMORY        -> {StoreState0, State}
    end,
    StoreState = rabbit_classic_queue_store_v2:delete_segments(DeletedSegments, StoreState1),
    State1#vqstate{
        index_state = IndexState2,
        store_state = StoreState
    }.

%% This function exists as a way to improve dropwhile/2
%% performance. The idea of having this function is to optimise calls
%% to rabbit_queue_index by batching delivers and acks, instead of
%% sending them one by one.
%%
%% Instead of removing every message as their are popped from the
%% queue, it first accumulates them and then removes them by calling
%% remove_queue_entries/3, since the behaviour of
%% remove_queue_entries/3 when used with
%% process_delivers_and_acks_fun(deliver_and_ack) is the same as
%% calling remove(false, MsgStatus, State).
%%
%% remove/3 also updates the out_counter in every call, but here we do
%% it just once at the end.
%%
%% @todo This function is really bad. If there are 1 million messages
%%       expired, it will first collect the 1 million messages and then
%%       process them. It should probably limit the number of messages
%%       it removes at once and loop until satisfied instead. It could
%%       also let the index first figure out until what seq_id() to read
%%       (since the index has expiration encoded, it could use binary
%%       search to find where it should stop reading) and then in a second
%%       step do the reading with a limit for each read and drop only that.
%%
%% @todo We cannot just read the metadata to drop the messages because
%%       there are messages we are not going to remove. Those messages
%%       will later on be consumed and their content read; only with
%%       a less efficient operation. This results in a drop in performance
%%       for long queues.
remove_by_predicate(Pred, State = #vqstate {out_counter = OutCount}) ->
    {MsgProps, QAcc, State1} =
        collect_by_predicate(Pred, ?QUEUE:new(), State),
    State2 =
        remove_queue_entries(
          QAcc, process_delivers_and_acks_fun(deliver_and_ack), State1),
    %% maybe_update_rates/1 is called in remove/2 for every
    %% message. Since we update out_counter only once, we call it just
    %% there.
    {MsgProps, maybe_update_rates(
                 State2 #vqstate {
                   out_counter = OutCount + ?QUEUE:len(QAcc)})}.

%% This function exists as a way to improve fetchwhile/4
%% performance. The idea of having this function is to optimise calls
%% to rabbit_queue_index by batching delivers, instead of sending them
%% one by one.
%%
%% Fun is the function passed to fetchwhile/4 that's
%% applied to every fetched message and used to build the fetchwhile/4
%% result accumulator FetchAcc.
%%
%% @todo See todo in remove_by_predicate/2 function.
fetch_by_predicate(Pred, Fun, FetchAcc,
                   State = #vqstate { out_counter = OutCount}) ->
    {MsgProps, QAcc, State1} =
        collect_by_predicate(Pred, ?QUEUE:new(), State),

    {NextDeliverSeqId, FetchAcc1, State2} =
        process_queue_entries(QAcc, Fun, FetchAcc, State1),

    {MsgProps, FetchAcc1, maybe_update_rates(
                            State2 #vqstate {
                              next_deliver_seq_id = NextDeliverSeqId,
                              out_counter         = OutCount + ?QUEUE:len(QAcc)})}.

%% We try to do here the same as what remove(true, State) does but
%% processing several messages at the same time. The idea is to
%% optimize rabbit_queue_index:deliver/2 calls by sending a list of
%% SeqIds instead of one by one, thus process_queue_entries1 will
%% accumulate the required deliveries, will record_pending_ack for
%% each message, and will update stats, like remove/2 does.
%%
%% For the meaning of Fun and FetchAcc arguments see
%% fetch_by_predicate/4 above.
process_queue_entries(Q, Fun, FetchAcc, State = #vqstate{ next_deliver_seq_id = NextDeliverSeqId }) ->
    ?QUEUE:fold(fun (MsgStatus, Acc) ->
                        process_queue_entries1(MsgStatus, Fun, Acc)
                end,
                {NextDeliverSeqId, FetchAcc, State}, Q).

process_queue_entries1(
  #msg_status { seq_id = SeqId } = MsgStatus,
  Fun,
  {NextDeliverSeqId, FetchAcc, State}) ->
    {Msg, State1} = read_msg(MsgStatus, State),
    State2 = record_pending_ack(
               MsgStatus #msg_status {
                 is_delivered = true }, State1),
    {next_deliver_seq_id(SeqId, NextDeliverSeqId),
     Fun(Msg, SeqId, FetchAcc),
     stats_pending_acks(MsgStatus, State2)}.

collect_by_predicate(Pred, QAcc, State) ->
    case queue_out(State) of
        {empty, State1} ->
            {undefined, QAcc, State1};
        {{value, MsgStatus = #msg_status { msg_props = MsgProps }}, State1} ->
            case Pred(MsgProps) of
                true  -> collect_by_predicate(Pred, ?QUEUE:in(MsgStatus, QAcc),
                                              State1);
                false -> {MsgProps, QAcc, in_r(MsgStatus, State1)}
            end
    end.

%%----------------------------------------------------------------------------
%% Helpers for Public API purge/1 function
%%----------------------------------------------------------------------------

%% The difference between purge_when_pending_acks/1
%% vs. purge_and_index_reset/1 is that the first one issues a deliver
%% and an ack to the queue index for every message that's being
%% removed, while the later just resets the queue index state.
purge_when_pending_acks(State) ->
    State1 = purge1(process_delivers_and_acks_fun(deliver_and_ack), State),
    a(State1).

purge_and_index_reset(State) ->
    State1 = purge1(process_delivers_and_acks_fun(none), State),
    a(reset_qi_state(State1)).

%% This function removes messages from each of delta and q3.
%%
%% purge_betas_and_deltas/2 loads messages from the queue index,
%% filling up q3. The messages loaded into q3 are removed by calling
%% remove_queue_entries/3 until there are no more messages to be read
%% from the queue index. Messages are read in batches from the queue
%% index.
purge1(AfterFun, State) ->
    a(purge_betas_and_deltas(AfterFun, State)).

reset_qi_state(State = #vqstate{ index_state = IndexState0,
                                 store_state = StoreState0 }) ->
    StoreState = rabbit_classic_queue_store_v2:terminate(StoreState0),
    IndexState = rabbit_classic_queue_index_v2:reset_state(IndexState0),
    State#vqstate{ index_state = IndexState,
                   store_state = StoreState }.

is_pending_ack_empty(State) ->
    count_pending_acks(State) =:= 0.

is_unconfirmed_empty(#vqstate { unconfirmed = UC, unconfirmed_simple = UCS }) ->
    sets:is_empty(UC) andalso sets:is_empty(UCS).

count_pending_acks(#vqstate { ram_pending_ack   = RPA,
                              disk_pending_ack  = DPA }) ->
    map_size(RPA) + map_size(DPA).

%% @todo When doing maybe_deltas_to_betas stats are updated. Then stats
%%       are updated again in remove_queue_entries1. All unnecessary since
%%       we are purging anyway?
purge_betas_and_deltas(DelsAndAcksFun, State) ->
    %% We use the maximum memory limit when purging to get greater performance.
    MemoryLimit = 2048,
    State0 = #vqstate { q3 = Q3 } = maybe_deltas_to_betas(DelsAndAcksFun, State, MemoryLimit, metadata_only),

    case ?QUEUE:is_empty(Q3) of
        true  -> State0;
        false -> State1 = remove_queue_entries(Q3, DelsAndAcksFun, State0),
                 purge_betas_and_deltas(DelsAndAcksFun, State1#vqstate{q3 = ?QUEUE:new()})
    end.

remove_queue_entries(Q, DelsAndAcksFun,
                     State = #vqstate{next_deliver_seq_id = NextDeliverSeqId0}) ->
    {MsgIdsByStore, NextDeliverSeqId, Acks, State1} =
        ?QUEUE:fold(fun remove_queue_entries1/2,
                    {maps:new(), NextDeliverSeqId0, [], State}, Q),
    State2 = remove_vhost_msgs_by_id(MsgIdsByStore, State1),
    DelsAndAcksFun(NextDeliverSeqId, Acks, State2).

remove_queue_entries1(
  #msg_status { msg_id = MsgId, seq_id = SeqId,
                msg_location = MsgLocation, index_on_disk = IndexOnDisk,
                is_persistent = IsPersistent} = MsgStatus,
  {MsgIdsByStore, NextDeliverSeqId, Acks, State}) ->
    {case MsgLocation of
         ?IN_SHARED_STORE -> rabbit_misc:maps_cons(IsPersistent, {SeqId, MsgId}, MsgIdsByStore);
         _ -> MsgIdsByStore
     end,
     next_deliver_seq_id(SeqId, NextDeliverSeqId),
     cons_if(IndexOnDisk, SeqId, Acks),
     %% @todo Probably don't do this on a per-message basis...
     stats_removed(MsgStatus, State)}.

process_delivers_and_acks_fun(deliver_and_ack) ->
    %% @todo Make a clause for empty Acks list?
    fun (NextDeliverSeqId, Acks, State = #vqstate { index_state = IndexState,
                                                    store_state = StoreState0}) ->
            {DeletedSegments, IndexState1} = rabbit_classic_queue_index_v2:ack(Acks, IndexState),

            StoreState = rabbit_classic_queue_store_v2:delete_segments(DeletedSegments, StoreState0),

            State #vqstate { index_state         = IndexState1,
                             store_state         = StoreState,
                             %% We indiscriminately update because we already took care
                             %% of calling next_deliver_seq_id/2 in the functions that
                             %% end up calling this fun.
                             next_deliver_seq_id = NextDeliverSeqId }
    end;
process_delivers_and_acks_fun(_) ->
    fun (NextDeliverSeqId, _, State) ->
            State #vqstate { next_deliver_seq_id = NextDeliverSeqId }
    end.

%%----------------------------------------------------------------------------
%% Internal gubbins for publishing
%%----------------------------------------------------------------------------

publish1(Msg,
         MsgProps = #message_properties { needs_confirming = NeedsConfirming },
         IsDelivered, _ChPid, PersistFun,
         State = #vqstate { q3 = Q3, delta = Delta = #delta { count = DeltaCount },
                            len                 = Len,
                            qi_embed_msgs_below = IndexMaxSize,
                            next_seq_id         = SeqId,
                            next_deliver_seq_id = NextDeliverSeqId,
                            in_counter          = InCount,
                            durable             = IsDurable,
                            unconfirmed         = UC,
                            unconfirmed_simple  = UCS,
                            rates               = #rates{ out = OutRate }}) ->
    MsgId = mc:get_annotation(id, Msg),
    IsPersistent = mc:is_persistent(Msg),
    IsPersistent1 = IsDurable andalso IsPersistent,
    MsgStatus = msg_status(IsPersistent1, IsDelivered, SeqId, Msg, MsgProps, IndexMaxSize),
    %% We allow from 1 to 2048 messages in memory depending on the consume rate. The lower
    %% limit is at 1 because the queue process will need to access this message to know
    %% expiration information.
    MemoryLimit = min(1 + floor(2 * OutRate), 2048),
    State3 = case DeltaCount of
                 %% Len is the same as Q3Len when DeltaCount =:= 0.
                 0 when Len < MemoryLimit ->
                     {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
                     State2 = State1 #vqstate { q3 = ?QUEUE:in(m(MsgStatus1), Q3) },
                     stats_published_memory(MsgStatus1, State2);
                 _ ->
                     {MsgStatus1, State1} = PersistFun(true, true, MsgStatus, State),
                     Delta1 = expand_delta(SeqId, Delta, IsPersistent),
                     State2 = State1 #vqstate { delta = Delta1 },
                     stats_published_disk(MsgStatus1, State2)
             end,
    {UC1, UCS1} = maybe_needs_confirming(NeedsConfirming, persist_to(MsgStatus),
                                         MsgId, UC, UCS),
    State3#vqstate{ next_seq_id         = SeqId + 1,
                    next_deliver_seq_id = maybe_next_deliver_seq_id(SeqId, NextDeliverSeqId, IsDelivered),
                    in_counter          = InCount + 1,
                    unconfirmed         = UC1,
                    unconfirmed_simple  = UCS1 }.

%% Only attempt to increase the next_deliver_seq_id for delivered messages.
maybe_next_deliver_seq_id(SeqId, NextDeliverSeqId, true) ->
    next_deliver_seq_id(SeqId, NextDeliverSeqId);
maybe_next_deliver_seq_id(_, NextDeliverSeqId, false) ->
    NextDeliverSeqId.

publish_delivered1(Msg,
                   MsgProps = #message_properties {
                                 needs_confirming = NeedsConfirming },
                   _ChPid, PersistFun,
                   State = #vqstate { qi_embed_msgs_below = IndexMaxSize,
                                      next_seq_id         = SeqId,
                                      next_deliver_seq_id = NextDeliverSeqId,
                                      in_counter          = InCount,
                                      out_counter         = OutCount,
                                      durable             = IsDurable,
                                      unconfirmed         = UC,
                                      unconfirmed_simple  = UCS }) ->
    MsgId = mc:get_annotation(id, Msg),
    IsPersistent = mc:is_persistent(Msg),
    IsPersistent1 = IsDurable andalso IsPersistent,
    MsgStatus = msg_status(IsPersistent1, true, SeqId, Msg, MsgProps, IndexMaxSize),
    {MsgStatus1, State1} = PersistFun(false, false, MsgStatus, State),
    State2 = record_pending_ack(m(MsgStatus1), State1),
    {UC1, UCS1} = maybe_needs_confirming(NeedsConfirming, persist_to(MsgStatus),
                                         MsgId, UC, UCS),
    {SeqId,
     stats_published_pending_acks(MsgStatus1,
           State2#vqstate{ next_seq_id         = SeqId + 1,
                           next_deliver_seq_id = next_deliver_seq_id(SeqId, NextDeliverSeqId),
                           out_counter         = OutCount + 1,
                           in_counter          = InCount + 1,
                           unconfirmed         = UC1,
                           unconfirmed_simple  = UCS1 })}.

maybe_needs_confirming(false, _, _, UC, UCS) ->
    {UC, UCS};
%% When storing to the v2 queue store we take the simple confirms
%% path because we don't need to track index and store separately.
maybe_needs_confirming(true, queue_store, MsgId, UC, UCS) ->
    {UC, sets:add_element(MsgId, UCS)};
%% Otherwise we keep tracking as it used to be.
maybe_needs_confirming(true, _, MsgId, UC, UCS) ->
    {sets:add_element(MsgId, UC), UCS}.

maybe_write_msg_to_disk(Force, MsgStatus = #msg_status {
                                 seq_id = SeqId,
                                 msg = Msg, msg_id = MsgId,
                                 is_persistent = IsPersistent,
                                 msg_location = ?IN_MEMORY,
                                 msg_props = Props },
                        State = #vqstate{ store_state = StoreState0,
                                          msg_store_clients = MSCState,
                                          disk_write_count  = Count})
  when Force orelse IsPersistent ->
    case persist_to(MsgStatus) of
        msg_store   -> ok = msg_store_write(MSCState, IsPersistent, SeqId, MsgId,
                                            prepare_to_store(Msg)),
                       {MsgStatus#msg_status{msg_location = ?IN_SHARED_STORE},
                        State#vqstate{disk_write_count = Count + 1}};
        queue_store -> {MsgLocation, StoreState} = rabbit_classic_queue_store_v2:write(SeqId, prepare_to_store(Msg), Props, StoreState0),
                       {MsgStatus#msg_status{ msg_location = MsgLocation },
                        State#vqstate{ store_state = StoreState,
                                       disk_write_count = Count + 1}};
        queue_index -> {MsgStatus, State}
    end;
maybe_write_msg_to_disk(_Force, MsgStatus, State) ->
    {MsgStatus, State}.

%% Due to certain optimisations made inside
%% rabbit_queue_index:pre_publish/7 we need to have two separate
%% functions for index persistence. This one is only used when paging
%% during memory pressure. We didn't want to modify
%% maybe_write_index_to_disk/3 because that function is used in other
%% places.
maybe_batch_write_index_to_disk(_Force,
                                MsgStatus = #msg_status {
                                  index_on_disk = true }, State) ->
    {MsgStatus, State};
maybe_batch_write_index_to_disk(Force,
                                MsgStatus = #msg_status {
                                  msg           = Msg,
                                  msg_id        = MsgId,
                                  seq_id        = SeqId,
                                  is_persistent = IsPersistent,
                                  msg_location  = MsgLocation,
                                  msg_props     = MsgProps},
                                State = #vqstate {
                                           target_ram_count = TargetRamCount,
                                           disk_write_count = DiskWriteCount,
                                           index_state      = IndexState})
  when Force orelse IsPersistent ->
    {MsgOrId, DiskWriteCount1} =
        case persist_to(MsgStatus) of
            msg_store   -> {MsgId, DiskWriteCount};
            queue_store -> {MsgId, DiskWriteCount};
            queue_index -> {prepare_to_store(Msg), DiskWriteCount + 1}
        end,
    IndexState1 = rabbit_classic_queue_index_v2:pre_publish(
        MsgOrId, SeqId, MsgLocation, MsgProps,
        IsPersistent, TargetRamCount, IndexState),
    {MsgStatus#msg_status{index_on_disk = true},
     State#vqstate{index_state      = IndexState1,
                   disk_write_count = DiskWriteCount1}};
maybe_batch_write_index_to_disk(_Force, MsgStatus, State) ->
    {MsgStatus, State}.

maybe_write_index_to_disk(_Force, MsgStatus = #msg_status {
                                    index_on_disk = true }, State) ->
    {MsgStatus, State};
maybe_write_index_to_disk(Force, MsgStatus = #msg_status {
                                   msg           = Msg,
                                   msg_id        = MsgId,
                                   seq_id        = SeqId,
                                   is_persistent = IsPersistent,
                                   msg_location  = MsgLocation,
                                   msg_props     = MsgProps},
                          State = #vqstate{target_ram_count = TargetRamCount,
                                           disk_write_count = DiskWriteCount,
                                           index_state      = IndexState})
  when Force orelse IsPersistent ->
    {MsgOrId, DiskWriteCount1} =
        case persist_to(MsgStatus) of
            msg_store   -> {MsgId, DiskWriteCount};
            queue_store -> {MsgId, DiskWriteCount};
            queue_index -> {prepare_to_store(Msg), DiskWriteCount + 1}
        end,
    IndexState2 = rabbit_classic_queue_index_v2:publish(
                    MsgOrId, SeqId, MsgLocation, MsgProps, IsPersistent,
                    persist_to(MsgStatus) =:= msg_store, TargetRamCount,
                    IndexState),
    {MsgStatus#msg_status{index_on_disk = true},
     State#vqstate{index_state      = IndexState2,
                   disk_write_count = DiskWriteCount1}};

maybe_write_index_to_disk(_Force, MsgStatus, State) ->
    {MsgStatus, State}.

maybe_write_to_disk(ForceMsg, ForceIndex, MsgStatus, State) ->
    {MsgStatus1, State1} = maybe_write_msg_to_disk(ForceMsg, MsgStatus, State),
    maybe_write_index_to_disk(ForceIndex, MsgStatus1, State1).

maybe_prepare_write_to_disk(ForceMsg, ForceIndex0, MsgStatus, State) ->
    {MsgStatus1, State1} = maybe_write_msg_to_disk(ForceMsg, MsgStatus, State),
    %% We want messages written to the v2 per-queue store to also
    %% be written to the index for proper accounting. The situation
    %% where a message can be in the store but not in the index can
    %% only occur when going through this function (not via maybe_write_to_disk).
    ForceIndex = case persist_to(MsgStatus) of
        queue_store -> true;
        _ -> ForceIndex0
    end,
    maybe_batch_write_index_to_disk(ForceIndex, MsgStatus1, State1).

determine_persist_to(Msg,
                     #message_properties{size = BodySize},
                     IndexMaxSize) ->
    %% The >= is so that you can set the env to 0 and never persist
    %% to the index.
    %%
    %% We want this to be fast, so we avoid size(term_to_binary())
    %% here, or using the term size estimation from truncate.erl, both
    %% of which are too slow. So instead, if the message body size
    %% goes over the limit then we avoid any other checks.
    %%
    %% If it doesn't we need to decide if the properties will push
    %% it past the limit. If we have the encoded properties (usual
    %% case) we can just check their size. If we don't (message came
    %% via the direct client), we make a guess based on the number of
    %% headers.

    {MetaSize, _BodySize} = mc:size(Msg),
     case BodySize >= IndexMaxSize of
         true  -> msg_store;
         false ->
             Est = MetaSize + BodySize,
             case Est >= IndexMaxSize of
                 true  -> msg_store;
                 false -> queue_store
             end
     end.

persist_to(#msg_status{persist_to = To}) -> To.

prepare_to_store(Msg) ->
    mc:prepare(store, Msg).

%%----------------------------------------------------------------------------
%% Internal gubbins for acks
%%----------------------------------------------------------------------------

record_pending_ack(#msg_status { seq_id = SeqId } = MsgStatus,
                   State = #vqstate { ram_pending_ack  = RPA,
                                      disk_pending_ack = DPA,
                                      ack_in_counter   = AckInCount}) ->
    {RPA1, DPA1} =
        case msg_in_ram(MsgStatus) of
            false -> {RPA, maps:put(SeqId, MsgStatus, DPA)};
            _     -> {maps:put(SeqId, MsgStatus, RPA), DPA}
        end,
    State #vqstate { ram_pending_ack  = RPA1,
                     disk_pending_ack = DPA1,
                     ack_in_counter   = AckInCount + 1}.

lookup_pending_ack(SeqId, #vqstate { ram_pending_ack  = RPA,
                                     disk_pending_ack = DPA}) ->
    case maps:get(SeqId, RPA, none) of
        none -> maps:get(SeqId, DPA);
        V    -> V
    end.

%% First parameter = UpdateStats
%% @todo Do the stats updating outside of this function.
remove_pending_ack(true, SeqId, State) ->
    case remove_pending_ack(false, SeqId, State) of
        {none, _} ->
            {none, State};
        {MsgStatus, State1} ->
            {MsgStatus, stats_acked_pending(MsgStatus, State1)}
    end;
remove_pending_ack(false, SeqId, State = #vqstate{ram_pending_ack  = RPA,
                                                  disk_pending_ack = DPA}) ->
    case maps:get(SeqId, RPA, none) of
        none -> case maps:get(SeqId, DPA, none) of
                     none ->
                         {none, State};
                     V ->
                         DPA1 = maps:remove(SeqId, DPA),
                         {V, State#vqstate{disk_pending_ack = DPA1}}
                 end;
        V    -> RPA1 = maps:remove(SeqId, RPA),
                {V, State #vqstate { ram_pending_ack = RPA1 }}
    end.

purge_pending_ack(KeepPersistent,
                  State = #vqstate { index_state       = IndexState,
                                     store_state       = StoreState0 }) ->
    {IndexOnDiskSeqIds, MsgIdsByStore, SeqIdsInStore, State1} = purge_pending_ack1(State),
    case KeepPersistent of
        true  -> remove_transient_msgs_by_id(MsgIdsByStore, State1);
        false -> {DeletedSegments, IndexState1} =
                     rabbit_classic_queue_index_v2:ack(IndexOnDiskSeqIds, IndexState),
                 StoreState1 = lists:foldl(fun rabbit_classic_queue_store_v2:remove/2, StoreState0, SeqIdsInStore),
                 StoreState = rabbit_classic_queue_store_v2:delete_segments(DeletedSegments, StoreState1),
                 State2 = remove_vhost_msgs_by_id(MsgIdsByStore, State1),
                 State2 #vqstate { index_state = IndexState1,
                                   store_state = StoreState }
    end.

purge_pending_ack_delete_and_terminate(
  State = #vqstate { index_state       = IndexState,
                     store_state       = StoreState }) ->
    {_, MsgIdsByStore, _SeqIdsInStore, State1} = purge_pending_ack1(State),
    StoreState1 = rabbit_classic_queue_store_v2:terminate(StoreState),
    IndexState1 = rabbit_classic_queue_index_v2:delete_and_terminate(IndexState),
    State2 = remove_vhost_msgs_by_id(MsgIdsByStore, State1),
    State2 #vqstate { index_state = IndexState1,
                      store_state = StoreState1 }.

purge_pending_ack1(State = #vqstate { ram_pending_ack   = RPA,
                                      disk_pending_ack  = DPA }) ->
    F = fun (_SeqId, MsgStatus, Acc) -> accumulate_ack(MsgStatus, Acc) end,
    {IndexOnDiskSeqIds, MsgIdsByStore, SeqIdsInStore, _AllMsgIds} =
        maps:fold(F, maps:fold(F, accumulate_ack_init(), RPA), DPA),
    State1 = State #vqstate{ram_pending_ack  = #{},
                            disk_pending_ack = #{}},
    {IndexOnDiskSeqIds, MsgIdsByStore, SeqIdsInStore, State1}.

%% MsgIdsByStore is an map with two keys:
%%
%% true: holds a list of Persistent Message Ids.
%% false: holds a list of Transient Message Ids.
%%
%% The msg_store_remove/3 function takes this boolean flag to determine
%% from which store the messages should be removed from.
remove_vhost_msgs_by_id(MsgIdsByStore,
                        State = #vqstate{ msg_store_clients = MSCState }) ->
    maps:fold(fun(IsPersistent, MsgIds, StateF) ->
        case msg_store_remove(MSCState, IsPersistent, MsgIds) of
            {ok, []} ->
                StateF;
            {ok, ConfirmMsgIds} ->
                record_confirms(sets:from_list(ConfirmMsgIds, [{version, 2}]), StateF)
        end
    end, State, MsgIdsByStore).

remove_transient_msgs_by_id(MsgIdsByStore, State) ->
    remove_vhost_msgs_by_id(maps:with([false], MsgIdsByStore), State).

accumulate_ack_init() -> {[], maps:new(), [], []}.

accumulate_ack(#msg_status { seq_id        = SeqId,
                             msg_id        = MsgId,
                             is_persistent = IsPersistent,
                             msg_location  = MsgLocation,
                             index_on_disk = IndexOnDisk },
               {IndexOnDiskSeqIdsAcc, MsgIdsByStore, SeqIdsInStore, AllMsgIds}) ->
    {cons_if(IndexOnDisk, SeqId, IndexOnDiskSeqIdsAcc),
     case MsgLocation of
         ?IN_SHARED_STORE -> rabbit_misc:maps_cons(IsPersistent, {SeqId, MsgId}, MsgIdsByStore);
         _                -> MsgIdsByStore
     end,
     case MsgLocation of
         ?IN_QUEUE_STORE -> [SeqId|SeqIdsInStore];
         ?IN_QUEUE_INDEX -> [SeqId|SeqIdsInStore];
         _               -> SeqIdsInStore
     end,
     [MsgId | AllMsgIds]}.

%%----------------------------------------------------------------------------
%% Internal plumbing for confirms (aka publisher acks)
%%----------------------------------------------------------------------------

record_confirms(MsgIdSet, State = #vqstate { msgs_on_disk        = MOD,
                                             msg_indices_on_disk = MIOD,
                                             unconfirmed         = UC,
                                             confirmed           = C }) ->
    State #vqstate {
      msgs_on_disk        = sets_subtract(MOD,  MsgIdSet),
      msg_indices_on_disk = sets_subtract(MIOD, MsgIdSet),
      unconfirmed         = sets_subtract(UC,   MsgIdSet),
      confirmed           = sets:union(C, MsgIdSet) }.

%% Function defined in both rabbit_msg_store and rabbit_variable_queue.
sets_subtract(Set1, Set2) ->
    case sets:size(Set2) of
        1 -> sets:del_element(hd(sets:to_list(Set2)), Set1);
        _ -> sets:subtract(Set1, Set2)
    end.

msgs_written_to_disk(Callback, MsgIdSet, ignored) ->
    %% The message was already acked so it doesn't matter if it was never written
    %% to the index, we can process the confirm.
    Callback(?MODULE,
             fun (?MODULE, State) -> record_confirms(MsgIdSet, State) end);
msgs_written_to_disk(Callback, MsgIdSet, written) ->
    Callback(?MODULE,
             fun (?MODULE, State = #vqstate { msgs_on_disk        = MOD,
                                              msg_indices_on_disk = MIOD,
                                              unconfirmed         = UC }) ->
                     %% @todo Apparently the message store ALWAYS calls this function
                     %%       for all message IDs. This is a waste. We should only
                     %%       call it for messages that need confirming, and avoid
                     %%       this intersection call.
                     %%
                     %%       The same may apply to msg_indices_written_to_disk as well.
                     Confirmed = sets:intersection(UC, MsgIdSet),
                     record_confirms(sets:intersection(MsgIdSet, MIOD),
                                     State #vqstate {
                                       msgs_on_disk =
                                           sets:union(MOD, Confirmed) })
             end).

msg_indices_written_to_disk(Callback, MsgIdSet) ->
    Callback(?MODULE,
             fun (?MODULE, State = #vqstate { msgs_on_disk        = MOD,
                                              msg_indices_on_disk = MIOD,
                                              unconfirmed         = UC }) ->
                     Confirmed = sets:intersection(UC, MsgIdSet),
                     record_confirms(sets:intersection(MsgIdSet, MOD),
                                     State #vqstate {
                                       msg_indices_on_disk =
                                           sets:union(MIOD, Confirmed) })
             end).

%% @todo Having to call run_backing_queue is probably reducing performance...
msgs_and_indices_written_to_disk(Callback, MsgIdSet) ->
    Callback(?MODULE,
             fun (?MODULE, State) -> record_confirms(MsgIdSet, State) end).

%%----------------------------------------------------------------------------
%% Internal plumbing for requeue
%%----------------------------------------------------------------------------

%% Rebuild queue, inserting sequence ids to maintain ordering
requeue_merge(SeqIds, Q, MsgIds, Limit, State) ->
    requeue_merge(SeqIds, Q, ?QUEUE:new(), MsgIds,
                Limit, State).

requeue_merge([SeqId | Rest] = SeqIds, Q, Front, MsgIds,
            Limit, State)
  when Limit == undefined orelse SeqId < Limit ->
    case ?QUEUE:out(Q) of
        {{value, #msg_status { seq_id = SeqIdQ } = MsgStatus}, Q1}
          when SeqIdQ < SeqId ->
            %% enqueue from the remaining queue
            requeue_merge(SeqIds, Q1, ?QUEUE:in(MsgStatus, Front), MsgIds,
                        Limit, State);
        {_, _Q1} ->
            %% enqueue from the remaining list of sequence ids
            case msg_from_pending_ack(SeqId, State) of
                {none, _} ->
                    requeue_merge(Rest, Q, Front, MsgIds, Limit, State);
                {#msg_status { msg_id = MsgId } = MsgStatus, State1} ->
                    State2 = stats_requeued_memory(MsgStatus, State1),
                    requeue_merge(Rest, Q, ?QUEUE:in(MsgStatus, Front), [MsgId | MsgIds],
                                Limit, State2)
            end
    end;
requeue_merge(SeqIds, Q, Front, MsgIds,
            _Limit, State) ->
    {SeqIds, ?QUEUE:join(Front, Q), MsgIds, State}.

delta_merge([], Delta, MsgIds, State) ->
    {Delta, MsgIds, State};
delta_merge(SeqIds, Delta, MsgIds, State) ->
    lists:foldl(fun (SeqId, {Delta0, MsgIds0, State0} = Acc) ->
                        case msg_from_pending_ack(SeqId, State0) of
                            {none, _} ->
                                Acc;
                        {#msg_status { msg_id = MsgId,
                                       is_persistent = IsPersistent } = MsgStatus, State1} ->
                                {_MsgStatus, State2} =
                                    maybe_prepare_write_to_disk(true, true, MsgStatus, State1),
                                {expand_delta(SeqId, Delta0, IsPersistent), [MsgId | MsgIds0],
                                 stats_requeued_disk(MsgStatus, State2)}
                        end
                end, {Delta, MsgIds, State}, SeqIds).

%% Mostly opposite of record_pending_ack/2
msg_from_pending_ack(SeqId, State) ->
    case remove_pending_ack(false, SeqId, State) of
        {none, _} ->
            {none, State};
        {#msg_status { msg_props = MsgProps } = MsgStatus, State1} ->
            {MsgStatus #msg_status {
               msg_props = MsgProps #message_properties { needs_confirming = false } },
             State1}
    end.

delta_limit(?BLANK_DELTA_PATTERN(_))              -> undefined;
delta_limit(#delta { start_seq_id = StartSeqId }) -> StartSeqId.

%%----------------------------------------------------------------------------
%% Iterator
%%----------------------------------------------------------------------------

ram_ack_iterator(State) ->
    {ack, maps:iterator(State#vqstate.ram_pending_ack)}.

disk_ack_iterator(State) ->
    {ack, maps:iterator(State#vqstate.disk_pending_ack)}.

msg_iterator(State) -> istate(start, State).

istate(start, State) -> {q3,    State#vqstate.q3,    State};
istate(q3,    State) -> {delta, State#vqstate.delta, State};
istate(delta, _State) -> done.

next({ack, It}, IndexState) ->
    case maps:next(It) of
        none                     -> {empty, IndexState};
        {_SeqId, MsgStatus, It1} -> Next = {ack, It1},
                                    {value, MsgStatus, true, Next, IndexState}
    end;
next(done, IndexState) -> {empty, IndexState};
next({delta, #delta{start_seq_id = SeqId,
                    end_seq_id   = SeqId}, State}, IndexState) ->
    next(istate(delta, State), IndexState);
next({delta, #delta{start_seq_id = SeqId,
                    end_seq_id   = SeqIdEnd} = Delta, State}, IndexState) ->
    SeqIdB = rabbit_classic_queue_index_v2:next_segment_boundary(SeqId),
    %% It may make sense to limit this based on rate. But this
    %% is not called outside of CMQs so I will leave it alone
    %% for the time being.
    SeqId1 = lists:min([SeqIdB,
                        %% We must limit the number of messages read at once
                        %% otherwise the queue will attempt to read up to segment_entry_count()
                        %% messages from the index each time. The value
                        %% chosen here is arbitrary.
                        SeqId + 2048,
                        SeqIdEnd]),
    {List, IndexState1} = rabbit_classic_queue_index_v2:read(SeqId, SeqId1, IndexState),
    next({delta, Delta#delta{start_seq_id = SeqId1}, List, State}, IndexState1);
next({delta, Delta, [], State}, IndexState) ->
    next({delta, Delta, State}, IndexState);
next({delta, Delta, [{_, SeqId, _, _, _} = M | Rest], State}, IndexState) ->
    case is_msg_in_pending_acks(SeqId, State) of
        false -> Next = {delta, Delta, Rest, State},
                 {value, beta_msg_status(M), false, Next, IndexState};
        true  -> next({delta, Delta, Rest, State}, IndexState)
    end;
next({Key, Q, State}, IndexState) ->
    case ?QUEUE:out(Q) of
        {empty, _Q}              -> next(istate(Key, State), IndexState);
        {{value, MsgStatus}, QN} -> Next = {Key, QN, State},
                                    {value, MsgStatus, false, Next, IndexState}
    end.

inext(It, {Its, IndexState}) ->
    case next(It, IndexState) of
        {empty, IndexState1} ->
            {Its, IndexState1};
        {value, MsgStatus1, Unacked, It1, IndexState1} ->
            {[{MsgStatus1, Unacked, It1} | Its], IndexState1}
    end.

ifold(_Fun, Acc, [], State0) ->
    {Acc, State0};
ifold(Fun, Acc, Its0, State0) ->
    [{MsgStatus, Unacked, It} | Rest] =
        lists:sort(fun ({#msg_status{seq_id = SeqId1}, _, _},
                        {#msg_status{seq_id = SeqId2}, _, _}) ->
                           SeqId1 =< SeqId2
                   end, Its0),
    {Msg, State1} = read_msg(MsgStatus, State0),
    case Fun(Msg, MsgStatus#msg_status.msg_props, Unacked, Acc) of
        {stop, Acc1} ->
            {Acc1, State1};
        {cont, Acc1} ->
            IndexState0 = State1#vqstate.index_state,
            {Its1, IndexState1} = inext(It, {Rest, IndexState0}),
            State2 = State1#vqstate{index_state = IndexState1},
            ifold(Fun, Acc1, Its1, State2)
    end.

%%----------------------------------------------------------------------------
%% Phase changes
%%----------------------------------------------------------------------------

fetch_from_q3(State = #vqstate { delta = #delta { count = DeltaCount },
                                 q3    = Q3 }) ->
    case ?QUEUE:out(Q3) of
        {empty, _Q3} when DeltaCount =:= 0 ->
            {empty, State};
        {empty, _Q3} ->
            fetch_from_q3(maybe_deltas_to_betas(State));
        {{value, MsgStatus}, Q3a} ->
            State1 = State #vqstate { q3 = Q3a },
            {loaded, {MsgStatus, State1}}
    end.

%% Thresholds for doing multi-read against the shared message
%% stores. The values have been obtained through numerous
%% experiments. Be careful when editing these values: after a
%% certain size the performance drops and it becomes no longer
%% interesting to keep the extra data in memory.
-define(SHARED_READ_MANY_SIZE_THRESHOLD, 12000).
-define(SHARED_READ_MANY_COUNT_THRESHOLD, 10).

maybe_deltas_to_betas(State = #vqstate { rates = #rates{ out = OutRate }}) ->
    AfterFun = process_delivers_and_acks_fun(deliver_and_ack),
    %% We allow from 1 to 2048 messages in memory depending on the consume rate.
    MemoryLimit = min(1 + floor(2 * OutRate), 2048),
    maybe_deltas_to_betas(AfterFun, State, MemoryLimit, messages).

maybe_deltas_to_betas(_DelsAndAcksFun,
                      State = #vqstate {delta = ?BLANK_DELTA_PATTERN(X) },
                      _MemoryLimit, _WhatToRead) ->
    State;
maybe_deltas_to_betas(DelsAndAcksFun,
                      State = #vqstate {
                        delta                = Delta,
                        q3                   = Q3,
                        index_state          = IndexState,
                        store_state          = StoreState,
                        msg_store_clients    = {MCStateP, MCStateT},
                        ram_msg_count        = RamMsgCount,
                        ram_bytes            = RamBytes,
                        disk_read_count      = DiskReadCount,
                        delta_transient_bytes = DeltaTransientBytes,
                        transient_threshold  = TransientThreshold },
                      MemoryLimit, WhatToRead) ->
    #delta { start_seq_id = DeltaSeqId,
             count        = DeltaCount,
             transient    = Transient,
             end_seq_id   = DeltaSeqIdEnd } = Delta,
    %% For v2 we want to limit the number of messages read at once to lower
    %% the memory footprint. We use the consume rate to determine how many
    %% messages we read.
    DeltaSeqLimit = DeltaSeqId + MemoryLimit,
    DeltaSeqId1 =
        lists:min([rabbit_classic_queue_index_v2:next_segment_boundary(DeltaSeqId),
                   DeltaSeqLimit, DeltaSeqIdEnd]),
    {List0, IndexState1} = rabbit_classic_queue_index_v2:read(DeltaSeqId, DeltaSeqId1, IndexState),
    {List, StoreState3, MCStateP3, MCStateT3} = case WhatToRead of
        messages ->
            %% We try to read messages from disk all at once instead of
            %% 1 by 1 at fetch time. When v1 is used and messages are
            %% embedded, then the message content is already read from
            %% disk at this point. For v2 embedded we must do a separate
            %% call to obtain the contents and then merge the contents
            %% back into the #msg_status records.
            %%
            %% For shared message store messages we do the same but only
            %% for messages < ?SHARED_READ_MANY_SIZE_THRESHOLD bytes and
            %% when there are at least ?SHARED_READ_MANY_COUNT_THRESHOLD
            %% messages to fetch from that store. Other messages will be
            %% fetched one by one right before sending the messages.
            %%
            %% Since we have two different shared stores for persistent
            %% and transient messages they are treated separately when
            %% deciding whether to read_many from either of them.
            %%
            %% Because v2 and shared stores function differently we
            %% must keep different information for performing the reads.
            {V2Reads0, ShPersistReads, ShTransientReads} = lists:foldl(fun
                ({_, SeqId, MsgLocation, _, _}, {V2ReadsAcc, ShPReadsAcc, ShTReadsAcc}) when is_tuple(MsgLocation) ->
                    {[{SeqId, MsgLocation}|V2ReadsAcc], ShPReadsAcc, ShTReadsAcc};
                ({MsgId, _, rabbit_msg_store, #message_properties{size = Size}, true},
                 {V2ReadsAcc, ShPReadsAcc, ShTReadsAcc}) when Size =< ?SHARED_READ_MANY_SIZE_THRESHOLD ->
                    {V2ReadsAcc, [MsgId|ShPReadsAcc], ShTReadsAcc};
                ({MsgId, _, rabbit_msg_store, #message_properties{size = Size}, false},
                 {V2ReadsAcc, ShPReadsAcc, ShTReadsAcc}) when Size =< ?SHARED_READ_MANY_SIZE_THRESHOLD ->
                    {V2ReadsAcc, ShPReadsAcc, [MsgId|ShTReadsAcc]};
                (_, Acc) ->
                    Acc
            end, {[], [], []}, List0),
            %% In order to properly read and merge V2 messages we want them
            %% in the older->younger order.
            V2Reads = lists:reverse(V2Reads0),
            %% We do read_many for v2 store unconditionally.
            {V2Msgs, StoreState2} = rabbit_classic_queue_store_v2:read_many(V2Reads, StoreState),
            List1 = merge_read_msgs(List0, V2Reads, V2Msgs),
            %% We read from the shared message store only if there are multiple messages
            %% (10+ as we wouldn't get much benefits from smaller number of messages)
            %% otherwise we wait and read later.
            %%
            %% Because read_many does not use FHC we do not get an updated MCState
            %% like with normal reads.
            {List2, MCStateP2} = case length(ShPersistReads) < ?SHARED_READ_MANY_COUNT_THRESHOLD of
                true ->
                    {List1, MCStateP};
                false ->
                    {ShPersistMsgs, MCStateP1} = rabbit_msg_store:read_many(ShPersistReads, MCStateP),
                    case map_size(ShPersistMsgs) of
                        0 -> {List1, MCStateP1};
                        _ -> {merge_sh_read_msgs(List1, ShPersistMsgs), MCStateP1}
                    end
            end,
            {List3, MCStateT2} = case length(ShTransientReads) < ?SHARED_READ_MANY_COUNT_THRESHOLD of
                true ->
                    {List2, MCStateT};
                false ->
                    {ShTransientMsgs, MCStateT1} = rabbit_msg_store:read_many(ShTransientReads, MCStateT),
                    case map_size(ShTransientMsgs) of
                        0 -> {List2, MCStateT1};
                        _ -> {merge_sh_read_msgs(List2, ShTransientMsgs), MCStateT1}
                    end
            end,
            {List3, StoreState2, MCStateP2, MCStateT2};
        metadata_only ->
            {List0, StoreState, MCStateP, MCStateT}
    end,
    {Q3a, RamCountsInc, RamBytesInc, State1, TransientCount, TransientBytes} =
        betas_from_index_entries(List, TransientThreshold,
                                 DelsAndAcksFun,
                                 State #vqstate { index_state = IndexState1,
                                                  store_state = StoreState3,
                                                  msg_store_clients = {MCStateP3, MCStateT3}}),
    State2 = State1 #vqstate { ram_msg_count     = RamMsgCount   + RamCountsInc,
                               ram_bytes         = RamBytes      + RamBytesInc,
                               disk_read_count   = DiskReadCount + RamCountsInc },
    case ?QUEUE:len(Q3a) of
        0 ->
            %% we ignored every message in the segment due to it being
            %% transient and below the threshold
            maybe_deltas_to_betas(
              DelsAndAcksFun,
              State2 #vqstate {
                delta = d(Delta #delta { start_seq_id = DeltaSeqId1 })},
              MemoryLimit, WhatToRead);
        Q3aLen ->
            Q3b = ?QUEUE:join(Q3, Q3a),
            case DeltaCount - Q3aLen of
                0 ->
                    %% delta is now empty
                    State2 #vqstate { delta = ?BLANK_DELTA,
                                      q3    = Q3b,
                                      delta_transient_bytes = 0};
                N when N > 0 ->
                    Delta1 = d(#delta { start_seq_id = DeltaSeqId1,
                                        count        = N,
                                        %% @todo Probably something wrong, seen it become negative...
                                        transient    = Transient - TransientCount,
                                        end_seq_id   = DeltaSeqIdEnd }),
                    State2 #vqstate { delta = Delta1,
                                      q3    = Q3b,
                                      delta_transient_bytes = DeltaTransientBytes - TransientBytes }
            end
    end.

merge_read_msgs([M = {_, SeqId, _, _, _}|MTail], [{SeqId, _}|RTail], [Msg|MsgTail]) ->
    [setelement(1, M, Msg)|merge_read_msgs(MTail, RTail, MsgTail)];
merge_read_msgs([M|MTail], RTail, MsgTail) ->
    [M|merge_read_msgs(MTail, RTail, MsgTail)];
%% @todo We probably don't need to unwrap until the end.
merge_read_msgs([], [], []) ->
    [].

%% We may not get as many messages as we tried reading.
merge_sh_read_msgs([M = {MsgId, _, _, _, _}|MTail], Reads) ->
    case Reads of
        #{MsgId := Msg} ->
            [setelement(1, M, Msg)|merge_sh_read_msgs(MTail, Reads)];
        _ ->
            [M|merge_sh_read_msgs(MTail, Reads)]
    end;
merge_sh_read_msgs(MTail, _Reads) ->
    MTail.

%% Flushes queue index batch caches and updates queue index state.
ui(#vqstate{index_state      = IndexState,
            target_ram_count = TargetRamCount} = State) ->
    IndexState1 = rabbit_classic_queue_index_v2:flush_pre_publish_cache(
                    TargetRamCount, IndexState),
    State#vqstate{index_state = IndexState1}.

maybe_client_terminate(MSCStateP) ->
    %% Queue might have been asked to stop by the supervisor, it needs a clean
    %% shutdown in order for the supervising strategy to work - if it reaches max
    %% restarts might bring the vhost down.
    try
        rabbit_msg_store:client_terminate(MSCStateP)
    catch
        _:_ ->
            ok
    end.