Rdb channel replication (#13732)
This PR is based on:
https://github.com/redis/redis/pull/12109
https://github.com/valkey-io/valkey/pull/60
Closes: https://github.com/redis/redis/issues/11678
**Motivation**
During a full sync, when master is delivering RDB to the replica,
incoming write commands are kept in a replication buffer in order to be
sent to the replica once RDB delivery is completed. If RDB delivery
takes a long time, it might create memory pressure on master. Also, once
a replica connection accumulates replication data which is larger than
output buffer limits, master will kill replica connection. This may
cause a replication failure.
The main benefit of the rdb channel replication is streaming incoming
commands in parallel to the RDB delivery. This approach shifts
replication stream buffering to the replica and reduces load on master.
We do this by opening another connection for RDB delivery. The main
channel on replica will be receiving replication stream while rdb
channel is receiving the RDB.
This feature also helps to reduce master's main process CPU load. By
opening a dedicated connection for the RDB transfer, the bgsave process
has access to the new connection and it will stream RDB directly to the
replicas. Before this change, due to TLS connection restriction, the
bgsave process was writing RDB bytes to a pipe and the main process was
forwarding
it to the replica. This is no longer necessary, the main process can
avoid these expensive socket read/write syscalls. It also means RDB
delivery to replica will be faster as it avoids this step.
In summary, replication will be faster and master's performance during
full syncs will improve.
**Implementation steps**
1. When replica connects to the master, it sends 'rdb-channel-repl' as
part of capability exchange to let master to know replica supports rdb
channel.
2. When replica lacks sufficient data for PSYNC, master sends
+RDBCHANNELSYNC reply with replica's client id. As the next step, the
replica opens a new connection (rdb-channel) and configures it against
the master with the appropriate capabilities and requirements. It also
sends given client id back to master over rdbchannel, so that master can
associate these channels. (initial replica connection will be referred
as main-channel) Then, replica requests fullsync using the RDB channel.
3. Prior to forking, master attaches the replica's main channel to the
replication backlog to deliver replication stream starting at the
snapshot end offset.
4. The master main process sends replication stream via the main
channel, while the bgsave process sends the RDB directly to the replica
via the rdb-channel. Replica accumulates replication stream in a local
buffer, while the RDB is being loaded into the memory.
5. Once the replica completes loading the rdb, it drops the rdb channel
and streams the accumulated replication stream into the db. Sync is
completed.
**Some details**
- Currently, rdbchannel replication is supported only if
`repl-diskless-sync` is enabled on master. Otherwise, replication will
happen over a single connection as in before.
- On replica, there is a limit to replication stream buffering. Replica
uses a new config `replica-full-sync-buffer-limit` to limit number of
bytes to accumulate. If it is not set, replica inherits
`client-output-buffer-limit <replica>` hard limit config. If we reach
this limit, replica stops accumulating. This is not a failure scenario
though. Further accumulation will happen on master side. Depending on
the configured limits on master, master may kill the replica connection.
**API changes in INFO output:**
1. New replica state: `send_bulk_and_stream`. Indicates full sync is
still in progress for this replica. It is receiving replication stream
and rdb in parallel.
```
slave0:ip=127.0.0.1,port=5002,state=send_bulk_and_stream,offset=0,lag=0
```
Replica state changes in steps:
- First, replica sends psync and receives +RDBCHANNELSYNC
:`state=wait_bgsave`
- After replica connects with rdbchannel and delivery starts:
`state=send_bulk_and_stream`
- After full sync: `state=online`
2. On replica side, replication stream buffering metrics:
- replica_full_sync_buffer_size: Currently accumulated replication
stream data in bytes.
- replica_full_sync_buffer_peak: Peak number of bytes that this instance
accumulated in the lifetime of the process.
```
replica_full_sync_buffer_size:20485
replica_full_sync_buffer_peak:1048560
```
**API changes in CLIENT LIST**
In `client list` output, rdbchannel clients will have 'C' flag in
addition to 'S' replica flag:
```
id=11 addr=127.0.0.1:39108 laddr=127.0.0.1:5001 fd=14 name= age=5 idle=5 flags=SC db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=0 multi-mem=0 rbs=1024 rbp=0 obl=0 oll=0 omem=0 tot-mem=1920 events=r cmd=psync user=default redir=-1 resp=2 lib-name= lib-ver= io-thread=0
```
**Config changes:**
- `replica-full-sync-buffer-limit`: Controls how much replication data
replica can accumulate during rdbchannel replication. If it is not set,
a value of 0 means replica will inherit `client-output-buffer-limit
<replica>` hard limit config to limit accumulated data.
- `repl-rdb-channel` config is added as a hidden config. This is mostly
for testing as we need to support both rdbchannel replication and the
older single connection replication (to keep compatibility with older
versions and rdbchannel replication will not be enabled if
repl-diskless-sync is not enabled). it affects both the master (not to
respond to rdb channel requests), and the replica (not to declare
capability)
**Internal API changes:**
Changes that were introduced to Redis replication:
- New replication capability is added to replconf command: `capa
rdb-channel-repl`. Indicates replica is capable of rdb channel
replication. Replica sends it when it connects to master along with
other capabilities.
- If replica needs fullsync, master replies `+RDBCHANNELSYNC
<client-id>` to the replica's PSYNC request.
- When replica opens rdbchannel connection, as part of replconf command,
it sends `rdb-channel 1` to let master know this is rdb channel. Also,
it sends `main-ch-client-id <client-id>` as part of replconf command so
master can associate channels.
**Testing:**
As rdbchannel replication is enabled by default, we run whole test suite
with it. Though, as we need to support both rdbchannel and single
connection replication, we'll be running some tests twice with
`repl-rdb-channel yes/no` config.
**Replica state diagram**
```
* * Replica state machine *
*
* Main channel state
* ┌───────────────────┐
* │RECEIVE_PING_REPLY │
* └────────┬──────────┘
* │ +PONG
* ┌────────▼──────────┐
* │SEND_HANDSHAKE │ RDB channel state
* └────────┬──────────┘ ┌───────────────────────────────┐
* │+OK ┌───► RDB_CH_SEND_HANDSHAKE │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_AUTH_REPLY │ │ REPLCONF main-ch-client-id <clientid>
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_AUTH_REPLY │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_PORT_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_REPLCONF_REPLY│
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_IP_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_FULLRESYNC │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_CAPA_REPLY │ │ │+FULLRESYNC
* └────────┬──────────┘ │ │Rdb delivery
* │ │ ┌──────────────▼────────────────┐
* ┌────────▼──────────┐ │ │ RDB_CH_RDB_LOADING │
* │SEND_PSYNC │ │ └──────────────┬────────────────┘
* └─┬─────────────────┘ │ │ Done loading
* │PSYNC (use cached-master) │ │
* ┌─▼─────────────────┐ │ │
* │RECEIVE_PSYNC_REPLY│ │ ┌────────────►│ Replica streams replication
* └─┬─────────────────┘ │ │ │ buffer into memory
* │ │ │ │
* │+RDBCHANNELSYNC client-id │ │ │
* ├──────┬───────────────────┘ │ │
* │ │ Main channel │ │
* │ │ accumulates repl data │ │
* │ ┌──▼────────────────┐ │ ┌───────▼───────────┐
* │ │ REPL_TRANSFER ├───────┘ │ CONNECTED │
* │ └───────────────────┘ └────▲───▲──────────┘
* │ │ │
* │ │ │
* │ +FULLRESYNC ┌───────────────────┐ │ │
* ├────────────────► REPL_TRANSFER ├────┘ │
* │ └───────────────────┘ │
* │ +CONTINUE │
* └──────────────────────────────────────────────┘
*/
```
-----
This PR also contains changes and ideas from:
https://github.com/valkey-io/valkey/pull/837
https://github.com/valkey-io/valkey/pull/1173
https://github.com/valkey-io/valkey/pull/804
https://github.com/valkey-io/valkey/pull/945
https://github.com/valkey-io/valkey/pull/989
---------
Co-authored-by: Yuan Wang <wangyuancode@163.com>
Co-authored-by: debing.sun <debing.sun@redis.com>
Co-authored-by: Moti Cohen <moticless@gmail.com>
Co-authored-by: naglera <anagler123@gmail.com>
Co-authored-by: Amit Nagler <58042354+naglera@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Ping Xie <pingxie@outlook.com>
Co-authored-by: Ran Shidlansik <ranshid@amazon.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: xbasel <103044017+xbasel@users.noreply.github.com>
2025-01-13 20:09:52 +08:00
|
|
|
#
|
|
|
|
# Copyright (c) 2009-Present, Redis Ltd.
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
|
|
|
# Copyright (c) 2024-present, Valkey contributors.
|
|
|
|
# All rights reserved.
|
|
|
|
#
|
2025-05-01 21:04:22 +08:00
|
|
|
# Licensed under your choice of (a) the Redis Source Available License 2.0
|
|
|
|
# (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
|
|
|
|
# GNU Affero General Public License v3 (AGPLv3).
|
Rdb channel replication (#13732)
This PR is based on:
https://github.com/redis/redis/pull/12109
https://github.com/valkey-io/valkey/pull/60
Closes: https://github.com/redis/redis/issues/11678
**Motivation**
During a full sync, when master is delivering RDB to the replica,
incoming write commands are kept in a replication buffer in order to be
sent to the replica once RDB delivery is completed. If RDB delivery
takes a long time, it might create memory pressure on master. Also, once
a replica connection accumulates replication data which is larger than
output buffer limits, master will kill replica connection. This may
cause a replication failure.
The main benefit of the rdb channel replication is streaming incoming
commands in parallel to the RDB delivery. This approach shifts
replication stream buffering to the replica and reduces load on master.
We do this by opening another connection for RDB delivery. The main
channel on replica will be receiving replication stream while rdb
channel is receiving the RDB.
This feature also helps to reduce master's main process CPU load. By
opening a dedicated connection for the RDB transfer, the bgsave process
has access to the new connection and it will stream RDB directly to the
replicas. Before this change, due to TLS connection restriction, the
bgsave process was writing RDB bytes to a pipe and the main process was
forwarding
it to the replica. This is no longer necessary, the main process can
avoid these expensive socket read/write syscalls. It also means RDB
delivery to replica will be faster as it avoids this step.
In summary, replication will be faster and master's performance during
full syncs will improve.
**Implementation steps**
1. When replica connects to the master, it sends 'rdb-channel-repl' as
part of capability exchange to let master to know replica supports rdb
channel.
2. When replica lacks sufficient data for PSYNC, master sends
+RDBCHANNELSYNC reply with replica's client id. As the next step, the
replica opens a new connection (rdb-channel) and configures it against
the master with the appropriate capabilities and requirements. It also
sends given client id back to master over rdbchannel, so that master can
associate these channels. (initial replica connection will be referred
as main-channel) Then, replica requests fullsync using the RDB channel.
3. Prior to forking, master attaches the replica's main channel to the
replication backlog to deliver replication stream starting at the
snapshot end offset.
4. The master main process sends replication stream via the main
channel, while the bgsave process sends the RDB directly to the replica
via the rdb-channel. Replica accumulates replication stream in a local
buffer, while the RDB is being loaded into the memory.
5. Once the replica completes loading the rdb, it drops the rdb channel
and streams the accumulated replication stream into the db. Sync is
completed.
**Some details**
- Currently, rdbchannel replication is supported only if
`repl-diskless-sync` is enabled on master. Otherwise, replication will
happen over a single connection as in before.
- On replica, there is a limit to replication stream buffering. Replica
uses a new config `replica-full-sync-buffer-limit` to limit number of
bytes to accumulate. If it is not set, replica inherits
`client-output-buffer-limit <replica>` hard limit config. If we reach
this limit, replica stops accumulating. This is not a failure scenario
though. Further accumulation will happen on master side. Depending on
the configured limits on master, master may kill the replica connection.
**API changes in INFO output:**
1. New replica state: `send_bulk_and_stream`. Indicates full sync is
still in progress for this replica. It is receiving replication stream
and rdb in parallel.
```
slave0:ip=127.0.0.1,port=5002,state=send_bulk_and_stream,offset=0,lag=0
```
Replica state changes in steps:
- First, replica sends psync and receives +RDBCHANNELSYNC
:`state=wait_bgsave`
- After replica connects with rdbchannel and delivery starts:
`state=send_bulk_and_stream`
- After full sync: `state=online`
2. On replica side, replication stream buffering metrics:
- replica_full_sync_buffer_size: Currently accumulated replication
stream data in bytes.
- replica_full_sync_buffer_peak: Peak number of bytes that this instance
accumulated in the lifetime of the process.
```
replica_full_sync_buffer_size:20485
replica_full_sync_buffer_peak:1048560
```
**API changes in CLIENT LIST**
In `client list` output, rdbchannel clients will have 'C' flag in
addition to 'S' replica flag:
```
id=11 addr=127.0.0.1:39108 laddr=127.0.0.1:5001 fd=14 name= age=5 idle=5 flags=SC db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=0 multi-mem=0 rbs=1024 rbp=0 obl=0 oll=0 omem=0 tot-mem=1920 events=r cmd=psync user=default redir=-1 resp=2 lib-name= lib-ver= io-thread=0
```
**Config changes:**
- `replica-full-sync-buffer-limit`: Controls how much replication data
replica can accumulate during rdbchannel replication. If it is not set,
a value of 0 means replica will inherit `client-output-buffer-limit
<replica>` hard limit config to limit accumulated data.
- `repl-rdb-channel` config is added as a hidden config. This is mostly
for testing as we need to support both rdbchannel replication and the
older single connection replication (to keep compatibility with older
versions and rdbchannel replication will not be enabled if
repl-diskless-sync is not enabled). it affects both the master (not to
respond to rdb channel requests), and the replica (not to declare
capability)
**Internal API changes:**
Changes that were introduced to Redis replication:
- New replication capability is added to replconf command: `capa
rdb-channel-repl`. Indicates replica is capable of rdb channel
replication. Replica sends it when it connects to master along with
other capabilities.
- If replica needs fullsync, master replies `+RDBCHANNELSYNC
<client-id>` to the replica's PSYNC request.
- When replica opens rdbchannel connection, as part of replconf command,
it sends `rdb-channel 1` to let master know this is rdb channel. Also,
it sends `main-ch-client-id <client-id>` as part of replconf command so
master can associate channels.
**Testing:**
As rdbchannel replication is enabled by default, we run whole test suite
with it. Though, as we need to support both rdbchannel and single
connection replication, we'll be running some tests twice with
`repl-rdb-channel yes/no` config.
**Replica state diagram**
```
* * Replica state machine *
*
* Main channel state
* ┌───────────────────┐
* │RECEIVE_PING_REPLY │
* └────────┬──────────┘
* │ +PONG
* ┌────────▼──────────┐
* │SEND_HANDSHAKE │ RDB channel state
* └────────┬──────────┘ ┌───────────────────────────────┐
* │+OK ┌───► RDB_CH_SEND_HANDSHAKE │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_AUTH_REPLY │ │ REPLCONF main-ch-client-id <clientid>
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_AUTH_REPLY │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_PORT_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_REPLCONF_REPLY│
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_IP_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_FULLRESYNC │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_CAPA_REPLY │ │ │+FULLRESYNC
* └────────┬──────────┘ │ │Rdb delivery
* │ │ ┌──────────────▼────────────────┐
* ┌────────▼──────────┐ │ │ RDB_CH_RDB_LOADING │
* │SEND_PSYNC │ │ └──────────────┬────────────────┘
* └─┬─────────────────┘ │ │ Done loading
* │PSYNC (use cached-master) │ │
* ┌─▼─────────────────┐ │ │
* │RECEIVE_PSYNC_REPLY│ │ ┌────────────►│ Replica streams replication
* └─┬─────────────────┘ │ │ │ buffer into memory
* │ │ │ │
* │+RDBCHANNELSYNC client-id │ │ │
* ├──────┬───────────────────┘ │ │
* │ │ Main channel │ │
* │ │ accumulates repl data │ │
* │ ┌──▼────────────────┐ │ ┌───────▼───────────┐
* │ │ REPL_TRANSFER ├───────┘ │ CONNECTED │
* │ └───────────────────┘ └────▲───▲──────────┘
* │ │ │
* │ │ │
* │ +FULLRESYNC ┌───────────────────┐ │ │
* ├────────────────► REPL_TRANSFER ├────┘ │
* │ └───────────────────┘ │
* │ +CONTINUE │
* └──────────────────────────────────────────────┘
*/
```
-----
This PR also contains changes and ideas from:
https://github.com/valkey-io/valkey/pull/837
https://github.com/valkey-io/valkey/pull/1173
https://github.com/valkey-io/valkey/pull/804
https://github.com/valkey-io/valkey/pull/945
https://github.com/valkey-io/valkey/pull/989
---------
Co-authored-by: Yuan Wang <wangyuancode@163.com>
Co-authored-by: debing.sun <debing.sun@redis.com>
Co-authored-by: Moti Cohen <moticless@gmail.com>
Co-authored-by: naglera <anagler123@gmail.com>
Co-authored-by: Amit Nagler <58042354+naglera@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Ping Xie <pingxie@outlook.com>
Co-authored-by: Ran Shidlansik <ranshid@amazon.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: xbasel <103044017+xbasel@users.noreply.github.com>
2025-01-13 20:09:52 +08:00
|
|
|
#
|
|
|
|
# Portions of this file are available under BSD3 terms; see REDISCONTRIBUTIONS for more information.
|
|
|
|
#
|
|
|
|
|
2010-05-14 23:31:11 +08:00
|
|
|
proc randstring {min max {type binary}} {
|
|
|
|
set len [expr {$min+int(rand()*($max-$min+1))}]
|
|
|
|
set output {}
|
|
|
|
if {$type eq {binary}} {
|
|
|
|
set minval 0
|
|
|
|
set maxval 255
|
Client eviction (#8687)
### Description
A mechanism for disconnecting clients when the sum of all connected clients is above a
configured limit. This prevents eviction or OOM caused by accumulated used memory
between all clients. It's a complimentary mechanism to the `client-output-buffer-limit`
mechanism which takes into account not only a single client and not only output buffers
but rather all memory used by all clients.
#### Design
The general design is as following:
* We track memory usage of each client, taking into account all memory used by the
client (query buffer, output buffer, parsed arguments, etc...). This is kept up to date
after reading from the socket, after processing commands and after writing to the socket.
* Based on the used memory we sort all clients into buckets. Each bucket contains all
clients using up up to x2 memory of the clients in the bucket below it. For example up
to 1m clients, up to 2m clients, up to 4m clients, ...
* Before processing a command and before sleep we check if we're over the configured
limit. If we are we start disconnecting clients from larger buckets downwards until we're
under the limit.
#### Config
`maxmemory-clients` max memory all clients are allowed to consume, above this threshold
we disconnect clients.
This config can either be set to 0 (meaning no limit), a size in bytes (possibly with MB/GB
suffix), or as a percentage of `maxmemory` by using the `%` suffix (e.g. setting it to `10%`
would mean 10% of `maxmemory`).
#### Important code changes
* During the development I encountered yet more situations where our io-threads access
global vars. And needed to fix them. I also had to handle keeps the clients sorted into the
memory buckets (which are global) while their memory usage changes in the io-thread.
To achieve this I decided to simplify how we check if we're in an io-thread and make it
much more explicit. I removed the `CLIENT_PENDING_READ` flag used for checking
if the client is in an io-thread (it wasn't used for anything else) and just used the global
`io_threads_op` variable the same way to check during writes.
* I optimized the cleanup of the client from the `clients_pending_read` list on client freeing.
We now store a pointer in the `client` struct to this list so we don't need to search in it
(`pending_read_list_node`).
* Added `evicted_clients` stat to `INFO` command.
* Added `CLIENT NO-EVICT ON|OFF` sub command to exclude a specific client from the
client eviction mechanism. Added corrosponding 'e' flag in the client info string.
* Added `multi-mem` field in the client info string to show how much memory is used up
by buffered multi commands.
* Client `tot-mem` now accounts for buffered multi-commands, pubsub patterns and
channels (partially), tracking prefixes (partially).
* CLIENT_CLOSE_ASAP flag is now handled in a new `beforeNextClient()` function so
clients will be disconnected between processing different clients and not only before sleep.
This new function can be used in the future for work we want to do outside the command
processing loop but don't want to wait for all clients to be processed before we get to it.
Specifically I wanted to handle output-buffer-limit related closing before we process client
eviction in case the two race with each other.
* Added a `DEBUG CLIENT-EVICTION` command to print out info about the client eviction
buckets.
* Each client now holds a pointer to the client eviction memory usage bucket it belongs to
and listNode to itself in that bucket for quick removal.
* Global `io_threads_op` variable now can contain a `IO_THREADS_OP_IDLE` value
indicating no io-threading is currently being executed.
* In order to track memory used by each clients in real-time we can't rely on updating
these stats in `clientsCron()` alone anymore. So now I call `updateClientMemUsage()`
(used to be `clientsCronTrackClientsMemUsage()`) after command processing, after
writing data to pubsub clients, after writing the output buffer and after reading from the
socket (and maybe other places too). The function is written to be fast.
* Clients are evicted if needed (with appropriate log line) in `beforeSleep()` and before
processing a command (before performing oom-checks and key-eviction).
* All clients memory usage buckets are grouped as follows:
* All clients using less than 64k.
* 64K..128K
* 128K..256K
* ...
* 2G..4G
* All clients using 4g and up.
* Added client-eviction.tcl with a bunch of tests for the new mechanism.
* Extended maxmemory.tcl to test the interaction between maxmemory and
maxmemory-clients settings.
* Added an option to flag a numeric configuration variable as a "percent", this means that
if we encounter a '%' after the number in the config file (or config set command) we
consider it as valid. Such a number is store internally as a negative value. This way an
integer value can be interpreted as either a percent (negative) or absolute value (positive).
This is useful for example if some numeric configuration can optionally be set to a percentage
of something else.
Co-authored-by: Oran Agra <oran@redislabs.com>
2021-09-23 19:02:16 +08:00
|
|
|
} elseif {$type eq {alpha} || $type eq {simplealpha}} {
|
2010-05-14 23:31:11 +08:00
|
|
|
set minval 48
|
|
|
|
set maxval 122
|
|
|
|
} elseif {$type eq {compr}} {
|
|
|
|
set minval 48
|
|
|
|
set maxval 52
|
|
|
|
}
|
|
|
|
while {$len} {
|
2021-09-24 21:58:38 +08:00
|
|
|
set num [expr {$minval+int(rand()*($maxval-$minval+1))}]
|
|
|
|
set rr [format "%c" $num]
|
Client eviction (#8687)
### Description
A mechanism for disconnecting clients when the sum of all connected clients is above a
configured limit. This prevents eviction or OOM caused by accumulated used memory
between all clients. It's a complimentary mechanism to the `client-output-buffer-limit`
mechanism which takes into account not only a single client and not only output buffers
but rather all memory used by all clients.
#### Design
The general design is as following:
* We track memory usage of each client, taking into account all memory used by the
client (query buffer, output buffer, parsed arguments, etc...). This is kept up to date
after reading from the socket, after processing commands and after writing to the socket.
* Based on the used memory we sort all clients into buckets. Each bucket contains all
clients using up up to x2 memory of the clients in the bucket below it. For example up
to 1m clients, up to 2m clients, up to 4m clients, ...
* Before processing a command and before sleep we check if we're over the configured
limit. If we are we start disconnecting clients from larger buckets downwards until we're
under the limit.
#### Config
`maxmemory-clients` max memory all clients are allowed to consume, above this threshold
we disconnect clients.
This config can either be set to 0 (meaning no limit), a size in bytes (possibly with MB/GB
suffix), or as a percentage of `maxmemory` by using the `%` suffix (e.g. setting it to `10%`
would mean 10% of `maxmemory`).
#### Important code changes
* During the development I encountered yet more situations where our io-threads access
global vars. And needed to fix them. I also had to handle keeps the clients sorted into the
memory buckets (which are global) while their memory usage changes in the io-thread.
To achieve this I decided to simplify how we check if we're in an io-thread and make it
much more explicit. I removed the `CLIENT_PENDING_READ` flag used for checking
if the client is in an io-thread (it wasn't used for anything else) and just used the global
`io_threads_op` variable the same way to check during writes.
* I optimized the cleanup of the client from the `clients_pending_read` list on client freeing.
We now store a pointer in the `client` struct to this list so we don't need to search in it
(`pending_read_list_node`).
* Added `evicted_clients` stat to `INFO` command.
* Added `CLIENT NO-EVICT ON|OFF` sub command to exclude a specific client from the
client eviction mechanism. Added corrosponding 'e' flag in the client info string.
* Added `multi-mem` field in the client info string to show how much memory is used up
by buffered multi commands.
* Client `tot-mem` now accounts for buffered multi-commands, pubsub patterns and
channels (partially), tracking prefixes (partially).
* CLIENT_CLOSE_ASAP flag is now handled in a new `beforeNextClient()` function so
clients will be disconnected between processing different clients and not only before sleep.
This new function can be used in the future for work we want to do outside the command
processing loop but don't want to wait for all clients to be processed before we get to it.
Specifically I wanted to handle output-buffer-limit related closing before we process client
eviction in case the two race with each other.
* Added a `DEBUG CLIENT-EVICTION` command to print out info about the client eviction
buckets.
* Each client now holds a pointer to the client eviction memory usage bucket it belongs to
and listNode to itself in that bucket for quick removal.
* Global `io_threads_op` variable now can contain a `IO_THREADS_OP_IDLE` value
indicating no io-threading is currently being executed.
* In order to track memory used by each clients in real-time we can't rely on updating
these stats in `clientsCron()` alone anymore. So now I call `updateClientMemUsage()`
(used to be `clientsCronTrackClientsMemUsage()`) after command processing, after
writing data to pubsub clients, after writing the output buffer and after reading from the
socket (and maybe other places too). The function is written to be fast.
* Clients are evicted if needed (with appropriate log line) in `beforeSleep()` and before
processing a command (before performing oom-checks and key-eviction).
* All clients memory usage buckets are grouped as follows:
* All clients using less than 64k.
* 64K..128K
* 128K..256K
* ...
* 2G..4G
* All clients using 4g and up.
* Added client-eviction.tcl with a bunch of tests for the new mechanism.
* Extended maxmemory.tcl to test the interaction between maxmemory and
maxmemory-clients settings.
* Added an option to flag a numeric configuration variable as a "percent", this means that
if we encounter a '%' after the number in the config file (or config set command) we
consider it as valid. Such a number is store internally as a negative value. This way an
integer value can be interpreted as either a percent (negative) or absolute value (positive).
This is useful for example if some numeric configuration can optionally be set to a percentage
of something else.
Co-authored-by: Oran Agra <oran@redislabs.com>
2021-09-23 19:02:16 +08:00
|
|
|
if {$type eq {simplealpha} && ![string is alnum $rr]} {continue}
|
2021-09-24 21:58:38 +08:00
|
|
|
if {$type eq {alpha} && $num eq 92} {continue} ;# avoid putting '\' char in the string, it can mess up TCL processing
|
Client eviction (#8687)
### Description
A mechanism for disconnecting clients when the sum of all connected clients is above a
configured limit. This prevents eviction or OOM caused by accumulated used memory
between all clients. It's a complimentary mechanism to the `client-output-buffer-limit`
mechanism which takes into account not only a single client and not only output buffers
but rather all memory used by all clients.
#### Design
The general design is as following:
* We track memory usage of each client, taking into account all memory used by the
client (query buffer, output buffer, parsed arguments, etc...). This is kept up to date
after reading from the socket, after processing commands and after writing to the socket.
* Based on the used memory we sort all clients into buckets. Each bucket contains all
clients using up up to x2 memory of the clients in the bucket below it. For example up
to 1m clients, up to 2m clients, up to 4m clients, ...
* Before processing a command and before sleep we check if we're over the configured
limit. If we are we start disconnecting clients from larger buckets downwards until we're
under the limit.
#### Config
`maxmemory-clients` max memory all clients are allowed to consume, above this threshold
we disconnect clients.
This config can either be set to 0 (meaning no limit), a size in bytes (possibly with MB/GB
suffix), or as a percentage of `maxmemory` by using the `%` suffix (e.g. setting it to `10%`
would mean 10% of `maxmemory`).
#### Important code changes
* During the development I encountered yet more situations where our io-threads access
global vars. And needed to fix them. I also had to handle keeps the clients sorted into the
memory buckets (which are global) while their memory usage changes in the io-thread.
To achieve this I decided to simplify how we check if we're in an io-thread and make it
much more explicit. I removed the `CLIENT_PENDING_READ` flag used for checking
if the client is in an io-thread (it wasn't used for anything else) and just used the global
`io_threads_op` variable the same way to check during writes.
* I optimized the cleanup of the client from the `clients_pending_read` list on client freeing.
We now store a pointer in the `client` struct to this list so we don't need to search in it
(`pending_read_list_node`).
* Added `evicted_clients` stat to `INFO` command.
* Added `CLIENT NO-EVICT ON|OFF` sub command to exclude a specific client from the
client eviction mechanism. Added corrosponding 'e' flag in the client info string.
* Added `multi-mem` field in the client info string to show how much memory is used up
by buffered multi commands.
* Client `tot-mem` now accounts for buffered multi-commands, pubsub patterns and
channels (partially), tracking prefixes (partially).
* CLIENT_CLOSE_ASAP flag is now handled in a new `beforeNextClient()` function so
clients will be disconnected between processing different clients and not only before sleep.
This new function can be used in the future for work we want to do outside the command
processing loop but don't want to wait for all clients to be processed before we get to it.
Specifically I wanted to handle output-buffer-limit related closing before we process client
eviction in case the two race with each other.
* Added a `DEBUG CLIENT-EVICTION` command to print out info about the client eviction
buckets.
* Each client now holds a pointer to the client eviction memory usage bucket it belongs to
and listNode to itself in that bucket for quick removal.
* Global `io_threads_op` variable now can contain a `IO_THREADS_OP_IDLE` value
indicating no io-threading is currently being executed.
* In order to track memory used by each clients in real-time we can't rely on updating
these stats in `clientsCron()` alone anymore. So now I call `updateClientMemUsage()`
(used to be `clientsCronTrackClientsMemUsage()`) after command processing, after
writing data to pubsub clients, after writing the output buffer and after reading from the
socket (and maybe other places too). The function is written to be fast.
* Clients are evicted if needed (with appropriate log line) in `beforeSleep()` and before
processing a command (before performing oom-checks and key-eviction).
* All clients memory usage buckets are grouped as follows:
* All clients using less than 64k.
* 64K..128K
* 128K..256K
* ...
* 2G..4G
* All clients using 4g and up.
* Added client-eviction.tcl with a bunch of tests for the new mechanism.
* Extended maxmemory.tcl to test the interaction between maxmemory and
maxmemory-clients settings.
* Added an option to flag a numeric configuration variable as a "percent", this means that
if we encounter a '%' after the number in the config file (or config set command) we
consider it as valid. Such a number is store internally as a negative value. This way an
integer value can be interpreted as either a percent (negative) or absolute value (positive).
This is useful for example if some numeric configuration can optionally be set to a percentage
of something else.
Co-authored-by: Oran Agra <oran@redislabs.com>
2021-09-23 19:02:16 +08:00
|
|
|
append output $rr
|
2010-05-14 23:31:11 +08:00
|
|
|
incr len -1
|
|
|
|
}
|
|
|
|
return $output
|
|
|
|
}
|
|
|
|
|
|
|
|
# Useful for some test
|
|
|
|
proc zlistAlikeSort {a b} {
|
|
|
|
if {[lindex $a 0] > [lindex $b 0]} {return 1}
|
|
|
|
if {[lindex $a 0] < [lindex $b 0]} {return -1}
|
|
|
|
string compare [lindex $a 1] [lindex $b 1]
|
|
|
|
}
|
|
|
|
|
2010-05-16 05:48:08 +08:00
|
|
|
# Return all log lines starting with the first line that contains a warning.
|
|
|
|
# Generally, this will be an assertion error with a stack trace.
|
2021-04-18 16:55:54 +08:00
|
|
|
proc crashlog_from_file {filename} {
|
2010-05-16 05:48:08 +08:00
|
|
|
set lines [split [exec cat $filename] "\n"]
|
|
|
|
set matched 0
|
2012-05-22 19:13:24 +08:00
|
|
|
set logall 0
|
2010-05-16 05:48:08 +08:00
|
|
|
set result {}
|
|
|
|
foreach line $lines {
|
2012-05-22 19:13:24 +08:00
|
|
|
if {[string match {*REDIS BUG REPORT START*} $line]} {
|
|
|
|
set logall 1
|
|
|
|
}
|
2010-05-16 05:48:08 +08:00
|
|
|
if {[regexp {^\[\d+\]\s+\d+\s+\w+\s+\d{2}:\d{2}:\d{2} \#} $line]} {
|
|
|
|
set matched 1
|
|
|
|
}
|
2012-05-22 19:13:24 +08:00
|
|
|
if {$logall || $matched} {
|
2010-05-16 05:48:08 +08:00
|
|
|
lappend result $line
|
|
|
|
}
|
|
|
|
}
|
|
|
|
join $result "\n"
|
|
|
|
}
|
|
|
|
|
2021-11-11 19:51:33 +08:00
|
|
|
# Return sanitizer log lines
|
|
|
|
proc sanitizer_errors_from_file {filename} {
|
|
|
|
set log [exec cat $filename]
|
|
|
|
set lines [split [exec cat $filename] "\n"]
|
|
|
|
|
|
|
|
foreach line $lines {
|
|
|
|
# Ignore huge allocation warnings
|
|
|
|
if ([string match {*WARNING: AddressSanitizer failed to allocate*} $line]) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
# GCC UBSAN output does not contain 'Sanitizer' but 'runtime error'.
|
2023-07-30 13:48:29 +08:00
|
|
|
if {[string match {*runtime error*} $line] ||
|
|
|
|
[string match {*Sanitizer*} $line]} {
|
2021-11-11 19:51:33 +08:00
|
|
|
return $log
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
2020-12-13 15:56:01 +08:00
|
|
|
proc getInfoProperty {infostr property} {
|
Add cluster-port support to redis-cli --cluster (#10344)
In #9389, we add a new `cluster-port` config and make cluster bus port configurable,
and currently redis-cli --cluster create/add-node doesn't support with a configurable `cluster-port` instance.
Because redis-cli uses the old way (port + 10000) to send the `CLUSTER MEET` command.
Now we add this support on redis-cli `--cluster`, note we don't need to explicitly pass in the
`cluster-port` parameter, we can get the real `cluster-port` of the node in `clusterManagerNodeLoadInfo`,
so the `--cluster create` and `--cluster add-node` interfaces have not changed.
We will use the `cluster-port` when we are doing `CLUSTER MEET`, also note that `CLUSTER MEET` bus-port
parameter was added in 4.0, so if the bus_port (the one in redis-cli) is 0, or equal (port + 10000),
we just call `CLUSTER MEET` with 2 arguments, using the old form.
Co-authored-by: Madelyn Olson <34459052+madolson@users.noreply.github.com>
2022-07-11 16:23:31 +08:00
|
|
|
if {[regexp -lineanchor "^$property:(.*?)\r\n" $infostr _ value]} {
|
|
|
|
return $value
|
2010-05-15 02:48:57 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-13 15:56:01 +08:00
|
|
|
# Return value for INFO property
|
|
|
|
proc status {r property} {
|
|
|
|
set _ [getInfoProperty [{*}$r info] $property]
|
|
|
|
}
|
|
|
|
|
2010-05-14 23:31:11 +08:00
|
|
|
proc waitForBgsave r {
|
|
|
|
while 1 {
|
Implement Multi Part AOF mechanism to avoid AOFRW overheads. (#9788)
Implement Multi-Part AOF mechanism to avoid overheads during AOFRW.
Introducing a folder with multiple AOF files tracked by a manifest file.
The main issues with the the original AOFRW mechanism are:
* buffering of commands that are processed during rewrite (consuming a lot of RAM)
* freezes of the main process when the AOFRW completes to drain the remaining part of the buffer and fsync it.
* double disk IO for the data that arrives during AOFRW (had to be written to both the old and new AOF files)
The main modifications of this PR:
1. Remove the AOF rewrite buffer and related code.
2. Divide the AOF into multiple files, they are classified as two types, one is the the `BASE` type,
it represents the full amount of data (Maybe AOF or RDB format) after each AOFRW, there is only
one `BASE` file at most. The second is `INCR` type, may have more than one. They represent the
incremental commands since the last AOFRW.
3. Use a AOF manifest file to record and manage these AOF files mentioned above.
4. The original configuration of `appendfilename` will be the base part of the new file name, for example:
`appendonly.aof.1.base.rdb` and `appendonly.aof.2.incr.aof`
5. Add manifest-related TCL tests, and modified some existing tests that depend on the `appendfilename`
6. Remove the `aof_rewrite_buffer_length` field in info.
7. Add `aof-disable-auto-gc` configuration. By default we're automatically deleting HISTORY type AOFs.
It also gives users the opportunity to preserve the history AOFs. just for testing use now.
8. Add AOFRW limiting measure. When the AOFRW failures reaches the threshold (3 times now),
we will delay the execution of the next AOFRW by 1 minute. If the next AOFRW also fails, it will be
delayed by 2 minutes. The next is 4, 8, 16, the maximum delay is 60 minutes (1 hour). During the limit
period, we can still use the 'bgrewriteaof' command to execute AOFRW immediately.
9. Support upgrade (load) data from old version redis.
10. Add `appenddirname` configuration, as the directory name of the append only files. All AOF files and
manifest file will be placed in this directory.
11. Only the last AOF file (BASE or INCR) can be truncated. Otherwise redis will exit even if
`aof-load-truncated` is enabled.
Co-authored-by: Oran Agra <oran@redislabs.com>
2022-01-04 01:14:13 +08:00
|
|
|
if {[status $r rdb_bgsave_in_progress] eq 1} {
|
2010-12-10 23:13:21 +08:00
|
|
|
if {$::verbose} {
|
|
|
|
puts -nonewline "\nWaiting for background save to finish... "
|
|
|
|
flush stdout
|
|
|
|
}
|
2022-02-13 15:52:38 +08:00
|
|
|
after 50
|
2010-05-14 23:31:11 +08:00
|
|
|
} else {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
proc waitForBgrewriteaof r {
|
|
|
|
while 1 {
|
Implement Multi Part AOF mechanism to avoid AOFRW overheads. (#9788)
Implement Multi-Part AOF mechanism to avoid overheads during AOFRW.
Introducing a folder with multiple AOF files tracked by a manifest file.
The main issues with the the original AOFRW mechanism are:
* buffering of commands that are processed during rewrite (consuming a lot of RAM)
* freezes of the main process when the AOFRW completes to drain the remaining part of the buffer and fsync it.
* double disk IO for the data that arrives during AOFRW (had to be written to both the old and new AOF files)
The main modifications of this PR:
1. Remove the AOF rewrite buffer and related code.
2. Divide the AOF into multiple files, they are classified as two types, one is the the `BASE` type,
it represents the full amount of data (Maybe AOF or RDB format) after each AOFRW, there is only
one `BASE` file at most. The second is `INCR` type, may have more than one. They represent the
incremental commands since the last AOFRW.
3. Use a AOF manifest file to record and manage these AOF files mentioned above.
4. The original configuration of `appendfilename` will be the base part of the new file name, for example:
`appendonly.aof.1.base.rdb` and `appendonly.aof.2.incr.aof`
5. Add manifest-related TCL tests, and modified some existing tests that depend on the `appendfilename`
6. Remove the `aof_rewrite_buffer_length` field in info.
7. Add `aof-disable-auto-gc` configuration. By default we're automatically deleting HISTORY type AOFs.
It also gives users the opportunity to preserve the history AOFs. just for testing use now.
8. Add AOFRW limiting measure. When the AOFRW failures reaches the threshold (3 times now),
we will delay the execution of the next AOFRW by 1 minute. If the next AOFRW also fails, it will be
delayed by 2 minutes. The next is 4, 8, 16, the maximum delay is 60 minutes (1 hour). During the limit
period, we can still use the 'bgrewriteaof' command to execute AOFRW immediately.
9. Support upgrade (load) data from old version redis.
10. Add `appenddirname` configuration, as the directory name of the append only files. All AOF files and
manifest file will be placed in this directory.
11. Only the last AOF file (BASE or INCR) can be truncated. Otherwise redis will exit even if
`aof-load-truncated` is enabled.
Co-authored-by: Oran Agra <oran@redislabs.com>
2022-01-04 01:14:13 +08:00
|
|
|
if {[status $r aof_rewrite_in_progress] eq 1} {
|
2010-12-10 23:13:21 +08:00
|
|
|
if {$::verbose} {
|
|
|
|
puts -nonewline "\nWaiting for background AOF rewrite to finish... "
|
|
|
|
flush stdout
|
|
|
|
}
|
2022-02-13 15:52:38 +08:00
|
|
|
after 50
|
2010-05-14 23:31:11 +08:00
|
|
|
} else {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-15 02:50:58 +08:00
|
|
|
proc wait_for_sync r {
|
2021-01-29 05:18:05 +08:00
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[status $r master_link_status] eq "up"
|
|
|
|
} else {
|
Set repl-diskless-sync to yes by default, add repl-diskless-sync-max-replicas (#10092)
1. enable diskless replication by default
2. add a new config named repl-diskless-sync-max-replicas that enables
replication to start before the full repl-diskless-sync-delay was
reached.
3. put replica online sooner on the master (see below)
4. test suite uses repl-diskless-sync-delay of 0 to be faster
5. a few tests that use multiple replica on a pre-populated master, are
now using the new repl-diskless-sync-max-replicas
6. fix possible timing issues in a few cluster tests (see below)
put replica online sooner on the master
----------------------------------------------------
there were two tests that failed because they needed for the master to
realize that the replica is online, but the test code was actually only
waiting for the replica to realize it's online, and in diskless it could
have been before the master realized it.
changes include two things:
1. the tests wait on the right thing
2. issues in the master, putting the replica online in two steps.
the master used to put the replica as online in 2 steps. the first
step was to mark it as online, and the second step was to enable the
write event (only after getting ACK), but in fact the first step didn't
contains some of the tasks to put it online (like updating good slave
count, and sending the module event). this meant that if a test was
waiting to see that the replica is online form the point of view of the
master, and then confirm that the module got an event, or that the
master has enough good replicas, it could fail due to timing issues.
so now the full effect of putting the replica online, happens at once,
and only the part about enabling the writes is delayed till the ACK.
fix cluster tests
--------------------
I added some code to wait for the replica to sync and avoid race
conditions.
later realized the sentinel and cluster tests where using the original 5
seconds delay, so changed it to 0.
this means the other changes are probably not needed, but i suppose
they're still better (avoid race conditions)
2022-01-17 20:11:11 +08:00
|
|
|
fail "replica didn't sync in time"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Rdb channel replication (#13732)
This PR is based on:
https://github.com/redis/redis/pull/12109
https://github.com/valkey-io/valkey/pull/60
Closes: https://github.com/redis/redis/issues/11678
**Motivation**
During a full sync, when master is delivering RDB to the replica,
incoming write commands are kept in a replication buffer in order to be
sent to the replica once RDB delivery is completed. If RDB delivery
takes a long time, it might create memory pressure on master. Also, once
a replica connection accumulates replication data which is larger than
output buffer limits, master will kill replica connection. This may
cause a replication failure.
The main benefit of the rdb channel replication is streaming incoming
commands in parallel to the RDB delivery. This approach shifts
replication stream buffering to the replica and reduces load on master.
We do this by opening another connection for RDB delivery. The main
channel on replica will be receiving replication stream while rdb
channel is receiving the RDB.
This feature also helps to reduce master's main process CPU load. By
opening a dedicated connection for the RDB transfer, the bgsave process
has access to the new connection and it will stream RDB directly to the
replicas. Before this change, due to TLS connection restriction, the
bgsave process was writing RDB bytes to a pipe and the main process was
forwarding
it to the replica. This is no longer necessary, the main process can
avoid these expensive socket read/write syscalls. It also means RDB
delivery to replica will be faster as it avoids this step.
In summary, replication will be faster and master's performance during
full syncs will improve.
**Implementation steps**
1. When replica connects to the master, it sends 'rdb-channel-repl' as
part of capability exchange to let master to know replica supports rdb
channel.
2. When replica lacks sufficient data for PSYNC, master sends
+RDBCHANNELSYNC reply with replica's client id. As the next step, the
replica opens a new connection (rdb-channel) and configures it against
the master with the appropriate capabilities and requirements. It also
sends given client id back to master over rdbchannel, so that master can
associate these channels. (initial replica connection will be referred
as main-channel) Then, replica requests fullsync using the RDB channel.
3. Prior to forking, master attaches the replica's main channel to the
replication backlog to deliver replication stream starting at the
snapshot end offset.
4. The master main process sends replication stream via the main
channel, while the bgsave process sends the RDB directly to the replica
via the rdb-channel. Replica accumulates replication stream in a local
buffer, while the RDB is being loaded into the memory.
5. Once the replica completes loading the rdb, it drops the rdb channel
and streams the accumulated replication stream into the db. Sync is
completed.
**Some details**
- Currently, rdbchannel replication is supported only if
`repl-diskless-sync` is enabled on master. Otherwise, replication will
happen over a single connection as in before.
- On replica, there is a limit to replication stream buffering. Replica
uses a new config `replica-full-sync-buffer-limit` to limit number of
bytes to accumulate. If it is not set, replica inherits
`client-output-buffer-limit <replica>` hard limit config. If we reach
this limit, replica stops accumulating. This is not a failure scenario
though. Further accumulation will happen on master side. Depending on
the configured limits on master, master may kill the replica connection.
**API changes in INFO output:**
1. New replica state: `send_bulk_and_stream`. Indicates full sync is
still in progress for this replica. It is receiving replication stream
and rdb in parallel.
```
slave0:ip=127.0.0.1,port=5002,state=send_bulk_and_stream,offset=0,lag=0
```
Replica state changes in steps:
- First, replica sends psync and receives +RDBCHANNELSYNC
:`state=wait_bgsave`
- After replica connects with rdbchannel and delivery starts:
`state=send_bulk_and_stream`
- After full sync: `state=online`
2. On replica side, replication stream buffering metrics:
- replica_full_sync_buffer_size: Currently accumulated replication
stream data in bytes.
- replica_full_sync_buffer_peak: Peak number of bytes that this instance
accumulated in the lifetime of the process.
```
replica_full_sync_buffer_size:20485
replica_full_sync_buffer_peak:1048560
```
**API changes in CLIENT LIST**
In `client list` output, rdbchannel clients will have 'C' flag in
addition to 'S' replica flag:
```
id=11 addr=127.0.0.1:39108 laddr=127.0.0.1:5001 fd=14 name= age=5 idle=5 flags=SC db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=0 multi-mem=0 rbs=1024 rbp=0 obl=0 oll=0 omem=0 tot-mem=1920 events=r cmd=psync user=default redir=-1 resp=2 lib-name= lib-ver= io-thread=0
```
**Config changes:**
- `replica-full-sync-buffer-limit`: Controls how much replication data
replica can accumulate during rdbchannel replication. If it is not set,
a value of 0 means replica will inherit `client-output-buffer-limit
<replica>` hard limit config to limit accumulated data.
- `repl-rdb-channel` config is added as a hidden config. This is mostly
for testing as we need to support both rdbchannel replication and the
older single connection replication (to keep compatibility with older
versions and rdbchannel replication will not be enabled if
repl-diskless-sync is not enabled). it affects both the master (not to
respond to rdb channel requests), and the replica (not to declare
capability)
**Internal API changes:**
Changes that were introduced to Redis replication:
- New replication capability is added to replconf command: `capa
rdb-channel-repl`. Indicates replica is capable of rdb channel
replication. Replica sends it when it connects to master along with
other capabilities.
- If replica needs fullsync, master replies `+RDBCHANNELSYNC
<client-id>` to the replica's PSYNC request.
- When replica opens rdbchannel connection, as part of replconf command,
it sends `rdb-channel 1` to let master know this is rdb channel. Also,
it sends `main-ch-client-id <client-id>` as part of replconf command so
master can associate channels.
**Testing:**
As rdbchannel replication is enabled by default, we run whole test suite
with it. Though, as we need to support both rdbchannel and single
connection replication, we'll be running some tests twice with
`repl-rdb-channel yes/no` config.
**Replica state diagram**
```
* * Replica state machine *
*
* Main channel state
* ┌───────────────────┐
* │RECEIVE_PING_REPLY │
* └────────┬──────────┘
* │ +PONG
* ┌────────▼──────────┐
* │SEND_HANDSHAKE │ RDB channel state
* └────────┬──────────┘ ┌───────────────────────────────┐
* │+OK ┌───► RDB_CH_SEND_HANDSHAKE │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_AUTH_REPLY │ │ REPLCONF main-ch-client-id <clientid>
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_AUTH_REPLY │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_PORT_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_REPLCONF_REPLY│
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_IP_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_FULLRESYNC │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_CAPA_REPLY │ │ │+FULLRESYNC
* └────────┬──────────┘ │ │Rdb delivery
* │ │ ┌──────────────▼────────────────┐
* ┌────────▼──────────┐ │ │ RDB_CH_RDB_LOADING │
* │SEND_PSYNC │ │ └──────────────┬────────────────┘
* └─┬─────────────────┘ │ │ Done loading
* │PSYNC (use cached-master) │ │
* ┌─▼─────────────────┐ │ │
* │RECEIVE_PSYNC_REPLY│ │ ┌────────────►│ Replica streams replication
* └─┬─────────────────┘ │ │ │ buffer into memory
* │ │ │ │
* │+RDBCHANNELSYNC client-id │ │ │
* ├──────┬───────────────────┘ │ │
* │ │ Main channel │ │
* │ │ accumulates repl data │ │
* │ ┌──▼────────────────┐ │ ┌───────▼───────────┐
* │ │ REPL_TRANSFER ├───────┘ │ CONNECTED │
* │ └───────────────────┘ └────▲───▲──────────┘
* │ │ │
* │ │ │
* │ +FULLRESYNC ┌───────────────────┐ │ │
* ├────────────────► REPL_TRANSFER ├────┘ │
* │ └───────────────────┘ │
* │ +CONTINUE │
* └──────────────────────────────────────────────┘
*/
```
-----
This PR also contains changes and ideas from:
https://github.com/valkey-io/valkey/pull/837
https://github.com/valkey-io/valkey/pull/1173
https://github.com/valkey-io/valkey/pull/804
https://github.com/valkey-io/valkey/pull/945
https://github.com/valkey-io/valkey/pull/989
---------
Co-authored-by: Yuan Wang <wangyuancode@163.com>
Co-authored-by: debing.sun <debing.sun@redis.com>
Co-authored-by: Moti Cohen <moticless@gmail.com>
Co-authored-by: naglera <anagler123@gmail.com>
Co-authored-by: Amit Nagler <58042354+naglera@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Ping Xie <pingxie@outlook.com>
Co-authored-by: Ran Shidlansik <ranshid@amazon.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: xbasel <103044017+xbasel@users.noreply.github.com>
2025-01-13 20:09:52 +08:00
|
|
|
proc wait_replica_online {r {replica_id 0} {maxtries 50} {delay 100}} {
|
|
|
|
wait_for_condition $maxtries $delay {
|
|
|
|
[string match "*slave$replica_id:*,state=online*" [$r info replication]]
|
Set repl-diskless-sync to yes by default, add repl-diskless-sync-max-replicas (#10092)
1. enable diskless replication by default
2. add a new config named repl-diskless-sync-max-replicas that enables
replication to start before the full repl-diskless-sync-delay was
reached.
3. put replica online sooner on the master (see below)
4. test suite uses repl-diskless-sync-delay of 0 to be faster
5. a few tests that use multiple replica on a pre-populated master, are
now using the new repl-diskless-sync-max-replicas
6. fix possible timing issues in a few cluster tests (see below)
put replica online sooner on the master
----------------------------------------------------
there were two tests that failed because they needed for the master to
realize that the replica is online, but the test code was actually only
waiting for the replica to realize it's online, and in diskless it could
have been before the master realized it.
changes include two things:
1. the tests wait on the right thing
2. issues in the master, putting the replica online in two steps.
the master used to put the replica as online in 2 steps. the first
step was to mark it as online, and the second step was to enable the
write event (only after getting ACK), but in fact the first step didn't
contains some of the tasks to put it online (like updating good slave
count, and sending the module event). this meant that if a test was
waiting to see that the replica is online form the point of view of the
master, and then confirm that the module got an event, or that the
master has enough good replicas, it could fail due to timing issues.
so now the full effect of putting the replica online, happens at once,
and only the part about enabling the writes is delayed till the ACK.
fix cluster tests
--------------------
I added some code to wait for the replica to sync and avoid race
conditions.
later realized the sentinel and cluster tests where using the original 5
seconds delay, so changed it to 0.
this means the other changes are probably not needed, but i suppose
they're still better (avoid race conditions)
2022-01-17 20:11:11 +08:00
|
|
|
} else {
|
Rdb channel replication (#13732)
This PR is based on:
https://github.com/redis/redis/pull/12109
https://github.com/valkey-io/valkey/pull/60
Closes: https://github.com/redis/redis/issues/11678
**Motivation**
During a full sync, when master is delivering RDB to the replica,
incoming write commands are kept in a replication buffer in order to be
sent to the replica once RDB delivery is completed. If RDB delivery
takes a long time, it might create memory pressure on master. Also, once
a replica connection accumulates replication data which is larger than
output buffer limits, master will kill replica connection. This may
cause a replication failure.
The main benefit of the rdb channel replication is streaming incoming
commands in parallel to the RDB delivery. This approach shifts
replication stream buffering to the replica and reduces load on master.
We do this by opening another connection for RDB delivery. The main
channel on replica will be receiving replication stream while rdb
channel is receiving the RDB.
This feature also helps to reduce master's main process CPU load. By
opening a dedicated connection for the RDB transfer, the bgsave process
has access to the new connection and it will stream RDB directly to the
replicas. Before this change, due to TLS connection restriction, the
bgsave process was writing RDB bytes to a pipe and the main process was
forwarding
it to the replica. This is no longer necessary, the main process can
avoid these expensive socket read/write syscalls. It also means RDB
delivery to replica will be faster as it avoids this step.
In summary, replication will be faster and master's performance during
full syncs will improve.
**Implementation steps**
1. When replica connects to the master, it sends 'rdb-channel-repl' as
part of capability exchange to let master to know replica supports rdb
channel.
2. When replica lacks sufficient data for PSYNC, master sends
+RDBCHANNELSYNC reply with replica's client id. As the next step, the
replica opens a new connection (rdb-channel) and configures it against
the master with the appropriate capabilities and requirements. It also
sends given client id back to master over rdbchannel, so that master can
associate these channels. (initial replica connection will be referred
as main-channel) Then, replica requests fullsync using the RDB channel.
3. Prior to forking, master attaches the replica's main channel to the
replication backlog to deliver replication stream starting at the
snapshot end offset.
4. The master main process sends replication stream via the main
channel, while the bgsave process sends the RDB directly to the replica
via the rdb-channel. Replica accumulates replication stream in a local
buffer, while the RDB is being loaded into the memory.
5. Once the replica completes loading the rdb, it drops the rdb channel
and streams the accumulated replication stream into the db. Sync is
completed.
**Some details**
- Currently, rdbchannel replication is supported only if
`repl-diskless-sync` is enabled on master. Otherwise, replication will
happen over a single connection as in before.
- On replica, there is a limit to replication stream buffering. Replica
uses a new config `replica-full-sync-buffer-limit` to limit number of
bytes to accumulate. If it is not set, replica inherits
`client-output-buffer-limit <replica>` hard limit config. If we reach
this limit, replica stops accumulating. This is not a failure scenario
though. Further accumulation will happen on master side. Depending on
the configured limits on master, master may kill the replica connection.
**API changes in INFO output:**
1. New replica state: `send_bulk_and_stream`. Indicates full sync is
still in progress for this replica. It is receiving replication stream
and rdb in parallel.
```
slave0:ip=127.0.0.1,port=5002,state=send_bulk_and_stream,offset=0,lag=0
```
Replica state changes in steps:
- First, replica sends psync and receives +RDBCHANNELSYNC
:`state=wait_bgsave`
- After replica connects with rdbchannel and delivery starts:
`state=send_bulk_and_stream`
- After full sync: `state=online`
2. On replica side, replication stream buffering metrics:
- replica_full_sync_buffer_size: Currently accumulated replication
stream data in bytes.
- replica_full_sync_buffer_peak: Peak number of bytes that this instance
accumulated in the lifetime of the process.
```
replica_full_sync_buffer_size:20485
replica_full_sync_buffer_peak:1048560
```
**API changes in CLIENT LIST**
In `client list` output, rdbchannel clients will have 'C' flag in
addition to 'S' replica flag:
```
id=11 addr=127.0.0.1:39108 laddr=127.0.0.1:5001 fd=14 name= age=5 idle=5 flags=SC db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=0 multi-mem=0 rbs=1024 rbp=0 obl=0 oll=0 omem=0 tot-mem=1920 events=r cmd=psync user=default redir=-1 resp=2 lib-name= lib-ver= io-thread=0
```
**Config changes:**
- `replica-full-sync-buffer-limit`: Controls how much replication data
replica can accumulate during rdbchannel replication. If it is not set,
a value of 0 means replica will inherit `client-output-buffer-limit
<replica>` hard limit config to limit accumulated data.
- `repl-rdb-channel` config is added as a hidden config. This is mostly
for testing as we need to support both rdbchannel replication and the
older single connection replication (to keep compatibility with older
versions and rdbchannel replication will not be enabled if
repl-diskless-sync is not enabled). it affects both the master (not to
respond to rdb channel requests), and the replica (not to declare
capability)
**Internal API changes:**
Changes that were introduced to Redis replication:
- New replication capability is added to replconf command: `capa
rdb-channel-repl`. Indicates replica is capable of rdb channel
replication. Replica sends it when it connects to master along with
other capabilities.
- If replica needs fullsync, master replies `+RDBCHANNELSYNC
<client-id>` to the replica's PSYNC request.
- When replica opens rdbchannel connection, as part of replconf command,
it sends `rdb-channel 1` to let master know this is rdb channel. Also,
it sends `main-ch-client-id <client-id>` as part of replconf command so
master can associate channels.
**Testing:**
As rdbchannel replication is enabled by default, we run whole test suite
with it. Though, as we need to support both rdbchannel and single
connection replication, we'll be running some tests twice with
`repl-rdb-channel yes/no` config.
**Replica state diagram**
```
* * Replica state machine *
*
* Main channel state
* ┌───────────────────┐
* │RECEIVE_PING_REPLY │
* └────────┬──────────┘
* │ +PONG
* ┌────────▼──────────┐
* │SEND_HANDSHAKE │ RDB channel state
* └────────┬──────────┘ ┌───────────────────────────────┐
* │+OK ┌───► RDB_CH_SEND_HANDSHAKE │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_AUTH_REPLY │ │ REPLCONF main-ch-client-id <clientid>
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_AUTH_REPLY │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_PORT_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_REPLCONF_REPLY│
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_IP_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_FULLRESYNC │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_CAPA_REPLY │ │ │+FULLRESYNC
* └────────┬──────────┘ │ │Rdb delivery
* │ │ ┌──────────────▼────────────────┐
* ┌────────▼──────────┐ │ │ RDB_CH_RDB_LOADING │
* │SEND_PSYNC │ │ └──────────────┬────────────────┘
* └─┬─────────────────┘ │ │ Done loading
* │PSYNC (use cached-master) │ │
* ┌─▼─────────────────┐ │ │
* │RECEIVE_PSYNC_REPLY│ │ ┌────────────►│ Replica streams replication
* └─┬─────────────────┘ │ │ │ buffer into memory
* │ │ │ │
* │+RDBCHANNELSYNC client-id │ │ │
* ├──────┬───────────────────┘ │ │
* │ │ Main channel │ │
* │ │ accumulates repl data │ │
* │ ┌──▼────────────────┐ │ ┌───────▼───────────┐
* │ │ REPL_TRANSFER ├───────┘ │ CONNECTED │
* │ └───────────────────┘ └────▲───▲──────────┘
* │ │ │
* │ │ │
* │ +FULLRESYNC ┌───────────────────┐ │ │
* ├────────────────► REPL_TRANSFER ├────┘ │
* │ └───────────────────┘ │
* │ +CONTINUE │
* └──────────────────────────────────────────────┘
*/
```
-----
This PR also contains changes and ideas from:
https://github.com/valkey-io/valkey/pull/837
https://github.com/valkey-io/valkey/pull/1173
https://github.com/valkey-io/valkey/pull/804
https://github.com/valkey-io/valkey/pull/945
https://github.com/valkey-io/valkey/pull/989
---------
Co-authored-by: Yuan Wang <wangyuancode@163.com>
Co-authored-by: debing.sun <debing.sun@redis.com>
Co-authored-by: Moti Cohen <moticless@gmail.com>
Co-authored-by: naglera <anagler123@gmail.com>
Co-authored-by: Amit Nagler <58042354+naglera@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Ping Xie <pingxie@outlook.com>
Co-authored-by: Ran Shidlansik <ranshid@amazon.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: xbasel <103044017+xbasel@users.noreply.github.com>
2025-01-13 20:09:52 +08:00
|
|
|
fail "replica $replica_id did not become online in time"
|
2010-05-15 02:50:58 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-11 15:22:42 +08:00
|
|
|
proc wait_for_ofs_sync {r1 r2} {
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[status $r1 master_repl_offset] eq [status $r2 master_repl_offset]
|
|
|
|
} else {
|
optimize(remove) usage of client's pending_querybuf (#10413)
To remove `pending_querybuf`, the key point is reusing `querybuf`, it means master client's `querybuf` is not only used to parse command, but also proxy to sub-replicas.
1. add a new variable `repl_applied` for master client to record how many data applied (propagated via `replicationFeedStreamFromMasterStream()`) but not trimmed in `querybuf`.
2. don't sdsrange `querybuf` in `commandProcessed()`, we trim it to `repl_applied` after the whole replication pipeline processed to avoid fragmented `sdsrange`. And here are some scenarios we cannot trim to `qb_pos`:
* we don't receive complete command from master
* master client blocked because of client pause
* IO threads operate read, master client flagged with CLIENT_PENDING_COMMAND
In these scenarios, `qb_pos` points to the part of the current command or the beginning of next command, and the current command is not applied yet, so the `repl_applied` is not equal to `qb_pos`.
Some other notes:
* Do not do big arg optimization on master client, since we can only sdsrange `querybuf` after data sent to replicas.
* Set `qb_pos` and `repl_applied` to 0 when `freeClient` in `replicationCacheMaster`.
* Rewrite `processPendingCommandsAndResetClient` to `processPendingCommandAndInputBuffer`, let `processInputBuffer` to be called successively after `processCommandAndResetClient`.
2022-03-25 10:45:40 +08:00
|
|
|
fail "replica offset didn't match in time"
|
2018-11-11 15:22:42 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-31 16:20:02 +08:00
|
|
|
proc wait_done_loading r {
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[catch {$r ping} e] == 0
|
|
|
|
} else {
|
|
|
|
fail "Loading DB is taking too much time."
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-02 22:42:53 +08:00
|
|
|
proc wait_lazyfree_done r {
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[status $r lazyfree_pending_objects] == 0
|
|
|
|
} else {
|
|
|
|
fail "lazyfree isn't done"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-10 13:28:22 +08:00
|
|
|
# count current log lines in server's stdout
|
|
|
|
proc count_log_lines {srv_idx} {
|
2020-12-08 22:22:16 +08:00
|
|
|
set _ [string trim [exec wc -l < [srv $srv_idx stdout]]]
|
2020-07-10 13:28:22 +08:00
|
|
|
}
|
|
|
|
|
Sanitize dump payload: ziplist, listpack, zipmap, intset, stream
When loading an encoded payload we will at least do a shallow validation to
check that the size that's encoded in the payload matches the size of the
allocation.
This let's us later use this encoded size to make sure the various offsets
inside encoded payload don't reach outside the allocation, if they do, we'll
assert/panic, but at least we won't segfault or smear memory.
We can also do 'deep' validation which runs on all the records of the encoded
payload and validates that they don't contain invalid offsets. This lets us
detect corruptions early and reject a RESTORE command rather than accepting
it and asserting (crashing) later when accessing that payload via some command.
configuration:
- adding ACL flag skip-sanitize-payload
- adding config sanitize-dump-payload [yes/no/clients]
For now, we don't have a good way to ensure MIGRATE in cluster resharding isn't
being slowed down by these sanitation, so i'm setting the default value to `no`,
but later on it should be set to `clients` by default.
changes:
- changing rdbReportError not to `exit` in RESTORE command
- adding a new stat to be able to later check if cluster MIGRATE isn't being
slowed down by sanitation.
2020-08-13 21:41:05 +08:00
|
|
|
# returns the number of times a line with that pattern appears in a file
|
|
|
|
proc count_message_lines {file pattern} {
|
|
|
|
set res 0
|
2022-08-24 20:07:43 +08:00
|
|
|
# exec fails when grep exists with status other than 0 (when the pattern wasn't found)
|
Sanitize dump payload: ziplist, listpack, zipmap, intset, stream
When loading an encoded payload we will at least do a shallow validation to
check that the size that's encoded in the payload matches the size of the
allocation.
This let's us later use this encoded size to make sure the various offsets
inside encoded payload don't reach outside the allocation, if they do, we'll
assert/panic, but at least we won't segfault or smear memory.
We can also do 'deep' validation which runs on all the records of the encoded
payload and validates that they don't contain invalid offsets. This lets us
detect corruptions early and reject a RESTORE command rather than accepting
it and asserting (crashing) later when accessing that payload via some command.
configuration:
- adding ACL flag skip-sanitize-payload
- adding config sanitize-dump-payload [yes/no/clients]
For now, we don't have a good way to ensure MIGRATE in cluster resharding isn't
being slowed down by these sanitation, so i'm setting the default value to `no`,
but later on it should be set to `clients` by default.
changes:
- changing rdbReportError not to `exit` in RESTORE command
- adding a new stat to be able to later check if cluster MIGRATE isn't being
slowed down by sanitation.
2020-08-13 21:41:05 +08:00
|
|
|
catch {
|
2020-12-08 22:22:16 +08:00
|
|
|
set res [string trim [exec grep $pattern $file 2> /dev/null | wc -l]]
|
Sanitize dump payload: ziplist, listpack, zipmap, intset, stream
When loading an encoded payload we will at least do a shallow validation to
check that the size that's encoded in the payload matches the size of the
allocation.
This let's us later use this encoded size to make sure the various offsets
inside encoded payload don't reach outside the allocation, if they do, we'll
assert/panic, but at least we won't segfault or smear memory.
We can also do 'deep' validation which runs on all the records of the encoded
payload and validates that they don't contain invalid offsets. This lets us
detect corruptions early and reject a RESTORE command rather than accepting
it and asserting (crashing) later when accessing that payload via some command.
configuration:
- adding ACL flag skip-sanitize-payload
- adding config sanitize-dump-payload [yes/no/clients]
For now, we don't have a good way to ensure MIGRATE in cluster resharding isn't
being slowed down by these sanitation, so i'm setting the default value to `no`,
but later on it should be set to `clients` by default.
changes:
- changing rdbReportError not to `exit` in RESTORE command
- adding a new stat to be able to later check if cluster MIGRATE isn't being
slowed down by sanitation.
2020-08-13 21:41:05 +08:00
|
|
|
}
|
|
|
|
return $res
|
|
|
|
}
|
|
|
|
|
|
|
|
# returns the number of times a line with that pattern appears in the log
|
|
|
|
proc count_log_message {srv_idx pattern} {
|
|
|
|
set stdout [srv $srv_idx stdout]
|
|
|
|
return [count_message_lines $stdout $pattern]
|
|
|
|
}
|
|
|
|
|
2020-07-10 13:28:22 +08:00
|
|
|
# verify pattern exists in server's sdtout after a certain line number
|
|
|
|
proc verify_log_message {srv_idx pattern from_line} {
|
2020-07-28 16:15:29 +08:00
|
|
|
incr from_line
|
|
|
|
set result [exec tail -n +$from_line < [srv $srv_idx stdout]]
|
2020-07-10 13:28:22 +08:00
|
|
|
if {![string match $pattern $result]} {
|
|
|
|
error "assertion:expected message not found in log file: $pattern"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# wait for pattern to be found in server's stdout after certain line number
|
2020-07-28 16:15:29 +08:00
|
|
|
# return value is a list containing the line that matched the pattern and the line number
|
|
|
|
proc wait_for_log_messages {srv_idx patterns from_line maxtries delay} {
|
2019-07-16 16:00:34 +08:00
|
|
|
set retry $maxtries
|
2020-07-28 16:15:29 +08:00
|
|
|
set next_line [expr $from_line + 1] ;# searching form the line after
|
2019-07-16 16:00:34 +08:00
|
|
|
set stdout [srv $srv_idx stdout]
|
|
|
|
while {$retry} {
|
2020-10-26 17:55:24 +08:00
|
|
|
# re-read the last line (unless it's before to our first), last time we read it, it might have been incomplete
|
|
|
|
set next_line [expr $next_line - 1 > $from_line + 1 ? $next_line - 1 : $from_line + 1]
|
2020-07-28 16:15:29 +08:00
|
|
|
set result [exec tail -n +$next_line < $stdout]
|
2019-07-16 16:00:34 +08:00
|
|
|
set result [split $result "\n"]
|
|
|
|
foreach line $result {
|
2020-07-28 16:15:29 +08:00
|
|
|
foreach pattern $patterns {
|
|
|
|
if {[string match $pattern $line]} {
|
|
|
|
return [list $line $next_line]
|
|
|
|
}
|
2019-07-16 16:00:34 +08:00
|
|
|
}
|
2020-07-28 16:15:29 +08:00
|
|
|
incr next_line
|
2019-07-16 16:00:34 +08:00
|
|
|
}
|
|
|
|
incr retry -1
|
|
|
|
after $delay
|
|
|
|
}
|
|
|
|
if {$retry == 0} {
|
2020-10-22 16:34:54 +08:00
|
|
|
if {$::verbose} {
|
|
|
|
puts "content of $stdout from line: $from_line:"
|
|
|
|
puts [exec tail -n +$from_line < $stdout]
|
|
|
|
}
|
2020-07-28 16:15:29 +08:00
|
|
|
fail "log message of '$patterns' not found in $stdout after line: $from_line till line: [expr $next_line -1]"
|
2019-07-16 16:00:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-31 15:23:09 +08:00
|
|
|
# write line to server log file
|
|
|
|
proc write_log_line {srv_idx msg} {
|
|
|
|
set logfile [srv $srv_idx stdout]
|
|
|
|
set fd [open $logfile "a+"]
|
|
|
|
puts $fd "### $msg"
|
|
|
|
close $fd
|
|
|
|
}
|
|
|
|
|
2013-06-25 21:32:37 +08:00
|
|
|
# Random integer between 0 and max (excluded).
|
2010-05-14 23:31:11 +08:00
|
|
|
proc randomInt {max} {
|
|
|
|
expr {int(rand()*$max)}
|
|
|
|
}
|
|
|
|
|
2021-09-12 16:31:22 +08:00
|
|
|
# Random integer between min and max (excluded).
|
|
|
|
proc randomRange {min max} {
|
|
|
|
expr {int(rand()*[expr $max - $min]) + $min}
|
|
|
|
}
|
|
|
|
|
2013-06-25 21:32:37 +08:00
|
|
|
# Random signed integer between -max and max (both extremes excluded).
|
2012-06-11 21:19:46 +08:00
|
|
|
proc randomSignedInt {max} {
|
|
|
|
set i [randomInt $max]
|
|
|
|
if {rand() > 0.5} {
|
|
|
|
set i -$i
|
|
|
|
}
|
|
|
|
return $i
|
|
|
|
}
|
|
|
|
|
2010-05-14 23:31:11 +08:00
|
|
|
proc randpath args {
|
|
|
|
set path [expr {int(rand()*[llength $args])}]
|
|
|
|
uplevel 1 [lindex $args $path]
|
|
|
|
}
|
|
|
|
|
|
|
|
proc randomValue {} {
|
|
|
|
randpath {
|
|
|
|
# Small enough to likely collide
|
2012-06-11 21:19:46 +08:00
|
|
|
randomSignedInt 1000
|
2010-05-14 23:31:11 +08:00
|
|
|
} {
|
|
|
|
# 32 bit compressible signed/unsigned
|
2012-06-11 21:19:46 +08:00
|
|
|
randpath {randomSignedInt 2000000000} {randomSignedInt 4000000000}
|
2010-05-14 23:31:11 +08:00
|
|
|
} {
|
|
|
|
# 64 bit
|
2012-06-11 21:19:46 +08:00
|
|
|
randpath {randomSignedInt 1000000000000}
|
2010-05-14 23:31:11 +08:00
|
|
|
} {
|
|
|
|
# Random string
|
|
|
|
randpath {randstring 0 256 alpha} \
|
|
|
|
{randstring 0 256 compr} \
|
|
|
|
{randstring 0 256 binary}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
proc randomKey {} {
|
|
|
|
randpath {
|
|
|
|
# Small enough to likely collide
|
|
|
|
randomInt 1000
|
|
|
|
} {
|
|
|
|
# 32 bit compressible signed/unsigned
|
|
|
|
randpath {randomInt 2000000000} {randomInt 4000000000}
|
|
|
|
} {
|
|
|
|
# 64 bit
|
|
|
|
randpath {randomInt 1000000000000}
|
|
|
|
} {
|
|
|
|
# Random string
|
|
|
|
randpath {randstring 1 256 alpha} \
|
|
|
|
{randstring 1 256 compr}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-07-07 00:30:38 +08:00
|
|
|
proc findKeyWithType {r type} {
|
|
|
|
for {set j 0} {$j < 20} {incr j} {
|
2010-07-28 20:08:46 +08:00
|
|
|
set k [{*}$r randomkey]
|
2010-07-07 00:30:38 +08:00
|
|
|
if {$k eq {}} {
|
|
|
|
return {}
|
|
|
|
}
|
2010-07-28 20:08:46 +08:00
|
|
|
if {[{*}$r type $k] eq $type} {
|
2010-07-07 00:30:38 +08:00
|
|
|
return $k
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return {}
|
|
|
|
}
|
|
|
|
|
2010-08-03 19:38:39 +08:00
|
|
|
proc createComplexDataset {r ops {opt {}}} {
|
2021-06-09 20:13:24 +08:00
|
|
|
set useexpire [expr {[lsearch -exact $opt useexpire] != -1}]
|
2024-05-29 19:47:48 +08:00
|
|
|
set usehexpire [expr {[lsearch -exact $opt usehexpire] != -1}]
|
|
|
|
|
2021-06-09 20:13:24 +08:00
|
|
|
if {[lsearch -exact $opt usetag] != -1} {
|
|
|
|
set tag "{t}"
|
|
|
|
} else {
|
|
|
|
set tag ""
|
|
|
|
}
|
2010-05-14 23:31:11 +08:00
|
|
|
for {set j 0} {$j < $ops} {incr j} {
|
2021-06-09 20:13:24 +08:00
|
|
|
set k [randomKey]$tag
|
|
|
|
set k2 [randomKey]$tag
|
2010-05-14 23:31:11 +08:00
|
|
|
set f [randomValue]
|
|
|
|
set v [randomValue]
|
2010-08-03 19:38:39 +08:00
|
|
|
|
2021-06-09 20:13:24 +08:00
|
|
|
if {$useexpire} {
|
2010-08-03 19:38:39 +08:00
|
|
|
if {rand() < 0.1} {
|
|
|
|
{*}$r expire [randomKey] [randomInt 2]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-14 23:31:11 +08:00
|
|
|
randpath {
|
|
|
|
set d [expr {rand()}]
|
|
|
|
} {
|
|
|
|
set d [expr {rand()}]
|
|
|
|
} {
|
|
|
|
set d [expr {rand()}]
|
|
|
|
} {
|
|
|
|
set d [expr {rand()}]
|
|
|
|
} {
|
|
|
|
set d [expr {rand()}]
|
|
|
|
} {
|
|
|
|
randpath {set d +inf} {set d -inf}
|
|
|
|
}
|
2010-07-28 20:08:46 +08:00
|
|
|
set t [{*}$r type $k]
|
2010-05-14 23:31:11 +08:00
|
|
|
|
|
|
|
if {$t eq {none}} {
|
|
|
|
randpath {
|
2010-07-28 20:08:46 +08:00
|
|
|
{*}$r set $k $v
|
2010-05-14 23:31:11 +08:00
|
|
|
} {
|
2010-07-28 20:08:46 +08:00
|
|
|
{*}$r lpush $k $v
|
2010-05-14 23:31:11 +08:00
|
|
|
} {
|
2010-07-28 20:08:46 +08:00
|
|
|
{*}$r sadd $k $v
|
2010-05-14 23:31:11 +08:00
|
|
|
} {
|
2010-07-28 20:08:46 +08:00
|
|
|
{*}$r zadd $k $d $v
|
2010-05-14 23:31:11 +08:00
|
|
|
} {
|
2010-07-28 20:08:46 +08:00
|
|
|
{*}$r hset $k $f $v
|
2010-07-07 00:30:38 +08:00
|
|
|
} {
|
2010-07-28 20:08:46 +08:00
|
|
|
{*}$r del $k
|
2010-05-14 23:31:11 +08:00
|
|
|
}
|
2010-07-28 20:08:46 +08:00
|
|
|
set t [{*}$r type $k]
|
2010-05-14 23:31:11 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
switch $t {
|
|
|
|
{string} {
|
|
|
|
# Nothing to do
|
|
|
|
}
|
|
|
|
{list} {
|
2010-07-28 20:08:46 +08:00
|
|
|
randpath {{*}$r lpush $k $v} \
|
|
|
|
{{*}$r rpush $k $v} \
|
|
|
|
{{*}$r lrem $k 0 $v} \
|
|
|
|
{{*}$r rpop $k} \
|
|
|
|
{{*}$r lpop $k}
|
2010-05-14 23:31:11 +08:00
|
|
|
}
|
|
|
|
{set} {
|
2010-07-28 20:08:46 +08:00
|
|
|
randpath {{*}$r sadd $k $v} \
|
|
|
|
{{*}$r srem $k $v} \
|
2010-07-07 00:30:38 +08:00
|
|
|
{
|
2010-11-04 17:48:49 +08:00
|
|
|
set otherset [findKeyWithType {*}$r set]
|
2010-07-07 00:30:38 +08:00
|
|
|
if {$otherset ne {}} {
|
2010-07-28 20:08:46 +08:00
|
|
|
randpath {
|
|
|
|
{*}$r sunionstore $k2 $k $otherset
|
|
|
|
} {
|
|
|
|
{*}$r sinterstore $k2 $k $otherset
|
|
|
|
} {
|
|
|
|
{*}$r sdiffstore $k2 $k $otherset
|
|
|
|
}
|
2010-07-07 00:30:38 +08:00
|
|
|
}
|
|
|
|
}
|
2010-05-14 23:31:11 +08:00
|
|
|
}
|
|
|
|
{zset} {
|
2010-07-28 20:08:46 +08:00
|
|
|
randpath {{*}$r zadd $k $d $v} \
|
|
|
|
{{*}$r zrem $k $v} \
|
2010-07-07 00:30:38 +08:00
|
|
|
{
|
2010-11-04 17:48:49 +08:00
|
|
|
set otherzset [findKeyWithType {*}$r zset]
|
2010-07-07 00:30:38 +08:00
|
|
|
if {$otherzset ne {}} {
|
2010-07-28 20:08:46 +08:00
|
|
|
randpath {
|
|
|
|
{*}$r zunionstore $k2 2 $k $otherzset
|
|
|
|
} {
|
|
|
|
{*}$r zinterstore $k2 2 $k $otherzset
|
|
|
|
}
|
2010-07-07 00:30:38 +08:00
|
|
|
}
|
|
|
|
}
|
2010-05-14 23:31:11 +08:00
|
|
|
}
|
|
|
|
{hash} {
|
2010-07-28 20:08:46 +08:00
|
|
|
randpath {{*}$r hset $k $f $v} \
|
|
|
|
{{*}$r hdel $k $f}
|
2024-05-29 19:47:48 +08:00
|
|
|
|
|
|
|
if { [{*}$r hexists $k $f] && $usehexpire && rand() < 0.5} {
|
|
|
|
{*}$r hexpire $k 1000 FIELDS 1 $f
|
|
|
|
}
|
2010-05-14 23:31:11 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-05-19 20:33:39 +08:00
|
|
|
|
|
|
|
proc formatCommand {args} {
|
|
|
|
set cmd "*[llength $args]\r\n"
|
|
|
|
foreach a $args {
|
|
|
|
append cmd "$[string length $a]\r\n$a\r\n"
|
|
|
|
}
|
|
|
|
set _ $cmd
|
|
|
|
}
|
2010-07-27 20:42:11 +08:00
|
|
|
|
|
|
|
proc csvdump r {
|
|
|
|
set o {}
|
2021-06-09 20:13:24 +08:00
|
|
|
if {$::singledb} {
|
|
|
|
set maxdb 1
|
|
|
|
} else {
|
|
|
|
set maxdb 16
|
|
|
|
}
|
|
|
|
for {set db 0} {$db < $maxdb} {incr db} {
|
|
|
|
if {!$::singledb} {
|
|
|
|
{*}$r select $db
|
|
|
|
}
|
2015-08-05 18:27:15 +08:00
|
|
|
foreach k [lsort [{*}$r keys *]] {
|
|
|
|
set type [{*}$r type $k]
|
|
|
|
append o [csvstring $db] , [csvstring $k] , [csvstring $type] ,
|
|
|
|
switch $type {
|
|
|
|
string {
|
|
|
|
append o [csvstring [{*}$r get $k]] "\n"
|
2010-07-27 20:42:11 +08:00
|
|
|
}
|
2015-08-05 18:27:15 +08:00
|
|
|
list {
|
|
|
|
foreach e [{*}$r lrange $k 0 -1] {
|
|
|
|
append o [csvstring $e] ,
|
|
|
|
}
|
|
|
|
append o "\n"
|
2010-07-27 20:42:11 +08:00
|
|
|
}
|
2015-08-05 18:27:15 +08:00
|
|
|
set {
|
|
|
|
foreach e [lsort [{*}$r smembers $k]] {
|
|
|
|
append o [csvstring $e] ,
|
|
|
|
}
|
|
|
|
append o "\n"
|
2010-07-27 20:42:11 +08:00
|
|
|
}
|
2015-08-05 18:27:15 +08:00
|
|
|
zset {
|
|
|
|
foreach e [{*}$r zrange $k 0 -1 withscores] {
|
|
|
|
append o [csvstring $e] ,
|
|
|
|
}
|
|
|
|
append o "\n"
|
2010-07-27 20:42:11 +08:00
|
|
|
}
|
2015-08-05 18:27:15 +08:00
|
|
|
hash {
|
|
|
|
set fields [{*}$r hgetall $k]
|
|
|
|
set newfields {}
|
Hfe serialization listpack (#13243)
Add RDB de/serialization for HFE
This PR adds two new RDB types: `RDB_TYPE_HASH_METADATA` and
`RDB_TYPE_HASH_LISTPACK_TTL` to save HFE data.
When the hash RAM encoding is dict, it will be saved in the former, and
when it is listpack it will be saved in the latter.
Both formats just add the TTL value for each field after the data that
was previously saved, i.e HASH_METADATA will save the number of entries
and, for each entry, key, value and TTL, whereas listpack is saved as a
blob.
On read, the usual dict <--> listpack conversion takes place if
required.
In addition, when reading a hash that was saved as a dict fields are
actively expired if expiry is due. Currently this slao holds for
listpack encoding, but it is supposed to be removed.
TODO:
Remove active expiry on load when loading from listpack format (unless
we'll decide to keep it)
2024-05-17 18:27:02 +08:00
|
|
|
foreach {f v} $fields {
|
2024-05-18 16:34:13 +08:00
|
|
|
set expirylist [{*}$r hexpiretime $k FIELDS 1 $f]
|
Hfe serialization listpack (#13243)
Add RDB de/serialization for HFE
This PR adds two new RDB types: `RDB_TYPE_HASH_METADATA` and
`RDB_TYPE_HASH_LISTPACK_TTL` to save HFE data.
When the hash RAM encoding is dict, it will be saved in the former, and
when it is listpack it will be saved in the latter.
Both formats just add the TTL value for each field after the data that
was previously saved, i.e HASH_METADATA will save the number of entries
and, for each entry, key, value and TTL, whereas listpack is saved as a
blob.
On read, the usual dict <--> listpack conversion takes place if
required.
In addition, when reading a hash that was saved as a dict fields are
actively expired if expiry is due. Currently this slao holds for
listpack encoding, but it is supposed to be removed.
TODO:
Remove active expiry on load when loading from listpack format (unless
we'll decide to keep it)
2024-05-17 18:27:02 +08:00
|
|
|
if {$expirylist eq (-1)} {
|
|
|
|
lappend newfields [list $f $v]
|
|
|
|
} else {
|
|
|
|
set e [lindex $expirylist 0]
|
|
|
|
lappend newfields [list $f $e $v] # TODO: extract the actual ttl value from the list in $e
|
|
|
|
}
|
2015-08-05 18:27:15 +08:00
|
|
|
}
|
|
|
|
set fields [lsort -index 0 $newfields]
|
|
|
|
foreach kv $fields {
|
|
|
|
append o [csvstring [lindex $kv 0]] ,
|
|
|
|
append o [csvstring [lindex $kv 1]] ,
|
|
|
|
}
|
|
|
|
append o "\n"
|
2010-07-27 20:42:11 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-06-09 20:13:24 +08:00
|
|
|
if {!$::singledb} {
|
|
|
|
{*}$r select 9
|
|
|
|
}
|
2010-07-27 20:42:11 +08:00
|
|
|
return $o
|
|
|
|
}
|
|
|
|
|
|
|
|
proc csvstring s {
|
|
|
|
return "\"$s\""
|
|
|
|
}
|
2011-11-16 21:40:50 +08:00
|
|
|
|
|
|
|
proc roundFloat f {
|
|
|
|
format "%.10g" $f
|
|
|
|
}
|
2014-02-17 19:29:54 +08:00
|
|
|
|
2020-05-27 21:12:30 +08:00
|
|
|
set ::last_port_attempted 0
|
2020-05-26 16:00:48 +08:00
|
|
|
proc find_available_port {start count} {
|
2020-05-27 21:12:30 +08:00
|
|
|
set port [expr $::last_port_attempted + 1]
|
|
|
|
for {set attempts 0} {$attempts < $count} {incr attempts} {
|
|
|
|
if {$port < $start || $port >= $start+$count} {
|
|
|
|
set port $start
|
|
|
|
}
|
2021-11-07 19:53:57 +08:00
|
|
|
set fd1 -1
|
2024-07-17 20:42:28 +08:00
|
|
|
proc dummy_accept {chan addr port} {}
|
|
|
|
if {[catch {set fd1 [socket -server dummy_accept -myaddr 127.0.0.1 $port]}] ||
|
|
|
|
[catch {set fd2 [socket -server dummy_accept -myaddr 127.0.0.1 [expr $port+10000]]}]} {
|
2021-11-07 19:53:57 +08:00
|
|
|
if {$fd1 != -1} {
|
2014-06-30 18:07:26 +08:00
|
|
|
close $fd1
|
|
|
|
}
|
2021-11-07 19:53:57 +08:00
|
|
|
} else {
|
|
|
|
close $fd1
|
|
|
|
close $fd2
|
|
|
|
set ::last_port_attempted $port
|
|
|
|
return $port
|
2014-02-17 19:29:54 +08:00
|
|
|
}
|
2020-05-27 21:12:30 +08:00
|
|
|
incr port
|
2014-02-17 19:29:54 +08:00
|
|
|
}
|
2020-05-27 21:12:30 +08:00
|
|
|
error "Can't find a non busy port in the $start-[expr {$start+$count-1}] range."
|
2014-02-17 19:29:54 +08:00
|
|
|
}
|
2014-02-18 00:36:50 +08:00
|
|
|
|
|
|
|
# Test if TERM looks like to support colors
|
|
|
|
proc color_term {} {
|
|
|
|
expr {[info exists ::env(TERM)] && [string match *xterm* $::env(TERM)]}
|
|
|
|
}
|
|
|
|
|
|
|
|
proc colorstr {color str} {
|
|
|
|
if {[color_term]} {
|
|
|
|
set b 0
|
|
|
|
if {[string range $color 0 4] eq {bold-}} {
|
|
|
|
set b 1
|
|
|
|
set color [string range $color 5 end]
|
|
|
|
}
|
|
|
|
switch $color {
|
|
|
|
red {set colorcode {31}}
|
|
|
|
green {set colorcode {32}}
|
|
|
|
yellow {set colorcode {33}}
|
|
|
|
blue {set colorcode {34}}
|
|
|
|
magenta {set colorcode {35}}
|
|
|
|
cyan {set colorcode {36}}
|
|
|
|
white {set colorcode {37}}
|
|
|
|
default {set colorcode {37}}
|
|
|
|
}
|
|
|
|
if {$colorcode ne {}} {
|
2014-08-05 20:50:44 +08:00
|
|
|
return "\033\[$b;${colorcode};49m$str\033\[0m"
|
2014-02-18 00:36:50 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return $str
|
|
|
|
}
|
|
|
|
}
|
2014-07-10 17:24:59 +08:00
|
|
|
|
2020-08-14 21:05:34 +08:00
|
|
|
proc find_valgrind_errors {stderr on_termination} {
|
2020-09-06 16:11:49 +08:00
|
|
|
set fd [open $stderr]
|
|
|
|
set buf [read $fd]
|
|
|
|
close $fd
|
|
|
|
|
|
|
|
# Look for stack trace (" at 0x") and other errors (Invalid, Mismatched, etc).
|
|
|
|
# Look for "Warnings", but not the "set address range perms". These don't indicate any real concern.
|
2020-08-14 21:05:34 +08:00
|
|
|
# corrupt-dump unit, not sure why but it seems they don't indicate any real concern.
|
2020-09-06 16:11:49 +08:00
|
|
|
if {[regexp -- { at 0x} $buf] ||
|
|
|
|
[regexp -- {^(?=.*Warning)(?:(?!set address range perms).)*$} $buf] ||
|
|
|
|
[regexp -- {Invalid} $buf] ||
|
|
|
|
[regexp -- {Mismatched} $buf] ||
|
|
|
|
[regexp -- {uninitialized} $buf] ||
|
|
|
|
[regexp -- {has a fishy} $buf] ||
|
2020-08-14 21:05:34 +08:00
|
|
|
[regexp -- {overlap} $buf]} {
|
|
|
|
return $buf
|
|
|
|
}
|
|
|
|
|
|
|
|
# If the process didn't terminate yet, we can't look for the summary report
|
|
|
|
if {!$on_termination} {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
2021-06-10 20:39:33 +08:00
|
|
|
# Look for the absence of a leak free summary (happens when redis isn't terminated properly).
|
2020-08-14 21:05:34 +08:00
|
|
|
if {(![regexp -- {definitely lost: 0 bytes} $buf] &&
|
2020-09-06 16:11:49 +08:00
|
|
|
![regexp -- {no leaks are possible} $buf])} {
|
|
|
|
return $buf
|
|
|
|
}
|
|
|
|
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
2014-07-10 17:24:59 +08:00
|
|
|
# Execute a background process writing random data for the specified number
|
Rdb channel replication (#13732)
This PR is based on:
https://github.com/redis/redis/pull/12109
https://github.com/valkey-io/valkey/pull/60
Closes: https://github.com/redis/redis/issues/11678
**Motivation**
During a full sync, when master is delivering RDB to the replica,
incoming write commands are kept in a replication buffer in order to be
sent to the replica once RDB delivery is completed. If RDB delivery
takes a long time, it might create memory pressure on master. Also, once
a replica connection accumulates replication data which is larger than
output buffer limits, master will kill replica connection. This may
cause a replication failure.
The main benefit of the rdb channel replication is streaming incoming
commands in parallel to the RDB delivery. This approach shifts
replication stream buffering to the replica and reduces load on master.
We do this by opening another connection for RDB delivery. The main
channel on replica will be receiving replication stream while rdb
channel is receiving the RDB.
This feature also helps to reduce master's main process CPU load. By
opening a dedicated connection for the RDB transfer, the bgsave process
has access to the new connection and it will stream RDB directly to the
replicas. Before this change, due to TLS connection restriction, the
bgsave process was writing RDB bytes to a pipe and the main process was
forwarding
it to the replica. This is no longer necessary, the main process can
avoid these expensive socket read/write syscalls. It also means RDB
delivery to replica will be faster as it avoids this step.
In summary, replication will be faster and master's performance during
full syncs will improve.
**Implementation steps**
1. When replica connects to the master, it sends 'rdb-channel-repl' as
part of capability exchange to let master to know replica supports rdb
channel.
2. When replica lacks sufficient data for PSYNC, master sends
+RDBCHANNELSYNC reply with replica's client id. As the next step, the
replica opens a new connection (rdb-channel) and configures it against
the master with the appropriate capabilities and requirements. It also
sends given client id back to master over rdbchannel, so that master can
associate these channels. (initial replica connection will be referred
as main-channel) Then, replica requests fullsync using the RDB channel.
3. Prior to forking, master attaches the replica's main channel to the
replication backlog to deliver replication stream starting at the
snapshot end offset.
4. The master main process sends replication stream via the main
channel, while the bgsave process sends the RDB directly to the replica
via the rdb-channel. Replica accumulates replication stream in a local
buffer, while the RDB is being loaded into the memory.
5. Once the replica completes loading the rdb, it drops the rdb channel
and streams the accumulated replication stream into the db. Sync is
completed.
**Some details**
- Currently, rdbchannel replication is supported only if
`repl-diskless-sync` is enabled on master. Otherwise, replication will
happen over a single connection as in before.
- On replica, there is a limit to replication stream buffering. Replica
uses a new config `replica-full-sync-buffer-limit` to limit number of
bytes to accumulate. If it is not set, replica inherits
`client-output-buffer-limit <replica>` hard limit config. If we reach
this limit, replica stops accumulating. This is not a failure scenario
though. Further accumulation will happen on master side. Depending on
the configured limits on master, master may kill the replica connection.
**API changes in INFO output:**
1. New replica state: `send_bulk_and_stream`. Indicates full sync is
still in progress for this replica. It is receiving replication stream
and rdb in parallel.
```
slave0:ip=127.0.0.1,port=5002,state=send_bulk_and_stream,offset=0,lag=0
```
Replica state changes in steps:
- First, replica sends psync and receives +RDBCHANNELSYNC
:`state=wait_bgsave`
- After replica connects with rdbchannel and delivery starts:
`state=send_bulk_and_stream`
- After full sync: `state=online`
2. On replica side, replication stream buffering metrics:
- replica_full_sync_buffer_size: Currently accumulated replication
stream data in bytes.
- replica_full_sync_buffer_peak: Peak number of bytes that this instance
accumulated in the lifetime of the process.
```
replica_full_sync_buffer_size:20485
replica_full_sync_buffer_peak:1048560
```
**API changes in CLIENT LIST**
In `client list` output, rdbchannel clients will have 'C' flag in
addition to 'S' replica flag:
```
id=11 addr=127.0.0.1:39108 laddr=127.0.0.1:5001 fd=14 name= age=5 idle=5 flags=SC db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=0 multi-mem=0 rbs=1024 rbp=0 obl=0 oll=0 omem=0 tot-mem=1920 events=r cmd=psync user=default redir=-1 resp=2 lib-name= lib-ver= io-thread=0
```
**Config changes:**
- `replica-full-sync-buffer-limit`: Controls how much replication data
replica can accumulate during rdbchannel replication. If it is not set,
a value of 0 means replica will inherit `client-output-buffer-limit
<replica>` hard limit config to limit accumulated data.
- `repl-rdb-channel` config is added as a hidden config. This is mostly
for testing as we need to support both rdbchannel replication and the
older single connection replication (to keep compatibility with older
versions and rdbchannel replication will not be enabled if
repl-diskless-sync is not enabled). it affects both the master (not to
respond to rdb channel requests), and the replica (not to declare
capability)
**Internal API changes:**
Changes that were introduced to Redis replication:
- New replication capability is added to replconf command: `capa
rdb-channel-repl`. Indicates replica is capable of rdb channel
replication. Replica sends it when it connects to master along with
other capabilities.
- If replica needs fullsync, master replies `+RDBCHANNELSYNC
<client-id>` to the replica's PSYNC request.
- When replica opens rdbchannel connection, as part of replconf command,
it sends `rdb-channel 1` to let master know this is rdb channel. Also,
it sends `main-ch-client-id <client-id>` as part of replconf command so
master can associate channels.
**Testing:**
As rdbchannel replication is enabled by default, we run whole test suite
with it. Though, as we need to support both rdbchannel and single
connection replication, we'll be running some tests twice with
`repl-rdb-channel yes/no` config.
**Replica state diagram**
```
* * Replica state machine *
*
* Main channel state
* ┌───────────────────┐
* │RECEIVE_PING_REPLY │
* └────────┬──────────┘
* │ +PONG
* ┌────────▼──────────┐
* │SEND_HANDSHAKE │ RDB channel state
* └────────┬──────────┘ ┌───────────────────────────────┐
* │+OK ┌───► RDB_CH_SEND_HANDSHAKE │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_AUTH_REPLY │ │ REPLCONF main-ch-client-id <clientid>
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_AUTH_REPLY │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_PORT_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_REPLCONF_REPLY│
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_IP_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_FULLRESYNC │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_CAPA_REPLY │ │ │+FULLRESYNC
* └────────┬──────────┘ │ │Rdb delivery
* │ │ ┌──────────────▼────────────────┐
* ┌────────▼──────────┐ │ │ RDB_CH_RDB_LOADING │
* │SEND_PSYNC │ │ └──────────────┬────────────────┘
* └─┬─────────────────┘ │ │ Done loading
* │PSYNC (use cached-master) │ │
* ┌─▼─────────────────┐ │ │
* │RECEIVE_PSYNC_REPLY│ │ ┌────────────►│ Replica streams replication
* └─┬─────────────────┘ │ │ │ buffer into memory
* │ │ │ │
* │+RDBCHANNELSYNC client-id │ │ │
* ├──────┬───────────────────┘ │ │
* │ │ Main channel │ │
* │ │ accumulates repl data │ │
* │ ┌──▼────────────────┐ │ ┌───────▼───────────┐
* │ │ REPL_TRANSFER ├───────┘ │ CONNECTED │
* │ └───────────────────┘ └────▲───▲──────────┘
* │ │ │
* │ │ │
* │ +FULLRESYNC ┌───────────────────┐ │ │
* ├────────────────► REPL_TRANSFER ├────┘ │
* │ └───────────────────┘ │
* │ +CONTINUE │
* └──────────────────────────────────────────────┘
*/
```
-----
This PR also contains changes and ideas from:
https://github.com/valkey-io/valkey/pull/837
https://github.com/valkey-io/valkey/pull/1173
https://github.com/valkey-io/valkey/pull/804
https://github.com/valkey-io/valkey/pull/945
https://github.com/valkey-io/valkey/pull/989
---------
Co-authored-by: Yuan Wang <wangyuancode@163.com>
Co-authored-by: debing.sun <debing.sun@redis.com>
Co-authored-by: Moti Cohen <moticless@gmail.com>
Co-authored-by: naglera <anagler123@gmail.com>
Co-authored-by: Amit Nagler <58042354+naglera@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Ping Xie <pingxie@outlook.com>
Co-authored-by: Ran Shidlansik <ranshid@amazon.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: xbasel <103044017+xbasel@users.noreply.github.com>
2025-01-13 20:09:52 +08:00
|
|
|
# of seconds to the specified Redis instance. If key is omitted, a random key
|
|
|
|
# is used for every SET command.
|
Improve replication buffering on replica and fix a related bug (#13904)
With RDB channel replication, we introduced parallel replication stream
and RDB delivery to the replica during a full sync. Currently, after the
replica loads the RDB and begins streaming the accumulated buffer to the
database, it does not read from the master connection during this
period. Although streaming the local buffer is generally a fast
operation, it can take some time if the buffer is large. This PR
introduces buffering during the streaming of the local buffer. One
important consideration is ensuring that we consume more than we read
during this operation; otherwise, it could take indefinitely. To
guarantee that it will eventually complete, we limit the read to at most
half of what we consume, e.g. read at most 1 mb once we consume at least
2 mb.
**Additional changes**
**Bug fix**
- Currently, when replica starts draining accumulated buffer, we call
protectClient() for the master client as we occasionally yield back to
event loop via processEventsWhileBlocked(). So, it prevents freeing the
master client. While we are in this loop, if replica receives "replicaof
newmaster" command, we call replicaSetMaster() which expects to free the
master client and trigger a new connection attempt. As the client object
is protected, its destruction will happen asynchronously. Though, a new
connection attempt to new master will be made immediately. Later, when
the replication buffer is drained, we realize master client was marked
as CLOSE_ASAP, and freeing master client triggers another connection
attempt to the new master. In most cases, we realize something is wrong
in the replication state machine and abort the second attempt later. So,
the bug may go undetected. Fix is not calling protectClient() for the
master client. Instead, trying to detect if master client is
disconnected during processEventsWhileBlocked() and if so, breaking the
loop immediately.
**Related improvement:**
- Currently, the replication buffer is a linked list of buffers, each of
which is 1 MB in size. While consuming the buffer, we process one buffer
at a time and check if we need to yield back to
`processEventsWhileBlocked()`. However, if
`loading-process-events-interval-bytes` is set to less than 1 MB, this
approach doesn't handle it well. To improve this, I've modified the code
to process 16KB at a time and check
`loading-process-events-interval-bytes` more frequently. This way,
depending on the configuration, we may yield back to networking more
often.
- In replication.c, `disklessLoadingRio` will be set before a call to
`emptyData()`. This change should not introduce any behavioral change
but it is logically more correct as emptyData() may yield to networking
and we may need to call rioAbort() on disklessLoadingRio. Otherwise,
failure of main channel may go undetected until a failure on rdb channel
on a corner case.
**Config changes**
- The default value for the `loading-process-events-interval-bytes`
configuration is being lowered from 2MB to 512KB. This configuration
primarily used for testing and controls the frequency of networking
during the loading phase, specifically when loading the RDB or applying
accumulated buffers during a full sync on the replica side.
Before the introduction of RDB channel replication, the 2MB value was
sufficient for occasionally yielding to networking, mainly to reply
-loading to the clients. However, with RDB channel replication, during a
full sync on the replica side (either while loading the RDB or applying
the accumulated buffer), we need to yield back to networking more
frequently to continue accumulating the replication stream. If this
doesn’t happen often enough, the replication stream can accumulate on
the master side, which is undesirable.
To address this, we’ve decided to lower the default value to 512KB. One
concern with frequent yielding to networking is the potential
performance impact, as each call to processEventsWhileBlocked() involves
4 syscalls, which could slow down the RDB loading phase. However,
benchmarking with various configuration values has shown that using
512KB or higher does not negatively impact RDB loading performance.
Based on these results, 512KB is now selected as the default value.
**Test changes**
- Added improved version of a replication test which checks memory usage
on master during full sync.
---------
Co-authored-by: Oran Agra <oran@redislabs.com>
2025-04-03 15:04:29 +08:00
|
|
|
proc start_write_load {host port seconds {key ""} {size 0} {sleep 0}} {
|
2014-07-10 17:24:59 +08:00
|
|
|
set tclsh [info nameofexecutable]
|
Improve replication buffering on replica and fix a related bug (#13904)
With RDB channel replication, we introduced parallel replication stream
and RDB delivery to the replica during a full sync. Currently, after the
replica loads the RDB and begins streaming the accumulated buffer to the
database, it does not read from the master connection during this
period. Although streaming the local buffer is generally a fast
operation, it can take some time if the buffer is large. This PR
introduces buffering during the streaming of the local buffer. One
important consideration is ensuring that we consume more than we read
during this operation; otherwise, it could take indefinitely. To
guarantee that it will eventually complete, we limit the read to at most
half of what we consume, e.g. read at most 1 mb once we consume at least
2 mb.
**Additional changes**
**Bug fix**
- Currently, when replica starts draining accumulated buffer, we call
protectClient() for the master client as we occasionally yield back to
event loop via processEventsWhileBlocked(). So, it prevents freeing the
master client. While we are in this loop, if replica receives "replicaof
newmaster" command, we call replicaSetMaster() which expects to free the
master client and trigger a new connection attempt. As the client object
is protected, its destruction will happen asynchronously. Though, a new
connection attempt to new master will be made immediately. Later, when
the replication buffer is drained, we realize master client was marked
as CLOSE_ASAP, and freeing master client triggers another connection
attempt to the new master. In most cases, we realize something is wrong
in the replication state machine and abort the second attempt later. So,
the bug may go undetected. Fix is not calling protectClient() for the
master client. Instead, trying to detect if master client is
disconnected during processEventsWhileBlocked() and if so, breaking the
loop immediately.
**Related improvement:**
- Currently, the replication buffer is a linked list of buffers, each of
which is 1 MB in size. While consuming the buffer, we process one buffer
at a time and check if we need to yield back to
`processEventsWhileBlocked()`. However, if
`loading-process-events-interval-bytes` is set to less than 1 MB, this
approach doesn't handle it well. To improve this, I've modified the code
to process 16KB at a time and check
`loading-process-events-interval-bytes` more frequently. This way,
depending on the configuration, we may yield back to networking more
often.
- In replication.c, `disklessLoadingRio` will be set before a call to
`emptyData()`. This change should not introduce any behavioral change
but it is logically more correct as emptyData() may yield to networking
and we may need to call rioAbort() on disklessLoadingRio. Otherwise,
failure of main channel may go undetected until a failure on rdb channel
on a corner case.
**Config changes**
- The default value for the `loading-process-events-interval-bytes`
configuration is being lowered from 2MB to 512KB. This configuration
primarily used for testing and controls the frequency of networking
during the loading phase, specifically when loading the RDB or applying
accumulated buffers during a full sync on the replica side.
Before the introduction of RDB channel replication, the 2MB value was
sufficient for occasionally yielding to networking, mainly to reply
-loading to the clients. However, with RDB channel replication, during a
full sync on the replica side (either while loading the RDB or applying
the accumulated buffer), we need to yield back to networking more
frequently to continue accumulating the replication stream. If this
doesn’t happen often enough, the replication stream can accumulate on
the master side, which is undesirable.
To address this, we’ve decided to lower the default value to 512KB. One
concern with frequent yielding to networking is the potential
performance impact, as each call to processEventsWhileBlocked() involves
4 syscalls, which could slow down the RDB loading phase. However,
benchmarking with various configuration values has shown that using
512KB or higher does not negatively impact RDB loading performance.
Based on these results, 512KB is now selected as the default value.
**Test changes**
- Added improved version of a replication test which checks memory usage
on master during full sync.
---------
Co-authored-by: Oran Agra <oran@redislabs.com>
2025-04-03 15:04:29 +08:00
|
|
|
exec $tclsh tests/helpers/gen_write_load.tcl $host $port $seconds $::tls $key $size $sleep &
|
2014-07-10 17:24:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
# Stop a process generating write load executed with start_write_load.
|
|
|
|
proc stop_write_load {handle} {
|
|
|
|
catch {exec /bin/kill -9 $handle}
|
|
|
|
}
|
2018-07-13 23:51:03 +08:00
|
|
|
|
stabilize tests that involved with load handlers (#8967)
When test stop 'load handler' by killing the process that generating the load,
some commands that already in the input buffer, still might be processed by the server.
This may cause some instability in tests, that count on that no more commands
processed after we stop the `load handler'
In this commit, new proc 'wait_load_handlers_disconnected' added, to verify that no more
cammands from any 'load handler' prossesed, by checking that the clients who
genreate the load is disconnceted.
Also, replacing check of dbsize with wait_for_ofs_sync before comparing debug digest, as
it would fail in case the last key the workload wrote was an overridden key (not a new one).
Affected tests
Race fix:
- failover command to specific replica works
- Connect multiple replicas at the same time (issue #141), master diskless=$mdl, replica diskless=$sdl
- AOF rewrite during write load: RDB preamble=$rdbpre
Cleanup and speedup:
- Test replication with blocking lists and sorted sets operations
- Test replication with parallel clients writing in different DBs
- Test replication partial resync: $descr (diskless: $mdl, $sdl, reconnect: $reconnect
2021-05-20 20:29:43 +08:00
|
|
|
proc wait_load_handlers_disconnected {{level 0}} {
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
![string match {*name=LOAD_HANDLER*} [r $level client list]]
|
|
|
|
} else {
|
|
|
|
fail "load_handler(s) still connected after too long time."
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-13 23:52:39 +08:00
|
|
|
proc K { x y } { set x }
|
|
|
|
|
2021-03-29 18:52:02 +08:00
|
|
|
# Shuffle a list with Fisher-Yates algorithm.
|
2018-07-13 23:51:03 +08:00
|
|
|
proc lshuffle {list} {
|
|
|
|
set n [llength $list]
|
2021-03-29 18:52:02 +08:00
|
|
|
while {$n>1} {
|
2018-07-13 23:51:03 +08:00
|
|
|
set j [expr {int(rand()*$n)}]
|
|
|
|
incr n -1
|
2021-03-29 18:52:02 +08:00
|
|
|
if {$n==$j} continue
|
|
|
|
set v [lindex $list $j]
|
|
|
|
lset list $j [lindex $list $n]
|
|
|
|
lset list $n $v
|
2018-07-13 23:51:03 +08:00
|
|
|
}
|
2021-03-29 18:52:02 +08:00
|
|
|
return $list
|
2018-07-13 23:51:03 +08:00
|
|
|
}
|
2019-07-01 20:22:29 +08:00
|
|
|
|
|
|
|
# Execute a background process writing complex data for the specified number
|
|
|
|
# of ops to the specified Redis instance.
|
|
|
|
proc start_bg_complex_data {host port db ops} {
|
|
|
|
set tclsh [info nameofexecutable]
|
2019-09-12 15:56:54 +08:00
|
|
|
exec $tclsh tests/helpers/bg_complex_data.tcl $host $port $db $ops $::tls &
|
2019-07-01 20:22:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
# Stop a process generating write load executed with start_bg_complex_data.
|
|
|
|
proc stop_bg_complex_data {handle} {
|
|
|
|
catch {exec /bin/kill -9 $handle}
|
|
|
|
}
|
2020-09-03 13:47:29 +08:00
|
|
|
|
2022-01-02 15:50:15 +08:00
|
|
|
# Write num keys with the given key prefix and value size (in bytes). If idx is
|
|
|
|
# given, it's the index (AKA level) used with the srv procedure and it specifies
|
|
|
|
# to which Redis instance to write the keys.
|
Replace cluster metadata with slot specific dictionaries (#11695)
This is an implementation of https://github.com/redis/redis/issues/10589 that eliminates 16 bytes per entry in cluster mode, that are currently used to create a linked list between entries in the same slot. Main idea is splitting main dictionary into 16k smaller dictionaries (one per slot), so we can perform all slot specific operations, such as iteration, without any additional info in the `dictEntry`. For Redis cluster, the expectation is that there will be a larger number of keys, so the fixed overhead of 16k dictionaries will be The expire dictionary is also split up so that each slot is logically decoupled, so that in subsequent revisions we will be able to atomically flush a slot of data.
## Important changes
* Incremental rehashing - one big change here is that it's not one, but rather up to 16k dictionaries that can be rehashing at the same time, in order to keep track of them, we introduce a separate queue for dictionaries that are rehashing. Also instead of rehashing a single dictionary, cron job will now try to rehash as many as it can in 1ms.
* getRandomKey - now needs to not only select a random key, from the random bucket, but also needs to select a random dictionary. Fairness is a major concern here, as it's possible that keys can be unevenly distributed across the slots. In order to address this search we introduced binary index tree). With that data structure we are able to efficiently find a random slot using binary search in O(log^2(slot count)) time.
* Iteration efficiency - when iterating dictionary with a lot of empty slots, we want to skip them efficiently. We can do this using same binary index that is used for random key selection, this index allows us to find a slot for a specific key index. For example if there are 10 keys in the slot 0, then we can quickly find a slot that contains 11th key using binary search on top of the binary index tree.
* scan API - in order to perform a scan across the entire DB, the cursor now needs to not only save position within the dictionary but also the slot id. In this change we append slot id into LSB of the cursor so it can be passed around between client and the server. This has interesting side effect, now you'll be able to start scanning specific slot by simply providing slot id as a cursor value. The plan is to not document this as defined behavior, however. It's also worth nothing the SCAN API is now technically incompatible with previous versions, although practically we don't believe it's an issue.
* Checksum calculation optimizations - During command execution, we know that all of the keys are from the same slot (outside of a few notable exceptions such as cross slot scripts and modules). We don't want to compute the checksum multiple multiple times, hence we are relying on cached slot id in the client during the command executions. All operations that access random keys, either should pass in the known slot or recompute the slot.
* Slot info in RDB - in order to resize individual dictionaries correctly, while loading RDB, it's not enough to know total number of keys (of course we could approximate number of keys per slot, but it won't be precise). To address this issue, we've added additional metadata into RDB that contains number of keys in each slot, which can be used as a hint during loading.
* DB size - besides `DBSIZE` API, we need to know size of the DB in many places want, in order to avoid scanning all dictionaries and summing up their sizes in a loop, we've introduced a new field into `redisDb` that keeps track of `key_count`. This way we can keep DBSIZE operation O(1). This is also kept for O(1) expires computation as well.
## Performance
This change improves SET performance in cluster mode by ~5%, most of the gains come from us not having to maintain linked lists for keys in slot, non-cluster mode has same performance. For workloads that rely on evictions, the performance is similar because of the extra overhead for finding keys to evict.
RDB loading performance is slightly reduced, as the slot of each key needs to be computed during the load.
## Interface changes
* Removed `overhead.hashtable.slot-to-keys` to `MEMORY STATS`
* Scan API will now require 64 bits to store the cursor, even on 32 bit systems, as the slot information will be stored.
* New RDB version to support the new op code for SLOT information.
---------
Co-authored-by: Vitaly Arbuzov <arvit@amazon.com>
Co-authored-by: Harkrishn Patro <harkrisp@amazon.com>
Co-authored-by: Roshan Khatri <rvkhatri@amazon.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Oran Agra <oran@redislabs.com>
2023-10-15 14:58:26 +08:00
|
|
|
proc populate {num {prefix key:} {size 3} {idx 0} {prints false} {expires 0}} {
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 14:19:21 +08:00
|
|
|
r $idx deferred 1
|
|
|
|
if {$num > 16} {set pipeline 16} else {set pipeline $num}
|
|
|
|
set val [string repeat A $size]
|
|
|
|
for {set j 0} {$j < $pipeline} {incr j} {
|
Replace cluster metadata with slot specific dictionaries (#11695)
This is an implementation of https://github.com/redis/redis/issues/10589 that eliminates 16 bytes per entry in cluster mode, that are currently used to create a linked list between entries in the same slot. Main idea is splitting main dictionary into 16k smaller dictionaries (one per slot), so we can perform all slot specific operations, such as iteration, without any additional info in the `dictEntry`. For Redis cluster, the expectation is that there will be a larger number of keys, so the fixed overhead of 16k dictionaries will be The expire dictionary is also split up so that each slot is logically decoupled, so that in subsequent revisions we will be able to atomically flush a slot of data.
## Important changes
* Incremental rehashing - one big change here is that it's not one, but rather up to 16k dictionaries that can be rehashing at the same time, in order to keep track of them, we introduce a separate queue for dictionaries that are rehashing. Also instead of rehashing a single dictionary, cron job will now try to rehash as many as it can in 1ms.
* getRandomKey - now needs to not only select a random key, from the random bucket, but also needs to select a random dictionary. Fairness is a major concern here, as it's possible that keys can be unevenly distributed across the slots. In order to address this search we introduced binary index tree). With that data structure we are able to efficiently find a random slot using binary search in O(log^2(slot count)) time.
* Iteration efficiency - when iterating dictionary with a lot of empty slots, we want to skip them efficiently. We can do this using same binary index that is used for random key selection, this index allows us to find a slot for a specific key index. For example if there are 10 keys in the slot 0, then we can quickly find a slot that contains 11th key using binary search on top of the binary index tree.
* scan API - in order to perform a scan across the entire DB, the cursor now needs to not only save position within the dictionary but also the slot id. In this change we append slot id into LSB of the cursor so it can be passed around between client and the server. This has interesting side effect, now you'll be able to start scanning specific slot by simply providing slot id as a cursor value. The plan is to not document this as defined behavior, however. It's also worth nothing the SCAN API is now technically incompatible with previous versions, although practically we don't believe it's an issue.
* Checksum calculation optimizations - During command execution, we know that all of the keys are from the same slot (outside of a few notable exceptions such as cross slot scripts and modules). We don't want to compute the checksum multiple multiple times, hence we are relying on cached slot id in the client during the command executions. All operations that access random keys, either should pass in the known slot or recompute the slot.
* Slot info in RDB - in order to resize individual dictionaries correctly, while loading RDB, it's not enough to know total number of keys (of course we could approximate number of keys per slot, but it won't be precise). To address this issue, we've added additional metadata into RDB that contains number of keys in each slot, which can be used as a hint during loading.
* DB size - besides `DBSIZE` API, we need to know size of the DB in many places want, in order to avoid scanning all dictionaries and summing up their sizes in a loop, we've introduced a new field into `redisDb` that keeps track of `key_count`. This way we can keep DBSIZE operation O(1). This is also kept for O(1) expires computation as well.
## Performance
This change improves SET performance in cluster mode by ~5%, most of the gains come from us not having to maintain linked lists for keys in slot, non-cluster mode has same performance. For workloads that rely on evictions, the performance is similar because of the extra overhead for finding keys to evict.
RDB loading performance is slightly reduced, as the slot of each key needs to be computed during the load.
## Interface changes
* Removed `overhead.hashtable.slot-to-keys` to `MEMORY STATS`
* Scan API will now require 64 bits to store the cursor, even on 32 bit systems, as the slot information will be stored.
* New RDB version to support the new op code for SLOT information.
---------
Co-authored-by: Vitaly Arbuzov <arvit@amazon.com>
Co-authored-by: Harkrishn Patro <harkrisp@amazon.com>
Co-authored-by: Roshan Khatri <rvkhatri@amazon.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Oran Agra <oran@redislabs.com>
2023-10-15 14:58:26 +08:00
|
|
|
if {$expires > 0} {
|
|
|
|
r $idx set $prefix$j $val ex $expires
|
|
|
|
} else {
|
|
|
|
r $idx set $prefix$j $val
|
|
|
|
}
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 14:19:21 +08:00
|
|
|
if {$prints} {puts $j}
|
2020-09-03 13:47:29 +08:00
|
|
|
}
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 14:19:21 +08:00
|
|
|
for {} {$j < $num} {incr j} {
|
Replace cluster metadata with slot specific dictionaries (#11695)
This is an implementation of https://github.com/redis/redis/issues/10589 that eliminates 16 bytes per entry in cluster mode, that are currently used to create a linked list between entries in the same slot. Main idea is splitting main dictionary into 16k smaller dictionaries (one per slot), so we can perform all slot specific operations, such as iteration, without any additional info in the `dictEntry`. For Redis cluster, the expectation is that there will be a larger number of keys, so the fixed overhead of 16k dictionaries will be The expire dictionary is also split up so that each slot is logically decoupled, so that in subsequent revisions we will be able to atomically flush a slot of data.
## Important changes
* Incremental rehashing - one big change here is that it's not one, but rather up to 16k dictionaries that can be rehashing at the same time, in order to keep track of them, we introduce a separate queue for dictionaries that are rehashing. Also instead of rehashing a single dictionary, cron job will now try to rehash as many as it can in 1ms.
* getRandomKey - now needs to not only select a random key, from the random bucket, but also needs to select a random dictionary. Fairness is a major concern here, as it's possible that keys can be unevenly distributed across the slots. In order to address this search we introduced binary index tree). With that data structure we are able to efficiently find a random slot using binary search in O(log^2(slot count)) time.
* Iteration efficiency - when iterating dictionary with a lot of empty slots, we want to skip them efficiently. We can do this using same binary index that is used for random key selection, this index allows us to find a slot for a specific key index. For example if there are 10 keys in the slot 0, then we can quickly find a slot that contains 11th key using binary search on top of the binary index tree.
* scan API - in order to perform a scan across the entire DB, the cursor now needs to not only save position within the dictionary but also the slot id. In this change we append slot id into LSB of the cursor so it can be passed around between client and the server. This has interesting side effect, now you'll be able to start scanning specific slot by simply providing slot id as a cursor value. The plan is to not document this as defined behavior, however. It's also worth nothing the SCAN API is now technically incompatible with previous versions, although practically we don't believe it's an issue.
* Checksum calculation optimizations - During command execution, we know that all of the keys are from the same slot (outside of a few notable exceptions such as cross slot scripts and modules). We don't want to compute the checksum multiple multiple times, hence we are relying on cached slot id in the client during the command executions. All operations that access random keys, either should pass in the known slot or recompute the slot.
* Slot info in RDB - in order to resize individual dictionaries correctly, while loading RDB, it's not enough to know total number of keys (of course we could approximate number of keys per slot, but it won't be precise). To address this issue, we've added additional metadata into RDB that contains number of keys in each slot, which can be used as a hint during loading.
* DB size - besides `DBSIZE` API, we need to know size of the DB in many places want, in order to avoid scanning all dictionaries and summing up their sizes in a loop, we've introduced a new field into `redisDb` that keeps track of `key_count`. This way we can keep DBSIZE operation O(1). This is also kept for O(1) expires computation as well.
## Performance
This change improves SET performance in cluster mode by ~5%, most of the gains come from us not having to maintain linked lists for keys in slot, non-cluster mode has same performance. For workloads that rely on evictions, the performance is similar because of the extra overhead for finding keys to evict.
RDB loading performance is slightly reduced, as the slot of each key needs to be computed during the load.
## Interface changes
* Removed `overhead.hashtable.slot-to-keys` to `MEMORY STATS`
* Scan API will now require 64 bits to store the cursor, even on 32 bit systems, as the slot information will be stored.
* New RDB version to support the new op code for SLOT information.
---------
Co-authored-by: Vitaly Arbuzov <arvit@amazon.com>
Co-authored-by: Harkrishn Patro <harkrisp@amazon.com>
Co-authored-by: Roshan Khatri <rvkhatri@amazon.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Oran Agra <oran@redislabs.com>
2023-10-15 14:58:26 +08:00
|
|
|
if {$expires > 0} {
|
|
|
|
r $idx set $prefix$j $val ex $expires
|
|
|
|
} else {
|
|
|
|
r $idx set $prefix$j $val
|
|
|
|
}
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 14:19:21 +08:00
|
|
|
r $idx read
|
|
|
|
if {$prints} {puts $j}
|
2020-09-03 13:47:29 +08:00
|
|
|
}
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 14:19:21 +08:00
|
|
|
for {set j 0} {$j < $pipeline} {incr j} {
|
|
|
|
r $idx read
|
|
|
|
if {$prints} {puts $j}
|
|
|
|
}
|
|
|
|
r $idx deferred 0
|
2020-09-03 13:47:29 +08:00
|
|
|
}
|
if diskless repl child is killed, make sure to reap the pid (#7742)
Starting redis 6.0 and the changes we made to the diskless master to be
suitable for TLS, I made the master avoid reaping (wait3) the pid of the
child until we know all replicas are done reading their rdb.
I did that in order to avoid a state where the rdb_child_pid is -1 but
we don't yet want to start another fork (still busy serving that data to
replicas).
It turns out that the solution used so far was problematic in case the
fork child was being killed (e.g. by the kernel OOM killer), in that
case there's a chance that we currently disabled the read event on the
rdb pipe, since we're waiting for a replica to become writable again.
and in that scenario the master would have never realized the child
exited, and the replica will remain hung too.
Note that there's no mechanism to detect a hung replica while it's in
rdb transfer state.
The solution here is to add another pipe which is used by the parent to
tell the child it is safe to exit. this mean that when the child exits,
for whatever reason, it is safe to reap it.
Besides that, i'm re-introducing an adjustment to REPLCONF ACK which was
part of #6271 (Accelerate diskless master connections) but was dropped
when that PR was rebased after the TLS fork/pipe changes (5a47794).
Now that RdbPipeCleanup no longer calls checkChildrenDone, and the ACK
has chance to detect that the child exited, it should be the one to call
it so that we don't have to wait for cron (server.hz) to do that.
2020-09-06 21:43:57 +08:00
|
|
|
|
|
|
|
proc get_child_pid {idx} {
|
|
|
|
set pid [srv $idx pid]
|
2020-10-18 19:50:29 +08:00
|
|
|
if {[file exists "/usr/bin/pgrep"]} {
|
2020-09-08 10:45:03 +08:00
|
|
|
set fd [open "|pgrep -P $pid" "r"]
|
|
|
|
set child_pid [string trim [lindex [split [read $fd] \n] 0]]
|
|
|
|
} else {
|
|
|
|
set fd [open "|ps --ppid $pid -o pid" "r"]
|
|
|
|
set child_pid [string trim [lindex [split [read $fd] \n] 1]]
|
|
|
|
}
|
if diskless repl child is killed, make sure to reap the pid (#7742)
Starting redis 6.0 and the changes we made to the diskless master to be
suitable for TLS, I made the master avoid reaping (wait3) the pid of the
child until we know all replicas are done reading their rdb.
I did that in order to avoid a state where the rdb_child_pid is -1 but
we don't yet want to start another fork (still busy serving that data to
replicas).
It turns out that the solution used so far was problematic in case the
fork child was being killed (e.g. by the kernel OOM killer), in that
case there's a chance that we currently disabled the read event on the
rdb pipe, since we're waiting for a replica to become writable again.
and in that scenario the master would have never realized the child
exited, and the replica will remain hung too.
Note that there's no mechanism to detect a hung replica while it's in
rdb transfer state.
The solution here is to add another pipe which is used by the parent to
tell the child it is safe to exit. this mean that when the child exits,
for whatever reason, it is safe to reap it.
Besides that, i'm re-introducing an adjustment to REPLCONF ACK which was
part of #6271 (Accelerate diskless master connections) but was dropped
when that PR was rebased after the TLS fork/pipe changes (5a47794).
Now that RdbPipeCleanup no longer calls checkChildrenDone, and the ACK
has chance to detect that the child exited, it should be the one to call
it so that we don't have to wait for cron (server.hz) to do that.
2020-09-06 21:43:57 +08:00
|
|
|
close $fd
|
|
|
|
|
|
|
|
return $child_pid
|
|
|
|
}
|
2020-10-08 13:33:17 +08:00
|
|
|
|
diskless master, avoid bgsave child hung when fork parent crashes (#11463)
During a diskless sync, if the master main process crashes, the child would
have hung in `write`. This fix closes the read fd on the child side, so that if the
parent crashes, the child will get a write error and exit.
This change also fixes disk-based replication, BGSAVE and AOFRW.
In that case the child wouldn't have been hang, it would have just kept
running until done which may be pointless.
There is a certain degree of risk here. in case there's a BGSAVE child that could
maybe succeed and the parent dies for some reason, the old code would have let
the child keep running and maybe succeed and avoid data loss.
On the other hand, if the parent is restarted, it would have loaded an old rdb file
(or none), and then the child could reach the end and rename the rdb file (data
conflicting with what the parent has), or also have a race with another BGSAVE
child that the new parent started.
Note that i removed a comment saying a write error will be ignored in the child
and handled by the parent (this comment was very old and i don't think relevant).
2022-11-09 16:02:18 +08:00
|
|
|
proc process_is_alive pid {
|
Fix some compile warnings and errors when building with gcc-12 or clang (#12035)
This PR is to fix the compilation warnings and errors generated by the latest
complier toolchain, and to add a new runner of the latest toolchain for daily CI.
## Fix various compilation warnings and errors
1) jemalloc.c
COMPILER: clang-14 with FORTIFY_SOURCE
WARNING:
```
src/jemalloc.c:1028:7: warning: suspicious concatenation of string literals in an array initialization; did you mean to separate the elements with a comma? [-Wstring-concatenation]
"/etc/malloc.conf",
^
src/jemalloc.c:1027:3: note: place parentheses around the string literal to silence warning
"\"name\" of the file referenced by the symbolic link named "
^
```
REASON: the compiler to alert developers to potential issues with string concatenation
that may miss a comma,
just like #9534 which misses a comma.
SOLUTION: use `()` to tell the compiler that these two line strings are continuous.
2) config.h
COMPILER: clang-14 with FORTIFY_SOURCE
WARNING:
```
In file included from quicklist.c:36:
./config.h:319:76: warning: attribute declaration must precede definition [-Wignored-attributes]
char *strcat(char *restrict dest, const char *restrict src) __attribute__((deprecated("please avoid use of unsafe C functions. prefer use of redis_strlcat instead")));
```
REASON: Enabling _FORTIFY_SOURCE will cause the compiler to use `strcpy()` with check,
it results in a deprecated attribute declaration after including <features.h>.
SOLUTION: move the deprecated attribute declaration from config.h to fmacro.h before "#include <features.h>".
3) networking.c
COMPILER: GCC-12
WARNING:
```
networking.c: In function ‘addReplyDouble.part.0’:
networking.c:876:21: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
876 | dbuf[start] = '$';
| ^
networking.c:868:14: note: at offset -5 into destination object ‘dbuf’ of size 5152
868 | char dbuf[MAX_LONG_DOUBLE_CHARS+32];
| ^
networking.c:876:21: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
876 | dbuf[start] = '$';
| ^
networking.c:868:14: note: at offset -6 into destination object ‘dbuf’ of size 5152
868 | char dbuf[MAX_LONG_DOUBLE_CHARS+32];
```
REASON: GCC-12 predicts that digits10() may return 9 or 10 through `return 9 + (v >= 1000000000UL)`.
SOLUTION: add an assert to let the compiler know the possible length;
4) redis-cli.c & redis-benchmark.c
COMPILER: clang-14 with FORTIFY_SOURCE
WARNING:
```
redis-benchmark.c:1621:2: warning: embedding a directive within macro arguments has undefined behavior [-Wembedded-directive] #ifdef USE_OPENSSL
redis-cli.c:3015:2: warning: embedding a directive within macro arguments has undefined behavior [-Wembedded-directive] #ifdef USE_OPENSSL
```
REASON: when _FORTIFY_SOURCE is enabled, the compiler will use the print() with
check, which is a macro. this may result in the use of directives within the macro, which
is undefined behavior.
SOLUTION: move the directives-related code out of `print()`.
5) server.c
COMPILER: gcc-13 with FORTIFY_SOURCE
WARNING:
```
In function 'lookupCommandLogic',
inlined from 'lookupCommandBySdsLogic' at server.c:3139:32:
server.c:3102:66: error: '*(robj **)argv' may be used uninitialized [-Werror=maybe-uninitialized]
3102 | struct redisCommand *base_cmd = dictFetchValue(commands, argv[0]->ptr);
| ~~~~^~~
```
REASON: The compiler thinks that the `argc` returned by `sdssplitlen()` could be 0,
resulting in an empty array of size 0 being passed to lookupCommandLogic.
this should be a false positive, `argc` can't be 0 when strings are not NULL.
SOLUTION: add an assert to let the compiler know that `argc` is positive.
6) sha1.c
COMPILER: gcc-12
WARNING:
```
In function ‘SHA1Update’,
inlined from ‘SHA1Final’ at sha1.c:195:5:
sha1.c:152:13: warning: ‘SHA1Transform’ reading 64 bytes from a region of size 0 [-Wstringop-overread]
152 | SHA1Transform(context->state, &data[i]);
| ^
sha1.c:152:13: note: referencing argument 2 of type ‘const unsigned char[64]’
sha1.c: In function ‘SHA1Final’:
sha1.c:56:6: note: in a call to function ‘SHA1Transform’
56 | void SHA1Transform(uint32_t state[5], const unsigned char buffer[64])
| ^
In function ‘SHA1Update’,
inlined from ‘SHA1Final’ at sha1.c:198:9:
sha1.c:152:13: warning: ‘SHA1Transform’ reading 64 bytes from a region of size 0 [-Wstringop-overread]
152 | SHA1Transform(context->state, &data[i]);
| ^
sha1.c:152:13: note: referencing argument 2 of type ‘const unsigned char[64]’
sha1.c: In function ‘SHA1Final’:
sha1.c:56:6: note: in a call to function ‘SHA1Transform’
56 | void SHA1Transform(uint32_t state[5], const unsigned char buffer[64])
```
REASON: due to the bug[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80922], when
enable LTO, gcc-12 will not see `diagnostic ignored "-Wstringop-overread"`, resulting in a warning.
SOLUTION: temporarily set SHA1Update to noinline to avoid compiler warnings due
to LTO being enabled until the above gcc bug is fixed.
7) zmalloc.h
COMPILER: GCC-12
WARNING:
```
In function ‘memset’,
inlined from ‘moduleCreateContext’ at module.c:877:5,
inlined from ‘RM_GetDetachedThreadSafeContext’ at module.c:8410:5:
/usr/include/x86_64-linux-gnu/bits/string_fortified.h:59:10: warning: ‘__builtin_memset’ writing 104 bytes into a region of size 0 overflows the destination [-Wstringop-overflow=]
59 | return __builtin___memset_chk (__dest, __ch, __len,
```
REASON: due to the GCC-12 bug [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96503],
GCC-12 cannot see alloc_size, which causes GCC to think that the actual size of memory
is 0 when checking with __glibc_objsize0().
SOLUTION: temporarily set malloc-related interfaces to `noinline` to avoid compiler warnings
due to LTO being enabled until the above gcc bug is fixed.
## Other changes
1) Fixed `ps -p [pid]` doesn't output `<defunct>` when using procps 4.x causing `replication
child dies when parent is killed - diskless` test to fail.
2) Add a new fortify CI with GCC-13 and ubuntu-lunar docker image.
2023-04-18 14:53:51 +08:00
|
|
|
if {[catch {exec ps -p $pid -f} err]} {
|
diskless master, avoid bgsave child hung when fork parent crashes (#11463)
During a diskless sync, if the master main process crashes, the child would
have hung in `write`. This fix closes the read fd on the child side, so that if the
parent crashes, the child will get a write error and exit.
This change also fixes disk-based replication, BGSAVE and AOFRW.
In that case the child wouldn't have been hang, it would have just kept
running until done which may be pointless.
There is a certain degree of risk here. in case there's a BGSAVE child that could
maybe succeed and the parent dies for some reason, the old code would have let
the child keep running and maybe succeed and avoid data loss.
On the other hand, if the parent is restarted, it would have loaded an old rdb file
(or none), and then the child could reach the end and rename the rdb file (data
conflicting with what the parent has), or also have a race with another BGSAVE
child that the new parent started.
Note that i removed a comment saying a write error will be ignored in the child
and handled by the parent (this comment was very old and i don't think relevant).
2022-11-09 16:02:18 +08:00
|
|
|
return 0
|
|
|
|
} else {
|
2022-11-13 02:35:34 +08:00
|
|
|
if {[string match "*<defunct>*" $err]} { return 0 }
|
diskless master, avoid bgsave child hung when fork parent crashes (#11463)
During a diskless sync, if the master main process crashes, the child would
have hung in `write`. This fix closes the read fd on the child side, so that if the
parent crashes, the child will get a write error and exit.
This change also fixes disk-based replication, BGSAVE and AOFRW.
In that case the child wouldn't have been hang, it would have just kept
running until done which may be pointless.
There is a certain degree of risk here. in case there's a BGSAVE child that could
maybe succeed and the parent dies for some reason, the old code would have let
the child keep running and maybe succeed and avoid data loss.
On the other hand, if the parent is restarted, it would have loaded an old rdb file
(or none), and then the child could reach the end and rename the rdb file (data
conflicting with what the parent has), or also have a race with another BGSAVE
child that the new parent started.
Note that i removed a comment saying a write error will be ignored in the child
and handled by the parent (this comment was very old and i don't think relevant).
2022-11-09 16:02:18 +08:00
|
|
|
return 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 14:19:21 +08:00
|
|
|
proc pause_process pid {
|
|
|
|
exec kill -SIGSTOP $pid
|
|
|
|
wait_for_condition 50 100 {
|
|
|
|
[string match {*T*} [lindex [exec ps j $pid] 16]]
|
|
|
|
} else {
|
|
|
|
puts [exec ps j $pid]
|
|
|
|
fail "process didn't stop"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
proc resume_process pid {
|
Rdb channel replication (#13732)
This PR is based on:
https://github.com/redis/redis/pull/12109
https://github.com/valkey-io/valkey/pull/60
Closes: https://github.com/redis/redis/issues/11678
**Motivation**
During a full sync, when master is delivering RDB to the replica,
incoming write commands are kept in a replication buffer in order to be
sent to the replica once RDB delivery is completed. If RDB delivery
takes a long time, it might create memory pressure on master. Also, once
a replica connection accumulates replication data which is larger than
output buffer limits, master will kill replica connection. This may
cause a replication failure.
The main benefit of the rdb channel replication is streaming incoming
commands in parallel to the RDB delivery. This approach shifts
replication stream buffering to the replica and reduces load on master.
We do this by opening another connection for RDB delivery. The main
channel on replica will be receiving replication stream while rdb
channel is receiving the RDB.
This feature also helps to reduce master's main process CPU load. By
opening a dedicated connection for the RDB transfer, the bgsave process
has access to the new connection and it will stream RDB directly to the
replicas. Before this change, due to TLS connection restriction, the
bgsave process was writing RDB bytes to a pipe and the main process was
forwarding
it to the replica. This is no longer necessary, the main process can
avoid these expensive socket read/write syscalls. It also means RDB
delivery to replica will be faster as it avoids this step.
In summary, replication will be faster and master's performance during
full syncs will improve.
**Implementation steps**
1. When replica connects to the master, it sends 'rdb-channel-repl' as
part of capability exchange to let master to know replica supports rdb
channel.
2. When replica lacks sufficient data for PSYNC, master sends
+RDBCHANNELSYNC reply with replica's client id. As the next step, the
replica opens a new connection (rdb-channel) and configures it against
the master with the appropriate capabilities and requirements. It also
sends given client id back to master over rdbchannel, so that master can
associate these channels. (initial replica connection will be referred
as main-channel) Then, replica requests fullsync using the RDB channel.
3. Prior to forking, master attaches the replica's main channel to the
replication backlog to deliver replication stream starting at the
snapshot end offset.
4. The master main process sends replication stream via the main
channel, while the bgsave process sends the RDB directly to the replica
via the rdb-channel. Replica accumulates replication stream in a local
buffer, while the RDB is being loaded into the memory.
5. Once the replica completes loading the rdb, it drops the rdb channel
and streams the accumulated replication stream into the db. Sync is
completed.
**Some details**
- Currently, rdbchannel replication is supported only if
`repl-diskless-sync` is enabled on master. Otherwise, replication will
happen over a single connection as in before.
- On replica, there is a limit to replication stream buffering. Replica
uses a new config `replica-full-sync-buffer-limit` to limit number of
bytes to accumulate. If it is not set, replica inherits
`client-output-buffer-limit <replica>` hard limit config. If we reach
this limit, replica stops accumulating. This is not a failure scenario
though. Further accumulation will happen on master side. Depending on
the configured limits on master, master may kill the replica connection.
**API changes in INFO output:**
1. New replica state: `send_bulk_and_stream`. Indicates full sync is
still in progress for this replica. It is receiving replication stream
and rdb in parallel.
```
slave0:ip=127.0.0.1,port=5002,state=send_bulk_and_stream,offset=0,lag=0
```
Replica state changes in steps:
- First, replica sends psync and receives +RDBCHANNELSYNC
:`state=wait_bgsave`
- After replica connects with rdbchannel and delivery starts:
`state=send_bulk_and_stream`
- After full sync: `state=online`
2. On replica side, replication stream buffering metrics:
- replica_full_sync_buffer_size: Currently accumulated replication
stream data in bytes.
- replica_full_sync_buffer_peak: Peak number of bytes that this instance
accumulated in the lifetime of the process.
```
replica_full_sync_buffer_size:20485
replica_full_sync_buffer_peak:1048560
```
**API changes in CLIENT LIST**
In `client list` output, rdbchannel clients will have 'C' flag in
addition to 'S' replica flag:
```
id=11 addr=127.0.0.1:39108 laddr=127.0.0.1:5001 fd=14 name= age=5 idle=5 flags=SC db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=0 multi-mem=0 rbs=1024 rbp=0 obl=0 oll=0 omem=0 tot-mem=1920 events=r cmd=psync user=default redir=-1 resp=2 lib-name= lib-ver= io-thread=0
```
**Config changes:**
- `replica-full-sync-buffer-limit`: Controls how much replication data
replica can accumulate during rdbchannel replication. If it is not set,
a value of 0 means replica will inherit `client-output-buffer-limit
<replica>` hard limit config to limit accumulated data.
- `repl-rdb-channel` config is added as a hidden config. This is mostly
for testing as we need to support both rdbchannel replication and the
older single connection replication (to keep compatibility with older
versions and rdbchannel replication will not be enabled if
repl-diskless-sync is not enabled). it affects both the master (not to
respond to rdb channel requests), and the replica (not to declare
capability)
**Internal API changes:**
Changes that were introduced to Redis replication:
- New replication capability is added to replconf command: `capa
rdb-channel-repl`. Indicates replica is capable of rdb channel
replication. Replica sends it when it connects to master along with
other capabilities.
- If replica needs fullsync, master replies `+RDBCHANNELSYNC
<client-id>` to the replica's PSYNC request.
- When replica opens rdbchannel connection, as part of replconf command,
it sends `rdb-channel 1` to let master know this is rdb channel. Also,
it sends `main-ch-client-id <client-id>` as part of replconf command so
master can associate channels.
**Testing:**
As rdbchannel replication is enabled by default, we run whole test suite
with it. Though, as we need to support both rdbchannel and single
connection replication, we'll be running some tests twice with
`repl-rdb-channel yes/no` config.
**Replica state diagram**
```
* * Replica state machine *
*
* Main channel state
* ┌───────────────────┐
* │RECEIVE_PING_REPLY │
* └────────┬──────────┘
* │ +PONG
* ┌────────▼──────────┐
* │SEND_HANDSHAKE │ RDB channel state
* └────────┬──────────┘ ┌───────────────────────────────┐
* │+OK ┌───► RDB_CH_SEND_HANDSHAKE │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_AUTH_REPLY │ │ REPLCONF main-ch-client-id <clientid>
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_AUTH_REPLY │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_PORT_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_REPLCONF_REPLY│
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_IP_REPLY │ │ │ +OK
* └────────┬──────────┘ │ ┌──────────────▼────────────────┐
* │+OK │ │ RDB_CH_RECEIVE_FULLRESYNC │
* ┌────────▼──────────┐ │ └──────────────┬────────────────┘
* │RECEIVE_CAPA_REPLY │ │ │+FULLRESYNC
* └────────┬──────────┘ │ │Rdb delivery
* │ │ ┌──────────────▼────────────────┐
* ┌────────▼──────────┐ │ │ RDB_CH_RDB_LOADING │
* │SEND_PSYNC │ │ └──────────────┬────────────────┘
* └─┬─────────────────┘ │ │ Done loading
* │PSYNC (use cached-master) │ │
* ┌─▼─────────────────┐ │ │
* │RECEIVE_PSYNC_REPLY│ │ ┌────────────►│ Replica streams replication
* └─┬─────────────────┘ │ │ │ buffer into memory
* │ │ │ │
* │+RDBCHANNELSYNC client-id │ │ │
* ├──────┬───────────────────┘ │ │
* │ │ Main channel │ │
* │ │ accumulates repl data │ │
* │ ┌──▼────────────────┐ │ ┌───────▼───────────┐
* │ │ REPL_TRANSFER ├───────┘ │ CONNECTED │
* │ └───────────────────┘ └────▲───▲──────────┘
* │ │ │
* │ │ │
* │ +FULLRESYNC ┌───────────────────┐ │ │
* ├────────────────► REPL_TRANSFER ├────┘ │
* │ └───────────────────┘ │
* │ +CONTINUE │
* └──────────────────────────────────────────────┘
*/
```
-----
This PR also contains changes and ideas from:
https://github.com/valkey-io/valkey/pull/837
https://github.com/valkey-io/valkey/pull/1173
https://github.com/valkey-io/valkey/pull/804
https://github.com/valkey-io/valkey/pull/945
https://github.com/valkey-io/valkey/pull/989
---------
Co-authored-by: Yuan Wang <wangyuancode@163.com>
Co-authored-by: debing.sun <debing.sun@redis.com>
Co-authored-by: Moti Cohen <moticless@gmail.com>
Co-authored-by: naglera <anagler123@gmail.com>
Co-authored-by: Amit Nagler <58042354+naglera@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Ping Xie <pingxie@outlook.com>
Co-authored-by: Ran Shidlansik <ranshid@amazon.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: xbasel <103044017+xbasel@users.noreply.github.com>
2025-01-13 20:09:52 +08:00
|
|
|
wait_for_condition 50 1000 {
|
|
|
|
[string match "T*" [exec ps -o state= -p $pid]]
|
|
|
|
} else {
|
|
|
|
puts [exec ps j $pid]
|
|
|
|
fail "process was not stopped"
|
|
|
|
}
|
Attempt to solve MacOS CI issues in GH Actions (#12013)
The MacOS CI in github actions often hangs without any logs. GH argues that
it's due to resource utilization, either running out of disk space, memory, or CPU
starvation, and thus the runner is terminated.
This PR contains multiple attempts to resolve this:
1. introducing pause_process instead of SIGSTOP, which waits for the process
to stop before resuming the test, possibly resolving race conditions in some tests,
this was a suspect since there was one test that could result in an infinite loop in that
case, in practice this didn't help, but still a good idea to keep.
2. disable the `save` config in many tests that don't need it, specifically ones that use
heavy writes and could create large files.
3. change the `populate` proc to use short pipeline rather than an infinite one.
4. use `--clients 1` in the macos CI so that we don't risk running multiple resource
demanding tests in parallel.
5. enable `--verbose` to be repeated to elevate verbosity and print more info to stdout
when a test or a server starts.
2023-04-12 14:19:21 +08:00
|
|
|
exec kill -SIGCONT $pid
|
|
|
|
}
|
|
|
|
|
2020-10-08 13:33:17 +08:00
|
|
|
proc cmdrstat {cmd r} {
|
|
|
|
if {[regexp "\r\ncmdstat_$cmd:(.*?)\r\n" [$r info commandstats] _ value]} {
|
|
|
|
set _ $value
|
|
|
|
}
|
|
|
|
}
|
2020-08-14 21:05:34 +08:00
|
|
|
|
2020-12-31 22:53:43 +08:00
|
|
|
proc errorrstat {cmd r} {
|
|
|
|
if {[regexp "\r\nerrorstat_$cmd:(.*?)\r\n" [$r info errorstats] _ value]} {
|
|
|
|
set _ $value
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Added INFO LATENCYSTATS section: latency by percentile distribution/latency by cumulative distribution of latencies (#9462)
# Short description
The Redis extended latency stats track per command latencies and enables:
- exporting the per-command percentile distribution via the `INFO LATENCYSTATS` command.
**( percentile distribution is not mergeable between cluster nodes ).**
- exporting the per-command cumulative latency distributions via the `LATENCY HISTOGRAM` command.
Using the cumulative distribution of latencies we can merge several stats from different cluster nodes
to calculate aggregate metrics .
By default, the extended latency monitoring is enabled since the overhead of keeping track of the
command latency is very small.
If you don't want to track extended latency metrics, you can easily disable it at runtime using the command:
- `CONFIG SET latency-tracking no`
By default, the exported latency percentiles are the p50, p99, and p999.
You can alter them at runtime using the command:
- `CONFIG SET latency-tracking-info-percentiles "0.0 50.0 100.0"`
## Some details:
- The total size per histogram should sit around 40 KiB. We only allocate those 40KiB when a command
was called for the first time.
- With regards to the WRITE overhead As seen below, there is no measurable overhead on the achievable
ops/sec or full latency spectrum on the client. Including also the measured redis-benchmark for unstable
vs this branch.
- We track from 1 nanosecond to 1 second ( everything above 1 second is considered +Inf )
## `INFO LATENCYSTATS` exposition format
- Format: `latency_percentiles_usec_<CMDNAME>:p0=XX,p50....`
## `LATENCY HISTOGRAM [command ...]` exposition format
Return a cumulative distribution of latencies in the format of a histogram for the specified command names.
The histogram is composed of a map of time buckets:
- Each representing a latency range, between 1 nanosecond and roughly 1 second.
- Each bucket covers twice the previous bucket's range.
- Empty buckets are not printed.
- Everything above 1 sec is considered +Inf.
- At max there will be log2(1000000000)=30 buckets
We reply a map for each command in the format:
`<command name> : { `calls`: <total command calls> , `histogram` : { <bucket 1> : latency , < bucket 2> : latency, ... } }`
Co-authored-by: Oran Agra <oran@redislabs.com>
2022-01-05 20:01:05 +08:00
|
|
|
proc latencyrstat_percentiles {cmd r} {
|
|
|
|
if {[regexp "\r\nlatency_percentiles_usec_$cmd:(.*?)\r\n" [$r info latencystats] _ value]} {
|
|
|
|
set _ $value
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Async IO Threads (#13695)
## Introduction
Redis introduced IO Thread in 6.0, allowing IO threads to handle client
request reading, command parsing and reply writing, thereby improving
performance. The current IO thread implementation has a few drawbacks.
- The main thread is blocked during IO thread read/write operations and
must wait for all IO threads to complete their current tasks before it
can continue execution. In other words, the entire process is
synchronous. This prevents the efficient utilization of multi-core CPUs
for parallel processing.
- When the number of clients and requests increases moderately, it
causes all IO threads to reach full CPU utilization due to the busy wait
mechanism used by the IO threads. This makes it challenging for us to
determine which part of Redis has reached its bottleneck.
- When IO threads are enabled with TLS and io-threads-do-reads, a
disconnection of a connection with pending data may result in it being
assigned to multiple IO threads simultaneously. This can cause race
conditions and trigger assertion failures. Related issue:
redis#12540
Therefore, we designed an asynchronous IO threads solution. The IO
threads adopt an event-driven model, with the main thread dedicated to
command processing, meanwhile, the IO threads handle client read and
write operations in parallel.
## Implementation
### Overall
As before, we did not change the fact that all client commands must be
executed on the main thread, because Redis was originally designed to be
single-threaded, and processing commands in a multi-threaded manner
would inevitably introduce numerous race and synchronization issues. But
now each IO thread has independent event loop, therefore, IO threads can
use a multiplexing approach to handle client read and write operations,
eliminating the CPU overhead caused by busy-waiting.
the execution process can be briefly described as follows:
the main thread assigns clients to IO threads after accepting
connections, IO threads will notify the main thread when clients
finish reading and parsing queries, then the main thread processes
queries from IO threads and generates replies, IO threads handle
writing reply to clients after receiving clients list from main thread,
and then continue to handle client read and write events.
### Each IO thread has independent event loop
We now assign each IO thread its own event loop. This approach
eliminates the need for the main thread to perform the costly
`epoll_wait` operation for handling connections (except for specific
ones). Instead, the main thread processes requests from the IO threads
and hands them back once completed, fully offloading read and write
events to the IO threads.
Additionally, all TLS operations, including handling pending data, have
been moved entirely to the IO threads. This resolves the issue where
io-threads-do-reads could not be used with TLS.
### Event-notified client queue
To facilitate communication between the IO threads and the main thread,
we designed an event-notified client queue. Each IO thread and the main
thread have two such queues to store clients waiting to be processed.
These queues are also integrated with the event loop to enable handling.
We use pthread_mutex to ensure the safety of queue operations, as well
as data visibility and ordering, and race conditions are minimized, as
each IO thread and the main thread operate on independent queues,
avoiding thread suspension due to lock contention. And we implemented an
event notifier based on `eventfd` or `pipe` to support event-driven
handling.
### Thread safety
Since the main thread and IO threads can execute in parallel, we must
handle data race issues carefully.
**client->flags**
The primary tasks of IO threads are reading and writing, i.e.
`readQueryFromClient` and `writeToClient`. However, IO threads and the
main thread may concurrently modify or access `client->flags`, leading
to potential race conditions. To address this, we introduced an io-flags
variable to record operations performed by IO threads, thereby avoiding
race conditions on `client->flags`.
**Pause IO thread**
In the main thread, we may want to operate data of IO threads, maybe
uninstall event handler, access or operate query/output buffer or resize
event loop, we need a clean and safe context to do that. We pause IO
thread in `IOThreadBeforeSleep`, do some jobs and then resume it. To
avoid thread suspended, we use busy waiting to confirm the target
status. Besides we use atomic variable to make sure memory visibility
and ordering. We introduce these functions to pause/resume IO Threads as
below.
```
pauseIOThread, resumeIOThread
pauseAllIOThreads, resumeAllIOThreads
pauseIOThreadsRange, resumeIOThreadsRange
```
Testing has shown that `pauseIOThread` is highly efficient, allowing the
main thread to execute nearly 200,000 operations per second during
stress tests. Similarly, `pauseAllIOThreads` with 8 IO threads can
handle up to nearly 56,000 operations per second. But operations
performed between pausing and resuming IO threads must be quick;
otherwise, they could cause the IO threads to reach full CPU
utilization.
**freeClient and freeClientAsync**
The main thread may need to terminate a client currently running on an
IO thread, for example, due to ACL rule changes, reaching the output
buffer limit, or evicting a client. In such cases, we need to pause the
IO thread to safely operate on the client.
**maxclients and maxmemory-clients updating**
When adjusting `maxclients`, we need to resize the event loop for all IO
threads. Similarly, when modifying `maxmemory-clients`, we need to
traverse all clients to calculate their memory usage. To ensure safe
operations, we pause all IO threads during these adjustments.
**Client info reading**
The main thread may need to read a client’s fields to generate a
descriptive string, such as for the `CLIENT LIST` command or logging
purposes. In such cases, we need to pause the IO thread handling that
client. If information for all clients needs to be displayed, all IO
threads must be paused.
**Tracking redirect**
Redis supports the tracking feature and can even send invalidation
messages to a connection with a specified ID. But the target client may
be running on IO thread, directly manipulating the client’s output
buffer is not thread-safe, and the IO thread may not be aware that the
client requires a response. In such cases, we pause the IO thread
handling the client, modify the output buffer, and install a write event
handler to ensure proper handling.
**clientsCron**
In the `clientsCron` function, the main thread needs to traverse all
clients to perform operations such as timeout checks, verifying whether
they have reached the soft output buffer limit, resizing the
output/query buffer, or updating memory usage. To safely operate on a
client, the IO thread handling that client must be paused.
If we were to pause the IO thread for each client individually, the
efficiency would be very low. Conversely, pausing all IO threads
simultaneously would be costly, especially when there are many IO
threads, as clientsCron is invoked relatively frequently.
To address this, we adopted a batched approach for pausing IO threads.
At most, 8 IO threads are paused at a time. The operations mentioned
above are only performed on clients running in the paused IO threads,
significantly reducing overhead while maintaining safety.
### Observability
In the current design, the main thread always assigns clients to the IO
thread with the least clients. To clearly observe the number of clients
handled by each IO thread, we added the new section in INFO output. The
`INFO THREADS` section can show the client count for each IO thread.
```
# Threads
io_thread_0:clients=0
io_thread_1:clients=2
io_thread_2:clients=2
```
Additionally, in the `CLIENT LIST` output, we also added a field to
indicate the thread to which each client is assigned.
`id=244 addr=127.0.0.1:41870 laddr=127.0.0.1:6379 ... resp=2 lib-name=
lib-ver= io-thread=1`
## Trade-off
### Special Clients
For certain special types of clients, keeping them running on IO threads
would result in severe race issues that are difficult to resolve.
Therefore, we chose not to offload these clients to the IO threads.
For replica, monitor, subscribe, and tracking clients, main thread may
directly write them a reply when conditions are met. Race issues are
difficult to resolve, so we have them processed in the main thread. This
includes the Lua debug clients as well, since we may operate connection
directly.
For blocking client, after the IO thread reads and parses a command and
hands it over to the main thread, if the client is identified as a
blocking type, it will be remained in the main thread. Once the blocking
operation completes and the reply is generated, the client is
transferred back to the IO thread to send the reply and wait for event
triggers.
### Clients Eviction
To support client eviction, it is necessary to update each client’s
memory usage promptly during operations such as read, write, or command
execution. However, when a client operates on an IO thread, it is not
feasible to update the memory usage immediately due to the risk of data
races. As a result, memory usage can only be updated either in the main
thread while processing commands or in the `ClientsCron` periodically.
The downside of this approach is that updates might experience a delay
of up to one second, which could impact the precision of memory
management for eviction.
To avoid incorrectly evicting clients. We adopted a best-effort
compensation solution, when we decide to eviction a client, we update
its memory usage again before evicting, if the memory used by the client
does not decrease or memory usage bucket is not changed, then we will
evict it, otherwise, not evict it.
However, we have not completely solved this problem. Due to the delay in
memory usage updates, it may lead us to make incorrect decisions about
the need to evict clients.
### Defragment
In the majority of cases we do NOT use the data from argv directly in
the db.
1. key names
We store a copy that we allocate in the main thread, see `sdsdup()` in
`dbAdd()`.
2. hash key and value
We store key as hfield and store value as sds, see `hfieldNew()` and
`sdsdup()` in `hashTypeSet()`.
3. other datatypes
They don't even use SDS, so there is no reference issues.
But in some cases client the data from argv may be retain by the main
thread.
As a result, during fragmentation cleanup, we need to move allocations
from the IO thread’s arena to the main thread’s arena. We always
allocate new memory in the main thread’s arena, but the memory released
by IO threads may not yet have been reclaimed. This ultimately causes
the fragmentation rate to be higher compared to creating and allocating
entirely within a single thread.
The following cases below will lead to memory allocated by the IO thread
being kept by the main thread.
1. string related command: `append`, `getset`, `mset` and `set`.
If `tryObjectEncoding()` does not change argv, we will keep it directly
in the main thread, see the code in `tryObjectEncoding()`(specifically
`trimStringObjectIfNeeded()`)
2. block related command.
the key names will be kept in `c->db->blocking_keys`.
3. watch command
the key names will be kept in `c->db->watched_keys`.
4. [s]subscribe command
channel name will be kept in `serverPubSubChannels`.
5. script load command
script will be kept in `server.lua_scripts`.
7. some module API: `RM_RetainString`, `RM_HoldString`
Those issues will be handled in other PRs.
## Testing
### Functional Testing
The commit with enabling IO Threads has passed all TCL tests, but we did
some changes:
**Client query buffer**: In the original code, when using a reusable
query buffer, ownership of the query buffer would be released after the
command was processed. However, with IO threads enabled, the client
transitions from an IO thread to the main thread for processing. This
causes the ownership release to occur earlier than the command
execution. As a result, when IO threads are enabled, the client's
information will never indicate that a shared query buffer is in use.
Therefore, we skip the corresponding query buffer tests in this case.
**Defragment**: Add a new defragmentation test to verify the effect of
io threads on defragmentation.
**Command delay**: For deferred clients in TCL tests, due to clients
being assigned to different threads for execution, delays may occur. To
address this, we introduced conditional waiting: the process proceeds to
the next step only when the `client list` contains the corresponding
commands.
### Sanitizer Testing
The commit passed all TCL tests and reported no errors when compiled
with the `fsanitizer=thread` and `fsanitizer=address` options enabled.
But we made the following modifications: we suppressed the sanitizer
warnings for clients with watched keys when updating `client->flags`, we
think IO threads read `client->flags`, but never modify it or read the
`CLIENT_DIRTY_CAS` bit, main thread just only modifies this bit, so
there is no actual data race.
## Others
### IO thread number
In the new multi-threaded design, the main thread is primarily focused
on command processing to improve performance. Typically, the main thread
does not handle regular client I/O operations but is responsible for
clients such as replication and tracking clients. To avoid breaking
changes, we still consider the main thread as the first IO thread.
When the io-threads configuration is set to a low value (e.g., 2),
performance does not show a significant improvement compared to a
single-threaded setup for simple commands (such as SET or GET), as the
main thread does not consume much CPU for these simple operations. This
results in underutilized multi-core capacity. However, for more complex
commands, having a low number of IO threads may still be beneficial.
Therefore, it’s important to adjust the `io-threads` based on your own
performance tests.
Additionally, you can clearly monitor the CPU utilization of the main
thread and IO threads using `top -H -p $redis_pid`. This allows you to
easily identify where the bottleneck is. If the IO thread is the
bottleneck, increasing the `io-threads` will improve performance. If the
main thread is the bottleneck, the overall performance can only be
scaled by increasing the number of shards or replicas.
---------
Co-authored-by: debing.sun <debing.sun@redis.com>
Co-authored-by: oranagra <oran@redislabs.com>
2024-12-23 14:16:40 +08:00
|
|
|
proc get_io_thread_clients {id {client r}} {
|
|
|
|
set pattern "io_thread_$id:clients=(\[0-9\]+)"
|
|
|
|
set info [$client info threads]
|
|
|
|
if {[regexp $pattern $info _ value]} {
|
|
|
|
return $value
|
|
|
|
} else {
|
|
|
|
return -1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-07-31 08:15:39 +08:00
|
|
|
proc generate_fuzzy_traffic_on_key {key type duration} {
|
2020-08-14 21:05:34 +08:00
|
|
|
# Commands per type, blocking commands removed
|
2022-02-05 22:54:16 +08:00
|
|
|
# TODO: extract these from COMMAND DOCS, and improve to include other types
|
2021-11-18 16:47:49 +08:00
|
|
|
set string_commands {APPEND BITCOUNT BITFIELD BITOP BITPOS DECR DECRBY GET GETBIT GETRANGE GETSET INCR INCRBY INCRBYFLOAT MGET MSET MSETNX PSETEX SET SETBIT SETEX SETNX SETRANGE LCS STRLEN}
|
2021-01-29 16:47:28 +08:00
|
|
|
set hash_commands {HDEL HEXISTS HGET HGETALL HINCRBY HINCRBYFLOAT HKEYS HLEN HMGET HMSET HSCAN HSET HSETNX HSTRLEN HVALS HRANDFIELD}
|
|
|
|
set zset_commands {ZADD ZCARD ZCOUNT ZINCRBY ZINTERSTORE ZLEXCOUNT ZPOPMAX ZPOPMIN ZRANGE ZRANGEBYLEX ZRANGEBYSCORE ZRANK ZREM ZREMRANGEBYLEX ZREMRANGEBYRANK ZREMRANGEBYSCORE ZREVRANGE ZREVRANGEBYLEX ZREVRANGEBYSCORE ZREVRANK ZSCAN ZSCORE ZUNIONSTORE ZRANDMEMBER}
|
2020-08-14 21:05:34 +08:00
|
|
|
set list_commands {LINDEX LINSERT LLEN LPOP LPOS LPUSH LPUSHX LRANGE LREM LSET LTRIM RPOP RPOPLPUSH RPUSH RPUSHX}
|
|
|
|
set set_commands {SADD SCARD SDIFF SDIFFSTORE SINTER SINTERSTORE SISMEMBER SMEMBERS SMOVE SPOP SRANDMEMBER SREM SSCAN SUNION SUNIONSTORE}
|
|
|
|
set stream_commands {XACK XADD XCLAIM XDEL XGROUP XINFO XLEN XPENDING XRANGE XREAD XREADGROUP XREVRANGE XTRIM}
|
2025-05-08 02:49:00 +08:00
|
|
|
set vset_commands {VADD VREM}
|
|
|
|
set commands [dict create string $string_commands hash $hash_commands zset $zset_commands list $list_commands set $set_commands stream $stream_commands vectorset $vset_commands]
|
2020-08-14 21:05:34 +08:00
|
|
|
|
|
|
|
set cmds [dict get $commands $type]
|
|
|
|
set start_time [clock seconds]
|
|
|
|
set sent {}
|
|
|
|
set succeeded 0
|
|
|
|
while {([clock seconds]-$start_time) < $duration} {
|
|
|
|
# find a random command for our key type
|
|
|
|
set cmd_idx [expr {int(rand()*[llength $cmds])}]
|
|
|
|
set cmd [lindex $cmds $cmd_idx]
|
|
|
|
# get the command details from redis
|
|
|
|
if { [ catch {
|
|
|
|
set cmd_info [lindex [r command info $cmd] 0]
|
|
|
|
} err ] } {
|
|
|
|
# if we failed, it means redis crashed after the previous command
|
|
|
|
return $sent
|
|
|
|
}
|
|
|
|
# try to build a valid command argument
|
|
|
|
set arity [lindex $cmd_info 1]
|
|
|
|
set arity [expr $arity < 0 ? - $arity: $arity]
|
|
|
|
set firstkey [lindex $cmd_info 3]
|
2021-08-06 03:57:05 +08:00
|
|
|
set lastkey [lindex $cmd_info 4]
|
2020-08-14 21:05:34 +08:00
|
|
|
set i 1
|
|
|
|
if {$cmd == "XINFO"} {
|
|
|
|
lappend cmd "STREAM"
|
|
|
|
lappend cmd $key
|
|
|
|
lappend cmd "FULL"
|
|
|
|
incr i 3
|
|
|
|
}
|
|
|
|
if {$cmd == "XREAD"} {
|
|
|
|
lappend cmd "STREAMS"
|
|
|
|
lappend cmd $key
|
|
|
|
randpath {
|
|
|
|
lappend cmd \$
|
|
|
|
} {
|
|
|
|
lappend cmd [randomValue]
|
|
|
|
}
|
|
|
|
incr i 3
|
|
|
|
}
|
|
|
|
if {$cmd == "XADD"} {
|
|
|
|
lappend cmd $key
|
|
|
|
randpath {
|
|
|
|
lappend cmd "*"
|
|
|
|
} {
|
|
|
|
lappend cmd [randomValue]
|
|
|
|
}
|
|
|
|
lappend cmd [randomValue]
|
|
|
|
lappend cmd [randomValue]
|
|
|
|
incr i 4
|
|
|
|
}
|
2025-05-08 02:49:00 +08:00
|
|
|
if {$cmd == "VADD"} {
|
|
|
|
lappend cmd $key
|
|
|
|
lappend cmd VALUES 3 1 1 1
|
|
|
|
lappend cmd [randomValue]
|
|
|
|
incr i 7
|
|
|
|
}
|
|
|
|
if {$cmd == "VREM"} {
|
|
|
|
lappend cmd $key
|
|
|
|
lappend cmd [randomValue]
|
|
|
|
incr i 2
|
|
|
|
}
|
|
|
|
|
2020-08-14 21:05:34 +08:00
|
|
|
for {} {$i < $arity} {incr i} {
|
2021-08-06 03:57:05 +08:00
|
|
|
if {$i == $firstkey || $i == $lastkey} {
|
2020-08-14 21:05:34 +08:00
|
|
|
lappend cmd $key
|
|
|
|
} else {
|
|
|
|
lappend cmd [randomValue]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
# execute the command, we expect commands to fail on syntax errors
|
|
|
|
lappend sent $cmd
|
|
|
|
if { ! [ catch {
|
|
|
|
r {*}$cmd
|
|
|
|
} err ] } {
|
|
|
|
incr succeeded
|
2021-04-25 18:08:46 +08:00
|
|
|
} else {
|
|
|
|
set err [format "%s" $err] ;# convert to string for pattern matching
|
|
|
|
if {[string match "*SIGTERM*" $err]} {
|
2022-11-22 17:20:24 +08:00
|
|
|
puts "commands caused test to hang:"
|
|
|
|
foreach cmd $sent {
|
|
|
|
foreach arg $cmd {
|
|
|
|
puts -nonewline "[string2printable $arg] "
|
|
|
|
}
|
|
|
|
puts ""
|
|
|
|
}
|
|
|
|
# Re-raise, let handler up the stack take care of this.
|
|
|
|
error $err $::errorInfo
|
2021-04-25 18:08:46 +08:00
|
|
|
}
|
2020-08-14 21:05:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-22 17:20:24 +08:00
|
|
|
# print stats so that we know if we managed to generate commands that actually made sense
|
2020-08-14 21:05:34 +08:00
|
|
|
#if {$::verbose} {
|
|
|
|
# set count [llength $sent]
|
|
|
|
# puts "Fuzzy traffic sent: $count, succeeded: $succeeded"
|
|
|
|
#}
|
|
|
|
|
|
|
|
# return the list of commands we sent
|
|
|
|
return $sent
|
|
|
|
}
|
|
|
|
|
|
|
|
proc string2printable s {
|
|
|
|
set res {}
|
|
|
|
set has_special_chars false
|
|
|
|
foreach i [split $s {}] {
|
|
|
|
scan $i %c int
|
|
|
|
# non printable characters, including space and excluding: " \ $ { }
|
|
|
|
if {$int < 32 || $int > 122 || $int == 34 || $int == 36 || $int == 92} {
|
|
|
|
set has_special_chars true
|
|
|
|
}
|
|
|
|
# TCL8.5 has issues mixing \x notation and normal chars in the same
|
|
|
|
# source code string, so we'll convert the entire string.
|
|
|
|
append res \\x[format %02X $int]
|
|
|
|
}
|
|
|
|
if {!$has_special_chars} {
|
|
|
|
return $s
|
|
|
|
}
|
|
|
|
set res "\"$res\""
|
|
|
|
return $res
|
|
|
|
}
|
2021-02-05 21:56:20 +08:00
|
|
|
|
2021-04-01 13:20:15 +08:00
|
|
|
# Calculation value of Chi-Square Distribution. By this value
|
|
|
|
# we can verify the random distribution sample confidence.
|
|
|
|
# Based on the following wiki:
|
|
|
|
# https://en.wikipedia.org/wiki/Chi-square_distribution
|
|
|
|
#
|
|
|
|
# param res Random sample list
|
|
|
|
# return Value of Chi-Square Distribution
|
|
|
|
#
|
|
|
|
# x2_value: return of chi_square_value function
|
|
|
|
# df: Degrees of freedom, Number of independent values minus 1
|
|
|
|
#
|
|
|
|
# By using x2_value and df to back check the cardinality table,
|
|
|
|
# we can know the confidence of the random sample.
|
|
|
|
proc chi_square_value {res} {
|
2021-02-05 21:56:20 +08:00
|
|
|
unset -nocomplain mydict
|
|
|
|
foreach key $res {
|
|
|
|
dict incr mydict $key 1
|
|
|
|
}
|
|
|
|
|
2021-04-01 13:20:15 +08:00
|
|
|
set x2_value 0
|
|
|
|
set p [expr [llength $res] / [dict size $mydict]]
|
2021-02-05 21:56:20 +08:00
|
|
|
foreach key [dict keys $mydict] {
|
|
|
|
set value [dict get $mydict $key]
|
2021-04-01 13:20:15 +08:00
|
|
|
|
|
|
|
# Aggregate the chi-square value of each element
|
|
|
|
set v [expr {pow($value - $p, 2) / $p}]
|
|
|
|
set x2_value [expr {$x2_value + $v}]
|
2021-02-05 21:56:20 +08:00
|
|
|
}
|
|
|
|
|
2021-04-01 13:20:15 +08:00
|
|
|
return $x2_value
|
2021-02-05 21:56:20 +08:00
|
|
|
}
|
2021-04-20 02:33:26 +08:00
|
|
|
|
|
|
|
#subscribe to Pub/Sub channels
|
|
|
|
proc consume_subscribe_messages {client type channels} {
|
|
|
|
set numsub -1
|
|
|
|
set counts {}
|
|
|
|
|
|
|
|
for {set i [llength $channels]} {$i > 0} {incr i -1} {
|
|
|
|
set msg [$client read]
|
|
|
|
assert_equal $type [lindex $msg 0]
|
|
|
|
|
|
|
|
# when receiving subscribe messages the channels names
|
|
|
|
# are ordered. when receiving unsubscribe messages
|
|
|
|
# they are unordered
|
|
|
|
set idx [lsearch -exact $channels [lindex $msg 1]]
|
|
|
|
if {[string match "*unsubscribe" $type]} {
|
|
|
|
assert {$idx >= 0}
|
|
|
|
} else {
|
|
|
|
assert {$idx == 0}
|
|
|
|
}
|
|
|
|
set channels [lreplace $channels $idx $idx]
|
|
|
|
|
|
|
|
# aggregate the subscription count to return to the caller
|
|
|
|
lappend counts [lindex $msg 2]
|
|
|
|
}
|
|
|
|
|
|
|
|
# we should have received messages for channels
|
|
|
|
assert {[llength $channels] == 0}
|
|
|
|
return $counts
|
|
|
|
}
|
|
|
|
|
|
|
|
proc subscribe {client channels} {
|
|
|
|
$client subscribe {*}$channels
|
|
|
|
consume_subscribe_messages $client subscribe $channels
|
|
|
|
}
|
|
|
|
|
2022-04-17 20:43:22 +08:00
|
|
|
proc ssubscribe {client channels} {
|
|
|
|
$client ssubscribe {*}$channels
|
|
|
|
consume_subscribe_messages $client ssubscribe $channels
|
|
|
|
}
|
|
|
|
|
2021-04-20 02:33:26 +08:00
|
|
|
proc unsubscribe {client {channels {}}} {
|
|
|
|
$client unsubscribe {*}$channels
|
|
|
|
consume_subscribe_messages $client unsubscribe $channels
|
|
|
|
}
|
|
|
|
|
2022-04-17 20:43:22 +08:00
|
|
|
proc sunsubscribe {client {channels {}}} {
|
|
|
|
$client sunsubscribe {*}$channels
|
|
|
|
consume_subscribe_messages $client sunsubscribe $channels
|
|
|
|
}
|
|
|
|
|
2021-04-20 02:33:26 +08:00
|
|
|
proc psubscribe {client channels} {
|
|
|
|
$client psubscribe {*}$channels
|
|
|
|
consume_subscribe_messages $client psubscribe $channels
|
|
|
|
}
|
|
|
|
|
|
|
|
proc punsubscribe {client {channels {}}} {
|
|
|
|
$client punsubscribe {*}$channels
|
|
|
|
consume_subscribe_messages $client punsubscribe $channels
|
2021-04-25 18:08:46 +08:00
|
|
|
}
|
2021-05-30 14:20:32 +08:00
|
|
|
|
2021-06-09 20:13:24 +08:00
|
|
|
proc debug_digest_value {key} {
|
2021-12-19 23:41:51 +08:00
|
|
|
if {[lsearch $::denytags "needs:debug"] >= 0 || $::ignoredigest} {
|
2021-06-09 20:13:24 +08:00
|
|
|
return "dummy-digest-value"
|
|
|
|
}
|
2021-12-19 23:41:51 +08:00
|
|
|
r debug digest-value $key
|
|
|
|
}
|
|
|
|
|
|
|
|
proc debug_digest {{level 0}} {
|
|
|
|
if {[lsearch $::denytags "needs:debug"] >= 0 || $::ignoredigest} {
|
|
|
|
return "dummy-digest"
|
|
|
|
}
|
|
|
|
r $level debug digest
|
2021-06-09 20:13:24 +08:00
|
|
|
}
|
|
|
|
|
Implementing the WAITAOF command (issue #10505) (#11713)
Implementing the WAITAOF functionality which would allow the user to
block until a specified number of Redises have fsynced all previous write
commands to the AOF.
Syntax: `WAITAOF <num_local> <num_replicas> <timeout>`
Response: Array containing two elements: num_local, num_replicas
num_local is always either 0 or 1 representing the local AOF on the master.
num_replicas is the number of replicas that acknowledged the a replication
offset of the last write being fsynced to the AOF.
Returns an error when called on replicas, or when called with non-zero
num_local on a master with AOF disabled, in all other cases the response
just contains number of fsync copies.
Main changes:
* Added code to keep track of replication offsets that are confirmed to have
been fsynced to disk.
* Keep advancing master_repl_offset even when replication is disabled (and
there's no replication backlog, only if there's an AOF enabled).
This way we can use this command and it's mechanisms even when replication
is disabled.
* Extend REPLCONF ACK to `REPLCONF ACK <ofs> FACK <ofs>`, the FACK
will be appended only if there's an AOF on the replica, and already ignored on
old masters (thus backwards compatible)
* WAIT now no longer wait for the replication offset after your last command, but
rather the replication offset after your last write (or read command that caused
propagation, e.g. lazy expiry).
Unrelated changes:
* WAIT command respects CLIENT_DENY_BLOCKING (not just CLIENT_MULTI)
Implementation details:
* Add an atomic var named `fsynced_reploff_pending` that's updated
(usually by the bio thread) and later copied to the main `fsynced_reploff`
variable (only if the AOF base file exists).
I.e. during the initial AOF rewrite it will not be used as the fsynced offset
since the AOF base is still missing.
* Replace close+fsync bio job with new BIO_CLOSE_AOF (AOF specific)
job that will also update fsync offset the field.
* Handle all AOF jobs (BIO_CLOSE_AOF, BIO_AOF_FSYNC) in the same bio
worker thread, to impose ordering on their execution. This solves a
race condition where a job could set `fsynced_reploff_pending` to a higher
value than another pending fsync job, resulting in indicating an offset
for which parts of the data have not yet actually been fsynced.
Imposing an ordering on the jobs guarantees that fsync jobs are executed
in increasing order of replication offset.
* Drain bio jobs when switching `appendfsync` to "always"
This should prevent a write race between updates to `fsynced_reploff_pending`
in the main thread (`flushAppendOnlyFile` when set to ALWAYS fsync), and
those done in the bio thread.
* Drain the pending fsync when starting over a new AOF to avoid race conditions
with the previous AOF offsets overriding the new one (e.g. after switching to
replicate from a new master).
* Make sure to update the fsynced offset at the end of the initial AOF rewrite.
a must in case there are no additional writes that trigger a periodic fsync,
specifically for a replica that does a full sync.
Limitations:
It is possible to write a module and a Lua script that propagate to the AOF and doesn't
propagate to the replication stream. see REDISMODULE_ARGV_NO_REPLICAS and luaRedisSetReplCommand.
These features are incompatible with the WAITAOF command, and can result
in two bad cases. The scenario is that the user executes command that only
propagates to AOF, and then immediately
issues a WAITAOF, and there's no further writes on the replication stream after that.
1. if the the last thing that happened on the replication stream is a PING
(which increased the replication offset but won't trigger an fsync on the replica),
then the client would hang forever (will wait for an fack that the replica will never
send sine it doesn't trigger any fsyncs).
2. if the last thing that happened is a write command that got propagated properly,
then WAITAOF will be released immediately, without waiting for an fsync (since
the offset didn't change)
Refactoring:
* Plumbing to allow bio worker to handle multiple job types
This introduces infrastructure necessary to allow BIO workers to
not have a 1-1 mapping of worker to job-type. This allows in the
future to assign multiple job types to a single worker, either as
a performance/resource optimization, or as a way of enforcing
ordering between specific classes of jobs.
Co-authored-by: Oran Agra <oran@redislabs.com>
2023-03-15 02:26:21 +08:00
|
|
|
proc wait_for_blocked_client {{idx 0}} {
|
2021-06-09 20:13:24 +08:00
|
|
|
wait_for_condition 50 100 {
|
Implementing the WAITAOF command (issue #10505) (#11713)
Implementing the WAITAOF functionality which would allow the user to
block until a specified number of Redises have fsynced all previous write
commands to the AOF.
Syntax: `WAITAOF <num_local> <num_replicas> <timeout>`
Response: Array containing two elements: num_local, num_replicas
num_local is always either 0 or 1 representing the local AOF on the master.
num_replicas is the number of replicas that acknowledged the a replication
offset of the last write being fsynced to the AOF.
Returns an error when called on replicas, or when called with non-zero
num_local on a master with AOF disabled, in all other cases the response
just contains number of fsync copies.
Main changes:
* Added code to keep track of replication offsets that are confirmed to have
been fsynced to disk.
* Keep advancing master_repl_offset even when replication is disabled (and
there's no replication backlog, only if there's an AOF enabled).
This way we can use this command and it's mechanisms even when replication
is disabled.
* Extend REPLCONF ACK to `REPLCONF ACK <ofs> FACK <ofs>`, the FACK
will be appended only if there's an AOF on the replica, and already ignored on
old masters (thus backwards compatible)
* WAIT now no longer wait for the replication offset after your last command, but
rather the replication offset after your last write (or read command that caused
propagation, e.g. lazy expiry).
Unrelated changes:
* WAIT command respects CLIENT_DENY_BLOCKING (not just CLIENT_MULTI)
Implementation details:
* Add an atomic var named `fsynced_reploff_pending` that's updated
(usually by the bio thread) and later copied to the main `fsynced_reploff`
variable (only if the AOF base file exists).
I.e. during the initial AOF rewrite it will not be used as the fsynced offset
since the AOF base is still missing.
* Replace close+fsync bio job with new BIO_CLOSE_AOF (AOF specific)
job that will also update fsync offset the field.
* Handle all AOF jobs (BIO_CLOSE_AOF, BIO_AOF_FSYNC) in the same bio
worker thread, to impose ordering on their execution. This solves a
race condition where a job could set `fsynced_reploff_pending` to a higher
value than another pending fsync job, resulting in indicating an offset
for which parts of the data have not yet actually been fsynced.
Imposing an ordering on the jobs guarantees that fsync jobs are executed
in increasing order of replication offset.
* Drain bio jobs when switching `appendfsync` to "always"
This should prevent a write race between updates to `fsynced_reploff_pending`
in the main thread (`flushAppendOnlyFile` when set to ALWAYS fsync), and
those done in the bio thread.
* Drain the pending fsync when starting over a new AOF to avoid race conditions
with the previous AOF offsets overriding the new one (e.g. after switching to
replicate from a new master).
* Make sure to update the fsynced offset at the end of the initial AOF rewrite.
a must in case there are no additional writes that trigger a periodic fsync,
specifically for a replica that does a full sync.
Limitations:
It is possible to write a module and a Lua script that propagate to the AOF and doesn't
propagate to the replication stream. see REDISMODULE_ARGV_NO_REPLICAS and luaRedisSetReplCommand.
These features are incompatible with the WAITAOF command, and can result
in two bad cases. The scenario is that the user executes command that only
propagates to AOF, and then immediately
issues a WAITAOF, and there's no further writes on the replication stream after that.
1. if the the last thing that happened on the replication stream is a PING
(which increased the replication offset but won't trigger an fsync on the replica),
then the client would hang forever (will wait for an fack that the replica will never
send sine it doesn't trigger any fsyncs).
2. if the last thing that happened is a write command that got propagated properly,
then WAITAOF will be released immediately, without waiting for an fsync (since
the offset didn't change)
Refactoring:
* Plumbing to allow bio worker to handle multiple job types
This introduces infrastructure necessary to allow BIO workers to
not have a 1-1 mapping of worker to job-type. This allows in the
future to assign multiple job types to a single worker, either as
a performance/resource optimization, or as a way of enforcing
ordering between specific classes of jobs.
Co-authored-by: Oran Agra <oran@redislabs.com>
2023-03-15 02:26:21 +08:00
|
|
|
[s $idx blocked_clients] ne 0
|
2021-06-09 20:13:24 +08:00
|
|
|
} else {
|
|
|
|
fail "no blocked clients"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-15 17:07:04 +08:00
|
|
|
proc wait_for_blocked_clients_count {count {maxtries 100} {delay 10} {idx 0}} {
|
2021-06-09 20:13:24 +08:00
|
|
|
wait_for_condition $maxtries $delay {
|
2023-03-15 17:07:04 +08:00
|
|
|
[s $idx blocked_clients] == $count
|
2021-06-09 20:13:24 +08:00
|
|
|
} else {
|
|
|
|
fail "Timeout waiting for blocked clients"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-02-20 17:12:19 +08:00
|
|
|
proc wait_for_watched_clients_count {count {maxtries 100} {delay 10} {idx 0}} {
|
|
|
|
wait_for_condition $maxtries $delay {
|
|
|
|
[s $idx watching_clients] == $count
|
|
|
|
} else {
|
|
|
|
fail "Timeout waiting for watched clients"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-30 14:20:32 +08:00
|
|
|
proc read_from_aof {fp} {
|
|
|
|
# Input fp is a blocking binary file descriptor of an opened AOF file.
|
|
|
|
if {[gets $fp count] == -1} return ""
|
|
|
|
set count [string range $count 1 end]
|
|
|
|
|
|
|
|
# Return a list of arguments for the command.
|
|
|
|
set res {}
|
|
|
|
for {set j 0} {$j < $count} {incr j} {
|
|
|
|
read $fp 1
|
|
|
|
set arg [::redis::redis_bulk_read $fp]
|
|
|
|
if {$j == 0} {set arg [string tolower $arg]}
|
|
|
|
lappend res $arg
|
|
|
|
}
|
|
|
|
return $res
|
|
|
|
}
|
|
|
|
|
|
|
|
proc assert_aof_content {aof_path patterns} {
|
|
|
|
set fp [open $aof_path r]
|
|
|
|
fconfigure $fp -translation binary
|
|
|
|
fconfigure $fp -blocking 1
|
|
|
|
|
|
|
|
for {set j 0} {$j < [llength $patterns]} {incr j} {
|
|
|
|
assert_match [lindex $patterns $j] [read_from_aof $fp]
|
|
|
|
}
|
|
|
|
}
|
2021-06-09 20:13:24 +08:00
|
|
|
|
|
|
|
proc config_set {param value {options {}}} {
|
|
|
|
set mayfail 0
|
|
|
|
foreach option $options {
|
|
|
|
switch $option {
|
|
|
|
"mayfail" {
|
|
|
|
set mayfail 1
|
|
|
|
}
|
|
|
|
default {
|
|
|
|
error "Unknown option $option"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if {[catch {r config set $param $value} err]} {
|
|
|
|
if {!$mayfail} {
|
|
|
|
error $err
|
|
|
|
} else {
|
|
|
|
if {$::verbose} {
|
|
|
|
puts "Ignoring CONFIG SET $param $value failure: $err"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-07-29 17:32:28 +08:00
|
|
|
|
Fix crash due to delete entry from compress quicklistNode and wrongly split quicklistNode (#11242)
This PR mainly deals with 2 crashes introduced in #9357,
and fix the QUICKLIST-PACKED-THRESHOLD mess in external test mode.
1. Fix crash due to deleting an entry from a compress quicklistNode
When inserting a large element, we need to create a new quicklistNode first,
and then delete its previous element, if the node where the deleted element is
located is compressed, it will cause a crash.
Now add `dont_compress` to quicklistNode, if we want to use a quicklistNode
after some operation, we can use this flag like following:
```c
node->dont_compress = 1; /* Prevent to be compressed */
some_operation(node); /* This operation might try to compress this node */
some_other_operation(node); /* We can use this node without decompress it */
node->dont_compress = 0; /* Re-able compression */
quicklistCompressNode(node);
```
Perhaps in the future, we could just disable the current entry from being
compressed during the iterator loop, but that would require more work.
2. Fix crash due to wrongly split quicklist
before #9357, the offset param of _quicklistSplitNode() will not negative.
For now, when offset is negative, the split extent will be wrong.
following example:
```c
int orig_start = after ? offset + 1 : 0;
int orig_extent = after ? -1 : offset;
int new_start = after ? 0 : offset;
int new_extent = after ? offset + 1 : -1;
# offset: -2, after: 1, node->count: 2
# current wrong range: [-1,-1] [0,-1]
# correct range: [1,-1] [0, 1]
```
Because only `_quicklistInsert()` splits the quicklistNode and only
`quicklistInsertAfter()`, `quicklistInsertBefore()` call _quicklistInsert(),
so `quicklistReplaceEntry()` and `listTypeInsert()` might occur this crash.
But the iterator of `listTypeInsert()` is alway from head to tail(iter->offset is
always positive), so it is not affected.
The final conclusion is this crash only occur when we insert a large element
with negative index into a list, that affects `LSET` command and `RM_ListSet`
module api.
3. In external test mode, we need to restore quicklist packed threshold after
when the end of test.
4. Show `node->count` in quicklistRepr().
5. Add new tcl proc `config_get_set` to support restoring config in tests.
2022-09-19 14:47:52 +08:00
|
|
|
proc config_get_set {param value {options {}}} {
|
|
|
|
set config [lindex [r config get $param] 1]
|
|
|
|
config_set $param $value $options
|
|
|
|
return $config
|
|
|
|
}
|
|
|
|
|
2021-07-29 17:32:28 +08:00
|
|
|
proc delete_lines_with_pattern {filename tmpfilename pattern} {
|
|
|
|
set fh_in [open $filename r]
|
|
|
|
set fh_out [open $tmpfilename w]
|
|
|
|
while {[gets $fh_in line] != -1} {
|
|
|
|
if {![regexp $pattern $line]} {
|
|
|
|
puts $fh_out $line
|
|
|
|
}
|
|
|
|
}
|
|
|
|
close $fh_in
|
|
|
|
close $fh_out
|
|
|
|
file rename -force $tmpfilename $filename
|
|
|
|
}
|
2021-12-14 03:16:25 +08:00
|
|
|
|
Protected configs and sensitive commands (#9920)
Block sensitive configs and commands by default.
* `enable-protected-configs` - block modification of configs with the new `PROTECTED_CONFIG` flag.
Currently we add this flag to `dbfilename`, and `dir` configs,
all of which are non-mutable configs that can set a file redis will write to.
* `enable-debug-command` - block the `DEBUG` command
* `enable-module-command` - block the `MODULE` command
These have a default value set to `no`, so that these features are not
exposed by default to client connections, and can only be set by modifying the config file.
Users can change each of these to either `yes` (allow all access), or `local` (allow access from
local TCP connections and unix domain connections)
Note that this is a **breaking change** (specifically the part about MODULE command being disabled by default).
I.e. we don't consider DEBUG command being blocked as an issue (people shouldn't have been using it),
and the few configs we protected are unlikely to have been set at runtime anyway.
On the other hand, it's likely to assume some users who use modules, load them from the config file anyway.
Note that's the whole point of this PR, for redis to be more secure by default and reduce the attack surface on
innocent users, so secure defaults will necessarily mean a breaking change.
2021-12-19 16:46:16 +08:00
|
|
|
proc get_nonloopback_addr {} {
|
|
|
|
set addrlist [list {}]
|
|
|
|
catch { set addrlist [exec hostname -I] }
|
|
|
|
return [lindex $addrlist 0]
|
|
|
|
}
|
|
|
|
|
|
|
|
proc get_nonloopback_client {} {
|
|
|
|
return [redis [get_nonloopback_addr] [srv 0 "port"] 0 $::tls]
|
|
|
|
}
|
|
|
|
|
2021-12-14 03:16:25 +08:00
|
|
|
# The following functions and variables are used only when running large-memory
|
|
|
|
# tests. We avoid defining them when not running large-memory tests because the
|
|
|
|
# global variables takes up lots of memory.
|
|
|
|
proc init_large_mem_vars {} {
|
|
|
|
if {![info exists ::str500]} {
|
|
|
|
set ::str500 [string repeat x 500000000] ;# 500mb
|
|
|
|
set ::str500_len [string length $::str500]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Utility function to write big argument into redis client connection
|
|
|
|
proc write_big_bulk {size {prefix ""} {skip_read no}} {
|
|
|
|
init_large_mem_vars
|
|
|
|
|
|
|
|
assert {[string length prefix] <= $size}
|
|
|
|
r write "\$$size\r\n"
|
|
|
|
r write $prefix
|
|
|
|
incr size -[string length $prefix]
|
|
|
|
while {$size >= 500000000} {
|
|
|
|
r write $::str500
|
|
|
|
incr size -500000000
|
|
|
|
}
|
|
|
|
if {$size > 0} {
|
|
|
|
r write [string repeat x $size]
|
|
|
|
}
|
|
|
|
r write "\r\n"
|
|
|
|
if {!$skip_read} {
|
|
|
|
r flush
|
|
|
|
r read
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Utility to read big bulk response (work around Tcl limitations)
|
|
|
|
proc read_big_bulk {code {compare no} {prefix ""}} {
|
|
|
|
init_large_mem_vars
|
|
|
|
|
|
|
|
r readraw 1
|
|
|
|
set resp_len [uplevel 1 $code] ;# get the first line of the RESP response
|
|
|
|
assert_equal [string range $resp_len 0 0] "$"
|
|
|
|
set resp_len [string range $resp_len 1 end]
|
|
|
|
set prefix_len [string length $prefix]
|
|
|
|
if {$compare} {
|
|
|
|
assert {$prefix_len <= $resp_len}
|
|
|
|
assert {$prefix_len <= $::str500_len}
|
|
|
|
}
|
|
|
|
|
|
|
|
set remaining $resp_len
|
|
|
|
while {$remaining > 0} {
|
|
|
|
set l $remaining
|
|
|
|
if {$l > $::str500_len} {set l $::str500_len} ; # can't read more than 2gb at a time, so read 500mb so we can easily verify read data
|
|
|
|
set read_data [r rawread $l]
|
|
|
|
set nbytes [string length $read_data]
|
|
|
|
if {$compare} {
|
|
|
|
set comp_len $nbytes
|
|
|
|
# Compare prefix part
|
|
|
|
if {$remaining == $resp_len} {
|
|
|
|
assert_equal $prefix [string range $read_data 0 [expr $prefix_len - 1]]
|
|
|
|
set read_data [string range $read_data $prefix_len $nbytes]
|
|
|
|
incr comp_len -$prefix_len
|
|
|
|
}
|
|
|
|
# Compare rest of data, evaluate and then assert to avoid huge print in case of failure
|
|
|
|
set data_equal [expr {$read_data == [string range $::str500 0 [expr $comp_len - 1]]}]
|
|
|
|
assert $data_equal
|
|
|
|
}
|
|
|
|
incr remaining -$nbytes
|
|
|
|
}
|
|
|
|
assert_equal [r rawread 2] "\r\n"
|
|
|
|
r readraw 0
|
|
|
|
return $resp_len
|
|
|
|
}
|
2021-12-17 13:56:59 +08:00
|
|
|
|
|
|
|
proc prepare_value {size} {
|
|
|
|
set _v "c"
|
|
|
|
for {set i 1} {$i < $size} {incr i} {
|
|
|
|
append _v 0
|
|
|
|
}
|
|
|
|
return $_v
|
|
|
|
}
|
2021-12-28 03:37:21 +08:00
|
|
|
|
|
|
|
proc memory_usage {key} {
|
|
|
|
set usage [r memory usage $key]
|
|
|
|
if {![string match {*jemalloc*} [s mem_allocator]]} {
|
|
|
|
# libc allocator can sometimes return a different size allocation for the same requested size
|
|
|
|
# this makes tests that rely on MEMORY USAGE unreliable, so instead we return a constant 1
|
|
|
|
set usage 1
|
|
|
|
}
|
|
|
|
return $usage
|
|
|
|
}
|
Build TLS as a loadable module
* Support BUILD_TLS=module to be loaded as a module via config file or
command line. e.g. redis-server --loadmodule redis-tls.so
* Updates to redismodule.h to allow it to be used side by side with
server.h by defining REDISMODULE_CORE_MODULE
* Changes to server.h, redismodule.h and module.c to avoid repeated
type declarations (gcc 4.8 doesn't like these)
* Add a mechanism for non-ABI neutral modules (ones who include
server.h) to refuse loading if they detect not being built together with
redis (release.c)
* Fix wrong signature of RedisModuleDefragFunc, this could break
compilation of a module, but not the ABI
* Move initialization of listeners in server.c to be after loading
the modules
* Config TLS after initialization of listeners
* Init cluster after initialization of listeners
* Add TLS module to CI
* Fix a test suite race conditions:
Now that the listeners are initialized later, it's not sufficient to
wait for the PID message in the log, we need to wait for the "Server
Initialized" message.
* Fix issues with moduleconfigs test as a result from start_server
waiting for "Server Initialized"
* Fix issues with modules/infra test as a result of an additional module
present
Notes about Sentinel:
Sentinel can't really rely on the tls module, since it uses hiredis to
initiate connections and depends on OpenSSL (won't be able to use any
other connection modules for that), so it was decided that when TLS is
built as a module, sentinel does not support TLS at all.
This means that it keeps using redis_tls_ctx and redis_tls_client_ctx directly.
Example code of config in redis-tls.so(may be use in the future):
RedisModuleString *tls_cfg = NULL;
void tlsInfo(RedisModuleInfoCtx *ctx, int for_crash_report) {
UNUSED(for_crash_report);
RedisModule_InfoAddSection(ctx, "");
RedisModule_InfoAddFieldLongLong(ctx, "var", 42);
}
int tlsCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc)
{
if (argc != 2) return RedisModule_WrongArity(ctx);
return RedisModule_ReplyWithString(ctx, argv[1]);
}
RedisModuleString *getStringConfigCommand(const char *name, void *privdata) {
REDISMODULE_NOT_USED(name);
REDISMODULE_NOT_USED(privdata);
return tls_cfg;
}
int setStringConfigCommand(const char *name, RedisModuleString *new, void *privdata, RedisModuleString **err) {
REDISMODULE_NOT_USED(name);
REDISMODULE_NOT_USED(err);
REDISMODULE_NOT_USED(privdata);
if (tls_cfg) RedisModule_FreeString(NULL, tls_cfg);
RedisModule_RetainString(NULL, new);
tls_cfg = new;
return REDISMODULE_OK;
}
int RedisModule_OnLoad(void *ctx, RedisModuleString **argv, int argc)
{
....
if (RedisModule_CreateCommand(ctx,"tls",tlsCommand,"",0,0,0) == REDISMODULE_ERR)
return REDISMODULE_ERR;
if (RedisModule_RegisterStringConfig(ctx, "cfg", "", REDISMODULE_CONFIG_DEFAULT, getStringConfigCommand, setStringConfigCommand, NULL, NULL) == REDISMODULE_ERR)
return REDISMODULE_ERR;
if (RedisModule_LoadConfigs(ctx) == REDISMODULE_ERR) {
if (tls_cfg) {
RedisModule_FreeString(ctx, tls_cfg);
tls_cfg = NULL;
}
return REDISMODULE_ERR;
}
...
}
Co-authored-by: zhenwei pi <pizhenwei@bytedance.com>
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
2022-08-22 15:53:56 +08:00
|
|
|
|
2025-05-08 02:49:00 +08:00
|
|
|
# Test if the server supports the specified command.
|
|
|
|
proc server_has_command {cmd_wanted} {
|
|
|
|
set lowercase_commands {}
|
|
|
|
foreach cmd [r command list] {
|
|
|
|
lappend lowercase_commands [string tolower $cmd]
|
|
|
|
}
|
|
|
|
expr {[lsearch $lowercase_commands [string tolower $cmd_wanted]] != -1}
|
|
|
|
}
|
|
|
|
|
Build TLS as a loadable module
* Support BUILD_TLS=module to be loaded as a module via config file or
command line. e.g. redis-server --loadmodule redis-tls.so
* Updates to redismodule.h to allow it to be used side by side with
server.h by defining REDISMODULE_CORE_MODULE
* Changes to server.h, redismodule.h and module.c to avoid repeated
type declarations (gcc 4.8 doesn't like these)
* Add a mechanism for non-ABI neutral modules (ones who include
server.h) to refuse loading if they detect not being built together with
redis (release.c)
* Fix wrong signature of RedisModuleDefragFunc, this could break
compilation of a module, but not the ABI
* Move initialization of listeners in server.c to be after loading
the modules
* Config TLS after initialization of listeners
* Init cluster after initialization of listeners
* Add TLS module to CI
* Fix a test suite race conditions:
Now that the listeners are initialized later, it's not sufficient to
wait for the PID message in the log, we need to wait for the "Server
Initialized" message.
* Fix issues with moduleconfigs test as a result from start_server
waiting for "Server Initialized"
* Fix issues with modules/infra test as a result of an additional module
present
Notes about Sentinel:
Sentinel can't really rely on the tls module, since it uses hiredis to
initiate connections and depends on OpenSSL (won't be able to use any
other connection modules for that), so it was decided that when TLS is
built as a module, sentinel does not support TLS at all.
This means that it keeps using redis_tls_ctx and redis_tls_client_ctx directly.
Example code of config in redis-tls.so(may be use in the future):
RedisModuleString *tls_cfg = NULL;
void tlsInfo(RedisModuleInfoCtx *ctx, int for_crash_report) {
UNUSED(for_crash_report);
RedisModule_InfoAddSection(ctx, "");
RedisModule_InfoAddFieldLongLong(ctx, "var", 42);
}
int tlsCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc)
{
if (argc != 2) return RedisModule_WrongArity(ctx);
return RedisModule_ReplyWithString(ctx, argv[1]);
}
RedisModuleString *getStringConfigCommand(const char *name, void *privdata) {
REDISMODULE_NOT_USED(name);
REDISMODULE_NOT_USED(privdata);
return tls_cfg;
}
int setStringConfigCommand(const char *name, RedisModuleString *new, void *privdata, RedisModuleString **err) {
REDISMODULE_NOT_USED(name);
REDISMODULE_NOT_USED(err);
REDISMODULE_NOT_USED(privdata);
if (tls_cfg) RedisModule_FreeString(NULL, tls_cfg);
RedisModule_RetainString(NULL, new);
tls_cfg = new;
return REDISMODULE_OK;
}
int RedisModule_OnLoad(void *ctx, RedisModuleString **argv, int argc)
{
....
if (RedisModule_CreateCommand(ctx,"tls",tlsCommand,"",0,0,0) == REDISMODULE_ERR)
return REDISMODULE_ERR;
if (RedisModule_RegisterStringConfig(ctx, "cfg", "", REDISMODULE_CONFIG_DEFAULT, getStringConfigCommand, setStringConfigCommand, NULL, NULL) == REDISMODULE_ERR)
return REDISMODULE_ERR;
if (RedisModule_LoadConfigs(ctx) == REDISMODULE_ERR) {
if (tls_cfg) {
RedisModule_FreeString(ctx, tls_cfg);
tls_cfg = NULL;
}
return REDISMODULE_ERR;
}
...
}
Co-authored-by: zhenwei pi <pizhenwei@bytedance.com>
Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
2022-08-22 15:53:56 +08:00
|
|
|
# forward compatibility, lmap missing in TCL 8.5
|
|
|
|
proc lmap args {
|
|
|
|
set body [lindex $args end]
|
|
|
|
set args [lrange $args 0 end-1]
|
|
|
|
set n 0
|
|
|
|
set pairs [list]
|
|
|
|
foreach {varnames listval} $args {
|
|
|
|
set varlist [list]
|
|
|
|
foreach varname $varnames {
|
|
|
|
upvar 1 $varname var$n
|
|
|
|
lappend varlist var$n
|
|
|
|
incr n
|
|
|
|
}
|
|
|
|
lappend pairs $varlist $listval
|
|
|
|
}
|
|
|
|
set temp [list]
|
|
|
|
foreach {*}$pairs {
|
|
|
|
lappend temp [uplevel 1 $body]
|
|
|
|
}
|
|
|
|
set temp
|
|
|
|
}
|
2023-06-12 19:05:34 +08:00
|
|
|
|
|
|
|
proc format_command {args} {
|
|
|
|
set cmd "*[llength $args]\r\n"
|
|
|
|
foreach a $args {
|
|
|
|
append cmd "$[string length $a]\r\n$a\r\n"
|
|
|
|
}
|
|
|
|
set _ $cmd
|
|
|
|
}
|
|
|
|
|
2024-01-09 09:56:06 +08:00
|
|
|
# Returns whether or not the system supports stack traces
|
|
|
|
proc system_backtrace_supported {} {
|
|
|
|
set system_name [string tolower [exec uname -s]]
|
|
|
|
if {$system_name eq {darwin}} {
|
|
|
|
return 1
|
|
|
|
} elseif {$system_name ne {linux}} {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
|
|
|
# libmusl does not support backtrace. Also return 0 on
|
|
|
|
# static binaries (ldd exit code 1) where we can't detect libmusl
|
2024-07-09 18:54:18 +08:00
|
|
|
if {![catch {set ldd [exec ldd src/redis-server]}]} {
|
2024-01-09 09:56:06 +08:00
|
|
|
if {![string match {*libc.*musl*} $ldd]} {
|
|
|
|
return 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0
|
|
|
|
}
|
2024-05-30 19:09:26 +08:00
|
|
|
|
|
|
|
proc generate_largevalue_test_array {} {
|
|
|
|
array set largevalue {}
|
|
|
|
set largevalue(listpack) "hello"
|
|
|
|
set largevalue(quicklist) [string repeat "x" 8192]
|
|
|
|
return [array get largevalue]
|
|
|
|
}
|