Fix short read issue that causes exit() on replica (#14085)
CI / test-ubuntu-latest (push) Has been cancelled Details
CI / test-sanitizer-address (push) Has been cancelled Details
CI / build-debian-old (push) Has been cancelled Details
CI / build-macos-latest (push) Has been cancelled Details
CI / build-32bit (push) Has been cancelled Details
CI / build-libc-malloc (push) Has been cancelled Details
CI / build-centos-jemalloc (push) Has been cancelled Details
CI / build-old-chain-jemalloc (push) Has been cancelled Details
Codecov / code-coverage (push) Has been cancelled Details
External Server Tests / test-external-standalone (push) Has been cancelled Details
External Server Tests / test-external-cluster (push) Has been cancelled Details
External Server Tests / test-external-nodebug (push) Has been cancelled Details
Spellcheck / Spellcheck (push) Has been cancelled Details

When `repl-diskless-load` is enabled on a replica, and it is in the
process of loading an RDB file, a broken connection detected by the main
channel may trigger a call to rioAbort(). This sets a flag to cause the
rdb channel to fail on the next rioRead() call, allowing it to perform
necessary cleanup.

However, there are specific scenarios where the error is checked using
rioGetReadError(), which does not account for the RIO_ABORT flag (see
[source](79b37ff535/src/rdb.c (L3098))).
As a result, the error goes undetected. The code then proceeds to
validate a module type, fails to find a match, and calls
rdbReportCorruptRDB() which logs the following error and exits the
process:

```
The RDB file contains module data I can't load: no matching module type '_________'
```

To fix this issue, the RIO_ABORT flag has been removed. Now, rioAbort()
sets both read and write error flags, so that subsequent operations and
error checks properly detect the failure.

Additional keys were added to the short read test. It reproduces the
issue with this change. We hit that problematic line once per key. My
guess is that with many smaller keys, the likelihood of the connection
being killed at just the right moment increases.
This commit is contained in:
Ozan Tezcan 2025-05-28 12:43:59 +03:00 committed by GitHub
parent 161326d332
commit 7f60945bc6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 11 additions and 5 deletions

View File

@ -23,7 +23,6 @@
#define RIO_FLAG_READ_ERROR (1<<0)
#define RIO_FLAG_WRITE_ERROR (1<<1)
#define RIO_FLAG_ABORT (1<<2)
#define RIO_TYPE_FILE (1<<0)
#define RIO_TYPE_BUFFER (1<<1)
@ -103,7 +102,7 @@ typedef struct _rio rio;
* if needed. */
static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
if (r->flags & (RIO_FLAG_WRITE_ERROR | RIO_FLAG_ABORT)) return 0;
if (r->flags & (RIO_FLAG_WRITE_ERROR)) return 0;
while (len) {
size_t bytes_to_write = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
if (r->update_cksum) r->update_cksum(r,buf,bytes_to_write);
@ -119,7 +118,7 @@ static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
}
static inline size_t rioRead(rio *r, void *buf, size_t len) {
if (r->flags & (RIO_FLAG_READ_ERROR | RIO_FLAG_ABORT)) return 0;
if (r->flags & (RIO_FLAG_READ_ERROR)) return 0;
while (len) {
size_t bytes_to_read = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
if (r->read(r,buf,bytes_to_read) == 0) {
@ -142,8 +141,10 @@ static inline int rioFlush(rio *r) {
return r->flush(r);
}
/* Abort RIO asynchronously by setting read and write error flags. Subsequent
* rioRead()/rioWrite() calls will fail, letting the caller terminate safely. */
static inline void rioAbort(rio *r) {
r->flags |= RIO_FLAG_ABORT;
r->flags |= (RIO_FLAG_READ_ERROR | RIO_FLAG_WRITE_ERROR);
}
/* This function allows to know if there was a read error in any past
@ -159,7 +160,7 @@ static inline int rioGetWriteError(rio *r) {
}
static inline void rioClearErrors(rio *r) {
r->flags &= ~(RIO_FLAG_READ_ERROR|RIO_FLAG_WRITE_ERROR|RIO_FLAG_ABORT);
r->flags &= ~(RIO_FLAG_READ_ERROR|RIO_FLAG_WRITE_ERROR);
}
void rioInitWithFile(rio *r, FILE *fp);

View File

@ -116,6 +116,11 @@ tags "modules" {
$master config set dynamic-hz no
$replica config set dynamic-hz no
set start [clock clicks -milliseconds]
# Generate small keys
for {set k 0} {$k < 20000} {incr k} {
r testrdb.set.key keysmall$k [string repeat A [expr {int(rand()*100)}]]
}
# Generate larger keys
for {set k 0} {$k < 30} {incr k} {
r testrdb.set.key key$k [string repeat A [expr {int(rand()*1000000)}]]
}