Fix short read issue that causes exit() on replica (#14085)

When `repl-diskless-load` is enabled on a replica, and it is in the process of loading an RDB file, a broken connection detected by the main channel may trigger a call to rioAbort(). This sets a flag to cause the rdb channel to fail on the next rioRead() call, allowing it to perform necessary cleanup. However, there are specific scenarios where the error is checked using rioGetReadError(), which does not account for the RIO_ABORT flag (see [source](79b37ff535/src/rdb.c (L3098))). As a result, the error goes undetected. The code then proceeds to validate a module type, fails to find a match, and calls rdbReportCorruptRDB() which logs the following error and exits the process: ``` The RDB file contains module data I can't load: no matching module type '_________' ``` To fix this issue, the RIO_ABORT flag has been removed. Now, rioAbort() sets both read and write error flags, so that subsequent operations and error checks properly detect the failure. Additional keys were added to the short read test. It reproduces the issue with this change. We hit that problematic line once per key. My guess is that with many smaller keys, the likelihood of the connection being killed at just the right moment increases.
2025-05-28 12:43:59 +03:00 · 2025-05-28 12:43:59 +03:00 · 7f60945bc6
parent 161326d332
commit 7f60945bc6
2 changed files with 11 additions and 5 deletions
--- a/src/rio.h
+++ b/src/rio.h
@ -23,7 +23,6 @@

 #define RIO_FLAG_READ_ERROR (1<<0)
 #define RIO_FLAG_WRITE_ERROR (1<<1)
-#define RIO_FLAG_ABORT (1<<2)

 #define RIO_TYPE_FILE (1<<0)
 #define RIO_TYPE_BUFFER (1<<1)
@ -103,7 +102,7 @@ typedef struct _rio rio;
 * if needed. */

 static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
-    if (r->flags & (RIO_FLAG_WRITE_ERROR | RIO_FLAG_ABORT)) return 0;
+    if (r->flags & (RIO_FLAG_WRITE_ERROR)) return 0;
    while (len) {
        size_t bytes_to_write = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
        if (r->update_cksum) r->update_cksum(r,buf,bytes_to_write);
@ -119,7 +118,7 @@ static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
 }

 static inline size_t rioRead(rio *r, void *buf, size_t len) {
-    if (r->flags & (RIO_FLAG_READ_ERROR | RIO_FLAG_ABORT)) return 0;
+    if (r->flags & (RIO_FLAG_READ_ERROR)) return 0;
    while (len) {
        size_t bytes_to_read = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
        if (r->read(r,buf,bytes_to_read) == 0) {
@ -142,8 +141,10 @@ static inline int rioFlush(rio *r) {
    return r->flush(r);
 }

+/* Abort RIO asynchronously by setting read and write error flags. Subsequent
+ * rioRead()/rioWrite() calls will fail, letting the caller terminate safely. */
 static inline void rioAbort(rio *r) {
-    r->flags |= RIO_FLAG_ABORT;
+    r->flags |= (RIO_FLAG_READ_ERROR | RIO_FLAG_WRITE_ERROR);
 }

 /* This function allows to know if there was a read error in any past
@ -159,7 +160,7 @@ static inline int rioGetWriteError(rio *r) {
 }

 static inline void rioClearErrors(rio *r) {
-    r->flags &= ~(RIO_FLAG_READ_ERROR|RIO_FLAG_WRITE_ERROR|RIO_FLAG_ABORT);
+    r->flags &= ~(RIO_FLAG_READ_ERROR|RIO_FLAG_WRITE_ERROR);
 }

 void rioInitWithFile(rio *r, FILE *fp);
--- a/tests/unit/moduleapi/testrdb.tcl
+++ b/tests/unit/moduleapi/testrdb.tcl
@ -116,6 +116,11 @@ tags "modules" {
                    $master config set dynamic-hz no
                    $replica config set dynamic-hz no
                    set start [clock clicks -milliseconds]
+                    # Generate small keys
+                    for {set k 0} {$k < 20000} {incr k} {
+                        r testrdb.set.key keysmall$k [string repeat A [expr {int(rand()*100)}]]
+                    }
+                    # Generate larger keys
                    for {set k 0} {$k < 30} {incr k} {
                        r testrdb.set.key key$k [string repeat A [expr {int(rand()*1000000)}]]
                    }