Avoid performing IO on coverage when child exits due to signal handler (#14072)
CI / test-ubuntu-latest (push) Waiting to run Details
CI / test-sanitizer-address (push) Waiting to run Details
CI / build-debian-old (push) Waiting to run Details
CI / build-macos-latest (push) Waiting to run Details
CI / build-32bit (push) Waiting to run Details
CI / build-libc-malloc (push) Waiting to run Details
CI / build-centos-jemalloc (push) Waiting to run Details
CI / build-old-chain-jemalloc (push) Waiting to run Details
Codecov / code-coverage (push) Waiting to run Details
External Server Tests / test-external-standalone (push) Waiting to run Details
External Server Tests / test-external-cluster (push) Waiting to run Details
External Server Tests / test-external-nodebug (push) Waiting to run Details
Spellcheck / Spellcheck (push) Waiting to run Details

Compiled Redis with COVERAGE_TEST, while using the fork API encountered
the following issue:
- Forked process calls `RedisModule_ExitFromChild` - child process
starts to report its COW while performing IO operations
- Parent process terminates child process with
`RedisModule_KillForkChild`
- Child process signal handler gets called while an IO operation is
called
- exit() is called because COVERAGE_TEST was on during compilation.
- exit() tries to perform more IO operations in its exit handlers.
- process gets deadlocked

Backtrace snippet:
```
#0  futex_wait (private=0, expected=2, futex_word=0x7e1220000c50) at ../sysdeps/nptl/futex-internal.h:146
#1  __GI___lll_lock_wait_private (futex=0x7e1220000c50) at ./nptl/lowlevellock.c:34
#2  0x00007e1234696429 in __GI__IO_flush_all () at ./libio/genops.c:698
#3  0x00007e123469680d in _IO_cleanup () at ./libio/genops.c:843
#4  0x00007e1234647b74 in __run_exit_handlers (status=status@entry=255, listp=<optimized out>, run_list_atexit=run_list_atexit@entry=true, run_dtors=run_dtors@entry=true) at ./stdlib/exit.c:129
#5  0x00007e1234647bbe in __GI_exit (status=status@entry=255) at ./stdlib/exit.c:138
#6  0x00005ef753264e13 in exitFromChild (retcode=255) at /home/jonathan/CLionProjects/redis/src/server.c:263
#7  sigKillChildHandler (sig=<optimized out>) at /home/jonathan/CLionProjects/redis/src/server.c:6794
#8  <signal handler called>
#9  0x00007e1234685b94 in _IO_fgets (buf=buf@entry=0x7e122dafdd90 "KSM:", ' ' <repeats 19 times>, "0 kB\n", n=n@entry=1024, fp=fp@entry=0x7e1220000b70) at ./libio/iofgets.c:47
#10 0x00005ef75326c5e0 in fgets (__stream=<optimized out>, __n=<optimized out>, __s=<optimized out>, __s=<optimized out>, __n=<optimized out>, __stream=<optimized out>) at /usr/include/x86_64-linux-gnu/bits/stdio2.h:200
#11 zmalloc_get_smap_bytes_by_field (field=0x5ef7534c42fd "Private_Dirty:", pid=<optimized out>) at /home/jonathan/CLionProjects/redis/src/zmalloc.c:928
#12 0x00005ef75338ab1f in zmalloc_get_private_dirty (pid=-1) at /home/jonathan/CLionProjects/redis/src/zmalloc.c:978
#13 sendChildInfoGeneric (info_type=CHILD_INFO_TYPE_MODULE_COW_SIZE, keys=0, progress=-1, pname=0x5ef7534c95b2 "Module fork") at /home/jonathan/CLionProjects/redis/src/childinfo.c:71
#14 0x00005ef75337962c in sendChildCowInfo (pname=0x5ef7534c95b2 "Module fork", info_type=CHILD_INFO_TYPE_MODULE_COW_SIZE) at /home/jonathan/CLionProjects/redis/src/server.c:6895
#15 RM_ExitFromChild (retcode=0) at /home/jonathan/CLionProjects/redis/src/module.c:11468
```

Change is to make the exit() _exit() calls conditional based on a
parameter to exitFromChild function.
The signal handler should exit without io operations since it doesn't
know its history.(If we were in the middle of IO operations before it
was called)

---------

Co-authored-by: Yuan Wang <wangyuancode@163.com>
This commit is contained in:
kei-nan 2025-05-28 11:27:52 +03:00 committed by GitHub
parent 79b37ff535
commit 161326d332
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 23 additions and 12 deletions

View File

@ -2587,9 +2587,9 @@ int rewriteAppendOnlyFileBackground(void) {
serverLog(LL_NOTICE,
"Successfully created the temporary AOF base file %s", tmpfile);
sendChildCowInfo(CHILD_INFO_TYPE_AOF_COW_SIZE, "AOF rewrite");
exitFromChild(0);
exitFromChild(0, 0);
} else {
exitFromChild(1);
exitFromChild(1, 0);
}
} else {
/* Parent */

View File

@ -94,7 +94,7 @@ void sendChildInfoGeneric(childInfoType info_type, size_t keys, double progress,
if (write(server.child_info_pipe[1], &data, wlen) != wlen) {
/* Failed writing to parent, it could have been killed, exit. */
serverLog(LL_WARNING,"Child failed reporting info to parent, exiting. %s", strerror(errno));
exitFromChild(1);
exitFromChild(1, 0);
}
}

View File

@ -920,7 +920,7 @@ void ldbEndSession(client *c) {
if (ldb.forked) {
writeToClient(c,0);
serverLog(LL_NOTICE,"Lua debugging session child exiting");
exitFromChild(0);
exitFromChild(0, 0);
} else {
serverLog(LL_NOTICE,
"Redis synchronous debugging eval session ended");

View File

@ -11467,7 +11467,7 @@ void RM_SendChildHeartbeat(double progress) {
*/
int RM_ExitFromChild(int retcode) {
sendChildCowInfo(CHILD_INFO_TYPE_MODULE_COW_SIZE, "Module fork");
exitFromChild(retcode);
exitFromChild(retcode, 0);
return REDISMODULE_OK;
}

View File

@ -1658,7 +1658,7 @@ int rdbSaveBackground(int req, char *filename, rdbSaveInfo *rsi, int rdbflags) {
if (retval == C_OK) {
sendChildCowInfo(CHILD_INFO_TYPE_RDB_COW_SIZE, "RDB");
}
exitFromChild((retval == C_OK) ? 0 : 1);
exitFromChild((retval == C_OK) ? 0 : 1, 0);
} else {
/* Parent */
if (childpid == -1) {
@ -3976,7 +3976,7 @@ int rdbSaveToSlavesSockets(int req, rdbSaveInfo *rsi) {
UNUSED(dummy);
}
zfree(conns);
exitFromChild((retval == C_OK) ? 0 : 1);
exitFromChild((retval == C_OK) ? 0 : 1, 0);
} else {
/* Parent */
if (childpid == -1) {

View File

@ -257,11 +257,19 @@ mstime_t commandTimeSnapshot(void) {
/* After an RDB dump or AOF rewrite we exit from children using _exit() instead of
* exit(), because the latter may interact with the same file objects used by
* the parent process. However if we are testing the coverage normal exit() is
* used in order to obtain the right coverage information. */
void exitFromChild(int retcode) {
* used in order to obtain the right coverage information.
* There is a caveat for when we exit due to a signal.
* In this case we want the function to be async signal safe, so we can't use exit()
*/
void exitFromChild(int retcode, int from_signal) {
#ifdef COVERAGE_TEST
exit(retcode);
if (!from_signal) {
exit(retcode);
} else {
_exit(retcode);
}
#else
UNUSED(from_signal);
_exit(retcode);
#endif
}
@ -6797,7 +6805,10 @@ static void sigKillChildHandler(int sig) {
UNUSED(sig);
int level = server.in_fork_child == CHILD_TYPE_MODULE? LL_VERBOSE: LL_WARNING;
serverLogRawFromHandler(level, "Received SIGUSR1 in child, exiting now.");
exitFromChild(SERVER_CHILD_NOERROR_RETVAL);
/* We don't want to perform any IO in the child when the parent is terminating us.
* We don't know what our stack trace is, it is possible that we were called during an IO operation
* If we were to do another IO operation, we might end up in a deadlock */
exitFromChild(SERVER_CHILD_NOERROR_RETVAL, 1);
}
void setupChildSignalHandlers(void) {

View File

@ -2779,7 +2779,7 @@ mstime_t commandTimeSnapshot(void);
void getRandomHexChars(char *p, size_t len);
void getRandomBytes(unsigned char *p, size_t len);
uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l);
void exitFromChild(int retcode);
void exitFromChild(int retcode, int from_signal);
long long redisPopcount(void *s, long count);
int redisSetProcTitle(char *title);
int validateProcTitleTemplate(const char *template);