From fe3f0aa252158bd45eb0bd68e21b362bf865532a Mon Sep 17 00:00:00 2001
From: "debing.sun" <debing.sun@redis.com>
Date: Mon, 28 Jul 2025 10:53:57 +0800
Subject: [PATCH] Fix some daily CI issues (#14217)

1) Fix the timeout of `Active defrag big keys: standalone`
Using a pipe to write commands may cause the write to block if the read
buffer becomes full.

2) Fix the failure of `Main db not affected when fail to diskless load`
test
If the master was killed in slow environment, then after
`cluster-node-timeout` (3s in our test), running keyspace commands on
the replica will get a CLUSTERDOWN error.

3) Fix the failure of `Test shutdown hook` test
ASAN can intercept a signal, so I guess that when we send SIGCONT after
SIGTERM to kill the server, it might start doing some work again,
causing the process to close very slowly.
---
 .../cluster/tests/17-diskless-load-swapdb.tcl | 11 +++++--
 tests/instances.tcl                           |  3 +-
 tests/support/server.tcl                      |  3 +-
 tests/unit/memefficiency.tcl                  | 33 ++++++++++++++-----
 4 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/tests/cluster/tests/17-diskless-load-swapdb.tcl b/tests/cluster/tests/17-diskless-load-swapdb.tcl
index 7a56ec783..cb81b9fdb 100644
--- a/tests/cluster/tests/17-diskless-load-swapdb.tcl
+++ b/tests/cluster/tests/17-diskless-load-swapdb.tcl
@@ -80,7 +80,14 @@ test "Main db not affected when fail to diskless load" {
         fail "Fail to full sync"
     }
 
-    # Replica keys and keys to slots map still both are right
-    assert_equal {1} [$replica get $slot0_key]
+    # Replica keys and keys to slots map still both are right.
+    # CLUSTERDOWN errors are acceptable here because the cluster may be in a transient state
+    # due to the timing relationship with cluster-node-timeout.
+    if {[catch {$replica get $slot0_key} result]} {
+        assert_match "*CLUSTERDOWN*" $result
+    } else {
+        assert_equal {1} $result
+    }
+
     assert_equal $slot0_key [$replica CLUSTER GETKEYSINSLOT 0 1]
 }
diff --git a/tests/instances.tcl b/tests/instances.tcl
index 7406f14c1..05b8507a1 100644
--- a/tests/instances.tcl
+++ b/tests/instances.tcl
@@ -221,9 +221,10 @@ proc is_alive pid {
 }
 
 proc stop_instance pid {
-    catch {exec kill $pid}
     # Node might have been stopped in the test
+    # Send SIGCONT before SIGTERM, otherwise shutdown may be slow with ASAN.
     catch {exec kill -SIGCONT $pid}
+    catch {exec kill $pid}
     if {$::valgrind} {
         set max_wait 120000
     } else {
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 9640fa547..47e347be4 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -95,9 +95,10 @@ proc kill_server config {
 
     # kill server and wait for the process to be totally exited
     send_data_packet $::test_server_fd server-killing $pid
-    catch {exec kill $pid}
     # Node might have been stopped in the test
+    # Send SIGCONT before SIGTERM, otherwise shutdown may be slow with ASAN.
     catch {exec kill -SIGCONT $pid}
+    catch {exec kill $pid}
     if {$::valgrind} {
         set max_wait 120000
     } else {
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index af516f7fe..71d0e511d 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -334,31 +334,46 @@ run_solo {defrag} {
             set expected_frag 1.49
             if {$::accurate} {
                 # scale the hash to 1m fields in order to have a measurable the latency
+                set count 0
                 for {set j 10000} {$j < 1000000} {incr j} {
                     $rd hset bighash $j [concat "asdfasdfasdf" $j]
-                }
-                for {set j 10000} {$j < 1000000} {incr j} {
-                    $rd read ; # Discard replies
+
+                    incr count
+                    if {$count % 10000 == 0} {
+                        for {set k 0} {$k < 10000} {incr k} {
+                            $rd read ; # Discard replies
+                        }
+                    }
                 }
                 # creating that big hash, increased used_memory, so the relative frag goes down
                 set expected_frag 1.3
             }
 
             # add a mass of string keys
+            set count 0
             for {set j 0} {$j < 500000} {incr j} {
                 $rd setrange $j 150 a
-            }
-            for {set j 0} {$j < 500000} {incr j} {
-                $rd read ; # Discard replies
+
+                incr count
+                if {$count % 10000 == 0} {
+                    for {set k 0} {$k < 10000} {incr k} {
+                        $rd read ; # Discard replies
+                    }
+                }
             }
             assert_equal [r dbsize] 500016
 
             # create some fragmentation
+            set count 0
             for {set j 0} {$j < 500000} {incr j 2} {
                 $rd del $j
-            }
-            for {set j 0} {$j < 500000} {incr j 2} {
-                $rd read ; # Discard replies
+
+                incr count
+                if {$count % 10000 == 0} {
+                    for {set k 0} {$k < 10000} {incr k} {
+                        $rd read ; # Discard replies
+                    }
+                }
             }
             assert_equal [r dbsize] 250016