mirror of https://github.com/apache/kafka.git
KAFKA-17515 Fix flaky RestoreIntegrationTest.shouldInvokeUserDefinedGlobalStateRestoreListener (#17187)
Found two issues in the flaky tests: (Put the log analysis under Jira comments.) 1) The error "java.nio.file.DirectoryNotEmptyException" occurs if the flush() of kafkaStreams.close() and purgeLocalStreamsState() are triggered in the same time. (The current timeout is 5 sec, which is too short since the CI is unstable and slow). 2) Racing issue: Task to-be restored in ks-1 are rebalanced to ks-2 before entering active restoring state. So no onRestoreSuspend() was triggered. To solve the issues: 1) Remove the timeout in kafkaStreams.close() 2) Ensure all tasks in ks-1 are active restoring before start second KafkaStreams(ks-2) Reviewers: Chia-Ping Tsai <chia7712@gmail.com>
This commit is contained in:
parent
09e3c12057
commit
8f5cf9968f
|
@ -102,6 +102,7 @@ import static org.apache.kafka.streams.Topology.AutoOffsetReset.EARLIEST;
|
||||||
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.purgeLocalStreamsState;
|
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.purgeLocalStreamsState;
|
||||||
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.safeUniqueTestName;
|
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.safeUniqueTestName;
|
||||||
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.startApplicationAndWaitUntilRunning;
|
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.startApplicationAndWaitUntilRunning;
|
||||||
|
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.waitForActiveRestoringTask;
|
||||||
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.waitForCompletion;
|
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.waitForCompletion;
|
||||||
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.waitForStandbyCompletion;
|
import static org.apache.kafka.streams.integration.utils.IntegrationTestUtils.waitForStandbyCompletion;
|
||||||
import static org.apache.kafka.test.TestUtils.waitForCondition;
|
import static org.apache.kafka.test.TestUtils.waitForCondition;
|
||||||
|
@ -579,12 +580,16 @@ public class RestoreIntegrationTest {
|
||||||
validateReceivedMessages(sampleData, outputTopic);
|
validateReceivedMessages(sampleData, outputTopic);
|
||||||
|
|
||||||
// Close kafkaStreams1 (with cleanup) and start it again to force the restoration of the state.
|
// Close kafkaStreams1 (with cleanup) and start it again to force the restoration of the state.
|
||||||
kafkaStreams.close(Duration.ofMillis(5000L));
|
kafkaStreams.close();
|
||||||
IntegrationTestUtils.purgeLocalStreamsState(streamsConfigurations);
|
IntegrationTestUtils.purgeLocalStreamsState(streamsConfigurations);
|
||||||
|
|
||||||
final TestStateRestoreListener kafkaStreams1StateRestoreListener = new TestStateRestoreListener("ks1", RESTORATION_DELAY);
|
final TestStateRestoreListener kafkaStreams1StateRestoreListener = new TestStateRestoreListener("ks1", RESTORATION_DELAY);
|
||||||
kafkaStreams = startKafkaStreams(builder, kafkaStreams1StateRestoreListener, kafkaStreams1Configuration);
|
kafkaStreams = startKafkaStreams(builder, kafkaStreams1StateRestoreListener, kafkaStreams1Configuration);
|
||||||
|
|
||||||
|
// Ensure all the restoring tasks are in active state before starting the new instance.
|
||||||
|
// Otherwise, the tasks which assigned to first kafka streams won't encounter "restoring suspend" after being reassigned to the second instance.
|
||||||
|
waitForActiveRestoringTask(kafkaStreams, 5, IntegrationTestUtils.DEFAULT_TIMEOUT);
|
||||||
|
|
||||||
assertTrue(kafkaStreams1StateRestoreListener.awaitUntilRestorationStarts());
|
assertTrue(kafkaStreams1StateRestoreListener.awaitUntilRestorationStarts());
|
||||||
assertTrue(kafkaStreams1StateRestoreListener.awaitUntilBatchRestoredIsCalled());
|
assertTrue(kafkaStreams1StateRestoreListener.awaitUntilBatchRestoredIsCalled());
|
||||||
|
|
||||||
|
|
|
@ -634,6 +634,18 @@ public class IntegrationTestUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait until enough restoring tasks have been started
|
||||||
|
*/
|
||||||
|
public static void waitForActiveRestoringTask(final KafkaStreams streams,
|
||||||
|
final int expectedTasks,
|
||||||
|
final long timeoutMilliseconds) throws Exception {
|
||||||
|
TestUtils.waitForCondition(() -> streams.metrics().entrySet().stream()
|
||||||
|
.filter(metric -> metric.getKey().name().equals("active-restoring-tasks"))
|
||||||
|
.anyMatch(metric -> ((Number) metric.getValue().metricValue()).intValue() == expectedTasks),
|
||||||
|
timeoutMilliseconds, "Timed out waiting for active restoring task");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wait until enough data (consumer records) has been consumed.
|
* Wait until enough data (consumer records) has been consumed.
|
||||||
*
|
*
|
||||||
|
|
Loading…
Reference in New Issue