mirror of https://github.com/apache/kafka.git
KAFKA-18189: CoordinatorRequestManager log message can include incorrect coordinator disconnect time (#18109)
Fixed logic in markCoordinatorUnknown to ensure the warning log contains the correct number of milliseconds the client has been disconnected. Reviewers: Christo Lolov <lolovc@amazon.com>
This commit is contained in:
parent
bd6d0fbf3d
commit
d09e222846
|
|
@ -139,22 +139,34 @@ public class CoordinatorRequestManager implements RequestManager {
|
|||
}
|
||||
|
||||
/**
|
||||
* Mark the current coordinator null.
|
||||
* Mark the coordinator as "unknown" (i.e. {@code null}) when a disconnect is detected. This detection can occur
|
||||
* in one of two paths:
|
||||
*
|
||||
* @param cause why the coordinator is marked unknown.
|
||||
* @param currentTimeMs the current time in ms.
|
||||
* <ol>
|
||||
* <li>The coordinator was discovered, but then later disconnected</li>
|
||||
* <li>The coordinator has not yet been discovered and/or connected</li>
|
||||
* </ol>
|
||||
*
|
||||
* @param cause String explanation of why the coordinator is marked unknown
|
||||
* @param currentTimeMs Current time in milliseconds
|
||||
*/
|
||||
public void markCoordinatorUnknown(final String cause, final long currentTimeMs) {
|
||||
if (this.coordinator != null) {
|
||||
log.info("Group coordinator {} is unavailable or invalid due to cause: {}. "
|
||||
+ "Rediscovery will be attempted.", this.coordinator, cause);
|
||||
this.coordinator = null;
|
||||
if (coordinator != null || timeMarkedUnknownMs == -1) {
|
||||
timeMarkedUnknownMs = currentTimeMs;
|
||||
totalDisconnectedMin = 0;
|
||||
}
|
||||
|
||||
if (coordinator != null) {
|
||||
log.info(
|
||||
"Group coordinator {} is unavailable or invalid due to cause: {}. Rediscovery will be attempted.",
|
||||
coordinator,
|
||||
cause
|
||||
);
|
||||
coordinator = null;
|
||||
} else {
|
||||
long durationOfOngoingDisconnectMs = Math.max(0, currentTimeMs - timeMarkedUnknownMs);
|
||||
long currDisconnectMin = durationOfOngoingDisconnectMs / COORDINATOR_DISCONNECT_LOGGING_INTERVAL_MS;
|
||||
if (currDisconnectMin > this.totalDisconnectedMin) {
|
||||
if (currDisconnectMin > totalDisconnectedMin) {
|
||||
log.debug("Consumer has been disconnected from the group coordinator for {}ms", durationOfOngoingDisconnectMs);
|
||||
totalDisconnectedMin = currDisconnectMin;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,16 +28,24 @@ import org.apache.kafka.common.requests.AbstractRequest;
|
|||
import org.apache.kafka.common.requests.FindCoordinatorRequest;
|
||||
import org.apache.kafka.common.requests.FindCoordinatorResponse;
|
||||
import org.apache.kafka.common.requests.RequestHeader;
|
||||
import org.apache.kafka.common.utils.LogCaptureAppender;
|
||||
import org.apache.kafka.common.utils.LogContext;
|
||||
import org.apache.kafka.common.utils.MockTime;
|
||||
|
||||
import org.apache.log4j.Level;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
|
@ -75,6 +83,78 @@ public class CoordinatorRequestManagerTest {
|
|||
assertEquals(Collections.emptyList(), pollResult.unsentRequests);
|
||||
}
|
||||
|
||||
/**
|
||||
* This test mimics a client that has been disconnected from the coordinator. When the client remains disconnected
|
||||
* from the coordinator for 60 seconds, the client will begin to emit a warning log every minute thereafter to
|
||||
* alert the user about the ongoing disconnect status. The warning log includes the length of time of the ongoing
|
||||
* disconnect:
|
||||
*
|
||||
* <code>
|
||||
* Consumer has been disconnected from the group coordinator for XXXXXms
|
||||
* </code>
|
||||
*
|
||||
* <p/>
|
||||
*
|
||||
* However, the logic used to calculate the length of the disconnect was not correct. This test exercises the
|
||||
* disconnect logic, controlling the logging and system time, to ensure the warning message is correct.
|
||||
*
|
||||
* @see CoordinatorRequestManager#markCoordinatorUnknown(String, long)
|
||||
*/
|
||||
@Test
|
||||
public void testMarkCoordinatorUnknownLoggingAccuracy() {
|
||||
long oneMinute = 60000;
|
||||
|
||||
try (final LogCaptureAppender appender = LogCaptureAppender.createAndRegister()) {
|
||||
// You'd be forgiven for assuming that a warning message would be logged at WARN, but
|
||||
// markCoordinatorUnknown logs the warning at DEBUG. This is partly for historical parity with the
|
||||
// ClassicKafkaConsumer.
|
||||
appender.setClassLogger(CoordinatorRequestManager.class, Level.DEBUG);
|
||||
CoordinatorRequestManager coordinatorRequestManager = setupCoordinatorManager(GROUP_ID);
|
||||
assertFalse(coordinatorRequestManager.coordinator().isPresent());
|
||||
|
||||
// Step 1: mark the coordinator as disconnected right after creation of the CoordinatorRequestManager.
|
||||
// Because the disconnect occurred immediately, no warning should be logged.
|
||||
coordinatorRequestManager.markCoordinatorUnknown("test", time.milliseconds());
|
||||
assertTrue(millisecondsFromLog(appender).isEmpty());
|
||||
|
||||
// Step 2: sleep for one minute and mark the coordinator unknown again. Then verify that the warning was
|
||||
// logged and the reported time is accurate.
|
||||
time.sleep(oneMinute);
|
||||
coordinatorRequestManager.markCoordinatorUnknown("test", time.milliseconds());
|
||||
Optional<Long> firstLogMs = millisecondsFromLog(appender);
|
||||
assertTrue(firstLogMs.isPresent());
|
||||
assertEquals(oneMinute, firstLogMs.get());
|
||||
|
||||
// Step 3: sleep for *another* minute, mark the coordinator unknown again, and verify the accuracy.
|
||||
time.sleep(oneMinute);
|
||||
coordinatorRequestManager.markCoordinatorUnknown("test", time.milliseconds());
|
||||
Optional<Long> secondLogMs = millisecondsFromLog(appender);
|
||||
assertTrue(secondLogMs.isPresent());
|
||||
assertEquals(oneMinute * 2, secondLogMs.get());
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<Long> millisecondsFromLog(LogCaptureAppender appender) {
|
||||
Pattern pattern = Pattern.compile("\\s+(?<millis>\\d+)+ms");
|
||||
List<Long> milliseconds = appender.getMessages().stream()
|
||||
.map(pattern::matcher)
|
||||
.filter(Matcher::find)
|
||||
.map(matcher -> matcher.group("millis"))
|
||||
.filter(Objects::nonNull)
|
||||
.map(millisString -> {
|
||||
try {
|
||||
return Long.parseLong(millisString);
|
||||
} catch (NumberFormatException e) {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
// Return the most recent log entry that matches the message in markCoordinatorUnknown, if present.
|
||||
return milliseconds.isEmpty() ? Optional.empty() : Optional.of(milliseconds.get(milliseconds.size() - 1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMarkCoordinatorUnknown() {
|
||||
CoordinatorRequestManager coordinatorManager = setupCoordinatorManager(GROUP_ID);
|
||||
|
|
|
|||
Loading…
Reference in New Issue