mirror of https://github.com/apache/kafka.git
KAFKA-9198; Complete purgatory operations on receiving StopReplica (#7701)
Force completion of delayed operations when receiving a StopReplica request. In the case of a partition reassignment, we cannot rely on receiving a LeaderAndIsr request in order to complete these operations because the leader may no longer be a replica. Previously when this happened, the delayed operations were left to eventually timeout. Reviewers: Stanislav Kozlovski <stanislav_kozlovski@outlook.com>, Ismael Juma <ismael@juma.me.uk> Co-Authored-By: Kun Du <kidkun@users.noreply.github.com>
This commit is contained in:
parent
305f134abf
commit
6ae1af842a
|
@ -29,8 +29,11 @@ import scala.collection._
|
|||
|
||||
case class FetchPartitionStatus(startOffsetMetadata: LogOffsetMetadata, fetchInfo: PartitionData) {
|
||||
|
||||
override def toString = "[startOffsetMetadata: " + startOffsetMetadata + ", " +
|
||||
"fetchInfo: " + fetchInfo + "]"
|
||||
override def toString: String = {
|
||||
"[startOffsetMetadata: " + startOffsetMetadata +
|
||||
", fetchInfo: " + fetchInfo +
|
||||
"]"
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package kafka.server
|
||||
|
||||
|
||||
import java.util.concurrent.TimeUnit
|
||||
import java.util.concurrent.locks.Lock
|
||||
|
||||
|
@ -86,17 +85,15 @@ class DelayedProduce(delayMs: Long,
|
|||
trace(s"Checking produce satisfaction for $topicPartition, current status $status")
|
||||
// skip those partitions that have already been satisfied
|
||||
if (status.acksPending) {
|
||||
val (hasEnough, error) = replicaManager.getPartition(topicPartition) match {
|
||||
case HostedPartition.Online(partition) =>
|
||||
partition.checkEnoughReplicasReachOffset(status.requiredOffset)
|
||||
|
||||
case HostedPartition.Offline =>
|
||||
(false, Errors.KAFKA_STORAGE_ERROR)
|
||||
|
||||
case HostedPartition.None =>
|
||||
val (hasEnough, error) = replicaManager.getPartitionOrError(topicPartition, expectLeader = true) match {
|
||||
case Left(err) =>
|
||||
// Case A
|
||||
(false, Errors.UNKNOWN_TOPIC_OR_PARTITION)
|
||||
(false, err)
|
||||
|
||||
case Right(partition) =>
|
||||
partition.checkEnoughReplicasReachOffset(status.requiredOffset)
|
||||
}
|
||||
|
||||
// Case B.1 || B.2
|
||||
if (error != Errors.NONE || hasEnough) {
|
||||
status.acksPending = false
|
||||
|
|
|
@ -383,9 +383,20 @@ class ReplicaManager(val config: KafkaConfig,
|
|||
if (logManager.getLog(topicPartition, isFuture = true).isDefined)
|
||||
logManager.asyncDelete(topicPartition, isFuture = true)
|
||||
}
|
||||
|
||||
// If we were the leader, we may have some operations still waiting for completion.
|
||||
// We force completion to prevent them from timing out.
|
||||
completeDelayedFetchOrProduceRequests(topicPartition)
|
||||
|
||||
stateChangeLogger.trace(s"Finished handling stop replica (delete=$deletePartition) for partition $topicPartition")
|
||||
}
|
||||
|
||||
private def completeDelayedFetchOrProduceRequests(topicPartition: TopicPartition): Unit = {
|
||||
val topicPartitionOperationKey = TopicPartitionOperationKey(topicPartition)
|
||||
delayedProducePurgatory.checkAndComplete(topicPartitionOperationKey)
|
||||
delayedFetchPurgatory.checkAndComplete(topicPartitionOperationKey)
|
||||
}
|
||||
|
||||
def stopReplicas(stopReplicaRequest: StopReplicaRequest): (mutable.Map[TopicPartition, Errors], Errors) = {
|
||||
replicaStateChangeLock synchronized {
|
||||
val responseMap = new collection.mutable.HashMap[TopicPartition, Errors]
|
||||
|
@ -454,12 +465,24 @@ class ReplicaManager(val config: KafkaConfig,
|
|||
}
|
||||
|
||||
def getPartitionOrException(topicPartition: TopicPartition, expectLeader: Boolean): Partition = {
|
||||
getPartitionOrError(topicPartition, expectLeader) match {
|
||||
case Left(Errors.KAFKA_STORAGE_ERROR) =>
|
||||
throw new KafkaStorageException(s"Partition $topicPartition is in an offline log directory")
|
||||
|
||||
case Left(error) =>
|
||||
throw error.exception(s"Error while fetching partition state for $topicPartition")
|
||||
|
||||
case Right(partition) => partition
|
||||
}
|
||||
}
|
||||
|
||||
def getPartitionOrError(topicPartition: TopicPartition, expectLeader: Boolean): Either[Errors, Partition] = {
|
||||
getPartition(topicPartition) match {
|
||||
case HostedPartition.Online(partition) =>
|
||||
partition
|
||||
Right(partition)
|
||||
|
||||
case HostedPartition.Offline =>
|
||||
throw new KafkaStorageException(s"Partition $topicPartition is in an offline log directory")
|
||||
Left(Errors.KAFKA_STORAGE_ERROR)
|
||||
|
||||
case HostedPartition.None if metadataCache.contains(topicPartition) =>
|
||||
if (expectLeader) {
|
||||
|
@ -467,13 +490,13 @@ class ReplicaManager(val config: KafkaConfig,
|
|||
// forces clients to refresh metadata to find the new location. This can happen, for example,
|
||||
// during a partition reassignment if a produce request from the client is sent to a broker after
|
||||
// the local replica has been deleted.
|
||||
throw new NotLeaderForPartitionException(s"Broker $localBrokerId is not a replica of $topicPartition")
|
||||
Left(Errors.NOT_LEADER_FOR_PARTITION)
|
||||
} else {
|
||||
throw new ReplicaNotAvailableException(s"Partition $topicPartition is not available")
|
||||
Left(Errors.REPLICA_NOT_AVAILABLE)
|
||||
}
|
||||
|
||||
case HostedPartition.None =>
|
||||
throw new UnknownTopicOrPartitionException(s"Partition $topicPartition doesn't exist")
|
||||
Left(Errors.UNKNOWN_TOPIC_OR_PARTITION)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1484,9 +1507,7 @@ class ReplicaManager(val config: KafkaConfig,
|
|||
}
|
||||
|
||||
partitionsToMakeFollower.foreach { partition =>
|
||||
val topicPartitionOperationKey = TopicPartitionOperationKey(partition.topicPartition)
|
||||
delayedProducePurgatory.checkAndComplete(topicPartitionOperationKey)
|
||||
delayedFetchPurgatory.checkAndComplete(topicPartitionOperationKey)
|
||||
completeDelayedFetchOrProduceRequests(partition.topicPartition)
|
||||
}
|
||||
|
||||
partitionsToMakeFollower.foreach { partition =>
|
||||
|
|
|
@ -49,6 +49,7 @@ import org.apache.kafka.common.{IsolationLevel, Node, TopicPartition}
|
|||
import org.easymock.EasyMock
|
||||
import org.junit.Assert._
|
||||
import org.junit.{After, Before, Test}
|
||||
import org.mockito.Mockito
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.{Map, Seq}
|
||||
|
@ -944,14 +945,14 @@ class ReplicaManagerTest {
|
|||
Optional.of(0))
|
||||
var fetchResult = sendConsumerFetch(replicaManager, tp0, partitionData, Some(clientMetadata))
|
||||
assertNotNull(fetchResult.get)
|
||||
assertEquals(fetchResult.get.error, Errors.NONE)
|
||||
assertEquals(Errors.NONE, fetchResult.get.error)
|
||||
|
||||
// Fetch from follower, with empty ClientMetadata (which implies an older version)
|
||||
partitionData = new FetchRequest.PartitionData(0L, 0L, 100,
|
||||
Optional.of(0))
|
||||
fetchResult = sendConsumerFetch(replicaManager, tp0, partitionData, None)
|
||||
assertNotNull(fetchResult.get)
|
||||
assertEquals(fetchResult.get.error, Errors.NOT_LEADER_FOR_PARTITION)
|
||||
assertEquals(Errors.NOT_LEADER_FOR_PARTITION, fetchResult.get.error)
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -999,7 +1000,7 @@ class ReplicaManagerTest {
|
|||
replicaManager.becomeLeaderOrFollower(0, becomeFollowerRequest, (_, _) => ())
|
||||
|
||||
assertNotNull(fetchResult.get)
|
||||
assertEquals(fetchResult.get.error, Errors.NOT_LEADER_FOR_PARTITION)
|
||||
assertEquals(Errors.NOT_LEADER_FOR_PARTITION, fetchResult.get.error)
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -1048,7 +1049,7 @@ class ReplicaManagerTest {
|
|||
replicaManager.becomeLeaderOrFollower(0, becomeFollowerRequest, (_, _) => ())
|
||||
|
||||
assertNotNull(fetchResult.get)
|
||||
assertEquals(fetchResult.get.error, Errors.FENCED_LEADER_EPOCH)
|
||||
assertEquals(Errors.FENCED_LEADER_EPOCH, fetchResult.get.error)
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -1079,13 +1080,114 @@ class ReplicaManagerTest {
|
|||
Optional.of(1))
|
||||
var fetchResult = sendConsumerFetch(replicaManager, tp0, partitionData, Some(clientMetadata))
|
||||
assertNotNull(fetchResult.get)
|
||||
assertEquals(fetchResult.get.error, Errors.NONE)
|
||||
assertEquals(Errors.NONE, fetchResult.get.error)
|
||||
|
||||
partitionData = new FetchRequest.PartitionData(0L, 0L, 100,
|
||||
Optional.empty())
|
||||
fetchResult = sendConsumerFetch(replicaManager, tp0, partitionData, Some(clientMetadata))
|
||||
assertNotNull(fetchResult.get)
|
||||
assertEquals(fetchResult.get.error, Errors.NONE)
|
||||
assertEquals(Errors.NONE, fetchResult.get.error)
|
||||
}
|
||||
|
||||
@Test
|
||||
def testClearFetchPurgatoryOnStopReplica(): Unit = {
|
||||
// As part of a reassignment, we may send StopReplica to the old leader.
|
||||
// In this case, we should ensure that pending purgatory operations are cancelled
|
||||
// immediately rather than sitting around to timeout.
|
||||
|
||||
val mockTimer = new MockTimer
|
||||
val replicaManager = setupReplicaManagerWithMockedPurgatories(mockTimer, aliveBrokerIds = Seq(0, 1))
|
||||
|
||||
val tp0 = new TopicPartition(topic, 0)
|
||||
val offsetCheckpoints = new LazyOffsetCheckpoints(replicaManager.highWatermarkCheckpoints)
|
||||
replicaManager.createPartition(tp0).createLogIfNotExists(0, isNew = false, isFutureReplica = false, offsetCheckpoints)
|
||||
val partition0Replicas = Seq[Integer](0, 1).asJava
|
||||
|
||||
val becomeLeaderRequest = new LeaderAndIsrRequest.Builder(ApiKeys.LEADER_AND_ISR.latestVersion, 0, 0, brokerEpoch,
|
||||
Seq(new LeaderAndIsrPartitionState()
|
||||
.setTopicName(tp0.topic)
|
||||
.setPartitionIndex(tp0.partition)
|
||||
.setControllerEpoch(0)
|
||||
.setLeader(0)
|
||||
.setLeaderEpoch(1)
|
||||
.setIsr(partition0Replicas)
|
||||
.setZkVersion(0)
|
||||
.setReplicas(partition0Replicas)
|
||||
.setIsNew(true)).asJava,
|
||||
Set(new Node(0, "host1", 0), new Node(1, "host2", 1)).asJava).build()
|
||||
replicaManager.becomeLeaderOrFollower(1, becomeLeaderRequest, (_, _) => ())
|
||||
|
||||
val partitionData = new FetchRequest.PartitionData(0L, 0L, 100,
|
||||
Optional.of(1))
|
||||
val fetchResult = sendConsumerFetch(replicaManager, tp0, partitionData, None, timeout = 10)
|
||||
assertNull(fetchResult.get)
|
||||
|
||||
Mockito.when(replicaManager.metadataCache.contains(tp0)).thenReturn(true)
|
||||
|
||||
// We have a fetch in purgatory, now receive a stop replica request and
|
||||
// assert that the fetch returns with a NOT_LEADER error
|
||||
|
||||
replicaManager.stopReplica(tp0, deletePartition = true)
|
||||
assertNotNull(fetchResult.get)
|
||||
assertEquals(Errors.NOT_LEADER_FOR_PARTITION, fetchResult.get.error)
|
||||
}
|
||||
|
||||
@Test
|
||||
def testClearProducePurgatoryOnStopReplica(): Unit = {
|
||||
val mockTimer = new MockTimer
|
||||
val replicaManager = setupReplicaManagerWithMockedPurgatories(mockTimer, aliveBrokerIds = Seq(0, 1))
|
||||
|
||||
val tp0 = new TopicPartition(topic, 0)
|
||||
val offsetCheckpoints = new LazyOffsetCheckpoints(replicaManager.highWatermarkCheckpoints)
|
||||
replicaManager.createPartition(tp0).createLogIfNotExists(0, isNew = false, isFutureReplica = false, offsetCheckpoints)
|
||||
val partition0Replicas = Seq[Integer](0, 1).asJava
|
||||
|
||||
val becomeLeaderRequest = new LeaderAndIsrRequest.Builder(ApiKeys.LEADER_AND_ISR.latestVersion, 0, 0, brokerEpoch,
|
||||
Seq(new LeaderAndIsrPartitionState()
|
||||
.setTopicName(tp0.topic)
|
||||
.setPartitionIndex(tp0.partition)
|
||||
.setControllerEpoch(0)
|
||||
.setLeader(0)
|
||||
.setLeaderEpoch(1)
|
||||
.setIsr(partition0Replicas)
|
||||
.setZkVersion(0)
|
||||
.setReplicas(partition0Replicas)
|
||||
.setIsNew(true)).asJava,
|
||||
Set(new Node(0, "host1", 0), new Node(1, "host2", 1)).asJava).build()
|
||||
replicaManager.becomeLeaderOrFollower(1, becomeLeaderRequest, (_, _) => ())
|
||||
|
||||
val produceResult = sendProducerAppend(replicaManager, tp0)
|
||||
assertNull(produceResult.get)
|
||||
|
||||
Mockito.when(replicaManager.metadataCache.contains(tp0)).thenReturn(true)
|
||||
|
||||
replicaManager.stopReplica(tp0, deletePartition = true)
|
||||
assertNotNull(produceResult.get)
|
||||
assertEquals(Errors.NOT_LEADER_FOR_PARTITION, produceResult.get.error)
|
||||
}
|
||||
|
||||
private def sendProducerAppend(replicaManager: ReplicaManager,
|
||||
topicPartition: TopicPartition): AtomicReference[PartitionResponse] = {
|
||||
val produceResult = new AtomicReference[PartitionResponse]()
|
||||
def callback(response: Map[TopicPartition, PartitionResponse]): Unit = {
|
||||
produceResult.set(response(topicPartition))
|
||||
}
|
||||
|
||||
val records = MemoryRecords.withRecords(CompressionType.NONE,
|
||||
new SimpleRecord("a".getBytes()),
|
||||
new SimpleRecord("b".getBytes()),
|
||||
new SimpleRecord("c".getBytes())
|
||||
)
|
||||
|
||||
replicaManager.appendRecords(
|
||||
timeout = 10,
|
||||
requiredAcks = -1,
|
||||
internalTopicsAllowed = false,
|
||||
isFromClient = true,
|
||||
entriesPerPartition = Map(topicPartition -> records),
|
||||
responseCallback = callback
|
||||
)
|
||||
produceResult
|
||||
}
|
||||
|
||||
private def sendConsumerFetch(replicaManager: ReplicaManager,
|
||||
|
@ -1370,14 +1472,14 @@ class ReplicaManagerTest {
|
|||
val logProps = new Properties()
|
||||
val mockLogMgr = TestUtils.createLogManager(config.logDirs.map(new File(_)), LogConfig(logProps))
|
||||
val aliveBrokers = aliveBrokerIds.map(brokerId => createBroker(brokerId, s"host$brokerId", brokerId))
|
||||
val metadataCache: MetadataCache = EasyMock.createMock(classOf[MetadataCache])
|
||||
EasyMock.expect(metadataCache.getAliveBrokers).andReturn(aliveBrokers).anyTimes()
|
||||
|
||||
val metadataCache: MetadataCache = Mockito.mock(classOf[MetadataCache])
|
||||
Mockito.when(metadataCache.getAliveBrokers).thenReturn(aliveBrokers)
|
||||
|
||||
aliveBrokerIds.foreach { brokerId =>
|
||||
EasyMock.expect(metadataCache.getAliveBroker(EasyMock.eq(brokerId)))
|
||||
.andReturn(Option(createBroker(brokerId, s"host$brokerId", brokerId)))
|
||||
.anyTimes()
|
||||
Mockito.when(metadataCache.getAliveBroker(brokerId))
|
||||
.thenReturn(Option(createBroker(brokerId, s"host$brokerId", brokerId)))
|
||||
}
|
||||
EasyMock.replay(metadataCache)
|
||||
|
||||
val mockProducePurgatory = new DelayedOperationPurgatory[DelayedProduce](
|
||||
purgatoryName = "Produce", timer, reaperEnabled = false)
|
||||
|
|
Loading…
Reference in New Issue