mirror of https://github.com/apache/kafka.git
KAFKA-19072: Add system test for ELR (#19344)
Verified locally. ``` test_id: kafkatest.tests.core.eligible_leader_replicas_test.EligibleLeaderReplicasTest.test_basic_eligible_leader_replicas.metadata_quorum=ISOLATED_KRAFT.group_protocol=classic status: PASS run time: 2 minutes 6.074 seconds -------------------------------------------------------------------------------- test_id: kafkatest.tests.core.eligible_leader_replicas_test.EligibleLeaderReplicasTest.test_basic_eligible_leader_replicas.metadata_quorum=ISOLATED_KRAFT.group_protocol=consumer status: PASS run time: 2 minutes 6.611 seconds -------------------------------------------------------------------------------- ``` Reviewers: Justine Olshan <jolshan@confluent.io>
This commit is contained in:
parent
cee55dbdec
commit
e3430fef88
|
@ -0,0 +1,161 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from kafkatest.services.kafka import KafkaService, quorum, consumer_group
|
||||
from kafkatest.services.console_consumer import ConsoleConsumer
|
||||
from kafkatest.services.verifiable_producer import VerifiableProducer
|
||||
from kafkatest.utils import is_int
|
||||
|
||||
from ducktape.tests.test import Test
|
||||
from ducktape.mark import matrix
|
||||
from ducktape.mark.resource import cluster
|
||||
from ducktape.utils.util import wait_until
|
||||
|
||||
import time
|
||||
|
||||
class EligibleLeaderReplicasTest(Test):
|
||||
"""
|
||||
Eligible leader replicas test verifies the ELR election can happen after all the replicas are offline.
|
||||
The partition will first clean shutdown 2 replicas and unclean shutdown the leader. Then the two clean shutdown
|
||||
replicas are started. One of these 2 replicas should be in ELR and become the leader.
|
||||
"""
|
||||
|
||||
def __init__(self, test_context):
|
||||
""":type test_context: ducktape.tests.test.TestContext"""
|
||||
super(EligibleLeaderReplicasTest, self).__init__(test_context=test_context)
|
||||
|
||||
self.topic = "input-topic"
|
||||
|
||||
self.num_brokers = 6
|
||||
|
||||
# Test parameters
|
||||
self.num_partitions = 1
|
||||
self.num_seed_messages = 10000
|
||||
|
||||
self.progress_timeout_sec = 60
|
||||
self.consumer_group = "elr-test-consumer-group"
|
||||
self.broker_startup_timeout_sec = 120
|
||||
|
||||
def seed_messages(self, topic, num_seed_messages):
|
||||
seed_timeout_sec = 10000
|
||||
seed_producer = VerifiableProducer(context=self.test_context,
|
||||
num_nodes=1,
|
||||
kafka=self.kafka,
|
||||
topic=topic,
|
||||
message_validator=is_int,
|
||||
max_messages=num_seed_messages,
|
||||
enable_idempotence=True)
|
||||
seed_producer.start()
|
||||
wait_until(lambda: seed_producer.num_acked >= num_seed_messages,
|
||||
timeout_sec=seed_timeout_sec,
|
||||
err_msg="Producer failed to produce messages %d in %ds." % \
|
||||
(self.num_seed_messages, seed_timeout_sec))
|
||||
return seed_producer.acked
|
||||
|
||||
def get_messages_from_topic(self, topic, num_messages, group_protocol):
|
||||
consumer = self.start_consumer(topic, group_id="verifying_consumer", group_protocol=group_protocol)
|
||||
return self.drain_consumer(consumer, num_messages)
|
||||
|
||||
def stop_broker(self, node, clean_shutdown):
|
||||
if clean_shutdown:
|
||||
self.kafka.stop_node(node, clean_shutdown = True, timeout_sec = self.broker_startup_timeout_sec)
|
||||
else:
|
||||
self.kafka.stop_node(node, clean_shutdown = False)
|
||||
gracePeriodSecs = 5
|
||||
brokerSessionTimeoutSecs = 18
|
||||
wait_until(lambda: not self.kafka.pids(node),
|
||||
timeout_sec=brokerSessionTimeoutSecs + gracePeriodSecs,
|
||||
err_msg="Failed to see timely disappearance of process for hard-killed broker %s" % str(node.account))
|
||||
time.sleep(brokerSessionTimeoutSecs + gracePeriodSecs)
|
||||
|
||||
def start_consumer(self, topic_to_read, group_id, group_protocol):
|
||||
consumer = ConsoleConsumer(context=self.test_context,
|
||||
num_nodes=1,
|
||||
kafka=self.kafka,
|
||||
topic=topic_to_read,
|
||||
group_id=group_id,
|
||||
message_validator=is_int,
|
||||
from_beginning=True,
|
||||
isolation_level="read_committed",
|
||||
consumer_properties=consumer_group.maybe_set_group_protocol(group_protocol))
|
||||
consumer.start()
|
||||
# ensure that the consumer is up.
|
||||
wait_until(lambda: (len(consumer.messages_consumed[1]) > 0) == True,
|
||||
timeout_sec=180,
|
||||
err_msg="Consumer failed to consume any messages for %ds" % \
|
||||
60)
|
||||
return consumer
|
||||
|
||||
def drain_consumer(self, consumer, num_messages):
|
||||
# wait until we read at least the expected number of messages.
|
||||
wait_until(lambda: len(consumer.messages_consumed[1]) >= num_messages,
|
||||
timeout_sec=90,
|
||||
err_msg="Consumer consumed only %d out of %d messages in %ds" % \
|
||||
(len(consumer.messages_consumed[1]), num_messages, 90))
|
||||
consumer.stop()
|
||||
return consumer.messages_consumed[1]
|
||||
|
||||
def setup_topics(self):
|
||||
self.kafka.topics = {
|
||||
self.topic: {
|
||||
"partitions": self.num_partitions,
|
||||
"replication-factor": 3,
|
||||
"configs": {
|
||||
"min.insync.replicas": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@cluster(num_nodes=9)
|
||||
@matrix(metadata_quorum=[quorum.isolated_kraft],
|
||||
group_protocol=consumer_group.all_group_protocols)
|
||||
def test_basic_eligible_leader_replicas(self, metadata_quorum, group_protocol=None):
|
||||
self.kafka = KafkaService(self.test_context,
|
||||
num_nodes=self.num_brokers,
|
||||
zk=None,
|
||||
controller_num_nodes_override=1)
|
||||
security_protocol = 'PLAINTEXT'
|
||||
|
||||
self.kafka.security_protocol = security_protocol
|
||||
self.kafka.interbroker_security_protocol = security_protocol
|
||||
self.kafka.logs["kafka_data_1"]["collect_default"] = True
|
||||
self.kafka.logs["kafka_data_2"]["collect_default"] = True
|
||||
self.kafka.logs["kafka_operational_logs_debug"]["collect_default"] = True
|
||||
|
||||
self.setup_topics()
|
||||
self.kafka.start(timeout_sec = self.broker_startup_timeout_sec)
|
||||
|
||||
self.kafka.run_features_command("upgrade", "eligible.leader.replicas.version", 1)
|
||||
input_messages = self.seed_messages(self.topic, self.num_seed_messages)
|
||||
isr = self.kafka.isr_idx_list(self.topic, 0)
|
||||
|
||||
self.stop_broker(self.kafka.nodes[isr[1] - 1], True)
|
||||
self.stop_broker(self.kafka.nodes[isr[2] - 1], True)
|
||||
|
||||
wait_until(lambda: len(self.kafka.isr_idx_list(self.topic)) == 1,
|
||||
timeout_sec=60,
|
||||
err_msg="Timed out waiting for the partition to have only 1 ISR")
|
||||
|
||||
self.stop_broker(self.kafka.nodes[isr[0] - 1], False)
|
||||
|
||||
self.kafka.start_node(self.kafka.nodes[isr[1] - 1], timeout_sec = self.broker_startup_timeout_sec)
|
||||
self.kafka.start_node(self.kafka.nodes[isr[2] - 1], timeout_sec = self.broker_startup_timeout_sec)
|
||||
|
||||
output_messages_set = set(self.get_messages_from_topic(self.topic, self.num_seed_messages, group_protocol))
|
||||
input_message_set = set(input_messages)
|
||||
|
||||
assert input_message_set == output_messages_set, \
|
||||
"Input and concurrently consumed output message sets are not equal. Num input messages: %d. Num concurrently_consumed_messages: %d" % \
|
||||
(len(input_message_set), len(output_messages_set))
|
Loading…
Reference in New Issue