kafka/tests/kafkatest/tests/streams/streams_broker_down_resilie...

344 lines
17 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from ducktape.mark import matrix
from ducktape.mark.resource import cluster
from kafkatest.services.kafka import quorum
from kafkatest.services.streams import StreamsBrokerDownResilienceService
from kafkatest.tests.streams.base_streams_test import BaseStreamsTest
class StreamsBrokerDownResilience(BaseStreamsTest):
"""
This test validates that Streams is resilient to a broker
being down longer than specified timeouts in configs
"""
inputTopic = "streamsResilienceSource"
outputTopic = "streamsResilienceSink"
client_id = "streams-broker-resilience-verify-consumer"
num_messages = 10000
message = "processed [0-9]* messages"
connected_message = "Discovered group coordinator"
def __init__(self, test_context):
super(StreamsBrokerDownResilience, self).__init__(test_context,
topics={self.inputTopic: {'partitions': 3, 'replication-factor': 1},
self.outputTopic: {'partitions': 1, 'replication-factor': 1}},
num_brokers=1)
def setUp(self):
# do not start kafka
pass
@cluster(num_nodes=7)
@matrix(metadata_quorum=[quorum.combined_kraft],
group_protocol=["classic", "streams"])
def test_streams_resilient_to_broker_down(self, metadata_quorum, group_protocol):
self.kafka.start()
# Broker should be down over 2x of retries * timeout ms
# So with (2 * 15000) = 30 seconds, we'll set downtime to 70 seconds
broker_down_time_in_seconds = 70
processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, self.get_configs(
group_protocol=group_protocol))
processor.start()
self.assert_produce_consume(self.inputTopic,
self.outputTopic,
self.client_id,
"before_broker_stop")
node = self.kafka.leader(self.inputTopic)
self.kafka.stop_node(node)
time.sleep(broker_down_time_in_seconds)
with processor.node.account.monitor_log(processor.LOG_FILE) as monitor:
self.kafka.start_node(node)
monitor.wait_until(self.connected_message,
timeout_sec=120,
err_msg=("Never saw output '%s' on " % self.connected_message) + str(processor.node.account))
self.assert_produce_consume(self.inputTopic,
self.outputTopic,
self.client_id,
"after_broker_stop",
timeout_sec=120)
self.kafka.stop()
@cluster(num_nodes=7)
@matrix(metadata_quorum=[quorum.combined_kraft],
group_protocol=["classic", "streams"])
def test_streams_runs_with_broker_down_initially(self, metadata_quorum, group_protocol):
self.kafka.start()
node = self.kafka.leader(self.inputTopic)
self.kafka.stop_node(node)
configs = self.get_configs(group_protocol=group_protocol, extra_configs=",application.id=starting_wo_broker_id")
# start streams with broker down initially
processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs)
processor.start()
processor_2 = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs)
processor_2.start()
processor_3 = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs)
processor_3.start()
broker_unavailable_message = "Node may not be available"
# verify streams instances unable to connect to broker, kept trying
self.wait_for_verification(processor, broker_unavailable_message, processor.LOG_FILE, 10)
self.wait_for_verification(processor_2, broker_unavailable_message, processor_2.LOG_FILE, 10)
self.wait_for_verification(processor_3, broker_unavailable_message, processor_3.LOG_FILE, 10)
with processor.node.account.monitor_log(processor.LOG_FILE) as monitor_1:
with processor_2.node.account.monitor_log(processor_2.LOG_FILE) as monitor_2:
with processor_3.node.account.monitor_log(processor_3.LOG_FILE) as monitor_3:
self.kafka.start_node(node)
monitor_1.wait_until(self.connected_message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.connected_message) + str(processor.node.account))
monitor_2.wait_until(self.connected_message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.connected_message) + str(processor_2.node.account))
monitor_3.wait_until(self.connected_message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.connected_message) + str(processor_3.node.account))
with processor.node.account.monitor_log(processor.STDOUT_FILE) as monitor_1:
with processor_2.node.account.monitor_log(processor_2.STDOUT_FILE) as monitor_2:
with processor_3.node.account.monitor_log(processor_3.STDOUT_FILE) as monitor_3:
self.assert_produce(self.inputTopic,
"sending_message_after_broker_down_initially",
num_messages=self.num_messages,
timeout_sec=120)
monitor_1.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor.node.account))
monitor_2.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor_2.node.account))
monitor_3.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor_3.node.account))
self.assert_consume(self.client_id,
"consuming_message_after_broker_down_initially",
self.outputTopic,
num_messages=self.num_messages,
timeout_sec=120)
self.kafka.stop()
@cluster(num_nodes=9)
@matrix(metadata_quorum=[quorum.combined_kraft],
group_protocol=["classic", "streams"])
def test_streams_should_scale_in_while_brokers_down(self, metadata_quorum, group_protocol):
self.kafka.start()
extra_configs = ",application.id=shutdown_with_broker_down"
# TODO KIP-441: consider rewriting the test for HighAvailabilityTaskAssignor
if group_protocol == "classic":
extra_configs += ",internal.task.assignor.class=org.apache.kafka.streams.processor.internals.assignment.LegacyStickyTaskAssignor"
configs = self.get_configs(
group_protocol=group_protocol,
extra_configs=extra_configs
)
processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs)
processor.start()
processor_2 = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs)
processor_2.start()
processor_3 = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs)
# need to wait for rebalance once
rebalance = "State transition from REBALANCING to RUNNING"
with processor_3.node.account.monitor_log(processor_3.LOG_FILE) as monitor:
processor_3.start()
monitor.wait_until(rebalance,
timeout_sec=120,
err_msg=("Never saw output '%s' on " % rebalance) + str(processor_3.node.account))
with processor.node.account.monitor_log(processor.STDOUT_FILE) as monitor_1:
with processor_2.node.account.monitor_log(processor_2.STDOUT_FILE) as monitor_2:
with processor_3.node.account.monitor_log(processor_3.STDOUT_FILE) as monitor_3:
self.assert_produce(self.inputTopic,
"sending_message_normal_broker_start",
num_messages=self.num_messages,
timeout_sec=120)
monitor_1.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor.node.account))
monitor_2.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor_2.node.account))
monitor_3.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor_3.node.account))
self.assert_consume(self.client_id,
"consuming_message_normal_broker_start",
self.outputTopic,
num_messages=self.num_messages,
timeout_sec=120)
node = self.kafka.leader(self.inputTopic)
self.kafka.stop_node(node)
processor.stop()
processor_2.stop()
shutdown_message = "Complete shutdown of streams resilience test app now"
self.wait_for_verification(processor, shutdown_message, processor.STDOUT_FILE)
self.wait_for_verification(processor_2, shutdown_message, processor_2.STDOUT_FILE)
with processor_3.node.account.monitor_log(processor_3.LOG_FILE) as monitor_3:
self.kafka.start_node(node)
monitor_3.wait_until(self.connected_message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.connected_message) + str(processor_3.node.account))
self.assert_produce_consume(self.inputTopic,
self.outputTopic,
self.client_id,
"sending_message_after_stopping_streams_instance_bouncing_broker",
num_messages=self.num_messages,
timeout_sec=120)
self.kafka.stop()
@cluster(num_nodes=9)
@matrix(metadata_quorum=[quorum.combined_kraft],
group_protocol=["classic", "streams"])
def test_streams_should_failover_while_brokers_down(self, metadata_quorum, group_protocol):
self.kafka.start()
extra_configs = ",application.id=shutdown_with_broker_down"
# TODO KIP-441: consider rewriting the test for HighAvailabilityTaskAssignor
if group_protocol == "classic":
extra_configs += ",internal.task.assignor.class=org.apache.kafka.streams.processor.internals.assignment.LegacyStickyTaskAssignor"
configs = self.get_configs(
group_protocol=group_protocol,
extra_configs=extra_configs
)
processor = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs)
processor.start()
processor_2 = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs)
processor_2.start()
processor_3 = StreamsBrokerDownResilienceService(self.test_context, self.kafka, configs)
# need to wait for rebalance once
rebalance = "State transition from REBALANCING to RUNNING"
with processor_3.node.account.monitor_log(processor_3.LOG_FILE) as monitor:
processor_3.start()
monitor.wait_until(rebalance,
timeout_sec=120,
err_msg=("Never saw output '%s' on " % rebalance) + str(processor_3.node.account))
with processor.node.account.monitor_log(processor.STDOUT_FILE) as monitor_1:
with processor_2.node.account.monitor_log(processor_2.STDOUT_FILE) as monitor_2:
with processor_3.node.account.monitor_log(processor_3.STDOUT_FILE) as monitor_3:
self.assert_produce(self.inputTopic,
"sending_message_after_normal_broker_start",
num_messages=self.num_messages,
timeout_sec=120)
monitor_1.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor.node.account))
monitor_2.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor_2.node.account))
monitor_3.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor_3.node.account))
self.assert_consume(self.client_id,
"consuming_message_after_normal_broker_start",
self.outputTopic,
num_messages=self.num_messages,
timeout_sec=120)
node = self.kafka.leader(self.inputTopic)
self.kafka.stop_node(node)
processor.abortThenRestart()
processor_2.abortThenRestart()
processor_3.abortThenRestart()
with processor.node.account.monitor_log(processor.LOG_FILE) as monitor_1:
with processor_2.node.account.monitor_log(processor_2.LOG_FILE) as monitor_2:
with processor_3.node.account.monitor_log(processor_3.LOG_FILE) as monitor_3:
self.kafka.start_node(node)
monitor_1.wait_until(self.connected_message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.connected_message) + str(processor.node.account))
monitor_2.wait_until(self.connected_message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.connected_message) + str(processor_2.node.account))
monitor_3.wait_until(self.connected_message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.connected_message) + str(processor_3.node.account))
with processor.node.account.monitor_log(processor.STDOUT_FILE) as monitor_1:
with processor_2.node.account.monitor_log(processor_2.STDOUT_FILE) as monitor_2:
with processor_3.node.account.monitor_log(processor_3.STDOUT_FILE) as monitor_3:
self.assert_produce(self.inputTopic,
"sending_message_after_hard_bouncing_streams_instance_bouncing_broker",
num_messages=self.num_messages,
timeout_sec=120)
monitor_1.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor.node.account))
monitor_2.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor_2.node.account))
monitor_3.wait_until(self.message,
timeout_sec=120,
err_msg=("Never saw '%s' on " % self.message) + str(processor_3.node.account))
self.assert_consume(self.client_id,
"consuming_message_after_stopping_streams_instance_bouncing_broker",
self.outputTopic,
num_messages=self.num_messages,
timeout_sec=120)
self.kafka.stop()