mirror of https://github.com/apache/kafka.git
				
				
				
			HOTFIX: move rebalanceInProgress check to skip commit during handleCorrupted (#10444)
Minor followup to #10407 -- we need to extract the rebalanceInProgress check down into the commitAndFillInConsumedOffsetsAndMetadataPerTaskMap method which is invoked during handleCorrupted, otherwise we may attempt to commit during a a rebalance which will fail Reviewers: Matthias J. Sax <mjsax@confluent.io>
This commit is contained in:
		
							parent
							
								
									b35704a843
								
							
						
					
					
						commit
						3eff8d39f1
					
				|  | @ -532,6 +532,9 @@ public class TaskManager { | |||
|         // as such we just need to skip those dirty tasks in the checkpoint | ||||
|         final Set<Task> dirtyTasks = new HashSet<>(); | ||||
|         try { | ||||
|             // in handleRevocation we must call commitOffsetsOrTransaction() directly rather than | ||||
|             // commitAndFillInConsumedOffsetsAndMetadataPerTaskMap() to make sure we don't skip the | ||||
|             // offset commit because we are in a rebalance | ||||
|             commitOffsetsOrTransaction(consumedOffsetsPerTask); | ||||
|         } catch (final TaskCorruptedException e) { | ||||
|             log.warn("Some tasks were corrupted when trying to commit offsets, these will be cleaned and revived: {}", | ||||
|  | @ -1002,28 +1005,34 @@ public class TaskManager { | |||
|      */ | ||||
|     int commit(final Collection<Task> tasksToCommit) { | ||||
|         int committed = 0; | ||||
|         if (rebalanceInProgress) { | ||||
|             committed = -1; | ||||
|         } else { | ||||
|             final Map<Task, Map<TopicPartition, OffsetAndMetadata>> consumedOffsetsAndMetadataPerTask = new HashMap<>(); | ||||
|             try { | ||||
|                 committed = commitAndFillInConsumedOffsetsAndMetadataPerTaskMap(tasksToCommit, consumedOffsetsAndMetadataPerTask); | ||||
|             } catch (final TimeoutException timeoutException) { | ||||
|                 consumedOffsetsAndMetadataPerTask | ||||
|                     .keySet() | ||||
|                     .forEach(t -> t.maybeInitTaskTimeoutOrThrow(time.milliseconds(), timeoutException)); | ||||
|             } | ||||
| 
 | ||||
|         final Map<Task, Map<TopicPartition, OffsetAndMetadata>> consumedOffsetsAndMetadataPerTask = new HashMap<>(); | ||||
|         try { | ||||
|             committed = commitAndFillInConsumedOffsetsAndMetadataPerTaskMap(tasksToCommit, consumedOffsetsAndMetadataPerTask); | ||||
|         } catch (final TimeoutException timeoutException) { | ||||
|             consumedOffsetsAndMetadataPerTask | ||||
|                 .keySet() | ||||
|                 .forEach(t -> t.maybeInitTaskTimeoutOrThrow(time.milliseconds(), timeoutException)); | ||||
|         } | ||||
| 
 | ||||
|         return committed; | ||||
|     } | ||||
| 
 | ||||
|     /** | ||||
|      * @throws TaskMigratedException if committing offsets failed (non-EOS) | ||||
|      *                               or if the task producer got fenced (EOS) | ||||
|      * @throws TimeoutException if committing offsets failed due to TimeoutException (non-EOS) | ||||
|      * @throws TaskCorruptedException if committing offsets failed due to TimeoutException (EOS) | ||||
|      * @param consumedOffsetsAndMetadataPerTask an empty map that will be filled in with the prepared offsets | ||||
|      * @return number of committed offsets, or -1 if we are in the middle of a rebalance and cannot commit | ||||
|      */ | ||||
|     private int commitAndFillInConsumedOffsetsAndMetadataPerTaskMap(final Collection<Task> tasksToCommit, | ||||
|                                                                     final Map<Task, Map<TopicPartition, OffsetAndMetadata>> consumedOffsetsAndMetadataPerTask) { | ||||
|         int committed = 0; | ||||
|         if (rebalanceInProgress) { | ||||
|             return -1; | ||||
|         } | ||||
| 
 | ||||
|         int committed = 0; | ||||
|         for (final Task task : tasksToCommit) { | ||||
|             if (task.commitNeeded()) { | ||||
|                 final Map<TopicPartition, OffsetAndMetadata> offsetAndMetadata = task.prepareCommit(); | ||||
|  | @ -1063,6 +1072,9 @@ public class TaskManager { | |||
|     } | ||||
| 
 | ||||
|     /** | ||||
|      * Caution: do not invoke this directly if it's possible a rebalance is occurring, as the commit will fail. If | ||||
|      * this is a possibility, prefer the {@link #commitAndFillInConsumedOffsetsAndMetadataPerTaskMap} instead. | ||||
|      * | ||||
|      * @throws TaskMigratedException   if committing offsets failed due to CommitFailedException (non-EOS) | ||||
|      * @throws TimeoutException        if committing offsets failed due to TimeoutException (non-EOS) | ||||
|      * @throws TaskCorruptedException  if committing offsets failed due to TimeoutException (EOS) | ||||
|  |  | |||
|  | @ -821,6 +821,57 @@ public class TaskManagerTest { | |||
|         verify(consumer); | ||||
|     } | ||||
| 
 | ||||
|     @Test | ||||
|     public void shouldNotAttemptToCommitInHandleCorruptedDuringARebalance() { | ||||
|         final ProcessorStateManager stateManager = EasyMock.createNiceMock(ProcessorStateManager.class); | ||||
|         expect(stateDirectory.listNonEmptyTaskDirectories()).andStubReturn(new File[0]); | ||||
| 
 | ||||
|         final StateMachineTask corruptedActive = new StateMachineTask(taskId00, taskId00Partitions, true, stateManager); | ||||
| 
 | ||||
|         // make sure this will attempt to be committed and throw | ||||
|         final StateMachineTask uncorruptedActive = new StateMachineTask(taskId01, taskId01Partitions, true, stateManager); | ||||
|         final Map<TopicPartition, OffsetAndMetadata> offsets = singletonMap(t1p1, new OffsetAndMetadata(0L, null)); | ||||
|         uncorruptedActive.setCommitNeeded(); | ||||
| 
 | ||||
|         // handleAssignment | ||||
|         final Map<TaskId, Set<TopicPartition>> assignment = new HashMap<>(); | ||||
|         assignment.putAll(taskId00Assignment); | ||||
|         assignment.putAll(taskId01Assignment); | ||||
|         expect(activeTaskCreator.createTasks(anyObject(), eq(assignment))).andStubReturn(asList(corruptedActive, uncorruptedActive)); | ||||
|         topologyBuilder.addSubscribedTopicsFromAssignment(anyObject(), anyString()); | ||||
|         expectLastCall().anyTimes(); | ||||
|         topologyBuilder.addSubscribedTopicsFromMetadata(eq(singleton(topic1)), anyObject()); | ||||
|         expectLastCall().anyTimes(); | ||||
| 
 | ||||
|         expectRestoreToBeCompleted(consumer, changeLogReader); | ||||
| 
 | ||||
|         expect(consumer.assignment()).andStubReturn(union(HashSet::new, taskId00Partitions, taskId01Partitions)); | ||||
| 
 | ||||
|         replay(activeTaskCreator, standbyTaskCreator, topologyBuilder, consumer, changeLogReader, stateDirectory, stateManager); | ||||
| 
 | ||||
|         uncorruptedActive.setCommittableOffsetsAndMetadata(offsets); | ||||
| 
 | ||||
|         taskManager.handleAssignment(assignment, emptyMap()); | ||||
|         assertThat(taskManager.tryToCompleteRestoration(time.milliseconds(), null), is(true)); | ||||
| 
 | ||||
|         assertThat(uncorruptedActive.state(), is(Task.State.RUNNING)); | ||||
| 
 | ||||
|         assertThat(uncorruptedActive.commitPrepared, is(false)); | ||||
|         assertThat(uncorruptedActive.commitNeeded, is(true)); | ||||
|         assertThat(uncorruptedActive.commitCompleted, is(false)); | ||||
| 
 | ||||
|         taskManager.handleRebalanceStart(singleton(topic1)); | ||||
|         assertThat(taskManager.isRebalanceInProgress(), is(true)); | ||||
|         taskManager.handleCorruption(singleton(taskId00)); | ||||
| 
 | ||||
|         assertThat(uncorruptedActive.commitPrepared, is(false)); | ||||
|         assertThat(uncorruptedActive.commitNeeded, is(true)); | ||||
|         assertThat(uncorruptedActive.commitCompleted, is(false)); | ||||
| 
 | ||||
|         assertThat(uncorruptedActive.state(), is(State.RUNNING)); | ||||
|         verify(consumer); | ||||
|     } | ||||
| 
 | ||||
|     @Test | ||||
|     public void shouldCloseAndReviveUncorruptedTasksWhenTimeoutExceptionThrownFromCommitWithALOS() { | ||||
|         final ProcessorStateManager stateManager = EasyMock.createStrictMock(ProcessorStateManager.class); | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue