diff --git a/streams/src/main/java/org/apache/kafka/streams/Topology.java b/streams/src/main/java/org/apache/kafka/streams/Topology.java index 01abf4a8b68..830b050cc66 100644 --- a/streams/src/main/java/org/apache/kafka/streams/Topology.java +++ b/streams/src/main/java/org/apache/kafka/streams/Topology.java @@ -23,12 +23,16 @@ import org.apache.kafka.streams.internals.AutoOffsetResetInternal; import org.apache.kafka.streams.kstream.KStream; import org.apache.kafka.streams.kstream.KTable; import org.apache.kafka.streams.processor.ConnectedStoreProvider; +import org.apache.kafka.streams.processor.Punctuator; import org.apache.kafka.streams.processor.StateStore; import org.apache.kafka.streams.processor.StreamPartitioner; import org.apache.kafka.streams.processor.TimestampExtractor; import org.apache.kafka.streams.processor.TopicNameExtractor; import org.apache.kafka.streams.processor.api.Processor; +import org.apache.kafka.streams.processor.api.ProcessorContext; import org.apache.kafka.streams.processor.api.ProcessorSupplier; +import org.apache.kafka.streams.processor.api.Record; +import org.apache.kafka.streams.processor.api.RecordMetadata; import org.apache.kafka.streams.processor.internals.InternalTopologyBuilder; import org.apache.kafka.streams.processor.internals.StoreDelegatingProcessorSupplier; import org.apache.kafka.streams.query.StateQueryRequest; @@ -38,6 +42,8 @@ import java.util.Objects; import java.util.Set; import java.util.regex.Pattern; +import static org.apache.kafka.streams.internals.ApiUtils.checkSupplier; + /** * A logical representation of a {@code ProcessorTopology}. * A topology is a graph of sources, processors, and sinks. @@ -80,13 +86,13 @@ public class Topology { } @Deprecated - private static AutoOffsetResetInternal convertOldToNew(final Topology.AutoOffsetReset resetPolicy) { + private static AutoOffsetResetInternal convertOldToNew(final AutoOffsetReset resetPolicy) { if (resetPolicy == null) { return null; } return new AutoOffsetResetInternal( - resetPolicy == org.apache.kafka.streams.Topology.AutoOffsetReset.EARLIEST + resetPolicy == AutoOffsetReset.EARLIEST ? org.apache.kafka.streams.AutoOffsetReset.earliest() : org.apache.kafka.streams.AutoOffsetReset.latest() ); @@ -572,25 +578,58 @@ public class Topology { /** * Add a {@link Processor processor} that receives and processed records from one or more parent processors or * {@link #addSource(String, String...) sources}. + * The {@link Processor} can emit any number of result records via {@link ProcessorContext#forward(Record)}. * Any record output by this processor will be forwarded to its child processors and * {@link #addSink(String, String, String...) sinks}. * *
By default, the processor is stateless. - * There is three different {@link StateStore state stores}, which can be connected to a processor: + * There is two different {@link StateStore state stores}, which can be added to the {@link Topology} and directly + * connected to a processor, making the processor stateful: *
All state stores which are connected to a processor and all global stores, can be accessed via + * {@link ProcessorContext#getStateStore(String) context.getStateStore(String)} + * using the context provided via + * {@link Processor#init(ProcessorContext) Processor#init()}: + * + *
{@code + * public class MyProcessor implements Processor+ * + * Furthermore, the provided {@link ProcessorContext} gives access to topology, runtime, and + * {@link RecordMetadata record metadata}, and allows to schedule {@link Punctuator punctuations} and to + * request offset commits. * * @param name * the unique name of the processor used to reference this node when adding other processor or * {@link #addSink(String, String, String...) sink} children - * @param supplier + * @param processorSupplier * the supplier used to obtain {@link Processor} instances * @param parentNames * the name of one or more processors or {@link #addSource(String, String...) sources}, @@ -601,13 +640,17 @@ public class Topology { * @throws TopologyException * if the provided processor name is not unique, or * if a parent processor/source name is unknown or specifies a sink + * @throws NullPointerException + * if {@code name}, {@code processorSupplier}, or {@code parentNames} is {@code null}, or + * {@code parentNames} contains a {@code null} parent name * * @see org.apache.kafka.streams.processor.api.ContextualProcessor ContextualProcessor */ public synchronized{ + * private ProcessorContext context; + * private KeyValueStore store; + * + * @Override + * void init(final ProcessorContext context) { + * this.context = context; + * this.store = context.getStateStore("myStore"); + * } + * + * @Override + * void process(final Record record) { + * // can access this.context and this.store + * } + * } + * }
For example, you can use this transformation to set a key for a key-less input record {@code The example below counts the number of token of the value string.
* The example below normalizes the String key to upper-case letters and counts the number of token of the
* value string.
@@ -262,7 +266,7 @@ public interface KStream The example below splits input records {@code The example below splits input records {@code {@code Foreach} is a terminal operation that may triggers side effects (such as logging or statistics
* collection) and returns {@code void} (cf. {@link #peek(ForeachAction)}).
@@ -412,7 +416,7 @@ public interface KStream {@code Peek} is a non-terminal operation that may triggers side effects (such as logging or statistics
* collection) and returns an unchanged {@code KStream} (cf. {@link #foreach(ForeachAction)}).
@@ -534,7 +538,7 @@ public interface KStream
- * In order for the processor to use state stores, the stores must be added to the topology and connected to the
- * processor using at least one of two strategies (though it's not required to connect global state stores; read-only
- * access to global state stores is available by default).
- *
- * The first strategy is to manually add the {@link StoreBuilder}s via {@link Topology#addStateStore(StoreBuilder, String...)},
- * and specify the store names via {@code stateStoreNames} so they will be connected to the processor.
+ * {@link ProcessorSupplier}) to each input record.
+ * The {@link Processor} can emit any number of result records via {@link ProcessorContext#forward(Record)}
+ * (possibly of a different key and/or value type).
+ *
+ * By default, the processor is stateless (similar to {@link #flatMap(KeyValueMapper, Named)}, however, it also
+ * has access to the {@link Record record's} timestamp and headers), but previously added
+ * {@link StateStore state stores} can be connected by providing their names as additional parameters, making
+ * the processor stateful.
+ * There is two different {@link StateStore state stores}, which can be added to the underlying {@link Topology}:
+ * All state stores which are connected to a processor and all global stores, can be accessed via
+ * {@link ProcessorContext#getStateStore(String) context.getStateStore(String)}
+ * using the context provided via
+ * {@link Processor#init(ProcessorContext) Processor#init()}:
+ *
*
- * With either strategy, within the {@link Processor}, the state is obtained via the {@link ProcessorContext}.
- * To trigger periodic actions via {@link org.apache.kafka.streams.processor.Punctuator#punctuate(long) punctuate()},
- * a schedule must be registered.
- * In contrast to grouping/aggregation and joins, even if the processor is stateful and an upstream operation
+ * was key changing, no auto-repartition is triggered.
* If repartitioning is required, a call to {@link #repartition()} should be performed before {@code process()}.
- *
- * Processing records might result in an internal data redistribution if a key-based operator (like an aggregation
- * or join) is applied to the result {@code KStream}.
- * (cf. {@link #processValues(FixedKeyProcessorSupplier, String...)})
+ * At the same time, this method is considered a key changing operation by itself, and might result in an internal
+ * data redistribution if a key-based operator (like an aggregation or join) is applied to the result
+ * {@code KStream} (cf. {@link #processValues(FixedKeyProcessorSupplier, String...)}).
*
- * @param processorSupplier an instance of {@link ProcessorSupplier} that generates a newly constructed {@link Processor}
- * The supplier should always generate a new instance. Creating a single {@link Processor} object
- * and returning the same object reference in {@link ProcessorSupplier#get()} is a
- * violation of the supplier pattern and leads to runtime exceptions.
- * @param stateStoreNames the names of the state stores used by the processor; not required if the supplier
- * implements {@link ConnectedStoreProvider#stores()}
- * @see #map(KeyValueMapper)
+ * @param processorSupplier
+ * the supplier used to obtain {@link Processor} instances
+ * @param stateStoreNames
+ * the names of state stores that the processor should be able to access
*/
- * In order for the processor to use state stores, the stores must be added to the topology and connected to the
- * processor using at least one of two strategies (though it's not required to connect global state stores; read-only
- * access to global state stores is available by default).
- *
- * The first strategy is to manually add the {@link StoreBuilder}s via {@link Topology#addStateStore(StoreBuilder, String...)},
- * and specify the store names via {@code stateStoreNames} so they will be connected to the processor.
- *
- * With either strategy, within the {@link Processor}, the state is obtained via the {@link ProcessorContext}.
- * To trigger periodic actions via {@link org.apache.kafka.streams.processor.Punctuator#punctuate(long) punctuate()},
- * a schedule must be registered.
- *
- * Processing records might result in an internal data redistribution if a key based operator (like an aggregation
- * or join) is applied to the result {@code KStream}.
- * (cf. {@link #processValues(FixedKeyProcessorSupplier, Named, String...)})
- *
- * @param processorSupplier an instance of {@link ProcessorSupplier} that generates a newly constructed {@link Processor}
- * The supplier should always generate a new instance. Creating a single {@link Processor} object
- * and returning the same object reference in {@link ProcessorSupplier#get()} is a
- * violation of the supplier pattern and leads to runtime exceptions.
- * @param named a {@link Named} config used to name the processor in the topology
- * @param stateStoreNames the names of the state store used by the processor
- * @see #map(KeyValueMapper)
- * @see #processValues(FixedKeyProcessorSupplier, Named, String...)
+ * Takes an additional {@link Named} parameter that is used to name the processor in the topology.
*/
- * In order for the processor to use state stores, the stores must be added to the topology and connected to the
- * processor using at least one of two strategies (though it's not required to connect global state stores; read-only
- * access to global state stores is available by default).
- *
- * The first strategy is to manually add the {@link StoreBuilder}s via {@link Topology#addStateStore(StoreBuilder, String...)},
- * and specify the store names via {@code stateStoreNames} so they will be connected to the processor.
- * Because the key cannot be modified, this method is not a key changing operation and preserves data
+ * co-location with respect to the key (cf. {@link #flatMapValues(ValueMapper)}).
+ * Thus, no internal data redistribution is required if a key-based operator (like an aggregation or join)
+ * is applied to the result {@code KStream}.
*
- * // provide store(s) that will be added and connected to the associated processor
- * // the store name from the builder ("myProcessorState") is used to access the store later via the ProcessorContext
- * Set
- * With either strategy, within the {@link FixedKeyProcessor}, the state is obtained via the {@link FixedKeyProcessorContext}.
- * To trigger periodic actions via {@link org.apache.kafka.streams.processor.Punctuator#punctuate(long) punctuate()},
- * a schedule must be registered.
- *
- * Setting a new value preserves data co-location with respect to the key.
- * Thus, no internal data redistribution is required if a key based operator (like an aggregation or join)
- * is applied to the result {@code KStream}. (cf. {@link #process(ProcessorSupplier, String...)})
- *
- * @param processorSupplier an instance of {@link FixedKeyProcessorSupplier} that generates a newly constructed {@link FixedKeyProcessor}
- * The supplier should always generate a new instance. Creating a single {@link FixedKeyProcessor} object
- * and returning the same object reference in {@link FixedKeyProcessorSupplier#get()} is a
- * violation of the supplier pattern and leads to runtime exceptions.
- * @param stateStoreNames the names of the state store used by the processor
- * @see #mapValues(ValueMapper)
- * @see #process(ProcessorSupplier, Named, String...)
+ * However, because the key cannot be modified, some restrictions apply to a {@link FixedKeyProcessor} compared
+ * to a {@link Processor}: for example, forwarding result records from a {@link Punctuator} is not possible.
*/
- * In order for the processor to use state stores, the stores must be added to the topology and connected to the
- * processor using at least one of two strategies (though it's not required to connect global state stores; read-only
- * access to global state stores is available by default).
- *
- * The first strategy is to manually add the {@link StoreBuilder}s via {@link Topology#addStateStore(StoreBuilder, String...)},
- * and specify the store names via {@code stateStoreNames} so they will be connected to the processor.
- *
- * With either strategy, within the {@link FixedKeyProcessor}, the state is obtained via the {@link FixedKeyProcessorContext}.
- * To trigger periodic actions via {@link org.apache.kafka.streams.processor.Punctuator#punctuate(long) punctuate()},
- * a schedule must be registered.
- *
- * Setting a new value preserves data co-location with respect to the key.
- * Thus, no internal data redistribution is required if a key based operator (like an aggregation or join)
- * is applied to the result {@code KStream}. (cf. {@link #process(ProcessorSupplier, String...)})
- *
- * @param processorSupplier an instance of {@link FixedKeyProcessorSupplier} that generates a newly constructed {@link FixedKeyProcessor}
- * The supplier should always generate a new instance. Creating a single {@link FixedKeyProcessor} object
- * and returning the same object reference in {@link FixedKeyProcessorSupplier#get()} is a
- * violation of the supplier pattern and leads to runtime exceptions.
- * @param named a {@link Named} config used to name the processor in the topology
- * @param stateStoreNames the names of the state store used by the processor
- * @see #mapValues(ValueMapper)
- * @see #process(ProcessorSupplier, Named, String...)
+ * Takes an additional {@link Named} parameter that is used to name the processor in the topology.
*/
{@code
@@ -216,7 +220,7 @@ public interface KStream
{@code
@@ -389,7 +393,7 @@ public interface KStream
+ *
+ *
+ * If the {@code processorSupplier} provides state stores via {@link ConnectedStoreProvider#stores()}, the
+ * corresponding {@link StoreBuilder StoreBuilders} will be added to the topology and connected to this processor
+ * automatically, without the need to provide the store names as parameter to this method.
+ * Additionally, even if a processor is stateless, it can still access all
+ * {@link StreamsBuilder#addGlobalStore global state stores} (read-only).
+ * There is no need to connect global stores to processors.
+ *
+ * {@code
- * // create store
- * StoreBuilder
- * The second strategy is for the given {@link ProcessorSupplier} to implement {@link ConnectedStoreProvider#stores()},
- * which provides the {@link StoreBuilder}s to be automatically added to the topology and connected to the processor.
- * {@code
- * class MyProcessorSupplier implements ProcessorSupplier {
- * // supply processor
- * Processor get() {
- * return new MyProcessor();
+ * @Override
+ * void init(final ProcessorContext
- * {@code
- * class MyProcessor implements Processor {
- * private StateStore state;
- *
- * void init(ProcessorContext context) {
- * this.state = context.getStateStore("myProcessorState");
- * // punctuate each second, can access this.state
- * context.schedule(Duration.ofSeconds(1), PunctuationType.WALL_CLOCK_TIME, new Punctuator(..));
- * }
- *
- * void process(Record
- * Even if any upstream operation was key-changing, no auto-repartition is triggered.
+ *
+ * Furthermore, the provided {@link ProcessorContext} gives access to topology, runtime, and
+ * {@link RecordMetadata record metadata}, and allows to schedule {@link Punctuator punctuations} and to
+ * request offset commits.
+ *
+ * {@code
- * // create store
- * StoreBuilder
- * The second strategy is for the given {@link ProcessorSupplier} to implement {@link ConnectedStoreProvider#stores()},
- * which provides the {@link StoreBuilder}s to be automatically added to the topology and connected to the processor.
- * {@code
- * class MyProcessorSupplier implements ProcessorSupplier {
- * // supply processor
- * Processor get() {
- * return new MyProcessor();
- * }
- *
- * // provide store(s) that will be added and connected to the associated processor
- * // the store name from the builder ("myProcessorState") is used to access the store later via the ProcessorContext
- * Set
- * {@code
- * class MyProcessor implements Processor {
- * private StateStore state;
- *
- * void init(ProcessorContext context) {
- * this.state = context.getStateStore("myProcessorState");
- * // punctuate each second, can access this.state
- * context.schedule(Duration.ofSeconds(1), PunctuationType.WALL_CLOCK_TIME, new Punctuator(..));
- * }
- *
- * void process(Record
- * Even if any upstream operation was key-changing, no auto-repartition is triggered.
- * If repartitioning is required, a call to {@link #repartition()} should be performed before {@code process()}.
- * {@code
- * // create store
- * StoreBuilder
- * The second strategy is for the given {@link ProcessorSupplier} to implement {@link ConnectedStoreProvider#stores()},
- * which provides the {@link StoreBuilder}s to be automatically added to the topology and connected to the processor.
- * {@code
- * class MyProcessorSupplier implements FixedKeyProcessorSupplier {
- * // supply processor
- * FixedKeyProcessor get() {
- * return new MyProcessor();
- * }
+ *
- * {@code
- * class MyProcessor implements FixedKeyProcessor {
- * private StateStore state;
- *
- * void init(ProcessorContext context) {
- * this.state = context.getStateStore("myProcessorState");
- * // punctuate each second, can access this.state
- * context.schedule(Duration.ofSeconds(1), PunctuationType.WALL_CLOCK_TIME, new Punctuator(..));
- * }
- *
- * void process(FixedKeyRecord
- * Even if any upstream operation was key-changing, no auto-repartition is triggered.
- * If repartitioning is required, a call to {@link #repartition()} should be performed before {@code process()}.
- * {@code
- * // create store
- * StoreBuilder
- * The second strategy is for the given {@link ProcessorSupplier} to implement {@link ConnectedStoreProvider#stores()},
- * which provides the {@link StoreBuilder}s to be automatically added to the topology and connected to the processor.
- * {@code
- * class MyProcessorSupplier implements FixedKeyProcessorSupplier {
- * // supply processor
- * FixedKeyProcessor get() {
- * return new MyProcessor();
- * }
- *
- * // provide store(s) that will be added and connected to the associated processor
- * // the store name from the builder ("myProcessorState") is used to access the store later via the ProcessorContext
- * Set
- * {@code
- * class MyProcessor implements FixedKeyProcessor {
- * private StateStore state;
- *
- * void init(ProcessorContext context) {
- * this.state = context.getStateStore("myProcessorState");
- * // punctuate each second, can access this.state
- * context.schedule(Duration.ofSeconds(1), PunctuationType.WALL_CLOCK_TIME, new Punctuator(..));
- * }
- *
- * void process(FixedKeyRecord
- * Even if any upstream operation was key-changing, no auto-repartition is triggered.
- * If repartitioning is required, a call to {@link #repartition()} should be performed before {@code process()}.
- *