Refactor DataBufferUtils Matcher implementation
The existing implementation was exposed to very poor performance when matching with multiple delimiters against a large buffer with many delimiters. In that case all matchers are invoked many times (as many as the number of delimiters) even though some of them found no match at all on the first pass. The revised implementation uses a single index and advances all matchers together, checking one byte a time, and not letting any one of them search to the end of the entire buffer on a single pass. Closes gh-25915
This commit is contained in:
parent
6946fe2f74
commit
fb4363e4e0
|
@ -30,7 +30,6 @@ import java.nio.channels.WritableByteChannel;
|
|||
import java.nio.file.OpenOption;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Callable;
|
||||
|
@ -568,65 +567,290 @@ public abstract class DataBufferUtils {
|
|||
|
||||
/**
|
||||
* Return a {@link Matcher} for the given delimiter.
|
||||
* The matcher can be used to find the delimiters in data buffers.
|
||||
* The matcher can be used to find the delimiters in a stream of data buffers.
|
||||
* @param delimiter the delimiter bytes to find
|
||||
* @return the matcher
|
||||
* @since 5.2
|
||||
*/
|
||||
public static Matcher matcher(byte[] delimiter) {
|
||||
Assert.isTrue(delimiter.length > 0, "Delimiter must not be empty");
|
||||
return new KnuthMorrisPrattMatcher(delimiter);
|
||||
return createMatcher(delimiter);
|
||||
}
|
||||
|
||||
/** Return a {@link Matcher} for the given delimiters.
|
||||
* The matcher can be used to find the delimiters in data buffers.
|
||||
/**
|
||||
* Return a {@link Matcher} for the given delimiters.
|
||||
* The matcher can be used to find the delimiters in a stream of data buffers.
|
||||
* @param delimiters the delimiters bytes to find
|
||||
* @return the matcher
|
||||
* @since 5.2
|
||||
*/
|
||||
public static Matcher matcher(byte[]... delimiters) {
|
||||
Assert.isTrue(delimiters.length > 0, "Delimiters must not be empty");
|
||||
if (delimiters.length == 1) {
|
||||
return matcher(delimiters[0]);
|
||||
}
|
||||
else {
|
||||
Matcher[] matchers = new Matcher[delimiters.length];
|
||||
for (int i = 0; i < delimiters.length; i++) {
|
||||
matchers[i] = matcher(delimiters[i]);
|
||||
}
|
||||
return new CompositeMatcher(matchers);
|
||||
return (delimiters.length == 1 ? createMatcher(delimiters[0]) : new CompositeMatcher(delimiters));
|
||||
}
|
||||
|
||||
private static NestedMatcher createMatcher(byte[] delimiter) {
|
||||
Assert.isTrue(delimiter.length > 0, "Delimiter must not be empty");
|
||||
switch (delimiter.length) {
|
||||
case 1:
|
||||
return (delimiter[0] == 10 ? SingleByteMatcher.NEWLINE_MATCHER : new SingleByteMatcher(delimiter));
|
||||
case 2:
|
||||
return new TwoByteMatcher(delimiter);
|
||||
default:
|
||||
return new KnuthMorrisPrattMatcher(delimiter);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Defines an object that matches a data buffer against a delimiter.
|
||||
* Contract to find delimiter(s) against one or more data buffers that can
|
||||
* be passed one at a time to the {@link #match(DataBuffer)} method.
|
||||
*
|
||||
* @since 5.2
|
||||
* @see #match(DataBuffer)
|
||||
*/
|
||||
public interface Matcher {
|
||||
|
||||
/**
|
||||
* Returns the position of the final matching delimiter byte that matches the given buffer,
|
||||
* or {@code -1} if not found.
|
||||
* @param dataBuffer the buffer in which to search for the delimiter
|
||||
* @return the position of the final matching delimiter, or {@code -1} if not found.
|
||||
* Find the first matching delimiter and return the index of the last
|
||||
* byte of the delimiter, or {@code -1} if not found.
|
||||
*/
|
||||
int match(DataBuffer dataBuffer);
|
||||
|
||||
/**
|
||||
* Return the delimiter used for this matcher.
|
||||
* @return the delimiter
|
||||
* Return the delimiter from the last invocation of {@link #match(DataBuffer)}.
|
||||
*/
|
||||
byte[] delimiter();
|
||||
|
||||
/**
|
||||
* Resets the state of this matcher.
|
||||
* Reset the state of this matcher.
|
||||
*/
|
||||
void reset();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Matcher that supports searching for multiple delimiters.
|
||||
*/
|
||||
private static class CompositeMatcher implements Matcher {
|
||||
|
||||
private static final byte[] NO_DELIMITER = new byte[0];
|
||||
|
||||
|
||||
private final NestedMatcher[] matchers;
|
||||
|
||||
byte[] longestDelimiter = NO_DELIMITER;
|
||||
|
||||
CompositeMatcher(byte[][] delimiters) {
|
||||
this.matchers = initMatchers(delimiters);
|
||||
}
|
||||
|
||||
private static NestedMatcher[] initMatchers(byte[][] delimiters) {
|
||||
NestedMatcher[] matchers = new NestedMatcher[delimiters.length];
|
||||
for (int i = 0; i < delimiters.length; i++) {
|
||||
matchers[i] = createMatcher(delimiters[i]);
|
||||
}
|
||||
return matchers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int match(DataBuffer dataBuffer) {
|
||||
this.longestDelimiter = NO_DELIMITER;
|
||||
|
||||
for (int pos = dataBuffer.readPosition(); pos < dataBuffer.writePosition(); pos++) {
|
||||
byte b = dataBuffer.getByte(pos);
|
||||
|
||||
for (NestedMatcher matcher : this.matchers) {
|
||||
if (matcher.match(b) && matcher.delimiter().length > this.longestDelimiter.length) {
|
||||
this.longestDelimiter = matcher.delimiter();
|
||||
}
|
||||
}
|
||||
|
||||
if (this.longestDelimiter != NO_DELIMITER) {
|
||||
reset();
|
||||
return pos;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] delimiter() {
|
||||
Assert.state(this.longestDelimiter != NO_DELIMITER, "Illegal state!");
|
||||
return this.longestDelimiter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
for (NestedMatcher matcher : this.matchers) {
|
||||
matcher.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Matcher that can be nested within {@link CompositeMatcher} where multiple
|
||||
* matchers advance together using the same index, one byte at a time.
|
||||
*/
|
||||
private interface NestedMatcher extends Matcher {
|
||||
|
||||
/**
|
||||
* Perform a match against the next byte of the stream and return true
|
||||
* if the delimiter is fully matched.
|
||||
*/
|
||||
boolean match(byte b);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Matcher for a single byte delimiter.
|
||||
*/
|
||||
private static class SingleByteMatcher implements NestedMatcher {
|
||||
|
||||
static SingleByteMatcher NEWLINE_MATCHER = new SingleByteMatcher(new byte[] {10});
|
||||
|
||||
private final byte[] delimiter;
|
||||
|
||||
SingleByteMatcher(byte[] delimiter) {
|
||||
Assert.isTrue(delimiter.length == 1, "Expected a 1 byte delimiter");
|
||||
this.delimiter = delimiter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int match(DataBuffer dataBuffer) {
|
||||
for (int pos = dataBuffer.readPosition(); pos < dataBuffer.writePosition(); pos++) {
|
||||
byte b = dataBuffer.getByte(pos);
|
||||
if (match(b)) {
|
||||
return pos;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean match(byte b) {
|
||||
return this.delimiter[0] == b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] delimiter() {
|
||||
return this.delimiter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Base class for a {@link NestedMatcher}.
|
||||
*/
|
||||
private static abstract class AbstractNestedMatcher implements NestedMatcher {
|
||||
|
||||
private final byte[] delimiter;
|
||||
|
||||
private int matches = 0;
|
||||
|
||||
|
||||
protected AbstractNestedMatcher(byte[] delimiter) {
|
||||
this.delimiter = delimiter;
|
||||
}
|
||||
|
||||
protected void setMatches(int index) {
|
||||
this.matches = index;
|
||||
}
|
||||
|
||||
protected int getMatches() {
|
||||
return this.matches;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int match(DataBuffer dataBuffer) {
|
||||
for (int pos = dataBuffer.readPosition(); pos < dataBuffer.writePosition(); pos++) {
|
||||
byte b = dataBuffer.getByte(pos);
|
||||
if (match(b)) {
|
||||
reset();
|
||||
return pos;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean match(byte b) {
|
||||
if (b == this.delimiter[this.matches]) {
|
||||
this.matches++;
|
||||
return (this.matches == delimiter().length);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] delimiter() {
|
||||
return this.delimiter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
this.matches = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Matcher with a 2 byte delimiter that does not benefit from a
|
||||
* Knuth-Morris-Pratt suffix-prefix table.
|
||||
*/
|
||||
private static class TwoByteMatcher extends AbstractNestedMatcher {
|
||||
|
||||
protected TwoByteMatcher(byte[] delimiter) {
|
||||
super(delimiter);
|
||||
Assert.isTrue(delimiter.length == 2, "Expected a 2-byte delimiter");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implementation of {@link Matcher} that uses the Knuth-Morris-Pratt algorithm.
|
||||
* @see <a href="https://www.nayuki.io/page/knuth-morris-pratt-string-matching">Knuth-Morris-Pratt string matching</a>
|
||||
*/
|
||||
private static class KnuthMorrisPrattMatcher extends AbstractNestedMatcher {
|
||||
|
||||
private final int[] table;
|
||||
|
||||
public KnuthMorrisPrattMatcher(byte[] delimiter) {
|
||||
super(delimiter);
|
||||
this.table = longestSuffixPrefixTable(delimiter);
|
||||
}
|
||||
|
||||
private static int[] longestSuffixPrefixTable(byte[] delimiter) {
|
||||
int[] result = new int[delimiter.length];
|
||||
result[0] = 0;
|
||||
for (int i = 1; i < delimiter.length; i++) {
|
||||
int j = result[i - 1];
|
||||
while (j > 0 && delimiter[i] != delimiter[j]) {
|
||||
j = result[j - 1];
|
||||
}
|
||||
if (delimiter[i] == delimiter[j]) {
|
||||
j++;
|
||||
}
|
||||
result[i] = j;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean match(byte b) {
|
||||
while (getMatches() > 0 && b != delimiter()[getMatches()]) {
|
||||
setMatches(this.table[getMatches() - 1]);
|
||||
}
|
||||
return super.match(b);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static class ReadableByteChannelGenerator implements Consumer<SynchronousSink<DataBuffer>> {
|
||||
|
||||
private final ReadableByteChannel channel;
|
||||
|
@ -908,124 +1132,4 @@ public abstract class DataBufferUtils {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implementation of {@link Matcher} that uses the Knuth-Morris-Pratt algorithm.
|
||||
* @see <a href="https://www.nayuki.io/page/knuth-morris-pratt-string-matching">Knuth-Morris-Pratt string matching</a>
|
||||
*/
|
||||
private static class KnuthMorrisPrattMatcher implements Matcher {
|
||||
|
||||
private final byte[] delimiter;
|
||||
|
||||
private final int[] table;
|
||||
|
||||
private int matches = 0;
|
||||
|
||||
public KnuthMorrisPrattMatcher(byte[] delimiter) {
|
||||
this.delimiter = Arrays.copyOf(delimiter, delimiter.length);
|
||||
this.table = longestSuffixPrefixTable(delimiter);
|
||||
}
|
||||
|
||||
private static int[] longestSuffixPrefixTable(byte[] delimiter) {
|
||||
int[] result = new int[delimiter.length];
|
||||
result[0] = 0;
|
||||
for (int i = 1; i < delimiter.length; i++) {
|
||||
int j = result[i - 1];
|
||||
while (j > 0 && delimiter[i] != delimiter[j]) {
|
||||
j = result[j - 1];
|
||||
}
|
||||
if (delimiter[i] == delimiter[j]) {
|
||||
j++;
|
||||
}
|
||||
result[i] = j;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int match(DataBuffer dataBuffer) {
|
||||
for (int i = dataBuffer.readPosition(); i < dataBuffer.writePosition(); i++) {
|
||||
byte b = dataBuffer.getByte(i);
|
||||
|
||||
while (this.matches > 0 && b != this.delimiter[this.matches]) {
|
||||
this.matches = this.table[this.matches - 1];
|
||||
}
|
||||
|
||||
if (b == this.delimiter[this.matches]) {
|
||||
this.matches++;
|
||||
if (this.matches == this.delimiter.length) {
|
||||
reset();
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] delimiter() {
|
||||
return Arrays.copyOf(this.delimiter, this.delimiter.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
this.matches = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implementation of {@link Matcher} that wraps several other matchers.
|
||||
*/
|
||||
private static class CompositeMatcher implements Matcher {
|
||||
|
||||
private static final byte[] NO_DELIMITER = new byte[0];
|
||||
|
||||
private final Matcher[] matchers;
|
||||
|
||||
byte[] longestDelimiter = NO_DELIMITER;
|
||||
|
||||
public CompositeMatcher(Matcher[] matchers) {
|
||||
this.matchers = matchers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int match(DataBuffer dataBuffer) {
|
||||
this.longestDelimiter = NO_DELIMITER;
|
||||
int bestEndIdx = Integer.MAX_VALUE;
|
||||
|
||||
|
||||
for (Matcher matcher : this.matchers) {
|
||||
int endIdx = matcher.match(dataBuffer);
|
||||
if (endIdx != -1 &&
|
||||
endIdx <= bestEndIdx &&
|
||||
matcher.delimiter().length > this.longestDelimiter.length) {
|
||||
bestEndIdx = endIdx;
|
||||
this.longestDelimiter = matcher.delimiter();
|
||||
}
|
||||
}
|
||||
if (bestEndIdx == Integer.MAX_VALUE) {
|
||||
this.longestDelimiter = NO_DELIMITER;
|
||||
return -1;
|
||||
}
|
||||
else {
|
||||
reset();
|
||||
return bestEndIdx;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] delimiter() {
|
||||
Assert.state(this.longestDelimiter != NO_DELIMITER, "Illegal state!");
|
||||
return this.longestDelimiter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
for (Matcher matcher : this.matchers) {
|
||||
matcher.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -886,18 +886,38 @@ class DataBufferUtilsTests extends AbstractDataBufferAllocatingTests {
|
|||
void matcher2(String displayName, DataBufferFactory bufferFactory) {
|
||||
super.bufferFactory = bufferFactory;
|
||||
|
||||
DataBuffer foo = stringBuffer("fooobar");
|
||||
DataBuffer foo = stringBuffer("foooobar");
|
||||
|
||||
byte[] delims = "oo".getBytes(StandardCharsets.UTF_8);
|
||||
DataBufferUtils.Matcher matcher = DataBufferUtils.matcher(delims);
|
||||
int result = matcher.match(foo);
|
||||
assertThat(result).isEqualTo(2);
|
||||
foo.readPosition(2);
|
||||
result = matcher.match(foo);
|
||||
assertThat(result).isEqualTo(3);
|
||||
foo.readPosition(3);
|
||||
result = matcher.match(foo);
|
||||
assertThat(result).isEqualTo(-1);
|
||||
int endIndex = matcher.match(foo);
|
||||
assertThat(endIndex).isEqualTo(2);
|
||||
foo.readPosition(endIndex + 1);
|
||||
endIndex = matcher.match(foo);
|
||||
assertThat(endIndex).isEqualTo(4);
|
||||
foo.readPosition(endIndex + 1);
|
||||
endIndex = matcher.match(foo);
|
||||
assertThat(endIndex).isEqualTo(-1);
|
||||
|
||||
release(foo);
|
||||
}
|
||||
|
||||
@ParameterizedDataBufferAllocatingTest
|
||||
void matcher3(String displayName, DataBufferFactory bufferFactory) {
|
||||
super.bufferFactory = bufferFactory;
|
||||
|
||||
DataBuffer foo = stringBuffer("foooobar");
|
||||
|
||||
byte[] delims = "oo".getBytes(StandardCharsets.UTF_8);
|
||||
DataBufferUtils.Matcher matcher = DataBufferUtils.matcher(delims);
|
||||
int endIndex = matcher.match(foo);
|
||||
assertThat(endIndex).isEqualTo(2);
|
||||
foo.readPosition(endIndex + 1);
|
||||
endIndex = matcher.match(foo);
|
||||
assertThat(endIndex).isEqualTo(4);
|
||||
foo.readPosition(endIndex + 1);
|
||||
endIndex = matcher.match(foo);
|
||||
assertThat(endIndex).isEqualTo(-1);
|
||||
|
||||
release(foo);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue