Refactor DataBufferUtils Matcher implementation

The existing implementation was exposed to very poor performance when matching
with multiple delimiters against a large buffer with many delimiters. In that
case all matchers are invoked many times (as many as the number of delimiters)
even though some of them found no match at all on the first pass.

The revised implementation uses a single index and advances all matchers
together, checking one byte a time, and not letting any one of them search to
the end of the entire buffer on a single pass.

Closes gh-25915
This commit is contained in:
Rossen Stoyanchev 2020-10-23 13:23:51 +01:00
parent 6946fe2f74
commit fb4363e4e0
2 changed files with 276 additions and 152 deletions

View File

@ -30,7 +30,6 @@ import java.nio.channels.WritableByteChannel;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.Callable;
@ -568,65 +567,290 @@ public abstract class DataBufferUtils {
/**
* Return a {@link Matcher} for the given delimiter.
* The matcher can be used to find the delimiters in data buffers.
* The matcher can be used to find the delimiters in a stream of data buffers.
* @param delimiter the delimiter bytes to find
* @return the matcher
* @since 5.2
*/
public static Matcher matcher(byte[] delimiter) {
Assert.isTrue(delimiter.length > 0, "Delimiter must not be empty");
return new KnuthMorrisPrattMatcher(delimiter);
return createMatcher(delimiter);
}
/** Return a {@link Matcher} for the given delimiters.
* The matcher can be used to find the delimiters in data buffers.
/**
* Return a {@link Matcher} for the given delimiters.
* The matcher can be used to find the delimiters in a stream of data buffers.
* @param delimiters the delimiters bytes to find
* @return the matcher
* @since 5.2
*/
public static Matcher matcher(byte[]... delimiters) {
Assert.isTrue(delimiters.length > 0, "Delimiters must not be empty");
if (delimiters.length == 1) {
return matcher(delimiters[0]);
}
else {
Matcher[] matchers = new Matcher[delimiters.length];
for (int i = 0; i < delimiters.length; i++) {
matchers[i] = matcher(delimiters[i]);
}
return new CompositeMatcher(matchers);
return (delimiters.length == 1 ? createMatcher(delimiters[0]) : new CompositeMatcher(delimiters));
}
private static NestedMatcher createMatcher(byte[] delimiter) {
Assert.isTrue(delimiter.length > 0, "Delimiter must not be empty");
switch (delimiter.length) {
case 1:
return (delimiter[0] == 10 ? SingleByteMatcher.NEWLINE_MATCHER : new SingleByteMatcher(delimiter));
case 2:
return new TwoByteMatcher(delimiter);
default:
return new KnuthMorrisPrattMatcher(delimiter);
}
}
/**
* Defines an object that matches a data buffer against a delimiter.
* Contract to find delimiter(s) against one or more data buffers that can
* be passed one at a time to the {@link #match(DataBuffer)} method.
*
* @since 5.2
* @see #match(DataBuffer)
*/
public interface Matcher {
/**
* Returns the position of the final matching delimiter byte that matches the given buffer,
* or {@code -1} if not found.
* @param dataBuffer the buffer in which to search for the delimiter
* @return the position of the final matching delimiter, or {@code -1} if not found.
* Find the first matching delimiter and return the index of the last
* byte of the delimiter, or {@code -1} if not found.
*/
int match(DataBuffer dataBuffer);
/**
* Return the delimiter used for this matcher.
* @return the delimiter
* Return the delimiter from the last invocation of {@link #match(DataBuffer)}.
*/
byte[] delimiter();
/**
* Resets the state of this matcher.
* Reset the state of this matcher.
*/
void reset();
}
/**
* Matcher that supports searching for multiple delimiters.
*/
private static class CompositeMatcher implements Matcher {
private static final byte[] NO_DELIMITER = new byte[0];
private final NestedMatcher[] matchers;
byte[] longestDelimiter = NO_DELIMITER;
CompositeMatcher(byte[][] delimiters) {
this.matchers = initMatchers(delimiters);
}
private static NestedMatcher[] initMatchers(byte[][] delimiters) {
NestedMatcher[] matchers = new NestedMatcher[delimiters.length];
for (int i = 0; i < delimiters.length; i++) {
matchers[i] = createMatcher(delimiters[i]);
}
return matchers;
}
@Override
public int match(DataBuffer dataBuffer) {
this.longestDelimiter = NO_DELIMITER;
for (int pos = dataBuffer.readPosition(); pos < dataBuffer.writePosition(); pos++) {
byte b = dataBuffer.getByte(pos);
for (NestedMatcher matcher : this.matchers) {
if (matcher.match(b) && matcher.delimiter().length > this.longestDelimiter.length) {
this.longestDelimiter = matcher.delimiter();
}
}
if (this.longestDelimiter != NO_DELIMITER) {
reset();
return pos;
}
}
return -1;
}
@Override
public byte[] delimiter() {
Assert.state(this.longestDelimiter != NO_DELIMITER, "Illegal state!");
return this.longestDelimiter;
}
@Override
public void reset() {
for (NestedMatcher matcher : this.matchers) {
matcher.reset();
}
}
}
/**
* Matcher that can be nested within {@link CompositeMatcher} where multiple
* matchers advance together using the same index, one byte at a time.
*/
private interface NestedMatcher extends Matcher {
/**
* Perform a match against the next byte of the stream and return true
* if the delimiter is fully matched.
*/
boolean match(byte b);
}
/**
* Matcher for a single byte delimiter.
*/
private static class SingleByteMatcher implements NestedMatcher {
static SingleByteMatcher NEWLINE_MATCHER = new SingleByteMatcher(new byte[] {10});
private final byte[] delimiter;
SingleByteMatcher(byte[] delimiter) {
Assert.isTrue(delimiter.length == 1, "Expected a 1 byte delimiter");
this.delimiter = delimiter;
}
@Override
public int match(DataBuffer dataBuffer) {
for (int pos = dataBuffer.readPosition(); pos < dataBuffer.writePosition(); pos++) {
byte b = dataBuffer.getByte(pos);
if (match(b)) {
return pos;
}
}
return -1;
}
@Override
public boolean match(byte b) {
return this.delimiter[0] == b;
}
@Override
public byte[] delimiter() {
return this.delimiter;
}
@Override
public void reset() {
}
}
/**
* Base class for a {@link NestedMatcher}.
*/
private static abstract class AbstractNestedMatcher implements NestedMatcher {
private final byte[] delimiter;
private int matches = 0;
protected AbstractNestedMatcher(byte[] delimiter) {
this.delimiter = delimiter;
}
protected void setMatches(int index) {
this.matches = index;
}
protected int getMatches() {
return this.matches;
}
@Override
public int match(DataBuffer dataBuffer) {
for (int pos = dataBuffer.readPosition(); pos < dataBuffer.writePosition(); pos++) {
byte b = dataBuffer.getByte(pos);
if (match(b)) {
reset();
return pos;
}
}
return -1;
}
@Override
public boolean match(byte b) {
if (b == this.delimiter[this.matches]) {
this.matches++;
return (this.matches == delimiter().length);
}
return false;
}
@Override
public byte[] delimiter() {
return this.delimiter;
}
@Override
public void reset() {
this.matches = 0;
}
}
/**
* Matcher with a 2 byte delimiter that does not benefit from a
* Knuth-Morris-Pratt suffix-prefix table.
*/
private static class TwoByteMatcher extends AbstractNestedMatcher {
protected TwoByteMatcher(byte[] delimiter) {
super(delimiter);
Assert.isTrue(delimiter.length == 2, "Expected a 2-byte delimiter");
}
}
/**
* Implementation of {@link Matcher} that uses the Knuth-Morris-Pratt algorithm.
* @see <a href="https://www.nayuki.io/page/knuth-morris-pratt-string-matching">Knuth-Morris-Pratt string matching</a>
*/
private static class KnuthMorrisPrattMatcher extends AbstractNestedMatcher {
private final int[] table;
public KnuthMorrisPrattMatcher(byte[] delimiter) {
super(delimiter);
this.table = longestSuffixPrefixTable(delimiter);
}
private static int[] longestSuffixPrefixTable(byte[] delimiter) {
int[] result = new int[delimiter.length];
result[0] = 0;
for (int i = 1; i < delimiter.length; i++) {
int j = result[i - 1];
while (j > 0 && delimiter[i] != delimiter[j]) {
j = result[j - 1];
}
if (delimiter[i] == delimiter[j]) {
j++;
}
result[i] = j;
}
return result;
}
@Override
public boolean match(byte b) {
while (getMatches() > 0 && b != delimiter()[getMatches()]) {
setMatches(this.table[getMatches() - 1]);
}
return super.match(b);
}
}
private static class ReadableByteChannelGenerator implements Consumer<SynchronousSink<DataBuffer>> {
private final ReadableByteChannel channel;
@ -908,124 +1132,4 @@ public abstract class DataBufferUtils {
}
}
/**
* Implementation of {@link Matcher} that uses the Knuth-Morris-Pratt algorithm.
* @see <a href="https://www.nayuki.io/page/knuth-morris-pratt-string-matching">Knuth-Morris-Pratt string matching</a>
*/
private static class KnuthMorrisPrattMatcher implements Matcher {
private final byte[] delimiter;
private final int[] table;
private int matches = 0;
public KnuthMorrisPrattMatcher(byte[] delimiter) {
this.delimiter = Arrays.copyOf(delimiter, delimiter.length);
this.table = longestSuffixPrefixTable(delimiter);
}
private static int[] longestSuffixPrefixTable(byte[] delimiter) {
int[] result = new int[delimiter.length];
result[0] = 0;
for (int i = 1; i < delimiter.length; i++) {
int j = result[i - 1];
while (j > 0 && delimiter[i] != delimiter[j]) {
j = result[j - 1];
}
if (delimiter[i] == delimiter[j]) {
j++;
}
result[i] = j;
}
return result;
}
@Override
public int match(DataBuffer dataBuffer) {
for (int i = dataBuffer.readPosition(); i < dataBuffer.writePosition(); i++) {
byte b = dataBuffer.getByte(i);
while (this.matches > 0 && b != this.delimiter[this.matches]) {
this.matches = this.table[this.matches - 1];
}
if (b == this.delimiter[this.matches]) {
this.matches++;
if (this.matches == this.delimiter.length) {
reset();
return i;
}
}
}
return -1;
}
@Override
public byte[] delimiter() {
return Arrays.copyOf(this.delimiter, this.delimiter.length);
}
@Override
public void reset() {
this.matches = 0;
}
}
/**
* Implementation of {@link Matcher} that wraps several other matchers.
*/
private static class CompositeMatcher implements Matcher {
private static final byte[] NO_DELIMITER = new byte[0];
private final Matcher[] matchers;
byte[] longestDelimiter = NO_DELIMITER;
public CompositeMatcher(Matcher[] matchers) {
this.matchers = matchers;
}
@Override
public int match(DataBuffer dataBuffer) {
this.longestDelimiter = NO_DELIMITER;
int bestEndIdx = Integer.MAX_VALUE;
for (Matcher matcher : this.matchers) {
int endIdx = matcher.match(dataBuffer);
if (endIdx != -1 &&
endIdx <= bestEndIdx &&
matcher.delimiter().length > this.longestDelimiter.length) {
bestEndIdx = endIdx;
this.longestDelimiter = matcher.delimiter();
}
}
if (bestEndIdx == Integer.MAX_VALUE) {
this.longestDelimiter = NO_DELIMITER;
return -1;
}
else {
reset();
return bestEndIdx;
}
}
@Override
public byte[] delimiter() {
Assert.state(this.longestDelimiter != NO_DELIMITER, "Illegal state!");
return this.longestDelimiter;
}
@Override
public void reset() {
for (Matcher matcher : this.matchers) {
matcher.reset();
}
}
}
}

View File

@ -886,18 +886,38 @@ class DataBufferUtilsTests extends AbstractDataBufferAllocatingTests {
void matcher2(String displayName, DataBufferFactory bufferFactory) {
super.bufferFactory = bufferFactory;
DataBuffer foo = stringBuffer("fooobar");
DataBuffer foo = stringBuffer("foooobar");
byte[] delims = "oo".getBytes(StandardCharsets.UTF_8);
DataBufferUtils.Matcher matcher = DataBufferUtils.matcher(delims);
int result = matcher.match(foo);
assertThat(result).isEqualTo(2);
foo.readPosition(2);
result = matcher.match(foo);
assertThat(result).isEqualTo(3);
foo.readPosition(3);
result = matcher.match(foo);
assertThat(result).isEqualTo(-1);
int endIndex = matcher.match(foo);
assertThat(endIndex).isEqualTo(2);
foo.readPosition(endIndex + 1);
endIndex = matcher.match(foo);
assertThat(endIndex).isEqualTo(4);
foo.readPosition(endIndex + 1);
endIndex = matcher.match(foo);
assertThat(endIndex).isEqualTo(-1);
release(foo);
}
@ParameterizedDataBufferAllocatingTest
void matcher3(String displayName, DataBufferFactory bufferFactory) {
super.bufferFactory = bufferFactory;
DataBuffer foo = stringBuffer("foooobar");
byte[] delims = "oo".getBytes(StandardCharsets.UTF_8);
DataBufferUtils.Matcher matcher = DataBufferUtils.matcher(delims);
int endIndex = matcher.match(foo);
assertThat(endIndex).isEqualTo(2);
foo.readPosition(endIndex + 1);
endIndex = matcher.match(foo);
assertThat(endIndex).isEqualTo(4);
foo.readPosition(endIndex + 1);
endIndex = matcher.match(foo);
assertThat(endIndex).isEqualTo(-1);
release(foo);
}