fix/hack remaining filter and analysis issues
This commit is contained in:
parent
df53448856
commit
0f8740a782
|
@ -23,6 +23,7 @@ import org.apache.lucene.index.LeafReader;
|
||||||
import org.apache.lucene.search.DocIdSet;
|
import org.apache.lucene.search.DocIdSet;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BitSet;
|
||||||
import org.apache.lucene.util.BitDocIdSet;
|
import org.apache.lucene.util.BitDocIdSet;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
@ -81,11 +82,10 @@ public class DocIdSets {
|
||||||
}
|
}
|
||||||
// TODO: should we use WAH8DocIdSet like Lucene?
|
// TODO: should we use WAH8DocIdSet like Lucene?
|
||||||
FixedBitSet fixedBitSet = new FixedBitSet(reader.maxDoc());
|
FixedBitSet fixedBitSet = new FixedBitSet(reader.maxDoc());
|
||||||
do {
|
it = set.iterator();
|
||||||
fixedBitSet.set(doc);
|
long cost = it.cost();
|
||||||
doc = it.nextDoc();
|
fixedBitSet.or(it);
|
||||||
} while (doc != DocIdSetIterator.NO_MORE_DOCS);
|
return new BitDocIdSet(fixedBitSet, cost);
|
||||||
return new BitDocIdSet(fixedBitSet);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -114,4 +114,29 @@ public class DocIdSets {
|
||||||
set.or(iterator);
|
set.or(iterator);
|
||||||
return set;
|
return set;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new DocIDSet if you have no idea of the cardinality,
|
||||||
|
* and are afraid of the cost of computing the cost.
|
||||||
|
* @deprecated remove usages of this.
|
||||||
|
*/
|
||||||
|
@Deprecated
|
||||||
|
public static BitDocIdSet newDocIDSet(BitSet bs) {
|
||||||
|
final int cost;
|
||||||
|
if (bs instanceof FixedBitSet) {
|
||||||
|
cost = guessCost((FixedBitSet) bs);
|
||||||
|
} else {
|
||||||
|
cost = bs.approximateCardinality();
|
||||||
|
}
|
||||||
|
return new BitDocIdSet(bs, cost);
|
||||||
|
}
|
||||||
|
|
||||||
|
// nocommit: we should instead base this always on cost of clauses and stuff???
|
||||||
|
private static int guessCost(FixedBitSet bs) {
|
||||||
|
if (bs.length() < 8192) {
|
||||||
|
return bs.cardinality();
|
||||||
|
} else {
|
||||||
|
return bs.length() / 8192 * new FixedBitSet(bs.getBits(), 8192).cardinality();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.search.DocIdSet;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.search.Filter;
|
import org.apache.lucene.search.Filter;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BitSetIterator;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
import org.elasticsearch.common.lucene.docset.AllDocIdSet;
|
import org.elasticsearch.common.lucene.docset.AllDocIdSet;
|
||||||
import org.elasticsearch.common.lucene.docset.DocIdSets;
|
import org.elasticsearch.common.lucene.docset.DocIdSets;
|
||||||
|
@ -179,7 +180,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
|
||||||
|
|
||||||
if (!hasBits) {
|
if (!hasBits) {
|
||||||
if (!fastOrClauses.isEmpty()) {
|
if (!fastOrClauses.isEmpty()) {
|
||||||
DocIdSetIterator it = res.iterator();
|
DocIdSetIterator it = new BitSetIterator(res, 0);
|
||||||
at_least_one_should_clause_iter:
|
at_least_one_should_clause_iter:
|
||||||
for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) {
|
for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) {
|
||||||
for (ResultClause fastOrClause : fastOrClauses) {
|
for (ResultClause fastOrClause : fastOrClauses) {
|
||||||
|
@ -199,7 +200,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
|
||||||
if (hasShouldClauses && !hasNonEmptyShouldClause) {
|
if (hasShouldClauses && !hasNonEmptyShouldClause) {
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
return res;
|
return DocIdSets.newDocIDSet(res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -244,7 +245,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
|
||||||
} else {
|
} else {
|
||||||
Bits bits = clause.bits;
|
Bits bits = clause.bits;
|
||||||
// use the "res" to drive the iteration
|
// use the "res" to drive the iteration
|
||||||
DocIdSetIterator it = res.iterator();
|
DocIdSetIterator it = new BitSetIterator(res, 0);
|
||||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||||
if (!bits.get(doc)) {
|
if (!bits.get(doc)) {
|
||||||
res.clear(doc);
|
res.clear(doc);
|
||||||
|
@ -262,7 +263,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
|
||||||
} else {
|
} else {
|
||||||
Bits bits = clause.bits;
|
Bits bits = clause.bits;
|
||||||
// let res drive the iteration
|
// let res drive the iteration
|
||||||
DocIdSetIterator it = res.iterator();
|
DocIdSetIterator it = new BitSetIterator(res, 0);
|
||||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||||
if (bits.get(doc)) {
|
if (bits.get(doc)) {
|
||||||
res.clear(doc);
|
res.clear(doc);
|
||||||
|
@ -277,7 +278,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
|
||||||
// clause must match in order for a doc to be a match. What we do here is checking if matched docs match with
|
// clause must match in order for a doc to be a match. What we do here is checking if matched docs match with
|
||||||
// any should filter. TODO: Add an option to have disable minimum_should_match=1 behaviour
|
// any should filter. TODO: Add an option to have disable minimum_should_match=1 behaviour
|
||||||
if (!slowOrClauses.isEmpty() || !fastOrClauses.isEmpty()) {
|
if (!slowOrClauses.isEmpty() || !fastOrClauses.isEmpty()) {
|
||||||
DocIdSetIterator it = res.iterator();
|
DocIdSetIterator it = new BitSetIterator(res, 0);
|
||||||
at_least_one_should_clause_iter:
|
at_least_one_should_clause_iter:
|
||||||
for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) {
|
for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) {
|
||||||
for (ResultClause fastOrClause : fastOrClauses) {
|
for (ResultClause fastOrClause : fastOrClauses) {
|
||||||
|
@ -303,7 +304,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
|
||||||
if (hasShouldClauses && !hasNonEmptyShouldClause) {
|
if (hasShouldClauses && !hasNonEmptyShouldClause) {
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
return res;
|
return DocIdSets.newDocIDSet(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -91,8 +91,10 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
if (version.onOrAfter(Version.LUCENE_4_4)) {
|
if (version.onOrAfter(Version.LUCENE_4_4)) {
|
||||||
return new KeepWordFilter(tokenStream, keepWords);
|
return new KeepWordFilter(tokenStream, keepWords);
|
||||||
|
} else {
|
||||||
|
// nocommit: what happened here?
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
return new KeepWordFilter(version, enablePositionIncrements, tokenStream, keepWords);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -57,7 +57,9 @@ public class LengthTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
if (version.onOrAfter(Version.LUCENE_4_4)) {
|
if (version.onOrAfter(Version.LUCENE_4_4)) {
|
||||||
return new LengthFilter(tokenStream, min, max);
|
return new LengthFilter(tokenStream, min, max);
|
||||||
}
|
} else {
|
||||||
return new LengthFilter(version, enablePositionIncrements, tokenStream, min, max);
|
// nocommit: what happened here?
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -64,8 +64,11 @@ public class StopTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
if (removeTrailing) {
|
if (removeTrailing) {
|
||||||
StopFilter filter = new StopFilter(version, tokenStream, stopWords);
|
StopFilter filter = new StopFilter(tokenStream, stopWords);
|
||||||
filter.setEnablePositionIncrements(enablePositionIncrements);
|
if (enablePositionIncrements == false) {
|
||||||
|
// nocommit: what happened here?
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
return filter;
|
return filter;
|
||||||
} else {
|
} else {
|
||||||
return new SuggestStopFilter(tokenStream, stopWords);
|
return new SuggestStopFilter(tokenStream, stopWords);
|
||||||
|
|
|
@ -19,14 +19,12 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.search.geo;
|
package org.elasticsearch.index.search.geo;
|
||||||
|
|
||||||
import org.apache.lucene.util.BitDocIdSet;
|
|
||||||
|
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.search.DocIdSet;
|
import org.apache.lucene.search.DocIdSet;
|
||||||
import org.apache.lucene.search.Filter;
|
import org.apache.lucene.search.Filter;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BitSet;
|
import org.apache.lucene.util.BitSet;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.BitDocIdSet;
|
||||||
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
||||||
import org.elasticsearch.common.geo.GeoPoint;
|
import org.elasticsearch.common.geo.GeoPoint;
|
||||||
import org.elasticsearch.common.lucene.docset.DocIdSets;
|
import org.elasticsearch.common.lucene.docset.DocIdSets;
|
||||||
|
@ -81,9 +79,9 @@ public class IndexedGeoBoundingBoxFilter {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (main == null) {
|
if (main == null) {
|
||||||
main = (FixedBitSet) set;
|
main = ((BitDocIdSet) set).bits();
|
||||||
} else {
|
} else {
|
||||||
main.or((FixedBitSet) set);
|
main.or(set.iterator());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -92,7 +90,7 @@ public class IndexedGeoBoundingBoxFilter {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
main.and(set.iterator());
|
main.and(set.iterator());
|
||||||
return main;
|
return DocIdSets.newDocIDSet(main);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -129,19 +127,19 @@ public class IndexedGeoBoundingBoxFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FixedBitSet getDocIdSet(LeafReaderContext context, Bits acceptedDocs) throws IOException {
|
public BitDocIdSet getDocIdSet(LeafReaderContext context, Bits acceptedDocs) throws IOException {
|
||||||
FixedBitSet main;
|
BitSet main;
|
||||||
DocIdSet set = lonFilter.getDocIdSet(context, acceptedDocs);
|
DocIdSet set = lonFilter.getDocIdSet(context, acceptedDocs);
|
||||||
if (DocIdSets.isEmpty(set)) {
|
if (DocIdSets.isEmpty(set)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
main = (FixedBitSet) set;
|
main = ((BitDocIdSet) set).bits();
|
||||||
set = latFilter.getDocIdSet(context, acceptedDocs);
|
set = latFilter.getDocIdSet(context, acceptedDocs);
|
||||||
if (DocIdSets.isEmpty(set)) {
|
if (DocIdSets.isEmpty(set)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
main.and(set.iterator());
|
main.and(set.iterator());
|
||||||
return main;
|
return DocIdSets.newDocIDSet(main);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -75,7 +75,6 @@ public class StopTokenFilterTests extends ElasticsearchTokenStreamTestCase {
|
||||||
tokenizer.setReader(new StringReader("foo bar"));
|
tokenizer.setReader(new StringReader("foo bar"));
|
||||||
TokenStream create = tokenFilter.create(tokenizer);
|
TokenStream create = tokenFilter.create(tokenizer);
|
||||||
assertThat(create, instanceOf(StopFilter.class));
|
assertThat(create, instanceOf(StopFilter.class));
|
||||||
assertThat(((StopFilter)create).getEnablePositionIncrements(), equalTo(true));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -90,7 +89,8 @@ public class StopTokenFilterTests extends ElasticsearchTokenStreamTestCase {
|
||||||
tokenizer.setReader(new StringReader("foo bar"));
|
tokenizer.setReader(new StringReader("foo bar"));
|
||||||
TokenStream create = tokenFilter.create(tokenizer);
|
TokenStream create = tokenFilter.create(tokenizer);
|
||||||
assertThat(create, instanceOf(StopFilter.class));
|
assertThat(create, instanceOf(StopFilter.class));
|
||||||
assertThat(((StopFilter)create).getEnablePositionIncrements(), equalTo(false));
|
// nocommit: was posInc=false actually supported in 4.3 in lucene (other than for ancient back compat?)
|
||||||
|
fail("what happened here, and what to do about it");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue