fix/hack remaining filter and analysis issues

This commit is contained in:
Robert Muir 2014-10-29 01:00:15 -04:00
parent df53448856
commit 0f8740a782
7 changed files with 58 additions and 27 deletions

View File

@ -23,6 +23,7 @@ import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BitDocIdSet; import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
@ -81,11 +82,10 @@ public class DocIdSets {
} }
// TODO: should we use WAH8DocIdSet like Lucene? // TODO: should we use WAH8DocIdSet like Lucene?
FixedBitSet fixedBitSet = new FixedBitSet(reader.maxDoc()); FixedBitSet fixedBitSet = new FixedBitSet(reader.maxDoc());
do { it = set.iterator();
fixedBitSet.set(doc); long cost = it.cost();
doc = it.nextDoc(); fixedBitSet.or(it);
} while (doc != DocIdSetIterator.NO_MORE_DOCS); return new BitDocIdSet(fixedBitSet, cost);
return new BitDocIdSet(fixedBitSet);
} }
/** /**
@ -114,4 +114,29 @@ public class DocIdSets {
set.or(iterator); set.or(iterator);
return set; return set;
} }
/**
* Creates a new DocIDSet if you have no idea of the cardinality,
* and are afraid of the cost of computing the cost.
* @deprecated remove usages of this.
*/
@Deprecated
public static BitDocIdSet newDocIDSet(BitSet bs) {
final int cost;
if (bs instanceof FixedBitSet) {
cost = guessCost((FixedBitSet) bs);
} else {
cost = bs.approximateCardinality();
}
return new BitDocIdSet(bs, cost);
}
// nocommit: we should instead base this always on cost of clauses and stuff???
private static int guessCost(FixedBitSet bs) {
if (bs.length() < 8192) {
return bs.cardinality();
} else {
return bs.length() / 8192 * new FixedBitSet(bs.getBits(), 8192).cardinality();
}
}
} }

View File

@ -25,6 +25,7 @@ import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter; import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.FixedBitSet;
import org.elasticsearch.common.lucene.docset.AllDocIdSet; import org.elasticsearch.common.lucene.docset.AllDocIdSet;
import org.elasticsearch.common.lucene.docset.DocIdSets; import org.elasticsearch.common.lucene.docset.DocIdSets;
@ -179,7 +180,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
if (!hasBits) { if (!hasBits) {
if (!fastOrClauses.isEmpty()) { if (!fastOrClauses.isEmpty()) {
DocIdSetIterator it = res.iterator(); DocIdSetIterator it = new BitSetIterator(res, 0);
at_least_one_should_clause_iter: at_least_one_should_clause_iter:
for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) { for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) {
for (ResultClause fastOrClause : fastOrClauses) { for (ResultClause fastOrClause : fastOrClauses) {
@ -199,7 +200,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
if (hasShouldClauses && !hasNonEmptyShouldClause) { if (hasShouldClauses && !hasNonEmptyShouldClause) {
return null; return null;
} else { } else {
return res; return DocIdSets.newDocIDSet(res);
} }
} }
@ -244,7 +245,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
} else { } else {
Bits bits = clause.bits; Bits bits = clause.bits;
// use the "res" to drive the iteration // use the "res" to drive the iteration
DocIdSetIterator it = res.iterator(); DocIdSetIterator it = new BitSetIterator(res, 0);
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
if (!bits.get(doc)) { if (!bits.get(doc)) {
res.clear(doc); res.clear(doc);
@ -262,7 +263,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
} else { } else {
Bits bits = clause.bits; Bits bits = clause.bits;
// let res drive the iteration // let res drive the iteration
DocIdSetIterator it = res.iterator(); DocIdSetIterator it = new BitSetIterator(res, 0);
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
if (bits.get(doc)) { if (bits.get(doc)) {
res.clear(doc); res.clear(doc);
@ -277,7 +278,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
// clause must match in order for a doc to be a match. What we do here is checking if matched docs match with // clause must match in order for a doc to be a match. What we do here is checking if matched docs match with
// any should filter. TODO: Add an option to have disable minimum_should_match=1 behaviour // any should filter. TODO: Add an option to have disable minimum_should_match=1 behaviour
if (!slowOrClauses.isEmpty() || !fastOrClauses.isEmpty()) { if (!slowOrClauses.isEmpty() || !fastOrClauses.isEmpty()) {
DocIdSetIterator it = res.iterator(); DocIdSetIterator it = new BitSetIterator(res, 0);
at_least_one_should_clause_iter: at_least_one_should_clause_iter:
for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) { for (int setDoc = it.nextDoc(); setDoc != DocIdSetIterator.NO_MORE_DOCS; setDoc = it.nextDoc()) {
for (ResultClause fastOrClause : fastOrClauses) { for (ResultClause fastOrClause : fastOrClauses) {
@ -303,7 +304,7 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
if (hasShouldClauses && !hasNonEmptyShouldClause) { if (hasShouldClauses && !hasNonEmptyShouldClause) {
return null; return null;
} else { } else {
return res; return DocIdSets.newDocIDSet(res);
} }
} }

View File

@ -91,8 +91,10 @@ public class KeepWordFilterFactory extends AbstractTokenFilterFactory {
public TokenStream create(TokenStream tokenStream) { public TokenStream create(TokenStream tokenStream) {
if (version.onOrAfter(Version.LUCENE_4_4)) { if (version.onOrAfter(Version.LUCENE_4_4)) {
return new KeepWordFilter(tokenStream, keepWords); return new KeepWordFilter(tokenStream, keepWords);
} else {
// nocommit: what happened here?
throw new UnsupportedOperationException();
} }
return new KeepWordFilter(version, enablePositionIncrements, tokenStream, keepWords);
} }

View File

@ -57,7 +57,9 @@ public class LengthTokenFilterFactory extends AbstractTokenFilterFactory {
public TokenStream create(TokenStream tokenStream) { public TokenStream create(TokenStream tokenStream) {
if (version.onOrAfter(Version.LUCENE_4_4)) { if (version.onOrAfter(Version.LUCENE_4_4)) {
return new LengthFilter(tokenStream, min, max); return new LengthFilter(tokenStream, min, max);
} } else {
return new LengthFilter(version, enablePositionIncrements, tokenStream, min, max); // nocommit: what happened here?
throw new UnsupportedOperationException();
}
} }
} }

View File

@ -64,8 +64,11 @@ public class StopTokenFilterFactory extends AbstractTokenFilterFactory {
@Override @Override
public TokenStream create(TokenStream tokenStream) { public TokenStream create(TokenStream tokenStream) {
if (removeTrailing) { if (removeTrailing) {
StopFilter filter = new StopFilter(version, tokenStream, stopWords); StopFilter filter = new StopFilter(tokenStream, stopWords);
filter.setEnablePositionIncrements(enablePositionIncrements); if (enablePositionIncrements == false) {
// nocommit: what happened here?
throw new UnsupportedOperationException();
}
return filter; return filter;
} else { } else {
return new SuggestStopFilter(tokenStream, stopWords); return new SuggestStopFilter(tokenStream, stopWords);

View File

@ -19,14 +19,12 @@
package org.elasticsearch.index.search.geo; package org.elasticsearch.index.search.geo;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter; import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BitSet; import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.BitDocIdSet;
import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.geo.GeoPoint; import org.elasticsearch.common.geo.GeoPoint;
import org.elasticsearch.common.lucene.docset.DocIdSets; import org.elasticsearch.common.lucene.docset.DocIdSets;
@ -81,9 +79,9 @@ public class IndexedGeoBoundingBoxFilter {
} }
} else { } else {
if (main == null) { if (main == null) {
main = (FixedBitSet) set; main = ((BitDocIdSet) set).bits();
} else { } else {
main.or((FixedBitSet) set); main.or(set.iterator());
} }
} }
@ -92,7 +90,7 @@ public class IndexedGeoBoundingBoxFilter {
return null; return null;
} }
main.and(set.iterator()); main.and(set.iterator());
return main; return DocIdSets.newDocIDSet(main);
} }
@Override @Override
@ -129,19 +127,19 @@ public class IndexedGeoBoundingBoxFilter {
} }
@Override @Override
public FixedBitSet getDocIdSet(LeafReaderContext context, Bits acceptedDocs) throws IOException { public BitDocIdSet getDocIdSet(LeafReaderContext context, Bits acceptedDocs) throws IOException {
FixedBitSet main; BitSet main;
DocIdSet set = lonFilter.getDocIdSet(context, acceptedDocs); DocIdSet set = lonFilter.getDocIdSet(context, acceptedDocs);
if (DocIdSets.isEmpty(set)) { if (DocIdSets.isEmpty(set)) {
return null; return null;
} }
main = (FixedBitSet) set; main = ((BitDocIdSet) set).bits();
set = latFilter.getDocIdSet(context, acceptedDocs); set = latFilter.getDocIdSet(context, acceptedDocs);
if (DocIdSets.isEmpty(set)) { if (DocIdSets.isEmpty(set)) {
return null; return null;
} }
main.and(set.iterator()); main.and(set.iterator());
return main; return DocIdSets.newDocIDSet(main);
} }
@Override @Override

View File

@ -75,7 +75,6 @@ public class StopTokenFilterTests extends ElasticsearchTokenStreamTestCase {
tokenizer.setReader(new StringReader("foo bar")); tokenizer.setReader(new StringReader("foo bar"));
TokenStream create = tokenFilter.create(tokenizer); TokenStream create = tokenFilter.create(tokenizer);
assertThat(create, instanceOf(StopFilter.class)); assertThat(create, instanceOf(StopFilter.class));
assertThat(((StopFilter)create).getEnablePositionIncrements(), equalTo(true));
} }
@Test @Test
@ -90,7 +89,8 @@ public class StopTokenFilterTests extends ElasticsearchTokenStreamTestCase {
tokenizer.setReader(new StringReader("foo bar")); tokenizer.setReader(new StringReader("foo bar"));
TokenStream create = tokenFilter.create(tokenizer); TokenStream create = tokenFilter.create(tokenizer);
assertThat(create, instanceOf(StopFilter.class)); assertThat(create, instanceOf(StopFilter.class));
assertThat(((StopFilter)create).getEnablePositionIncrements(), equalTo(false)); // nocommit: was posInc=false actually supported in 4.3 in lucene (other than for ancient back compat?)
fail("what happened here, and what to do about it");
} }
@Test @Test