diff options
9 files changed, 83 insertions, 39 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/HasIndexItem.java b/container-search/src/main/java/com/yahoo/prelude/query/HasIndexItem.java index d5d193f54b4..8a91587daa2 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/HasIndexItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/HasIndexItem.java @@ -15,6 +15,7 @@ public interface HasIndexItem { @NonNull public String getIndexName(); + /** @return how many phrase words does this item contain */ public int getNumWords(); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/PhraseItem.java b/container-search/src/main/java/com/yahoo/prelude/query/PhraseItem.java index c3689805dd7..e44a86ddd2d 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/PhraseItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/PhraseItem.java @@ -218,10 +218,12 @@ public class PhraseItem extends CompositeIndexedItem { WordItem wordItem = (WordItem) item; buffer.append(wordItem.getWord()); - } else { + } else if (item instanceof PhraseSegmentItem) { PhraseSegmentItem seg = (PhraseSegmentItem) item; seg.appendContentsString(buffer); + } else { + buffer.append(item.toString()); } if (i.hasNext()) { buffer.append(" "); @@ -250,7 +252,6 @@ public class PhraseItem extends CompositeIndexedItem { public int getNumWords() { int numWords = 0; - for (Iterator<Item> j = getItemIterator(); j.hasNext();) { numWords += ((IndexedItem) j.next()).getNumWords(); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java index 3d2e437d34c..1227a7f80cf 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java @@ -82,6 +82,7 @@ public abstract class SegmentItem extends CompositeItem implements BlockItem { return locked; } + @Override public int getNumWords() { return getItemCount(); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/WordAlternativesItem.java b/container-search/src/main/java/com/yahoo/prelude/query/WordAlternativesItem.java index 1157d2763e0..9815bfafc82 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/WordAlternativesItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/WordAlternativesItem.java @@ -91,7 +91,13 @@ public class WordAlternativesItem extends TermItem { @Override public String stringValue() { - return alternatives.get(maxIndex).word; + StringBuilder builder = new StringBuilder(); + builder.append("[ "); + for (Alternative a : alternatives) { + builder.append(a.word).append("(").append(a.exactness).append(") "); + } + builder.append("]"); + return builder.toString(); } @Override @@ -101,7 +107,7 @@ public class WordAlternativesItem extends TermItem { @Override public int getNumWords() { - return alternatives.size(); + return 1; } @Override diff --git a/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java b/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java index 0b3d11158f1..39573e4d71f 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java @@ -157,6 +157,7 @@ public class WordItem extends TermItem { return this.word.equals(other.word); } + @Override public int getNumWords() { return 1; } diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java index fd80d00c98d..655fbf6acc3 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java @@ -37,6 +37,14 @@ import static com.yahoo.prelude.querytransform.CJKSearcher.TERM_ORDER_RELAXATION @Provides(StemmingSearcher.STEMMING) public class StemmingSearcher extends Searcher { + private static class StemContext { + public boolean isCJK = false; + public boolean insidePhrase = false; + public Language language = null; + public IndexFacts.Session indexFacts = null; + public Map<Item, TaggableItem> reverseConnectivity = null; + } + public static final String STEMMING = "Stemming"; public static final CompoundName DISABLE = new CompoundName("nostemming"); private final Linguistics linguistics; @@ -67,7 +75,10 @@ public class StemmingSearcher extends Searcher { for (String field : highlightFields) { StemMode stemMode = indexFacts.getIndex(field).getStemMode(); if (stemMode != StemMode.NONE) { - Item newHighlight = scan(highlight.getHighlightItems().get(field), false, Language.ENGLISH, indexFacts, null); + StemContext context = new StemContext(); + context.language = Language.ENGLISH; + context.indexFacts = indexFacts; + Item newHighlight = scan(highlight.getHighlightItems().get(field), context); highlight.getHighlightItems().put(field, (AndItem)newHighlight); } } @@ -82,8 +93,12 @@ public class StemmingSearcher extends Searcher { if (language == Language.UNKNOWN) { return q.getModel().getQueryTree().getRoot(); } - return scan(q.getModel().getQueryTree().getRoot(), language.isCjk(), language, indexFacts, - createReverseConnectivities(q.getModel().getQueryTree().getRoot())); + StemContext context = new StemContext(); + context.isCJK = language.isCjk(); + context.language = language; + context.indexFacts = indexFacts; + context.reverseConnectivity = createReverseConnectivities(q.getModel().getQueryTree().getRoot()); + return scan(q.getModel().getQueryTree().getRoot(), context); } private Map<Item, TaggableItem> createReverseConnectivities(Item root) { @@ -108,36 +123,38 @@ public class StemmingSearcher extends Searcher { return reverseConnectivity; } - private Item scan(Item item, boolean isCJK, Language l, IndexFacts.Session indexFacts, - Map<Item, TaggableItem> reverseConnectivity) { + private Item scan(Item item, StemContext context) { if (item == null) { return null; - } else if (item instanceof BlockItem) { - return checkBlock((BlockItem) item, isCJK, l, indexFacts, reverseConnectivity); + } + boolean old = context.insidePhrase; + if (item instanceof PhraseItem || item instanceof PhraseSegmentItem) { + context.insidePhrase = true; + } + if (item instanceof BlockItem) { + item = checkBlock((BlockItem) item, context); } else if (item instanceof CompositeItem) { CompositeItem comp = (CompositeItem) item; ListIterator<Item> i = comp.getItemIterator(); while (i.hasNext()) { Item original = i.next(); - Item transformed = scan(original, isCJK, l, indexFacts, reverseConnectivity); + Item transformed = scan(original, context); if (original != transformed) i.set(transformed); } - return item; - } else { - return item; } + context.insidePhrase = old; + return item; } - private Item checkBlock(BlockItem b, boolean isCJK, Language language, - IndexFacts.Session indexFacts, Map<Item, TaggableItem> reverseConnectivity) { + private Item checkBlock(BlockItem b, StemContext context) { if (b instanceof PrefixItem || !b.isWords()) return (Item) b; if (b.isFromQuery() && !b.isStemmed()) { - Index index = indexFacts.getIndex(b.getIndexName()); + Index index = context.indexFacts.getIndex(b.getIndexName()); StemMode stemMode = index.getStemMode(); - if (stemMode != StemMode.NONE) return stem(b, isCJK, language, reverseConnectivity, index); + if (stemMode != StemMode.NONE) return stem(b, context, index); } return (Item) b; } @@ -158,21 +175,20 @@ public class StemmingSearcher extends Searcher { } // The rewriting logic is here - private Item stem(BlockItem current, boolean isCJK, - Language language, Map<Item, TaggableItem> reverseConnectivity, Index index) { + private Item stem(BlockItem current, StemContext context, Index index) { Item blockAsItem = (Item)current; CompositeItem composite; - List<StemList> segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), language); + List<StemList> segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), context.language); String indexName = current.getIndexName(); Substring substring = getOffsets(current); if (segments.size() == 1) { - TaggableItem w = singleWordSegment(current, segments.get(0), index, substring); - setMetaData(current, reverseConnectivity, w); + TaggableItem w = singleWordSegment(current, segments.get(0), index, substring, context.insidePhrase); + setMetaData(current, context.reverseConnectivity, w); return (Item) w; } - if (isCJK) { + if (context.isCJK) { composite = chooseCompositeForCJK(current, ((Item) current).getParent(), indexName); @@ -181,7 +197,7 @@ public class StemmingSearcher extends Searcher { } for (StemList segment : segments) { - TaggableItem w = singleWordSegment(current, segment, index, substring); + TaggableItem w = singleWordSegment(current, segment, index, substring, context.insidePhrase); if (composite instanceof AndSegmentItem) { setSignificance(w, current); @@ -189,7 +205,7 @@ public class StemmingSearcher extends Searcher { composite.addItem((Item) w); } if (composite instanceof AndSegmentItem) { - andSegmentConnectivity(current, reverseConnectivity, composite); + andSegmentConnectivity(current, context.reverseConnectivity, composite); } copyAttributes(blockAsItem, composite); composite.lock(); @@ -197,7 +213,7 @@ public class StemmingSearcher extends Searcher { if (composite instanceof PhraseSegmentItem) { PhraseSegmentItem replacement = (PhraseSegmentItem) composite; setSignificance(replacement, current); - phraseSegmentConnectivity(current, reverseConnectivity, replacement); + phraseSegmentConnectivity(current, context.reverseConnectivity, replacement); } return composite; @@ -265,23 +281,22 @@ public class StemmingSearcher extends Searcher { private TaggableItem singleWordSegment(BlockItem current, StemList segment, Index index, - Substring substring) { + Substring substring, + boolean insidePhrase) { String indexName = current.getIndexName(); - if (index.getLiteralBoost() || index.getStemMode() == StemMode.ALL) { - // Yes, this will create a new WordAlternativesItem even if stemmed - // and original form are identical. This is to decrease complexity - // in accent removal and lowercasing. + if (insidePhrase == false && ((index.getLiteralBoost() || index.getStemMode() == StemMode.ALL))) { List<Alternative> terms = new ArrayList<>(segment.size() + 1); terms.add(new Alternative(current.stringValue(), 1.0d)); for (String term : segment) { terms.add(new Alternative(term, 0.7d)); } WordAlternativesItem alternatives = new WordAlternativesItem(indexName, current.isFromQuery(), substring, terms); - return alternatives; - } else { - WordItem first = singleStemSegment((Item) current, segment.get(0), indexName, substring); - return first; + if (alternatives.getAlternatives().size() > 1) { + return alternatives; + } } + WordItem first = singleStemSegment((Item) current, segment.get(0), indexName, substring); + return first; } private void setMetaData(BlockItem current, Map<Item, TaggableItem> reverseConnectivity, TaggableItem replacement) { diff --git a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java index b4b1142b6d4..2aeab415559 100644 --- a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java @@ -130,6 +130,20 @@ public class StemmingSearcherTestCase { assertTrue("Did not find original word form in query.", foundExpectedBaseForm); } + @Test + public void testMultipleStemming() { + try { + Query q = new Query(QueryTestCase.httpEncode("/search?language=en&search=four&query=trees \"nouns girls\" flowers \"a verbs a\" girls&default-index=foobar")); + executeStemming(q); + assertEquals("AND WORD_ALTERNATIVES foobar:[ tree(0.7) trees(1.0) ] "+ + "foobar:\"noun girl\" WORD_ALTERNATIVES foobar:[ flower(0.7) flowers(1.0) ] "+ + "foobar:\"a verb a\" WORD_ALTERNATIVES foobar:[ girl(0.7) girls(1.0) ]", q.getModel().getQueryTree().getRoot().toString()); + } catch (Exception e) { + System.err.println("got exception: "+ e); + e.printStackTrace(); + } + } + private Execution.Context newExecutionContext() { return new Execution.Context(null, indexFacts, null, null, linguistics); } diff --git a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/index-info.cfg b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/index-info.cfg index 0c34dade1da..f5b6c5a8541 100644 --- a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/index-info.cfg +++ b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/index-info.cfg @@ -1,6 +1,6 @@ -indexinfo[3] +indexinfo[4] indexinfo[0].name one -indexinfo[0].command[12] +indexinfo[0].command[14] indexinfo[0].command[0].indexname exactemento indexinfo[0].command[0].command compact-to-term indexinfo[0].command[1].indexname default @@ -45,3 +45,7 @@ indexinfo[2].command[0].indexname default indexinfo[2].command[0].command stem indexinfo[2].command[1].indexname default indexinfo[2].command[1].command literal-boost +indexinfo[3].name four +indexinfo[3].command[1] +indexinfo[3].command[0].indexname foobar +indexinfo[3].command[0].command "stem ALL" diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java index f2c53fb6b31..31a057c158f 100644 --- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java @@ -898,6 +898,7 @@ public class YqlParserTestCase { assertEquals(2, phrase.getItemCount()); assertEquals("forest", ((WordItem) phrase.getItem(0)).getWord()); checkWordAlternativesContent((WordAlternativesItem) phrase.getItem(1)); + assertEquals("foo:\"forest WORD_ALTERNATIVES foo:[ tree(0.7) trees(1.0) ]\"", root.toString()); } private void checkWordAlternativesContent(WordAlternativesItem alternatives) { |