summaryrefslogtreecommitdiffstats
path: root/container-search
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2019-01-24 07:57:27 +0100
committerGitHub <noreply@github.com>2019-01-24 07:57:27 +0100
commitaa049270fe9cce920d4d6d3d253d8c226d890411 (patch)
treeb348a14eead341a60f69b1cfdd84bd1c058caf6e /container-search
parent47521adeb37d8d0bc68c90bc67ca994b3a04e2a5 (diff)
parentc149a4b1f57ef4b3164af93edf3a83bdc507a616 (diff)
Merge pull request #8217 from vespa-engine/arnej/avoid-alternatives-inside-phrases-head
Arnej/avoid alternatives inside phrases head MERGEOK
Diffstat (limited to 'container-search')
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/HasIndexItem.java1
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/PhraseItem.java5
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java1
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/WordAlternativesItem.java10
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/WordItem.java1
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java81
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java14
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/querytransform/test/index-info.cfg8
-rw-r--r--container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java1
9 files changed, 83 insertions, 39 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/HasIndexItem.java b/container-search/src/main/java/com/yahoo/prelude/query/HasIndexItem.java
index d5d193f54b4..8a91587daa2 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/HasIndexItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/HasIndexItem.java
@@ -15,6 +15,7 @@ public interface HasIndexItem {
@NonNull
public String getIndexName();
+ /** @return how many phrase words does this item contain */
public int getNumWords();
}
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/PhraseItem.java b/container-search/src/main/java/com/yahoo/prelude/query/PhraseItem.java
index c3689805dd7..e44a86ddd2d 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/PhraseItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/PhraseItem.java
@@ -218,10 +218,12 @@ public class PhraseItem extends CompositeIndexedItem {
WordItem wordItem = (WordItem) item;
buffer.append(wordItem.getWord());
- } else {
+ } else if (item instanceof PhraseSegmentItem) {
PhraseSegmentItem seg = (PhraseSegmentItem) item;
seg.appendContentsString(buffer);
+ } else {
+ buffer.append(item.toString());
}
if (i.hasNext()) {
buffer.append(" ");
@@ -250,7 +252,6 @@ public class PhraseItem extends CompositeIndexedItem {
public int getNumWords() {
int numWords = 0;
-
for (Iterator<Item> j = getItemIterator(); j.hasNext();) {
numWords += ((IndexedItem) j.next()).getNumWords();
}
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java
index 3d2e437d34c..1227a7f80cf 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/SegmentItem.java
@@ -82,6 +82,7 @@ public abstract class SegmentItem extends CompositeItem implements BlockItem {
return locked;
}
+ @Override
public int getNumWords() {
return getItemCount();
}
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/WordAlternativesItem.java b/container-search/src/main/java/com/yahoo/prelude/query/WordAlternativesItem.java
index 1157d2763e0..9815bfafc82 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/WordAlternativesItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/WordAlternativesItem.java
@@ -91,7 +91,13 @@ public class WordAlternativesItem extends TermItem {
@Override
public String stringValue() {
- return alternatives.get(maxIndex).word;
+ StringBuilder builder = new StringBuilder();
+ builder.append("[ ");
+ for (Alternative a : alternatives) {
+ builder.append(a.word).append("(").append(a.exactness).append(") ");
+ }
+ builder.append("]");
+ return builder.toString();
}
@Override
@@ -101,7 +107,7 @@ public class WordAlternativesItem extends TermItem {
@Override
public int getNumWords() {
- return alternatives.size();
+ return 1;
}
@Override
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java b/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java
index 0b3d11158f1..39573e4d71f 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java
@@ -157,6 +157,7 @@ public class WordItem extends TermItem {
return this.word.equals(other.word);
}
+ @Override
public int getNumWords() {
return 1;
}
diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java
index fd80d00c98d..655fbf6acc3 100644
--- a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java
+++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java
@@ -37,6 +37,14 @@ import static com.yahoo.prelude.querytransform.CJKSearcher.TERM_ORDER_RELAXATION
@Provides(StemmingSearcher.STEMMING)
public class StemmingSearcher extends Searcher {
+ private static class StemContext {
+ public boolean isCJK = false;
+ public boolean insidePhrase = false;
+ public Language language = null;
+ public IndexFacts.Session indexFacts = null;
+ public Map<Item, TaggableItem> reverseConnectivity = null;
+ }
+
public static final String STEMMING = "Stemming";
public static final CompoundName DISABLE = new CompoundName("nostemming");
private final Linguistics linguistics;
@@ -67,7 +75,10 @@ public class StemmingSearcher extends Searcher {
for (String field : highlightFields) {
StemMode stemMode = indexFacts.getIndex(field).getStemMode();
if (stemMode != StemMode.NONE) {
- Item newHighlight = scan(highlight.getHighlightItems().get(field), false, Language.ENGLISH, indexFacts, null);
+ StemContext context = new StemContext();
+ context.language = Language.ENGLISH;
+ context.indexFacts = indexFacts;
+ Item newHighlight = scan(highlight.getHighlightItems().get(field), context);
highlight.getHighlightItems().put(field, (AndItem)newHighlight);
}
}
@@ -82,8 +93,12 @@ public class StemmingSearcher extends Searcher {
if (language == Language.UNKNOWN) {
return q.getModel().getQueryTree().getRoot();
}
- return scan(q.getModel().getQueryTree().getRoot(), language.isCjk(), language, indexFacts,
- createReverseConnectivities(q.getModel().getQueryTree().getRoot()));
+ StemContext context = new StemContext();
+ context.isCJK = language.isCjk();
+ context.language = language;
+ context.indexFacts = indexFacts;
+ context.reverseConnectivity = createReverseConnectivities(q.getModel().getQueryTree().getRoot());
+ return scan(q.getModel().getQueryTree().getRoot(), context);
}
private Map<Item, TaggableItem> createReverseConnectivities(Item root) {
@@ -108,36 +123,38 @@ public class StemmingSearcher extends Searcher {
return reverseConnectivity;
}
- private Item scan(Item item, boolean isCJK, Language l, IndexFacts.Session indexFacts,
- Map<Item, TaggableItem> reverseConnectivity) {
+ private Item scan(Item item, StemContext context) {
if (item == null) {
return null;
- } else if (item instanceof BlockItem) {
- return checkBlock((BlockItem) item, isCJK, l, indexFacts, reverseConnectivity);
+ }
+ boolean old = context.insidePhrase;
+ if (item instanceof PhraseItem || item instanceof PhraseSegmentItem) {
+ context.insidePhrase = true;
+ }
+ if (item instanceof BlockItem) {
+ item = checkBlock((BlockItem) item, context);
} else if (item instanceof CompositeItem) {
CompositeItem comp = (CompositeItem) item;
ListIterator<Item> i = comp.getItemIterator();
while (i.hasNext()) {
Item original = i.next();
- Item transformed = scan(original, isCJK, l, indexFacts, reverseConnectivity);
+ Item transformed = scan(original, context);
if (original != transformed)
i.set(transformed);
}
- return item;
- } else {
- return item;
}
+ context.insidePhrase = old;
+ return item;
}
- private Item checkBlock(BlockItem b, boolean isCJK, Language language,
- IndexFacts.Session indexFacts, Map<Item, TaggableItem> reverseConnectivity) {
+ private Item checkBlock(BlockItem b, StemContext context) {
if (b instanceof PrefixItem || !b.isWords()) return (Item) b;
if (b.isFromQuery() && !b.isStemmed()) {
- Index index = indexFacts.getIndex(b.getIndexName());
+ Index index = context.indexFacts.getIndex(b.getIndexName());
StemMode stemMode = index.getStemMode();
- if (stemMode != StemMode.NONE) return stem(b, isCJK, language, reverseConnectivity, index);
+ if (stemMode != StemMode.NONE) return stem(b, context, index);
}
return (Item) b;
}
@@ -158,21 +175,20 @@ public class StemmingSearcher extends Searcher {
}
// The rewriting logic is here
- private Item stem(BlockItem current, boolean isCJK,
- Language language, Map<Item, TaggableItem> reverseConnectivity, Index index) {
+ private Item stem(BlockItem current, StemContext context, Index index) {
Item blockAsItem = (Item)current;
CompositeItem composite;
- List<StemList> segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), language);
+ List<StemList> segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), context.language);
String indexName = current.getIndexName();
Substring substring = getOffsets(current);
if (segments.size() == 1) {
- TaggableItem w = singleWordSegment(current, segments.get(0), index, substring);
- setMetaData(current, reverseConnectivity, w);
+ TaggableItem w = singleWordSegment(current, segments.get(0), index, substring, context.insidePhrase);
+ setMetaData(current, context.reverseConnectivity, w);
return (Item) w;
}
- if (isCJK) {
+ if (context.isCJK) {
composite = chooseCompositeForCJK(current,
((Item) current).getParent(),
indexName);
@@ -181,7 +197,7 @@ public class StemmingSearcher extends Searcher {
}
for (StemList segment : segments) {
- TaggableItem w = singleWordSegment(current, segment, index, substring);
+ TaggableItem w = singleWordSegment(current, segment, index, substring, context.insidePhrase);
if (composite instanceof AndSegmentItem) {
setSignificance(w, current);
@@ -189,7 +205,7 @@ public class StemmingSearcher extends Searcher {
composite.addItem((Item) w);
}
if (composite instanceof AndSegmentItem) {
- andSegmentConnectivity(current, reverseConnectivity, composite);
+ andSegmentConnectivity(current, context.reverseConnectivity, composite);
}
copyAttributes(blockAsItem, composite);
composite.lock();
@@ -197,7 +213,7 @@ public class StemmingSearcher extends Searcher {
if (composite instanceof PhraseSegmentItem) {
PhraseSegmentItem replacement = (PhraseSegmentItem) composite;
setSignificance(replacement, current);
- phraseSegmentConnectivity(current, reverseConnectivity, replacement);
+ phraseSegmentConnectivity(current, context.reverseConnectivity, replacement);
}
return composite;
@@ -265,23 +281,22 @@ public class StemmingSearcher extends Searcher {
private TaggableItem singleWordSegment(BlockItem current,
StemList segment,
Index index,
- Substring substring) {
+ Substring substring,
+ boolean insidePhrase) {
String indexName = current.getIndexName();
- if (index.getLiteralBoost() || index.getStemMode() == StemMode.ALL) {
- // Yes, this will create a new WordAlternativesItem even if stemmed
- // and original form are identical. This is to decrease complexity
- // in accent removal and lowercasing.
+ if (insidePhrase == false && ((index.getLiteralBoost() || index.getStemMode() == StemMode.ALL))) {
List<Alternative> terms = new ArrayList<>(segment.size() + 1);
terms.add(new Alternative(current.stringValue(), 1.0d));
for (String term : segment) {
terms.add(new Alternative(term, 0.7d));
}
WordAlternativesItem alternatives = new WordAlternativesItem(indexName, current.isFromQuery(), substring, terms);
- return alternatives;
- } else {
- WordItem first = singleStemSegment((Item) current, segment.get(0), indexName, substring);
- return first;
+ if (alternatives.getAlternatives().size() > 1) {
+ return alternatives;
+ }
}
+ WordItem first = singleStemSegment((Item) current, segment.get(0), indexName, substring);
+ return first;
}
private void setMetaData(BlockItem current, Map<Item, TaggableItem> reverseConnectivity, TaggableItem replacement) {
diff --git a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java
index b4b1142b6d4..2aeab415559 100644
--- a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java
+++ b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java
@@ -130,6 +130,20 @@ public class StemmingSearcherTestCase {
assertTrue("Did not find original word form in query.", foundExpectedBaseForm);
}
+ @Test
+ public void testMultipleStemming() {
+ try {
+ Query q = new Query(QueryTestCase.httpEncode("/search?language=en&search=four&query=trees \"nouns girls\" flowers \"a verbs a\" girls&default-index=foobar"));
+ executeStemming(q);
+ assertEquals("AND WORD_ALTERNATIVES foobar:[ tree(0.7) trees(1.0) ] "+
+ "foobar:\"noun girl\" WORD_ALTERNATIVES foobar:[ flower(0.7) flowers(1.0) ] "+
+ "foobar:\"a verb a\" WORD_ALTERNATIVES foobar:[ girl(0.7) girls(1.0) ]", q.getModel().getQueryTree().getRoot().toString());
+ } catch (Exception e) {
+ System.err.println("got exception: "+ e);
+ e.printStackTrace();
+ }
+ }
+
private Execution.Context newExecutionContext() {
return new Execution.Context(null, indexFacts, null, null, linguistics);
}
diff --git a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/index-info.cfg b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/index-info.cfg
index 0c34dade1da..f5b6c5a8541 100644
--- a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/index-info.cfg
+++ b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/index-info.cfg
@@ -1,6 +1,6 @@
-indexinfo[3]
+indexinfo[4]
indexinfo[0].name one
-indexinfo[0].command[12]
+indexinfo[0].command[14]
indexinfo[0].command[0].indexname exactemento
indexinfo[0].command[0].command compact-to-term
indexinfo[0].command[1].indexname default
@@ -45,3 +45,7 @@ indexinfo[2].command[0].indexname default
indexinfo[2].command[0].command stem
indexinfo[2].command[1].indexname default
indexinfo[2].command[1].command literal-boost
+indexinfo[3].name four
+indexinfo[3].command[1]
+indexinfo[3].command[0].indexname foobar
+indexinfo[3].command[0].command "stem ALL"
diff --git a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
index f2c53fb6b31..31a057c158f 100644
--- a/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
+++ b/container-search/src/test/java/com/yahoo/search/yql/YqlParserTestCase.java
@@ -898,6 +898,7 @@ public class YqlParserTestCase {
assertEquals(2, phrase.getItemCount());
assertEquals("forest", ((WordItem) phrase.getItem(0)).getWord());
checkWordAlternativesContent((WordAlternativesItem) phrase.getItem(1));
+ assertEquals("foo:\"forest WORD_ALTERNATIVES foo:[ tree(0.7) trees(1.0) ]\"", root.toString());
}
private void checkWordAlternativesContent(WordAlternativesItem alternatives) {