summaryrefslogtreecommitdiffstats
path: root/container-search
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2023-06-05 09:28:50 +0200
committerGitHub <noreply@github.com>2023-06-05 09:28:50 +0200
commit240398303fe83ec1b6a4a1bf407125163310534d (patch)
tree7cb3d8da514a492d987585357ec82a709b9c2c12 /container-search
parentf8bae81a83fd37fb884633e58c0f55e056bba30f (diff)
parentb18703690547333d559f09f63f40ada4fed6f4d4 (diff)
Merge pull request #27267 from vespa-engine/bratseth/emoji-stemming
Bratseth/emoji stemming
Diffstat (limited to 'container-search')
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java37
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java42
2 files changed, 33 insertions, 46 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java
index 7c4bcb38c41..9050b82fd69 100644
--- a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java
+++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java
@@ -43,10 +43,8 @@ import com.yahoo.search.Searcher;
import com.yahoo.search.searchchain.Execution;
import com.yahoo.search.searchchain.PhaseNames;
-
import static com.yahoo.prelude.querytransform.CJKSearcher.TERM_ORDER_RELAXATION;
-
/**
* Replaces query terms with their stems
*
@@ -111,9 +109,8 @@ public class StemmingSearcher extends Searcher {
private Item replaceTerms(Query q, IndexFacts.Session indexFacts) {
Language language = q.getModel().getParsingLanguage();
- if (language == Language.UNKNOWN) {
- return q.getModel().getQueryTree().getRoot();
- }
+ if (language == Language.UNKNOWN) return q.getModel().getQueryTree().getRoot();
+
StemContext context = new StemContext();
context.isCJK = language.isCjk();
context.language = language;
@@ -144,9 +141,8 @@ public class StemmingSearcher extends Searcher {
}
private Item scan(Item item, StemContext context) {
- if (item == null) {
- return null;
- }
+ if (item == null) return null;
+
boolean old = context.insidePhrase;
if (item instanceof PhraseItem || item instanceof PhraseSegmentItem) {
context.insidePhrase = true;
@@ -155,7 +151,6 @@ public class StemmingSearcher extends Searcher {
item = checkBlock((BlockItem) item, context);
} else if (item instanceof CompositeItem comp) {
ListIterator<Item> i = comp.getItemIterator();
-
while (i.hasNext()) {
Item original = i.next();
Item transformed = scan(original, context);
@@ -186,7 +181,7 @@ public class StemmingSearcher extends Searcher {
if (i instanceof TermItem) {
return ((TermItem) i).getOrigin(); // this should always be the case
} else {
- getLogger().log(Level.WARNING, "Weird, BlockItem '" + b + "' was a composite containing " +
+ getLogger().log(Level.WARNING, "BlockItem '" + b + "' was a composite containing " +
i.getClass().getName() + ", expected TermItem.");
}
}
@@ -198,24 +193,14 @@ public class StemmingSearcher extends Searcher {
Item blockAsItem = (Item)current;
CompositeItem composite;
List<StemList> segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), context.language);
+ if (segments.isEmpty()) return blockAsItem;
+
String indexName = current.getIndexName();
Substring substring = getOffsets(current);
-
if (segments.size() == 1) {
- getLogger().log(Level.FINE, () -> "Stem '"+current.stringValue()+"' mode "+index.getStemMode()
- +" and language '"+context.language+"' -> '"+segments.get(0)+"'");
TaggableItem w = singleWordSegment(current, segments.get(0), index, substring, context.insidePhrase);
setMetaData(current, context.reverseConnectivity, w);
- return (Item) w;
- } else if (getLogger().isLoggable(Level.FINE)) {
- var buf = new StringBuilder();
- buf.append("Stem '").append(current.stringValue());
- buf.append("' mode ").append(index.getStemMode());
- buf.append(" and language '").append(context.language).append("' ->");
- for (StemList segment : segments) {
- buf.append(" '").append(segment).append("'");
- }
- getLogger().log(Level.FINE, buf.toString());
+ return (Item)w;
}
if (context.isCJK)
@@ -224,7 +209,6 @@ public class StemmingSearcher extends Searcher {
composite = chooseComposite(current, ((Item) current).getParent(), indexName);
for (StemList segment : segments) {
- getLogger().log(Level.FINE, () -> "Stem to multiple segments '"+segment+"'");
TaggableItem w = singleWordSegment(current, segment, index, substring, context.insidePhrase);
if (composite instanceof AndSegmentItem) {
@@ -242,7 +226,6 @@ public class StemmingSearcher extends Searcher {
setSignificance(replacement, current);
phraseSegmentConnectivity(current, context.reverseConnectivity, replacement);
}
-
return composite;
}
@@ -372,8 +355,8 @@ public class StemmingSearcher extends Searcher {
case PHRASE -> createPhraseSegment(current, indexName);
case BOOLEAN_AND -> createAndSegment(current);
default -> throw new IllegalArgumentException("Unknown segmenting rule: " + current.getSegmentingRule() +
- ". This is a bug in Vespa, as the implementation has gotten out of sync." +
- " Please create an issue.");
+ ". This is a bug in Vespa, as the implementation has gotten out of sync." +
+ " Please create an issue.");
};
}
diff --git a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java
index bcb243b4563..d1514267a9b 100644
--- a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java
+++ b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java
@@ -23,7 +23,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
- * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias M. Lidal</a>
+ * @author Mathias M. Lidal
*/
public class StemmingSearcherTestCase {
@@ -33,8 +33,8 @@ public class StemmingSearcherTestCase {
@Test
void testStemOnlySomeTerms() {
- assertStem("/search?query=Holes in CVS and Subversion nostem:Found",
- "WEAKAND(100) hole in cvs and subversion nostem:Found");
+ assertStemmed("WEAKAND(100) hole in cvs and subversion nostem:Found", "/search?query=Holes in CVS and Subversion nostem:Found"
+ );
}
@Test
@@ -78,7 +78,7 @@ public class StemmingSearcherTestCase {
@Test
void testDontStemPrefixes() {
- assertStem("/search?query=ist*&language=de", "WEAKAND(100) ist*");
+ assertStemmed("WEAKAND(100) ist*", "/search?query=ist*&language=de");
}
@Test
@@ -90,10 +90,10 @@ public class StemmingSearcherTestCase {
@Test
void testNounStemming() {
- assertStem("/search?query=noun:towers noun:tower noun:tow",
- "WEAKAND(100) noun:tower noun:tower noun:tow");
- assertStem("/search?query=notnoun:towers notnoun:tower notnoun:tow",
- "WEAKAND(100) notnoun:tower notnoun:tower notnoun:tow");
+ assertStemmed("WEAKAND(100) noun:tower noun:tower noun:tow", "/search?query=noun:towers noun:tower noun:tow"
+ );
+ assertStemmed("WEAKAND(100) notnoun:tower notnoun:tower notnoun:tow", "/search?query=notnoun:towers notnoun:tower notnoun:tow"
+ );
}
@SuppressWarnings("deprecation")
@@ -133,11 +133,19 @@ public class StemmingSearcherTestCase {
@Test
void testMultipleStemming() {
- Query q = new Query(QueryTestCase.httpEncode("/search?language=en&search=four&query=trees \"nouns girls\" flowers \"a verbs a\" girls&default-index=foobar"));
- executeStemming(q);
- assertEquals("WEAKAND(100) WORD_ALTERNATIVES foobar:[ tree(0.7) trees(1.0) ] " +
- "foobar:\"noun girl\" WORD_ALTERNATIVES foobar:[ flower(0.7) flowers(1.0) ] " +
- "foobar:\"a verb a\" WORD_ALTERNATIVES foobar:[ girl(0.7) girls(1.0) ]", q.getModel().getQueryTree().getRoot().toString());
+ assertStemmed("WEAKAND(100) WORD_ALTERNATIVES foobar:[ tree(0.7) trees(1.0) ] " +
+ "foobar:\"noun girl\" WORD_ALTERNATIVES foobar:[ flower(0.7) flowers(1.0) ] " +
+ "foobar:\"a verb a\" WORD_ALTERNATIVES foobar:[ girl(0.7) girls(1.0) ]",
+ "/search?language=en&search=four&query=trees \"nouns girls\" flowers \"a verbs a\" girls&default-index=foobar");
+ }
+
+ @Test
+ void testEmojiStemming() {
+ String emoji1 = "\uD83C\uDF49"; // 🍉
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ assertStemmed("WEAKAND(100) " + emoji1, "/search?query=" + emoji1);
+ assertStemmed("WEAKAND(100) (AND " + emoji1 + " " + emoji2 + ")", "/search?query=" + emoji1 + emoji2);
+ assertStemmed("WEAKAND(100) (AND " + emoji1 + " foo " + emoji2 + ")", "/search?query=" + emoji1 + "foo" + emoji2);
}
private Execution.Context newExecutionContext() {
@@ -153,12 +161,8 @@ public class StemmingSearcherTestCase {
newExecutionContext()).search(query);
}
- private void assertStem(String queryString, String expectedQueryTree) {
- assertStemEncoded(QueryTestCase.httpEncode(queryString), expectedQueryTree);
- }
-
- private void assertStemEncoded(String encodedQueryString, String expectedQueryTree) {
- Query query = new Query(encodedQueryString);
+ private void assertStemmed(String expectedQueryTree, String queryString) {
+ Query query = new Query(QueryTestCase.httpEncode(queryString));
executeStemming(query);
assertEquals(expectedQueryTree, query.getModel().getQueryTree().getRoot().toString());
}