diff options
author | Jon Bratseth <bratseth@gmail.com> | 2023-06-05 09:28:50 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-05 09:28:50 +0200 |
commit | 240398303fe83ec1b6a4a1bf407125163310534d (patch) | |
tree | 7cb3d8da514a492d987585357ec82a709b9c2c12 /container-search | |
parent | f8bae81a83fd37fb884633e58c0f55e056bba30f (diff) | |
parent | b18703690547333d559f09f63f40ada4fed6f4d4 (diff) |
Merge pull request #27267 from vespa-engine/bratseth/emoji-stemming
Bratseth/emoji stemming
Diffstat (limited to 'container-search')
2 files changed, 33 insertions, 46 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java index 7c4bcb38c41..9050b82fd69 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java @@ -43,10 +43,8 @@ import com.yahoo.search.Searcher; import com.yahoo.search.searchchain.Execution; import com.yahoo.search.searchchain.PhaseNames; - import static com.yahoo.prelude.querytransform.CJKSearcher.TERM_ORDER_RELAXATION; - /** * Replaces query terms with their stems * @@ -111,9 +109,8 @@ public class StemmingSearcher extends Searcher { private Item replaceTerms(Query q, IndexFacts.Session indexFacts) { Language language = q.getModel().getParsingLanguage(); - if (language == Language.UNKNOWN) { - return q.getModel().getQueryTree().getRoot(); - } + if (language == Language.UNKNOWN) return q.getModel().getQueryTree().getRoot(); + StemContext context = new StemContext(); context.isCJK = language.isCjk(); context.language = language; @@ -144,9 +141,8 @@ public class StemmingSearcher extends Searcher { } private Item scan(Item item, StemContext context) { - if (item == null) { - return null; - } + if (item == null) return null; + boolean old = context.insidePhrase; if (item instanceof PhraseItem || item instanceof PhraseSegmentItem) { context.insidePhrase = true; @@ -155,7 +151,6 @@ public class StemmingSearcher extends Searcher { item = checkBlock((BlockItem) item, context); } else if (item instanceof CompositeItem comp) { ListIterator<Item> i = comp.getItemIterator(); - while (i.hasNext()) { Item original = i.next(); Item transformed = scan(original, context); @@ -186,7 +181,7 @@ public class StemmingSearcher extends Searcher { if (i instanceof TermItem) { return ((TermItem) i).getOrigin(); // this should always be the case } else { - getLogger().log(Level.WARNING, "Weird, BlockItem '" + b + "' was a composite containing " + + getLogger().log(Level.WARNING, "BlockItem '" + b + "' was a composite containing " + i.getClass().getName() + ", expected TermItem."); } } @@ -198,24 +193,14 @@ public class StemmingSearcher extends Searcher { Item blockAsItem = (Item)current; CompositeItem composite; List<StemList> segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), context.language); + if (segments.isEmpty()) return blockAsItem; + String indexName = current.getIndexName(); Substring substring = getOffsets(current); - if (segments.size() == 1) { - getLogger().log(Level.FINE, () -> "Stem '"+current.stringValue()+"' mode "+index.getStemMode() - +" and language '"+context.language+"' -> '"+segments.get(0)+"'"); TaggableItem w = singleWordSegment(current, segments.get(0), index, substring, context.insidePhrase); setMetaData(current, context.reverseConnectivity, w); - return (Item) w; - } else if (getLogger().isLoggable(Level.FINE)) { - var buf = new StringBuilder(); - buf.append("Stem '").append(current.stringValue()); - buf.append("' mode ").append(index.getStemMode()); - buf.append(" and language '").append(context.language).append("' ->"); - for (StemList segment : segments) { - buf.append(" '").append(segment).append("'"); - } - getLogger().log(Level.FINE, buf.toString()); + return (Item)w; } if (context.isCJK) @@ -224,7 +209,6 @@ public class StemmingSearcher extends Searcher { composite = chooseComposite(current, ((Item) current).getParent(), indexName); for (StemList segment : segments) { - getLogger().log(Level.FINE, () -> "Stem to multiple segments '"+segment+"'"); TaggableItem w = singleWordSegment(current, segment, index, substring, context.insidePhrase); if (composite instanceof AndSegmentItem) { @@ -242,7 +226,6 @@ public class StemmingSearcher extends Searcher { setSignificance(replacement, current); phraseSegmentConnectivity(current, context.reverseConnectivity, replacement); } - return composite; } @@ -372,8 +355,8 @@ public class StemmingSearcher extends Searcher { case PHRASE -> createPhraseSegment(current, indexName); case BOOLEAN_AND -> createAndSegment(current); default -> throw new IllegalArgumentException("Unknown segmenting rule: " + current.getSegmentingRule() + - ". This is a bug in Vespa, as the implementation has gotten out of sync." + - " Please create an issue."); + ". This is a bug in Vespa, as the implementation has gotten out of sync." + + " Please create an issue."); }; } diff --git a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java index bcb243b4563..d1514267a9b 100644 --- a/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/querytransform/test/StemmingSearcherTestCase.java @@ -23,7 +23,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; /** - * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias M. Lidal</a> + * @author Mathias M. Lidal */ public class StemmingSearcherTestCase { @@ -33,8 +33,8 @@ public class StemmingSearcherTestCase { @Test void testStemOnlySomeTerms() { - assertStem("/search?query=Holes in CVS and Subversion nostem:Found", - "WEAKAND(100) hole in cvs and subversion nostem:Found"); + assertStemmed("WEAKAND(100) hole in cvs and subversion nostem:Found", "/search?query=Holes in CVS and Subversion nostem:Found" + ); } @Test @@ -78,7 +78,7 @@ public class StemmingSearcherTestCase { @Test void testDontStemPrefixes() { - assertStem("/search?query=ist*&language=de", "WEAKAND(100) ist*"); + assertStemmed("WEAKAND(100) ist*", "/search?query=ist*&language=de"); } @Test @@ -90,10 +90,10 @@ public class StemmingSearcherTestCase { @Test void testNounStemming() { - assertStem("/search?query=noun:towers noun:tower noun:tow", - "WEAKAND(100) noun:tower noun:tower noun:tow"); - assertStem("/search?query=notnoun:towers notnoun:tower notnoun:tow", - "WEAKAND(100) notnoun:tower notnoun:tower notnoun:tow"); + assertStemmed("WEAKAND(100) noun:tower noun:tower noun:tow", "/search?query=noun:towers noun:tower noun:tow" + ); + assertStemmed("WEAKAND(100) notnoun:tower notnoun:tower notnoun:tow", "/search?query=notnoun:towers notnoun:tower notnoun:tow" + ); } @SuppressWarnings("deprecation") @@ -133,11 +133,19 @@ public class StemmingSearcherTestCase { @Test void testMultipleStemming() { - Query q = new Query(QueryTestCase.httpEncode("/search?language=en&search=four&query=trees \"nouns girls\" flowers \"a verbs a\" girls&default-index=foobar")); - executeStemming(q); - assertEquals("WEAKAND(100) WORD_ALTERNATIVES foobar:[ tree(0.7) trees(1.0) ] " + - "foobar:\"noun girl\" WORD_ALTERNATIVES foobar:[ flower(0.7) flowers(1.0) ] " + - "foobar:\"a verb a\" WORD_ALTERNATIVES foobar:[ girl(0.7) girls(1.0) ]", q.getModel().getQueryTree().getRoot().toString()); + assertStemmed("WEAKAND(100) WORD_ALTERNATIVES foobar:[ tree(0.7) trees(1.0) ] " + + "foobar:\"noun girl\" WORD_ALTERNATIVES foobar:[ flower(0.7) flowers(1.0) ] " + + "foobar:\"a verb a\" WORD_ALTERNATIVES foobar:[ girl(0.7) girls(1.0) ]", + "/search?language=en&search=four&query=trees \"nouns girls\" flowers \"a verbs a\" girls&default-index=foobar"); + } + + @Test + void testEmojiStemming() { + String emoji1 = "\uD83C\uDF49"; // 🍉 + String emoji2 = "\uD83D\uDE00"; // 😀 + assertStemmed("WEAKAND(100) " + emoji1, "/search?query=" + emoji1); + assertStemmed("WEAKAND(100) (AND " + emoji1 + " " + emoji2 + ")", "/search?query=" + emoji1 + emoji2); + assertStemmed("WEAKAND(100) (AND " + emoji1 + " foo " + emoji2 + ")", "/search?query=" + emoji1 + "foo" + emoji2); } private Execution.Context newExecutionContext() { @@ -153,12 +161,8 @@ public class StemmingSearcherTestCase { newExecutionContext()).search(query); } - private void assertStem(String queryString, String expectedQueryTree) { - assertStemEncoded(QueryTestCase.httpEncode(queryString), expectedQueryTree); - } - - private void assertStemEncoded(String encodedQueryString, String expectedQueryTree) { - Query query = new Query(encodedQueryString); + private void assertStemmed(String expectedQueryTree, String queryString) { + Query query = new Query(QueryTestCase.httpEncode(queryString)); executeStemming(query); assertEquals(expectedQueryTree, query.getModel().getQueryTree().getRoot().toString()); } |