diff options
author | MariusArhaug <mariusarhaug@hotmail.com> | 2024-06-26 13:01:43 +0200 |
---|---|---|
committer | MariusArhaug <mariusarhaug@hotmail.com> | 2024-06-26 13:01:43 +0200 |
commit | ccd6b8758dee2aeebcaf4eb8b944de5b0a8c32b7 (patch) | |
tree | efc709d5a9231b7464c56823098bd5820b1513ea | |
parent | 28928d915eeaddc4cd1de7a94a285e467d473d74 (diff) |
Handle implicit/explicit set languages for significance searcher
3 files changed, 199 insertions, 38 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java index 3f72e98f18a..8ba8f747019 100644 --- a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java +++ b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java @@ -14,6 +14,7 @@ import com.yahoo.prelude.query.WordItem; import com.yahoo.search.Query; import com.yahoo.search.Result; import com.yahoo.search.Searcher; +import com.yahoo.search.query.ranking.Significance; import com.yahoo.search.result.ErrorMessage; import com.yahoo.search.schema.RankProfile; import com.yahoo.search.schema.Schema; @@ -95,16 +96,60 @@ public class SignificanceSearcher extends Searcher { } private Result calculateAndSetSignificance(Query query, Execution execution) { - Language language = query.getModel().getParsingLanguage(); - Optional<SignificanceModel> model = significanceModelRegistry.getModel(language); - log.log(Level.FINE, () -> "Got model for language %s: %s" - .formatted(language, model.map(SignificanceModel::getId).orElse("<none>"))); + try { + var significanceModel = getSignificanceModelFromQueryLanguage(query); + log.log(Level.FINE, () -> "Got model for language %s: %s" + .formatted(query.getModel().getParsingLanguage(), significanceModel.getId())); - if (model.isEmpty()) return execution.search(query); + setIDF(query.getModel().getQueryTree().getRoot(), significanceModel); - setIDF(query.getModel().getQueryTree().getRoot(), model.get()); + return execution.search(query); + } catch (IllegalArgumentException e) { + var result = new Result(query); + result.hits().addError( + ErrorMessage.createIllegalQuery(e.getMessage())); + return result; + } + } + + private SignificanceModel getSignificanceModelFromQueryLanguage(Query query) throws IllegalArgumentException { + Language explicitLanguage = query.getModel().getLanguage(); + Language implicitLanguage = query.getModel().getParsingLanguage(); + + if (explicitLanguage == null && implicitLanguage == null) { + throw new IllegalArgumentException("No language found in query"); + } + + if (explicitLanguage != null) { + if (explicitLanguage == Language.UNKNOWN) { + return handleFallBackToUnknownLanguage(); + } + var model = significanceModelRegistry.getModel(explicitLanguage); + if (model.isEmpty()) { + throw new IllegalArgumentException("No significance model available for set language " + explicitLanguage); + } + return model.get(); + } + + if (implicitLanguage == Language.UNKNOWN) { + return handleFallBackToUnknownLanguage(); + } + var model = significanceModelRegistry.getModel(implicitLanguage); + if (model.isEmpty()) { + throw new IllegalArgumentException("No significance model available for implicit language " + implicitLanguage); + } + return model.get(); + } + + private SignificanceModel handleFallBackToUnknownLanguage() throws IllegalArgumentException { + var unknownModel = significanceModelRegistry.getModel(Language.UNKNOWN); + var englishModel = significanceModelRegistry.getModel(Language.ENGLISH); + + if (unknownModel.isEmpty() && englishModel.isEmpty()) { + throw new IllegalArgumentException("No significance model available for unknown or english language"); + } - return execution.search(query); + return unknownModel.orElseGet(englishModel::get); } private void setIDF(Item root, SignificanceModel significanceModel) { diff --git a/container-search/src/test/java/com/yahoo/prelude/query/test/QueryLanguageTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/test/QueryLanguageTestCase.java index a05124a42b1..45282de817b 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/test/QueryLanguageTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/test/QueryLanguageTestCase.java @@ -6,6 +6,7 @@ import com.yahoo.prelude.query.NotItem; import com.yahoo.prelude.query.PhraseItem; import com.yahoo.prelude.query.WordItem; import com.yahoo.search.Query; +import com.yahoo.search.query.Model; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -100,6 +101,15 @@ public class QueryLanguageTestCase { private void assertLanguage(Language expectedLanguage, String languageParameter) { Query query = new Query("?query=test&language=" + languageParameter); + Query query2 = new Query("?query=test"); + Model model = query.getModel(); + Model model2 = query2.getModel(); + + Language language1_0 = model.getParsingLanguage(); + Language language1_1 = model.getLanguage(); + Language language2_0 = model2.getParsingLanguage(); + Language language2_1 = model2.getLanguage(); + assertEquals(expectedLanguage, query.getModel().getParsingLanguage()); /* diff --git a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java index be68c87efb3..29e3c002c21 100644 --- a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java +++ b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java @@ -4,6 +4,12 @@ package com.yahoo.search.significance.test; import com.yahoo.component.chain.Chain; import com.yahoo.config.subscription.ConfigGetter; import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.language.opennlp.OpenNlpLinguistics; +import com.yahoo.language.process.*; import com.yahoo.language.significance.SignificanceModel; import com.yahoo.language.significance.SignificanceModelRegistry; import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry; @@ -20,6 +26,7 @@ import com.yahoo.search.significance.SignificanceSearcher; import com.yahoo.vespa.config.search.RankProfilesConfig; import org.junit.jupiter.api.Test; +import java.nio.ByteBuffer; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; @@ -27,6 +34,7 @@ import java.util.List; import static com.yahoo.test.JunitCompat.assertEquals; +import static java.nio.charset.StandardCharsets.UTF_8; /** * Tests significance term in the search chain. @@ -51,12 +59,90 @@ public class SignificanceSearcherTest { searcher = new SignificanceSearcher(significanceModelRegistry, new SchemaInfo(List.of(schema.build()), List.of())); } + private static class MockLinguistics implements Linguistics { + + private final MockDetector mockDetector; + MockLinguistics(Language language) { + this.mockDetector = new MockDetector(language); + } + + @Override + public Stemmer getStemmer() { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public Tokenizer getTokenizer() { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public Normalizer getNormalizer() { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public Transformer getTransformer() { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public Segmenter getSegmenter() { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public Detector getDetector() { + return this.mockDetector; + } + + @Override + public GramSplitter getGramSplitter() { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public CharacterClasses getCharacterClasses() { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public boolean equals(Linguistics other) { + return false; + } + } + + private static class MockDetector implements Detector { + + private Language detectionLanguage; + MockDetector(Language detectionLanguage) { + this.detectionLanguage = detectionLanguage; + } + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public Detection detect(ByteBuffer input, Hint hint) { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public Detection detect(String input, Hint hint) { + return new Detection(detectionLanguage, UTF_8.name(), false); + } + } + private Execution createExecution(SignificanceSearcher searcher) { return new Execution(new Chain<>(searcher), Execution.Context.createContextStub()); } - private Execution createExecution() { - return new Execution(new Chain<>(), Execution.Context.createContextStub()); + private Execution createExecution(SignificanceSearcher searcher, Language language) { + var context = Execution.Context.createContextStub(); + context.setLinguistics(new MockLinguistics(language)); + return new Execution(new Chain<>(searcher), context); } @Test @@ -191,35 +277,6 @@ public class SignificanceSearcherTest { } - @Test - void testSignificanceValueOnEmptyQuery() { - Query q = new Query(); - q.getModel().setLanguage(Language.NORWEGIAN_BOKMAL); - AndItem root = new AndItem(); - WordItem tmp; - tmp = new WordItem("Hei", true); - root.addItem(tmp); - tmp = new WordItem("Verden", true); - root.addItem(tmp); - - - q.getModel().getQueryTree().setRoot(root); - Result r = createExecution(searcher).search(q); - root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot(); - - WordItem w0 = (WordItem) root.getItem(0); - WordItem w1 = (WordItem) root.getItem(1); - - Result r0 = createExecution().search(q); - root = (AndItem) r0.getQuery().getModel().getQueryTree().getRoot(); - - WordItem w0_0 = (WordItem) root.getItem(0); - WordItem w0_1 = (WordItem) root.getItem(1); - - assertEquals(w0_0.getSignificance(), w0.getSignificance()); - assertEquals(w0_1.getSignificance(), w1.getSignificance()); - - } @Test public void failsOnConflictingSignificanceConfiguration() { @@ -252,4 +309,53 @@ public class SignificanceSearcherTest { "(https://docs.vespa.ai/en/reference/schema-reference.html#significance).", errorMessage.getDetailedMessage()); } + + @Test + public void testSignificanceSearcherWithExplictitAndImplictSetLanguages() { + Query q = new Query(); + q.getModel().setLanguage(Language.UNKNOWN); + q.getRanking().setProfile("significance-ranking"); + AndItem root = new AndItem(); + WordItem tmp; + tmp = new WordItem("hello", true); + root.addItem(tmp); + + q.getModel().getQueryTree().setRoot(root); + + SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get(); + var helloFrequency = model.documentFrequency("hello"); + var helloSignificanceValue = SignificanceSearcher.calculateIDF(helloFrequency.corpusSize(), helloFrequency.frequency()); + Result r = createExecution(searcher).search(q); + + root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot(); + WordItem w0 = (WordItem) root.getItem(0); + assertEquals(helloSignificanceValue, w0.getSignificance()); + + + Query q2 = new Query(); + q2.getModel().setLanguage(Language.FRENCH); + q2.getRanking().setProfile("significance-ranking"); + AndItem root2 = new AndItem(); + WordItem tmp2; + tmp2 = new WordItem("hello", true); + root2.addItem(tmp2); + + q2.getModel().getQueryTree().setRoot(root2); + Result r2 = createExecution(searcher).search(q2); + + assertEquals(1, r2.hits().getErrorHit().errors().size()); + + + Query q3 = new Query(); + q3.getRanking().setProfile("significance-ranking"); + WordItem root3 = new WordItem("Я с детства хотел завести собаку, но родители мне не разрешали.", true); + + q3.getModel().getQueryTree().setRoot(root3); + Execution execution = createExecution(searcher, Language.RUSSIAN); + Result r3 = execution.search(q3); + + assertEquals(1, r3.hits().getErrorHit().errors().size()); + + + } } |