diff options
author | Marius Arhaug <mariusarhaug@hotmail.com> | 2024-04-24 15:27:32 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-24 15:27:32 +0200 |
commit | ff1d604e77b1943a72fc6b585b09db82a5ee791d (patch) | |
tree | 2195429e568fe6de8a680454d22efcac4b6a4120 /container-search | |
parent | 802c854e5190d37914f237d8626949781f3db9c2 (diff) | |
parent | 8f69128279305dacd077b540d5e9be746508efc9 (diff) |
Merge pull request #30871 from vespa-engine/marius/add-significance-searcher
Add significance searcher
Diffstat (limited to 'container-search')
4 files changed, 246 insertions, 1 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java b/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java index 9ea35339f8d..97220725fec 100644 --- a/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java +++ b/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java @@ -43,7 +43,8 @@ public class LocalProviderSpec { com.yahoo.search.searchers.ValidateFuzzySearcher.class, com.yahoo.search.yql.FieldFiller.class, com.yahoo.search.searchers.InputCheckingSearcher.class, - com.yahoo.search.searchers.ContainerLatencySearcher.class); + com.yahoo.search.searchers.ContainerLatencySearcher.class, + com.yahoo.search.significance.SignificanceSearcher.class); public final String clusterName; diff --git a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java new file mode 100644 index 00000000000..0a42bf8a259 --- /dev/null +++ b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java @@ -0,0 +1,77 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.search.significance; + +import com.yahoo.component.annotation.Inject; +import com.yahoo.component.chain.dependencies.Before; +import com.yahoo.component.chain.dependencies.Provides; +import com.yahoo.language.Language; +import com.yahoo.language.significance.SignificanceModel; +import com.yahoo.language.significance.SignificanceModelRegistry; +import com.yahoo.prelude.query.CompositeItem; +import com.yahoo.prelude.query.Item; +import com.yahoo.prelude.query.NullItem; +import com.yahoo.prelude.query.WordItem; +import com.yahoo.search.Query; +import com.yahoo.search.Result; +import com.yahoo.search.Searcher; +import com.yahoo.search.searchchain.Execution; + +import java.util.Optional; + +import static com.yahoo.prelude.querytransform.StemmingSearcher.STEMMING; + +/** + * Sets significance values on word items in the query tree. + * + * @author MariusArhaug + */ + +@Provides(SignificanceSearcher.SIGNIFICANCE) +@Before(STEMMING) +public class SignificanceSearcher extends Searcher { + + public final static String SIGNIFICANCE = "Significance"; + private final SignificanceModelRegistry significanceModelRegistry; + + + @Inject + public SignificanceSearcher(SignificanceModelRegistry significanceModelRegistry) { + this.significanceModelRegistry = significanceModelRegistry; + } + + @Override + public Result search(Query query, Execution execution) { + Language language = query.getModel().getParsingLanguage(); + Optional<SignificanceModel> model = significanceModelRegistry.getModel(language); + + if (model.isEmpty()) return execution.search(query); + + setIDF(query.getModel().getQueryTree().getRoot(), model.get()); + + return execution.search(query); + } + + private void setIDF(Item root, SignificanceModel significanceModel) { + if (root == null || root instanceof NullItem) return; + + if (root instanceof WordItem) { + + var documentFrequency = significanceModel.documentFrequency(((WordItem) root).getWord()); + long N = documentFrequency.corpusSize(); + long nq_i = documentFrequency.frequency(); + double idf = calculateIDF(N, nq_i); + + ((WordItem) root).setSignificance(idf); + } else if (root instanceof CompositeItem) { + for (int i = 0; i < ((CompositeItem) root).getItemCount(); i++) { + setIDF(((CompositeItem) root).getItem(i), significanceModel); + } + } + } + + public static double calculateIDF(long N, long nq_i) { + return Math.log(1 + (N - nq_i + 0.5) / (nq_i + 0.5)); + } +} + + diff --git a/container-search/src/test/java/com/yahoo/search/significance/model/en.json b/container-search/src/test/java/com/yahoo/search/significance/model/en.json new file mode 100644 index 00000000000..50bae5e3451 --- /dev/null +++ b/container-search/src/test/java/com/yahoo/search/significance/model/en.json @@ -0,0 +1,14 @@ +{ + "version" : "1.0", + "id" : "test::1", + "description" : "desc", + "corpus-size" : 10, + "language" : "en", + "word-count" : 4, + "frequencies" : { + "usa" : 2, + "hello": 3, + "world": 5, + "test": 2 + } +} diff --git a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java new file mode 100644 index 00000000000..890db3abb51 --- /dev/null +++ b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java @@ -0,0 +1,153 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.search.significance.test; + +import com.yahoo.component.chain.Chain; +import com.yahoo.language.Language; +import com.yahoo.language.significance.SignificanceModel; +import com.yahoo.language.significance.SignificanceModelRegistry; +import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry; +import com.yahoo.prelude.query.AndItem; +import com.yahoo.prelude.query.WordItem; +import com.yahoo.search.Query; +import com.yahoo.search.Result; +import com.yahoo.search.searchchain.Execution; +import com.yahoo.search.significance.SignificanceSearcher; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; +import java.util.HashMap; + + +import static com.yahoo.test.JunitCompat.assertEquals; + +/** + * Tests significance term in the search chain. + * + * @author MariusArhaug + */ +public class SignificanceSearcherTest { + SignificanceModelRegistry significanceModelRegistry; + SignificanceSearcher searcher; + + public SignificanceSearcherTest() { + HashMap<Language, Path> map = new HashMap<>(); + map.put(Language.ENGLISH, Path.of("src/test/java/com/yahoo/search/significance/model/en.json")); + + significanceModelRegistry = new DefaultSignificanceModelRegistry(map); + searcher = new SignificanceSearcher(significanceModelRegistry); + } + + private Execution createExecution(SignificanceSearcher searcher) { + return new Execution(new Chain<>(searcher), Execution.Context.createContextStub()); + } + + private Execution createExecution() { + return new Execution(new Chain<>(), Execution.Context.createContextStub()); + } + + @Test + void testSignificanceValueOnSimpleQuery() { + + Query q = new Query(); + AndItem root = new AndItem(); + WordItem tmp; + tmp = new WordItem("Hello", true); + root.addItem(tmp); + tmp = new WordItem("world", true); + root.addItem(tmp); + + q.getModel().getQueryTree().setRoot(root); + + SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get(); + var helloFrequency = model.documentFrequency("Hello"); + var helloSignificanceValue = SignificanceSearcher.calculateIDF(helloFrequency.corpusSize(), helloFrequency.frequency()); + + var worldFrequency = model.documentFrequency("world"); + var worldSignificanceValue = SignificanceSearcher.calculateIDF(worldFrequency.corpusSize(), worldFrequency.frequency()); + + Result r = createExecution(searcher).search(q); + + root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot(); + WordItem w0 = (WordItem) root.getItem(0); + WordItem w1 = (WordItem) root.getItem(1); + + assertEquals(helloSignificanceValue, w0.getSignificance()); + assertEquals(worldSignificanceValue, w1.getSignificance()); + + } + + @Test + void testSignificanceValueOnRecursiveQuery() { + Query q = new Query(); + AndItem root = new AndItem(); + WordItem child1 = new WordItem("hello", true); + + AndItem child2 = new AndItem(); + WordItem child2_1 = new WordItem("test", true); + + AndItem child3 = new AndItem(); + AndItem child3_1 = new AndItem(); + WordItem child3_1_1 = new WordItem("usa", true); + + root.addItem(child1); + root.addItem(child2); + root.addItem(child3); + + child2.addItem(child2_1); + child3.addItem(child3_1); + child3_1.addItem(child3_1_1); + + q.getModel().getQueryTree().setRoot(root); + + SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get(); + var helloFrequency = model.documentFrequency("hello"); + var helloSignificanceValue = SignificanceSearcher.calculateIDF(helloFrequency.corpusSize(), helloFrequency.frequency()); + + var testFrequency = model.documentFrequency("test"); + var testSignificanceValue = SignificanceSearcher.calculateIDF(testFrequency.corpusSize(), testFrequency.frequency()); + + + + Result r = createExecution(searcher).search(q); + + root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot(); + WordItem w0 = (WordItem) root.getItem(0); + WordItem w1 = (WordItem) ((AndItem) root.getItem(1)).getItem(0); + WordItem w3 = (WordItem) ((AndItem) ((AndItem) root.getItem(2)).getItem(0)).getItem(0); + + assertEquals(helloSignificanceValue, w0.getSignificance()); + assertEquals(testSignificanceValue, w1.getSignificance()); + assertEquals(SignificanceSearcher.calculateIDF(10, 2), w3.getSignificance()); + + } + + @Test + void testSignificanceValueOnEmptyQuery() { + Query q = new Query(); + q.getModel().setLanguage(Language.NORWEGIAN_BOKMAL); + AndItem root = new AndItem(); + WordItem tmp; + tmp = new WordItem("Hei", true); + root.addItem(tmp); + tmp = new WordItem("Verden", true); + root.addItem(tmp); + + + q.getModel().getQueryTree().setRoot(root); + Result r = createExecution(searcher).search(q); + root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot(); + + WordItem w0 = (WordItem) root.getItem(0); + WordItem w1 = (WordItem) root.getItem(1); + + Result r0 = createExecution().search(q); + root = (AndItem) r0.getQuery().getModel().getQueryTree().getRoot(); + + WordItem w0_0 = (WordItem) root.getItem(0); + WordItem w0_1 = (WordItem) root.getItem(1); + + assertEquals(w0_0.getSignificance(), w0.getSignificance()); + assertEquals(w0_1.getSignificance(), w1.getSignificance()); + + } +} |