summaryrefslogtreecommitdiffstats
path: root/container-search
diff options
context:
space:
mode:
authorMarius Arhaug <mariusarhaug@hotmail.com>2024-04-24 15:27:32 +0200
committerGitHub <noreply@github.com>2024-04-24 15:27:32 +0200
commitff1d604e77b1943a72fc6b585b09db82a5ee791d (patch)
tree2195429e568fe6de8a680454d22efcac4b6a4120 /container-search
parent802c854e5190d37914f237d8626949781f3db9c2 (diff)
parent8f69128279305dacd077b540d5e9be746508efc9 (diff)
Merge pull request #30871 from vespa-engine/marius/add-significance-searcher
Add significance searcher
Diffstat (limited to 'container-search')
-rw-r--r--container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java3
-rw-r--r--container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java77
-rw-r--r--container-search/src/test/java/com/yahoo/search/significance/model/en.json14
-rw-r--r--container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java153
4 files changed, 246 insertions, 1 deletions
diff --git a/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java b/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java
index 9ea35339f8d..97220725fec 100644
--- a/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java
+++ b/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java
@@ -43,7 +43,8 @@ public class LocalProviderSpec {
com.yahoo.search.searchers.ValidateFuzzySearcher.class,
com.yahoo.search.yql.FieldFiller.class,
com.yahoo.search.searchers.InputCheckingSearcher.class,
- com.yahoo.search.searchers.ContainerLatencySearcher.class);
+ com.yahoo.search.searchers.ContainerLatencySearcher.class,
+ com.yahoo.search.significance.SignificanceSearcher.class);
public final String clusterName;
diff --git a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
new file mode 100644
index 00000000000..0a42bf8a259
--- /dev/null
+++ b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
@@ -0,0 +1,77 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.search.significance;
+
+import com.yahoo.component.annotation.Inject;
+import com.yahoo.component.chain.dependencies.Before;
+import com.yahoo.component.chain.dependencies.Provides;
+import com.yahoo.language.Language;
+import com.yahoo.language.significance.SignificanceModel;
+import com.yahoo.language.significance.SignificanceModelRegistry;
+import com.yahoo.prelude.query.CompositeItem;
+import com.yahoo.prelude.query.Item;
+import com.yahoo.prelude.query.NullItem;
+import com.yahoo.prelude.query.WordItem;
+import com.yahoo.search.Query;
+import com.yahoo.search.Result;
+import com.yahoo.search.Searcher;
+import com.yahoo.search.searchchain.Execution;
+
+import java.util.Optional;
+
+import static com.yahoo.prelude.querytransform.StemmingSearcher.STEMMING;
+
+/**
+ * Sets significance values on word items in the query tree.
+ *
+ * @author MariusArhaug
+ */
+
+@Provides(SignificanceSearcher.SIGNIFICANCE)
+@Before(STEMMING)
+public class SignificanceSearcher extends Searcher {
+
+ public final static String SIGNIFICANCE = "Significance";
+ private final SignificanceModelRegistry significanceModelRegistry;
+
+
+ @Inject
+ public SignificanceSearcher(SignificanceModelRegistry significanceModelRegistry) {
+ this.significanceModelRegistry = significanceModelRegistry;
+ }
+
+ @Override
+ public Result search(Query query, Execution execution) {
+ Language language = query.getModel().getParsingLanguage();
+ Optional<SignificanceModel> model = significanceModelRegistry.getModel(language);
+
+ if (model.isEmpty()) return execution.search(query);
+
+ setIDF(query.getModel().getQueryTree().getRoot(), model.get());
+
+ return execution.search(query);
+ }
+
+ private void setIDF(Item root, SignificanceModel significanceModel) {
+ if (root == null || root instanceof NullItem) return;
+
+ if (root instanceof WordItem) {
+
+ var documentFrequency = significanceModel.documentFrequency(((WordItem) root).getWord());
+ long N = documentFrequency.corpusSize();
+ long nq_i = documentFrequency.frequency();
+ double idf = calculateIDF(N, nq_i);
+
+ ((WordItem) root).setSignificance(idf);
+ } else if (root instanceof CompositeItem) {
+ for (int i = 0; i < ((CompositeItem) root).getItemCount(); i++) {
+ setIDF(((CompositeItem) root).getItem(i), significanceModel);
+ }
+ }
+ }
+
+ public static double calculateIDF(long N, long nq_i) {
+ return Math.log(1 + (N - nq_i + 0.5) / (nq_i + 0.5));
+ }
+}
+
+
diff --git a/container-search/src/test/java/com/yahoo/search/significance/model/en.json b/container-search/src/test/java/com/yahoo/search/significance/model/en.json
new file mode 100644
index 00000000000..50bae5e3451
--- /dev/null
+++ b/container-search/src/test/java/com/yahoo/search/significance/model/en.json
@@ -0,0 +1,14 @@
+{
+ "version" : "1.0",
+ "id" : "test::1",
+ "description" : "desc",
+ "corpus-size" : 10,
+ "language" : "en",
+ "word-count" : 4,
+ "frequencies" : {
+ "usa" : 2,
+ "hello": 3,
+ "world": 5,
+ "test": 2
+ }
+}
diff --git a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
new file mode 100644
index 00000000000..890db3abb51
--- /dev/null
+++ b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
@@ -0,0 +1,153 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.search.significance.test;
+
+import com.yahoo.component.chain.Chain;
+import com.yahoo.language.Language;
+import com.yahoo.language.significance.SignificanceModel;
+import com.yahoo.language.significance.SignificanceModelRegistry;
+import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry;
+import com.yahoo.prelude.query.AndItem;
+import com.yahoo.prelude.query.WordItem;
+import com.yahoo.search.Query;
+import com.yahoo.search.Result;
+import com.yahoo.search.searchchain.Execution;
+import com.yahoo.search.significance.SignificanceSearcher;
+import org.junit.jupiter.api.Test;
+
+import java.nio.file.Path;
+import java.util.HashMap;
+
+
+import static com.yahoo.test.JunitCompat.assertEquals;
+
+/**
+ * Tests significance term in the search chain.
+ *
+ * @author MariusArhaug
+ */
+public class SignificanceSearcherTest {
+ SignificanceModelRegistry significanceModelRegistry;
+ SignificanceSearcher searcher;
+
+ public SignificanceSearcherTest() {
+ HashMap<Language, Path> map = new HashMap<>();
+ map.put(Language.ENGLISH, Path.of("src/test/java/com/yahoo/search/significance/model/en.json"));
+
+ significanceModelRegistry = new DefaultSignificanceModelRegistry(map);
+ searcher = new SignificanceSearcher(significanceModelRegistry);
+ }
+
+ private Execution createExecution(SignificanceSearcher searcher) {
+ return new Execution(new Chain<>(searcher), Execution.Context.createContextStub());
+ }
+
+ private Execution createExecution() {
+ return new Execution(new Chain<>(), Execution.Context.createContextStub());
+ }
+
+ @Test
+ void testSignificanceValueOnSimpleQuery() {
+
+ Query q = new Query();
+ AndItem root = new AndItem();
+ WordItem tmp;
+ tmp = new WordItem("Hello", true);
+ root.addItem(tmp);
+ tmp = new WordItem("world", true);
+ root.addItem(tmp);
+
+ q.getModel().getQueryTree().setRoot(root);
+
+ SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
+ var helloFrequency = model.documentFrequency("Hello");
+ var helloSignificanceValue = SignificanceSearcher.calculateIDF(helloFrequency.corpusSize(), helloFrequency.frequency());
+
+ var worldFrequency = model.documentFrequency("world");
+ var worldSignificanceValue = SignificanceSearcher.calculateIDF(worldFrequency.corpusSize(), worldFrequency.frequency());
+
+ Result r = createExecution(searcher).search(q);
+
+ root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
+ WordItem w0 = (WordItem) root.getItem(0);
+ WordItem w1 = (WordItem) root.getItem(1);
+
+ assertEquals(helloSignificanceValue, w0.getSignificance());
+ assertEquals(worldSignificanceValue, w1.getSignificance());
+
+ }
+
+ @Test
+ void testSignificanceValueOnRecursiveQuery() {
+ Query q = new Query();
+ AndItem root = new AndItem();
+ WordItem child1 = new WordItem("hello", true);
+
+ AndItem child2 = new AndItem();
+ WordItem child2_1 = new WordItem("test", true);
+
+ AndItem child3 = new AndItem();
+ AndItem child3_1 = new AndItem();
+ WordItem child3_1_1 = new WordItem("usa", true);
+
+ root.addItem(child1);
+ root.addItem(child2);
+ root.addItem(child3);
+
+ child2.addItem(child2_1);
+ child3.addItem(child3_1);
+ child3_1.addItem(child3_1_1);
+
+ q.getModel().getQueryTree().setRoot(root);
+
+ SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
+ var helloFrequency = model.documentFrequency("hello");
+ var helloSignificanceValue = SignificanceSearcher.calculateIDF(helloFrequency.corpusSize(), helloFrequency.frequency());
+
+ var testFrequency = model.documentFrequency("test");
+ var testSignificanceValue = SignificanceSearcher.calculateIDF(testFrequency.corpusSize(), testFrequency.frequency());
+
+
+
+ Result r = createExecution(searcher).search(q);
+
+ root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
+ WordItem w0 = (WordItem) root.getItem(0);
+ WordItem w1 = (WordItem) ((AndItem) root.getItem(1)).getItem(0);
+ WordItem w3 = (WordItem) ((AndItem) ((AndItem) root.getItem(2)).getItem(0)).getItem(0);
+
+ assertEquals(helloSignificanceValue, w0.getSignificance());
+ assertEquals(testSignificanceValue, w1.getSignificance());
+ assertEquals(SignificanceSearcher.calculateIDF(10, 2), w3.getSignificance());
+
+ }
+
+ @Test
+ void testSignificanceValueOnEmptyQuery() {
+ Query q = new Query();
+ q.getModel().setLanguage(Language.NORWEGIAN_BOKMAL);
+ AndItem root = new AndItem();
+ WordItem tmp;
+ tmp = new WordItem("Hei", true);
+ root.addItem(tmp);
+ tmp = new WordItem("Verden", true);
+ root.addItem(tmp);
+
+
+ q.getModel().getQueryTree().setRoot(root);
+ Result r = createExecution(searcher).search(q);
+ root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
+
+ WordItem w0 = (WordItem) root.getItem(0);
+ WordItem w1 = (WordItem) root.getItem(1);
+
+ Result r0 = createExecution().search(q);
+ root = (AndItem) r0.getQuery().getModel().getQueryTree().getRoot();
+
+ WordItem w0_0 = (WordItem) root.getItem(0);
+ WordItem w0_1 = (WordItem) root.getItem(1);
+
+ assertEquals(w0_0.getSignificance(), w0.getSignificance());
+ assertEquals(w0_1.getSignificance(), w1.getSignificance());
+
+ }
+}