aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarius Arhaug <mariusarhaug@hotmail.com>2024-04-24 15:27:32 +0200
committerGitHub <noreply@github.com>2024-04-24 15:27:32 +0200
commitff1d604e77b1943a72fc6b585b09db82a5ee791d (patch)
tree2195429e568fe6de8a680454d22efcac4b6a4120
parent802c854e5190d37914f237d8626949781f3db9c2 (diff)
parent8f69128279305dacd077b540d5e9be746508efc9 (diff)
Merge pull request #30871 from vespa-engine/marius/add-significance-searcher
Add significance searcher
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java17
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java1
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/container/search/searchchain/SchemaChainsTest.java2
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java4
-rw-r--r--container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java3
-rw-r--r--container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java77
-rw-r--r--container-search/src/test/java/com/yahoo/search/significance/model/en.json14
-rw-r--r--container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java153
-rw-r--r--linguistics/abi-spec.json2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java4
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java7
-rw-r--r--linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java16
12 files changed, 276 insertions, 24 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java b/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java
index eb3f63cdf10..c210c2621a6 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java
@@ -23,19 +23,20 @@ import static com.yahoo.vespa.model.container.xml.ModelIdResolver.SIGNIFICANCE_M
*/
public class SignificanceModelRegistry extends SimpleComponent implements SignificanceConfig.Producer {
- private static final String CLASS = "com.yahoo.search.significance.impl.DefaultSignificanceModelRegistry";
- private static final String BUNDLE = "linguistics";
+ private static final String CLASS = "com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry";
+ private static final String BUNDLE = null;
- private final List<SignificanceModelConfig> configList;
+ private final List<SignificanceModelConfig> configList = new ArrayList<>();
public SignificanceModelRegistry(DeployState deployState, Element spec) {
super(new ComponentModel(BundleInstantiationSpecification.fromStrings(CLASS, CLASS, BUNDLE)));
- configList = new ArrayList<>();
+ if (spec != null) {
- for (Element modelElement : XML.getChildren(spec, "model")) {
- addConfig(
- modelElement.getAttribute("language"),
- Model.fromXml(deployState, modelElement, Set.of(SIGNIFICANCE_MODEL)).modelReference());
+ for (Element modelElement : XML.getChildren(spec, "model")) {
+ addConfig(
+ modelElement.getAttribute("language"),
+ Model.fromXml(deployState, modelElement, Set.of(SIGNIFICANCE_MODEL)).modelReference());
+ }
}
}
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java b/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java
index c6fca8d32c6..56e2a21e38b 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/container/xml/ContainerModelBuilder.java
@@ -778,7 +778,6 @@ public class ContainerModelBuilder extends ConfigModelBuilder<ContainerModel> {
private void addSignificance(DeployState deployState, Element spec, ApplicationContainerCluster cluster) {
Element significanceElement = XML.getChild(spec, "significance");
- if (significanceElement == null) return;
SignificanceModelRegistry significanceModelRegistry = new SignificanceModelRegistry(deployState, significanceElement);
cluster.addComponent(significanceModelRegistry);
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/container/search/searchchain/SchemaChainsTest.java b/config-model/src/test/java/com/yahoo/vespa/model/container/search/searchchain/SchemaChainsTest.java
index 5a316f69a9f..ea43f5c8124 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/container/search/searchchain/SchemaChainsTest.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/container/search/searchchain/SchemaChainsTest.java
@@ -159,7 +159,7 @@ public class SchemaChainsTest extends SchemaChainsTestBase {
@Test
public void require_all_default_chains_are_correct() {
- assertEquals(61, chainsConfig.components().size());
+ assertEquals(63, chainsConfig.components().size());
assertEquals(10, chainsConfig.chains().size());
validateVespaPhasesChain(findChain("vespaPhases"));
validateNativeChain(findChain("native"));
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java b/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java
index acb9426b812..00e95a34287 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java
@@ -49,9 +49,9 @@ public class SignificanceModelTestCase {
private SignificanceConfig assertSignificancePresent(ApplicationContainerCluster cluster) {
- var id = new ComponentId("com.yahoo.search.significance.impl.DefaultSignificanceModelRegistry");
+ var id = new ComponentId("com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry");
var significance = (SignificanceModelRegistry) cluster.getComponentsMap().get(id);
- assertEquals("com.yahoo.search.significance.impl.DefaultSignificanceModelRegistry", significance.getClassId().getName());
+ assertEquals("com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry", significance.getClassId().getName());
var cfgBuilder = new SignificanceConfig.Builder();
significance.getConfig(cfgBuilder);
return cfgBuilder.build();
diff --git a/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java b/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java
index 9ea35339f8d..97220725fec 100644
--- a/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java
+++ b/container-search/src/main/java/com/yahoo/search/searchchain/model/federation/LocalProviderSpec.java
@@ -43,7 +43,8 @@ public class LocalProviderSpec {
com.yahoo.search.searchers.ValidateFuzzySearcher.class,
com.yahoo.search.yql.FieldFiller.class,
com.yahoo.search.searchers.InputCheckingSearcher.class,
- com.yahoo.search.searchers.ContainerLatencySearcher.class);
+ com.yahoo.search.searchers.ContainerLatencySearcher.class,
+ com.yahoo.search.significance.SignificanceSearcher.class);
public final String clusterName;
diff --git a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
new file mode 100644
index 00000000000..0a42bf8a259
--- /dev/null
+++ b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
@@ -0,0 +1,77 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.search.significance;
+
+import com.yahoo.component.annotation.Inject;
+import com.yahoo.component.chain.dependencies.Before;
+import com.yahoo.component.chain.dependencies.Provides;
+import com.yahoo.language.Language;
+import com.yahoo.language.significance.SignificanceModel;
+import com.yahoo.language.significance.SignificanceModelRegistry;
+import com.yahoo.prelude.query.CompositeItem;
+import com.yahoo.prelude.query.Item;
+import com.yahoo.prelude.query.NullItem;
+import com.yahoo.prelude.query.WordItem;
+import com.yahoo.search.Query;
+import com.yahoo.search.Result;
+import com.yahoo.search.Searcher;
+import com.yahoo.search.searchchain.Execution;
+
+import java.util.Optional;
+
+import static com.yahoo.prelude.querytransform.StemmingSearcher.STEMMING;
+
+/**
+ * Sets significance values on word items in the query tree.
+ *
+ * @author MariusArhaug
+ */
+
+@Provides(SignificanceSearcher.SIGNIFICANCE)
+@Before(STEMMING)
+public class SignificanceSearcher extends Searcher {
+
+ public final static String SIGNIFICANCE = "Significance";
+ private final SignificanceModelRegistry significanceModelRegistry;
+
+
+ @Inject
+ public SignificanceSearcher(SignificanceModelRegistry significanceModelRegistry) {
+ this.significanceModelRegistry = significanceModelRegistry;
+ }
+
+ @Override
+ public Result search(Query query, Execution execution) {
+ Language language = query.getModel().getParsingLanguage();
+ Optional<SignificanceModel> model = significanceModelRegistry.getModel(language);
+
+ if (model.isEmpty()) return execution.search(query);
+
+ setIDF(query.getModel().getQueryTree().getRoot(), model.get());
+
+ return execution.search(query);
+ }
+
+ private void setIDF(Item root, SignificanceModel significanceModel) {
+ if (root == null || root instanceof NullItem) return;
+
+ if (root instanceof WordItem) {
+
+ var documentFrequency = significanceModel.documentFrequency(((WordItem) root).getWord());
+ long N = documentFrequency.corpusSize();
+ long nq_i = documentFrequency.frequency();
+ double idf = calculateIDF(N, nq_i);
+
+ ((WordItem) root).setSignificance(idf);
+ } else if (root instanceof CompositeItem) {
+ for (int i = 0; i < ((CompositeItem) root).getItemCount(); i++) {
+ setIDF(((CompositeItem) root).getItem(i), significanceModel);
+ }
+ }
+ }
+
+ public static double calculateIDF(long N, long nq_i) {
+ return Math.log(1 + (N - nq_i + 0.5) / (nq_i + 0.5));
+ }
+}
+
+
diff --git a/container-search/src/test/java/com/yahoo/search/significance/model/en.json b/container-search/src/test/java/com/yahoo/search/significance/model/en.json
new file mode 100644
index 00000000000..50bae5e3451
--- /dev/null
+++ b/container-search/src/test/java/com/yahoo/search/significance/model/en.json
@@ -0,0 +1,14 @@
+{
+ "version" : "1.0",
+ "id" : "test::1",
+ "description" : "desc",
+ "corpus-size" : 10,
+ "language" : "en",
+ "word-count" : 4,
+ "frequencies" : {
+ "usa" : 2,
+ "hello": 3,
+ "world": 5,
+ "test": 2
+ }
+}
diff --git a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
new file mode 100644
index 00000000000..890db3abb51
--- /dev/null
+++ b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
@@ -0,0 +1,153 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.search.significance.test;
+
+import com.yahoo.component.chain.Chain;
+import com.yahoo.language.Language;
+import com.yahoo.language.significance.SignificanceModel;
+import com.yahoo.language.significance.SignificanceModelRegistry;
+import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry;
+import com.yahoo.prelude.query.AndItem;
+import com.yahoo.prelude.query.WordItem;
+import com.yahoo.search.Query;
+import com.yahoo.search.Result;
+import com.yahoo.search.searchchain.Execution;
+import com.yahoo.search.significance.SignificanceSearcher;
+import org.junit.jupiter.api.Test;
+
+import java.nio.file.Path;
+import java.util.HashMap;
+
+
+import static com.yahoo.test.JunitCompat.assertEquals;
+
+/**
+ * Tests significance term in the search chain.
+ *
+ * @author MariusArhaug
+ */
+public class SignificanceSearcherTest {
+ SignificanceModelRegistry significanceModelRegistry;
+ SignificanceSearcher searcher;
+
+ public SignificanceSearcherTest() {
+ HashMap<Language, Path> map = new HashMap<>();
+ map.put(Language.ENGLISH, Path.of("src/test/java/com/yahoo/search/significance/model/en.json"));
+
+ significanceModelRegistry = new DefaultSignificanceModelRegistry(map);
+ searcher = new SignificanceSearcher(significanceModelRegistry);
+ }
+
+ private Execution createExecution(SignificanceSearcher searcher) {
+ return new Execution(new Chain<>(searcher), Execution.Context.createContextStub());
+ }
+
+ private Execution createExecution() {
+ return new Execution(new Chain<>(), Execution.Context.createContextStub());
+ }
+
+ @Test
+ void testSignificanceValueOnSimpleQuery() {
+
+ Query q = new Query();
+ AndItem root = new AndItem();
+ WordItem tmp;
+ tmp = new WordItem("Hello", true);
+ root.addItem(tmp);
+ tmp = new WordItem("world", true);
+ root.addItem(tmp);
+
+ q.getModel().getQueryTree().setRoot(root);
+
+ SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
+ var helloFrequency = model.documentFrequency("Hello");
+ var helloSignificanceValue = SignificanceSearcher.calculateIDF(helloFrequency.corpusSize(), helloFrequency.frequency());
+
+ var worldFrequency = model.documentFrequency("world");
+ var worldSignificanceValue = SignificanceSearcher.calculateIDF(worldFrequency.corpusSize(), worldFrequency.frequency());
+
+ Result r = createExecution(searcher).search(q);
+
+ root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
+ WordItem w0 = (WordItem) root.getItem(0);
+ WordItem w1 = (WordItem) root.getItem(1);
+
+ assertEquals(helloSignificanceValue, w0.getSignificance());
+ assertEquals(worldSignificanceValue, w1.getSignificance());
+
+ }
+
+ @Test
+ void testSignificanceValueOnRecursiveQuery() {
+ Query q = new Query();
+ AndItem root = new AndItem();
+ WordItem child1 = new WordItem("hello", true);
+
+ AndItem child2 = new AndItem();
+ WordItem child2_1 = new WordItem("test", true);
+
+ AndItem child3 = new AndItem();
+ AndItem child3_1 = new AndItem();
+ WordItem child3_1_1 = new WordItem("usa", true);
+
+ root.addItem(child1);
+ root.addItem(child2);
+ root.addItem(child3);
+
+ child2.addItem(child2_1);
+ child3.addItem(child3_1);
+ child3_1.addItem(child3_1_1);
+
+ q.getModel().getQueryTree().setRoot(root);
+
+ SignificanceModel model = significanceModelRegistry.getModel(Language.ENGLISH).get();
+ var helloFrequency = model.documentFrequency("hello");
+ var helloSignificanceValue = SignificanceSearcher.calculateIDF(helloFrequency.corpusSize(), helloFrequency.frequency());
+
+ var testFrequency = model.documentFrequency("test");
+ var testSignificanceValue = SignificanceSearcher.calculateIDF(testFrequency.corpusSize(), testFrequency.frequency());
+
+
+
+ Result r = createExecution(searcher).search(q);
+
+ root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
+ WordItem w0 = (WordItem) root.getItem(0);
+ WordItem w1 = (WordItem) ((AndItem) root.getItem(1)).getItem(0);
+ WordItem w3 = (WordItem) ((AndItem) ((AndItem) root.getItem(2)).getItem(0)).getItem(0);
+
+ assertEquals(helloSignificanceValue, w0.getSignificance());
+ assertEquals(testSignificanceValue, w1.getSignificance());
+ assertEquals(SignificanceSearcher.calculateIDF(10, 2), w3.getSignificance());
+
+ }
+
+ @Test
+ void testSignificanceValueOnEmptyQuery() {
+ Query q = new Query();
+ q.getModel().setLanguage(Language.NORWEGIAN_BOKMAL);
+ AndItem root = new AndItem();
+ WordItem tmp;
+ tmp = new WordItem("Hei", true);
+ root.addItem(tmp);
+ tmp = new WordItem("Verden", true);
+ root.addItem(tmp);
+
+
+ q.getModel().getQueryTree().setRoot(root);
+ Result r = createExecution(searcher).search(q);
+ root = (AndItem) r.getQuery().getModel().getQueryTree().getRoot();
+
+ WordItem w0 = (WordItem) root.getItem(0);
+ WordItem w1 = (WordItem) root.getItem(1);
+
+ Result r0 = createExecution().search(q);
+ root = (AndItem) r0.getQuery().getModel().getQueryTree().getRoot();
+
+ WordItem w0_0 = (WordItem) root.getItem(0);
+ WordItem w0_1 = (WordItem) root.getItem(1);
+
+ assertEquals(w0_0.getSignificance(), w0.getSignificance());
+ assertEquals(w0_1.getSignificance(), w1.getSignificance());
+
+ }
+}
diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json
index a4adacc5905..1ca32a2dd37 100644
--- a/linguistics/abi-spec.json
+++ b/linguistics/abi-spec.json
@@ -816,7 +816,7 @@
"abstract"
],
"methods" : [
- "public abstract com.yahoo.language.significance.SignificanceModel getModel(com.yahoo.language.Language)"
+ "public abstract java.util.Optional getModel(com.yahoo.language.Language)"
],
"fields" : [ ]
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java
index 6d8dcc00e0a..95d5b5e69d8 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java
@@ -4,10 +4,12 @@ package com.yahoo.language.significance;
import com.yahoo.api.annotations.Beta;
import com.yahoo.language.Language;
+import java.util.Optional;
+
/**
* @author MariusArhaug
*/
@Beta
public interface SignificanceModelRegistry {
- SignificanceModel getModel(Language language);
+ Optional<SignificanceModel> getModel(Language language);
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
index d44eab39cdf..1be1d3f13b5 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
@@ -11,6 +11,7 @@ import java.nio.file.Path;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Map;
+import java.util.Optional;
import java.util.function.Supplier;
import static com.yahoo.yolean.Exceptions.uncheck;
@@ -43,12 +44,12 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist
@Override
- public SignificanceModel getModel(Language language) throws IllegalArgumentException {
+ public Optional<SignificanceModel> getModel(Language language) {
if (!models.containsKey(language))
{
- throw new IllegalArgumentException("No model for language " + language);
+ return Optional.empty();
}
- return models.get(language);
+ return Optional.of(models.get(language));
}
diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
index d1de63a994d..d4849571b5e 100644
--- a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
+++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
@@ -8,9 +8,7 @@ import org.junit.Test;
import java.nio.file.Path;
import java.util.HashMap;
-import static org.junit.jupiter.api.Assertions.assertThrows;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.*;
/**
@@ -27,10 +25,16 @@ public class DefaultSignificanceModelRegistryTest {
DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
- var englishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH);
- var norwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL);
+ var optionalEnglishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH);
+ var optionalNorwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL);
- assertThrows(IllegalArgumentException.class, () -> defaultSignificanceModelRegistry.getModel(Language.FRENCH));
+ assertTrue(optionalEnglishModel.isPresent());
+ assertTrue(optionalNorwegianModel.isPresent());
+
+ var englishModel = optionalEnglishModel.get();
+ var norwegianModel = optionalNorwegianModel.get();
+
+ assertTrue( defaultSignificanceModelRegistry.getModel(Language.FRENCH).isEmpty());
assertNotNull(englishModel);
assertNotNull(norwegianModel);