summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--client/go/internal/osutil/run_cmd.go2
-rw-r--r--config-model/src/main/java/com/yahoo/schema/RankProfile.java11
-rw-r--r--config-model/src/main/java/com/yahoo/schema/derived/RawRankProfile.java5
-rw-r--r--config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedRanking.java1
-rw-r--r--config-model/src/main/java/com/yahoo/schema/parser/ParsedRankProfile.java7
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java15
-rw-r--r--config-model/src/main/javacc/SchemaParser.jj23
-rw-r--r--config-model/src/main/resources/schema/containercluster.rnc2
-rw-r--r--config-model/src/test/cfg/significance/services.xml6
-rw-r--r--config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java33
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java3
-rw-r--r--config-model/src/test/schema-test-files/services.xml2
-rw-r--r--configdefinitions/src/vespa/significance.def2
-rw-r--r--container-search/abi-spec.json2
-rw-r--r--container-search/src/main/java/com/yahoo/search/query/Ranking.java10
-rw-r--r--container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java20
-rw-r--r--container-search/src/test/java/com/yahoo/search/significance/model/en.json20
-rw-r--r--container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java26
-rw-r--r--container-search/src/test/resources/config/with_significance/rank-profiles.cfg3
-rw-r--r--dependency-versions/pom.xml8
-rw-r--r--dist/vespa.spec2
-rw-r--r--linguistics/abi-spec.json3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java76
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java56
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java43
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java48
-rw-r--r--linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java49
-rw-r--r--linguistics/src/test/models/docv1.json18
-rw-r--r--linguistics/src/test/models/docv2.json31
-rw-r--r--linguistics/src/test/models/en.json6
-rw-r--r--linguistics/src/test/models/no.json17
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp4
-rw-r--r--searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp30
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp31
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h6
-rw-r--r--searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h6
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h1
39 files changed, 487 insertions, 157 deletions
diff --git a/client/go/internal/osutil/run_cmd.go b/client/go/internal/osutil/run_cmd.go
index 3847dcc912a..ab7bd1069c3 100644
--- a/client/go/internal/osutil/run_cmd.go
+++ b/client/go/internal/osutil/run_cmd.go
@@ -38,7 +38,7 @@ func analyzeError(err error) string {
msg := "died with signal: " + status.Signal().String()
switch status.Signal() {
case syscall.SIGILL:
- msg = msg + " (you probably have an older CPU than required)"
+ msg = msg + " (you probably have an older CPU than required, see https://docs.vespa.ai/en/cpu-support.html)"
}
return msg
}
diff --git a/config-model/src/main/java/com/yahoo/schema/RankProfile.java b/config-model/src/main/java/com/yahoo/schema/RankProfile.java
index 82ed45028b3..cdefbbf8174 100644
--- a/config-model/src/main/java/com/yahoo/schema/RankProfile.java
+++ b/config-model/src/main/java/com/yahoo/schema/RankProfile.java
@@ -141,6 +141,8 @@ public class RankProfile implements Cloneable {
private Boolean strict;
+ private Boolean useSignificanceModel;
+
private final ApplicationPackage applicationPackage;
private final DeployLogger deployLogger;
@@ -216,6 +218,15 @@ public class RankProfile implements Cloneable {
this.strict = strict;
}
+ public void setUseSignificanceModel(Boolean useSignificanceModel) {
+ this.useSignificanceModel = useSignificanceModel;
+ }
+
+ public Boolean useSignificanceModel() {
+ if (useSignificanceModel != null) return useSignificanceModel;
+ return uniquelyInherited(p -> p.useSignificanceModel(), "use-model").orElse(null);
+ }
+
/**
* Adds a profile to those inherited by this.
* The profile must belong to this schema (directly or by inheritance).
diff --git a/config-model/src/main/java/com/yahoo/schema/derived/RawRankProfile.java b/config-model/src/main/java/com/yahoo/schema/derived/RawRankProfile.java
index b057624f055..42586fa7d75 100644
--- a/config-model/src/main/java/com/yahoo/schema/derived/RawRankProfile.java
+++ b/config-model/src/main/java/com/yahoo/schema/derived/RawRankProfile.java
@@ -186,6 +186,7 @@ public class RawRankProfile {
private RankingExpression globalPhaseRanking;
private final int globalPhaseRerankCount;
private final SerializationContext functionSerializationContext;
+ private Boolean useSignificanceModel;
/**
* Creates a raw rank profile from the given rank profile
@@ -221,6 +222,7 @@ public class RawRankProfile {
rankScoreDropLimit = compiled.getRankScoreDropLimit();
ignoreDefaultRankFeatures = compiled.getIgnoreDefaultRankFeatures();
rankProperties = new ArrayList<>(compiled.getRankProperties());
+ useSignificanceModel = compiled.useSignificanceModel();
Map<String, RankProfile.RankingExpressionFunction> functions = compiled.getFunctions();
List<ExpressionFunction> functionExpressions = functions.values().stream().map(RankProfile.RankingExpressionFunction::function).toList();
@@ -479,6 +481,9 @@ public class RawRankProfile {
if (targetHitsMaxAdjustmentFactor.isPresent()) {
properties.add(new Pair<>("vespa.matching.nns.target_hits_max_adjustment_factor", String.valueOf(targetHitsMaxAdjustmentFactor.getAsDouble())));
}
+ if (useSignificanceModel != null) {
+ properties.add(new Pair<>("vespa.significance.use_model", String.valueOf(useSignificanceModel)));
+ }
if (matchPhaseSettings != null) {
properties.add(new Pair<>("vespa.matchphase.degradation.attribute", matchPhaseSettings.getAttribute()));
properties.add(new Pair<>("vespa.matchphase.degradation.ascendingorder", matchPhaseSettings.getAscending() + ""));
diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedRanking.java b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedRanking.java
index 5ccbb7b19a4..77a10862f9c 100644
--- a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedRanking.java
+++ b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedRanking.java
@@ -39,6 +39,7 @@ public class ConvertParsedRanking {
profile.inherit(name);
parsed.isStrict().ifPresent(value -> profile.setStrict(value));
+ parsed.isUseSignificanceModel().ifPresent(value -> profile.setUseSignificanceModel(value));
for (var constant : parsed.getConstants().values())
profile.add(constant);
diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ParsedRankProfile.java b/config-model/src/main/java/com/yahoo/schema/parser/ParsedRankProfile.java
index fbbb0c7fe83..93319e82076 100644
--- a/config-model/src/main/java/com/yahoo/schema/parser/ParsedRankProfile.java
+++ b/config-model/src/main/java/com/yahoo/schema/parser/ParsedRankProfile.java
@@ -44,6 +44,7 @@ class ParsedRankProfile extends ParsedBlock {
private String inheritedMatchFeatures = null;
private String secondPhaseExpression = null;
private Boolean strict = null;
+ private Boolean useSignificanceModel = null;
private final List<MutateOperation> mutateOperations = new ArrayList<>();
private final List<String> inherited = new ArrayList<>();
private final Map<String, Boolean> fieldsRankFilter = new LinkedHashMap<>();
@@ -96,6 +97,8 @@ class ParsedRankProfile extends ParsedBlock {
Optional<String> getSecondPhaseExpression() { return Optional.ofNullable(this.secondPhaseExpression); }
Optional<Boolean> isStrict() { return Optional.ofNullable(this.strict); }
+ Optional<Boolean> isUseSignificanceModel() { return Optional.ofNullable(this.useSignificanceModel); }
+
void addSummaryFeatures(FeatureList features) { this.summaryFeatures.add(features); }
void addMatchFeatures(FeatureList features) { this.matchFeatures.add(features); }
void addRankFeatures(FeatureList features) { this.rankFeatures.add(features); }
@@ -218,6 +221,10 @@ class ParsedRankProfile extends ParsedBlock {
this.strict = strict;
}
+ void setUseSignificanceModel(boolean useSignificanceModel) {
+ verifyThat(this.useSignificanceModel == null, "already has use-model");
+ this.useSignificanceModel = useSignificanceModel;
+ }
void setTermwiseLimit(double limit) {
verifyThat(termwiseLimit == null, "already has termwise-limit");
this.termwiseLimit = limit;
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java b/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java
index c210c2621a6..693eebd75a8 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java
@@ -33,17 +33,15 @@ public class SignificanceModelRegistry extends SimpleComponent implements Signif
if (spec != null) {
for (Element modelElement : XML.getChildren(spec, "model")) {
- addConfig(
- modelElement.getAttribute("language"),
- Model.fromXml(deployState, modelElement, Set.of(SIGNIFICANCE_MODEL)).modelReference());
+ addConfig(Model.fromXml(deployState, modelElement, Set.of(SIGNIFICANCE_MODEL)).modelReference());
}
}
}
- public void addConfig(String language, ModelReference path) {
+ public void addConfig(ModelReference path) {
configList.add(
- new SignificanceModelConfig(language, path)
+ new SignificanceModelConfig(path)
);
}
@@ -53,19 +51,16 @@ public class SignificanceModelRegistry extends SimpleComponent implements Signif
builder.model(
configList.stream()
.map(config -> new SignificanceConfig.Model.Builder()
- .language(config.language)
.path(config.path)
).toList()
);
}
- class SignificanceModelConfig {
- private final String language;
+ static class SignificanceModelConfig {
private final ModelReference path;
- public SignificanceModelConfig(String language, ModelReference path) {
- this.language = language;
+ public SignificanceModelConfig(ModelReference path) {
this.path = path;
}
diff --git a/config-model/src/main/javacc/SchemaParser.jj b/config-model/src/main/javacc/SchemaParser.jj
index 255cc3cde70..b40f2d0796d 100644
--- a/config-model/src/main/javacc/SchemaParser.jj
+++ b/config-model/src/main/javacc/SchemaParser.jj
@@ -188,6 +188,8 @@ TOKEN :
| < SUFFIX: "suffix" >
| < CONSTANT: "constant">
| < ONNX_MODEL: "onnx-model">
+| < SIGNIFICANCE: "significance">
+| < USE_MODEL: "use-model">
| < INTRAOP_THREADS: "intraop-threads">
| < INTEROP_THREADS: "interop-threads">
| < GPU_DEVICE: "gpu-device">
@@ -1761,7 +1763,8 @@ void rankProfileItem(ParsedSchema schema, ParsedRankProfile profile) : { }
| matchFeatures(profile)
| summaryFeatures(profile)
| onnxModelInProfile(profile)
- | strict(profile) )
+ | strict(profile)
+ | significance(profile))
}
/**
@@ -2115,6 +2118,22 @@ void strict(ParsedRankProfile profile) :
)
}
+void significance(ParsedRankProfile profile) :
+{}
+{
+ <SIGNIFICANCE> lbrace() (significanceItem(profile) (<NL>)*)* <RBRACE>
+ {}
+}
+
+void significanceItem(ParsedRankProfile profile) :
+{}
+{
+ <USE_MODEL> <COLON> (
+ ( <TRUE> { profile.setUseSignificanceModel(true); } ) |
+ ( <FALSE> { profile.setUseSignificanceModel(false); } )
+ )
+}
+
/**
* Consumes a match-features block of a rank profile.
*
@@ -2710,6 +2729,7 @@ String identifierWithDash() :
| <TARGET_HITS_MAX_ADJUSTMENT_FACTOR>
| <TERMWISE_LIMIT>
| <UPPER_BOUND>
+ | <USE_MODEL>
) { return token.image; }
}
@@ -2812,6 +2832,7 @@ String identifier() : { }
| <STEMMING>
| <STRENGTH>
| <STRICT>
+ | <SIGNIFICANCE>
| <STRING>
| <STRUCT>
| <SUBSTRING>
diff --git a/config-model/src/main/resources/schema/containercluster.rnc b/config-model/src/main/resources/schema/containercluster.rnc
index 08092f10020..c79a7b38d09 100644
--- a/config-model/src/main/resources/schema/containercluster.rnc
+++ b/config-model/src/main/resources/schema/containercluster.rnc
@@ -138,7 +138,7 @@ Threadpool = element threadpool {
}
Significance = element significance {
- element model { attribute language { xsd:string } & ModelReference }*
+ element model { ModelReference }*
}
Clients = element clients {
diff --git a/config-model/src/test/cfg/significance/services.xml b/config-model/src/test/cfg/significance/services.xml
index 6991f5498fb..ffdb73bfc2e 100644
--- a/config-model/src/test/cfg/significance/services.xml
+++ b/config-model/src/test/cfg/significance/services.xml
@@ -8,9 +8,9 @@
<container version="1.0">
<search>
<significance>
- <model language="en" model-id="idf-wiki-english" path="models/idf-english-wiki.json.zst"/>
- <model language="no" path="models/idf-norwegian-wiki.json.zst" />
- <model language="ru" url="https://some/uri/blob.json" />
+ <model model-id="idf-wiki-english" path="models/idf-english-wiki.json.zst"/>
+ <model path="models/idf-norwegian-wiki.json.zst" />
+ <model url="https://some/uri/blob.json" />
</significance>
</search>
</container>
diff --git a/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java b/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java
index 5a2dc218da7..34ca6c30a61 100644
--- a/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java
+++ b/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java
@@ -121,6 +121,39 @@ public class SchemaParserTestCase {
}
@Test
+ void significance_can_be_parsed() throws Exception {
+ String input = """
+ schema foo {
+ rank-profile significance-ranking-0 inherits default {
+ significance {
+ use-model: true
+ }
+ }
+ rank-profile significance-ranking-1 {
+ significance {
+ use-model: false
+ }
+ }
+ }
+ """;
+
+ ParsedSchema schema = parseString(input);
+ assertEquals("foo", schema.name());
+ var rplist = schema.getRankProfiles();
+ assertEquals(2, rplist.size());
+
+ var rp0 = rplist.get(0);
+ assertEquals("significance-ranking-0", rp0.name());
+ assertTrue(rp0.isUseSignificanceModel().isPresent());
+ assertTrue(rp0.isUseSignificanceModel().get());
+
+ var rp1 = rplist.get(1);
+ assertEquals("significance-ranking-1", rp1.name());
+ assertTrue(rp1.isUseSignificanceModel().isPresent());
+ assertFalse(rp1.isUseSignificanceModel().get());
+ }
+
+ @Test
void maxOccurrencesCanBeParsed() throws Exception {
String input = joinLines
("schema foo {",
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java b/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java
index 00e95a34287..26e8c67a226 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java
@@ -37,9 +37,6 @@ public class SignificanceModelTestCase {
ApplicationContainerCluster containerCluster = vespaModel.getContainerClusters().get("container");
var significanceConfig = assertSignificancePresent(containerCluster);
assertEquals(3, significanceConfig.model().size());
- assertEquals("en", significanceConfig.model().get(0).language());
- assertEquals("no", significanceConfig.model().get(1).language());
- assertEquals("ru", significanceConfig.model().get(2).language());
assertEquals("models/idf-norwegian-wiki.json.zst", modelReference(significanceConfig.model().get(1), "path").path().orElseThrow().value());
assertEquals("https://some/uri/blob.json", modelReference(significanceConfig.model().get(2), "path").url().orElseThrow().value());
diff --git a/config-model/src/test/schema-test-files/services.xml b/config-model/src/test/schema-test-files/services.xml
index 7333ef5a87b..a413ec7753b 100644
--- a/config-model/src/test/schema-test-files/services.xml
+++ b/config-model/src/test/schema-test-files/services.xml
@@ -168,7 +168,7 @@
</threadpool>
<significance>
- <model language="en" model-id="idf-wiki-simple-english" path="models/idf-simple-english-wiki.json.zst" />
+ <model model-id="idf-wiki-simple-english" path="models/idf-simple-english-wiki.json.zst" />
</significance>
</search>
diff --git a/configdefinitions/src/vespa/significance.def b/configdefinitions/src/vespa/significance.def
index e0cc5b4c611..8d40381a0c9 100644
--- a/configdefinitions/src/vespa/significance.def
+++ b/configdefinitions/src/vespa/significance.def
@@ -1,6 +1,4 @@
# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
namespace=search.significance.config
-model[].language string
model[].path model
-
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json
index 5e66e1bb746..d85f1844b18 100644
--- a/container-search/abi-spec.json
+++ b/container-search/abi-spec.json
@@ -5499,6 +5499,8 @@
"public com.yahoo.search.query.ranking.RankProperties getProperties()",
"public void setListFeatures(boolean)",
"public boolean getListFeatures()",
+ "public void setUseSignificance(boolean)",
+ "public boolean getUseSignificance()",
"public com.yahoo.search.query.ranking.MatchPhase getMatchPhase()",
"public com.yahoo.search.query.ranking.GlobalPhase getGlobalPhase()",
"public com.yahoo.search.query.ranking.Matching getMatching()",
diff --git a/container-search/src/main/java/com/yahoo/search/query/Ranking.java b/container-search/src/main/java/com/yahoo/search/query/Ranking.java
index b1dd5624d18..09de1a24ef9 100644
--- a/container-search/src/main/java/com/yahoo/search/query/Ranking.java
+++ b/container-search/src/main/java/com/yahoo/search/query/Ranking.java
@@ -113,6 +113,8 @@ public class Ranking implements Cloneable {
private SoftTimeout softTimeout = new SoftTimeout();
+ private boolean useSignificance = false;
+
public Ranking(Query parent) {
this.parent = parent;
this.rankFeatures = new RankFeatures(this);
@@ -217,6 +219,14 @@ public class Ranking implements Cloneable {
/** Returns whether rank features should be dumped with the result of this query, default false */
public boolean getListFeatures() { return listFeatures; }
+ /** Set whether to use significance in ranking */
+ @com.yahoo.api.annotations.Beta
+ public void setUseSignificance(boolean useSignificance) { this.useSignificance = useSignificance; }
+
+ /** Returns whether to use significance in ranking */
+ @com.yahoo.api.annotations.Beta
+ public boolean getUseSignificance() { return useSignificance; }
+
/** Returns the match phase rank settings of this. This is never null. */
public MatchPhase getMatchPhase() { return matchPhase; }
diff --git a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
index 0a42bf8a259..6cef576f967 100644
--- a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
+++ b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java
@@ -14,8 +14,11 @@ import com.yahoo.prelude.query.WordItem;
import com.yahoo.search.Query;
import com.yahoo.search.Result;
import com.yahoo.search.Searcher;
+import com.yahoo.search.query.Ranking;
import com.yahoo.search.searchchain.Execution;
+import com.yahoo.vespa.config.search.RankProfilesConfig;
+import java.util.HashMap;
import java.util.Optional;
import static com.yahoo.prelude.querytransform.StemmingSearcher.STEMMING;
@@ -32,15 +35,30 @@ public class SignificanceSearcher extends Searcher {
public final static String SIGNIFICANCE = "Significance";
private final SignificanceModelRegistry significanceModelRegistry;
+ private final RankProfilesConfig rankProfilesConfig;
+
+ private final HashMap<String, Boolean> useModel = new HashMap<>();
@Inject
- public SignificanceSearcher(SignificanceModelRegistry significanceModelRegistry) {
+ public SignificanceSearcher(SignificanceModelRegistry significanceModelRegistry, RankProfilesConfig rankProfilesConfig) {
this.significanceModelRegistry = significanceModelRegistry;
+ this.rankProfilesConfig = rankProfilesConfig;
+
+ for (RankProfilesConfig.Rankprofile profile : rankProfilesConfig.rankprofile()) {
+ for (RankProfilesConfig.Rankprofile.Fef.Property property : profile.fef().property()) {
+ if (property.name().equals("vespa.significance.use_model")) {
+ useModel.put(profile.name(), Boolean.parseBoolean(property.value()));
+ }
+ }
+ }
}
@Override
public Result search(Query query, Execution execution) {
+ Ranking ranking = query.getRanking();
+ if (!useModel.containsKey(ranking.getProfile()) || !useModel.get(ranking.getProfile())) return execution.search(query);
+
Language language = query.getModel().getParsingLanguage();
Optional<SignificanceModel> model = significanceModelRegistry.getModel(language);
diff --git a/container-search/src/test/java/com/yahoo/search/significance/model/en.json b/container-search/src/test/java/com/yahoo/search/significance/model/en.json
index 50bae5e3451..04010959a58 100644
--- a/container-search/src/test/java/com/yahoo/search/significance/model/en.json
+++ b/container-search/src/test/java/com/yahoo/search/significance/model/en.json
@@ -2,13 +2,17 @@
"version" : "1.0",
"id" : "test::1",
"description" : "desc",
- "corpus-size" : 10,
- "language" : "en",
- "word-count" : 4,
- "frequencies" : {
- "usa" : 2,
- "hello": 3,
- "world": 5,
- "test": 2
+ "languages" : {
+ "en": {
+ "description" : "english model",
+ "document-count" : 10,
+ "language" : "en",
+ "document-frequencies" : {
+ "usa" : 2,
+ "hello": 3,
+ "world": 5,
+ "test": 2
+ }
+ }
}
}
diff --git a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
index 890db3abb51..ed67798b4b1 100644
--- a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
+++ b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java
@@ -2,6 +2,7 @@
package com.yahoo.search.significance.test;
import com.yahoo.component.chain.Chain;
+import com.yahoo.config.subscription.ConfigGetter;
import com.yahoo.language.Language;
import com.yahoo.language.significance.SignificanceModel;
import com.yahoo.language.significance.SignificanceModelRegistry;
@@ -12,10 +13,13 @@ import com.yahoo.search.Query;
import com.yahoo.search.Result;
import com.yahoo.search.searchchain.Execution;
import com.yahoo.search.significance.SignificanceSearcher;
+import com.yahoo.vespa.config.search.RankProfilesConfig;
import org.junit.jupiter.api.Test;
import java.nio.file.Path;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import static com.yahoo.test.JunitCompat.assertEquals;
@@ -29,12 +33,24 @@ public class SignificanceSearcherTest {
SignificanceModelRegistry significanceModelRegistry;
SignificanceSearcher searcher;
+ private static final String CONFIG_DIR = "src/test/resources/config/";
+
public SignificanceSearcherTest() {
- HashMap<Language, Path> map = new HashMap<>();
- map.put(Language.ENGLISH, Path.of("src/test/java/com/yahoo/search/significance/model/en.json"));
+ List<Path> models = new ArrayList<>();
+ models.add( Path.of("src/test/java/com/yahoo/search/significance/model/en.json"));
+
+ RankProfilesConfig rpCfg = readConfig("with_significance");
+
+ assertEquals(1, rpCfg.rankprofile().size());
+
+ significanceModelRegistry = new DefaultSignificanceModelRegistry(models);
+ searcher = new SignificanceSearcher(significanceModelRegistry, rpCfg);
+ }
- significanceModelRegistry = new DefaultSignificanceModelRegistry(map);
- searcher = new SignificanceSearcher(significanceModelRegistry);
+ @SuppressWarnings("deprecation")
+ private RankProfilesConfig readConfig(String subDir) {
+ String cfgId = "file:" + CONFIG_DIR + subDir + "/rank-profiles.cfg";
+ return ConfigGetter.getConfig(RankProfilesConfig.class, cfgId);
}
private Execution createExecution(SignificanceSearcher searcher) {
@@ -49,6 +65,7 @@ public class SignificanceSearcherTest {
void testSignificanceValueOnSimpleQuery() {
Query q = new Query();
+ q.getRanking().setProfile("significance-ranking");
AndItem root = new AndItem();
WordItem tmp;
tmp = new WordItem("Hello", true);
@@ -79,6 +96,7 @@ public class SignificanceSearcherTest {
@Test
void testSignificanceValueOnRecursiveQuery() {
Query q = new Query();
+ q.getRanking().setProfile("significance-ranking");
AndItem root = new AndItem();
WordItem child1 = new WordItem("hello", true);
diff --git a/container-search/src/test/resources/config/with_significance/rank-profiles.cfg b/container-search/src/test/resources/config/with_significance/rank-profiles.cfg
new file mode 100644
index 00000000000..1dc1be62862
--- /dev/null
+++ b/container-search/src/test/resources/config/with_significance/rank-profiles.cfg
@@ -0,0 +1,3 @@
+rankprofile[0].name "significance-ranking"
+rankprofile[0].fef.property[0].name "vespa.significance.use_model"
+rankprofile[0].fef.property[0].value "true" \ No newline at end of file
diff --git a/dependency-versions/pom.xml b/dependency-versions/pom.xml
index fd80b4c79d4..65aa5a76795 100644
--- a/dependency-versions/pom.xml
+++ b/dependency-versions/pom.xml
@@ -68,7 +68,7 @@
<assertj.vespa.version>3.25.3</assertj.vespa.version>
<!-- Athenz dependencies. Make sure these dependencies match those in Vespa's internal repositories -->
- <aws-sdk.vespa.version>1.12.710</aws-sdk.vespa.version>
+ <aws-sdk.vespa.version>1.12.711</aws-sdk.vespa.version>
<athenz.vespa.version>1.11.57</athenz.vespa.version>
<!-- Athenz END -->
@@ -171,17 +171,17 @@
<maven-compiler-plugin.vespa.version>3.13.0</maven-compiler-plugin.vespa.version>
<maven-core.vespa.version>3.9.6</maven-core.vespa.version>
<maven-dependency-plugin.vespa.version>3.6.1</maven-dependency-plugin.vespa.version>
- <maven-deploy-plugin.vespa.version>3.1.1</maven-deploy-plugin.vespa.version>
+ <maven-deploy-plugin.vespa.version>3.1.2</maven-deploy-plugin.vespa.version>
<maven-enforcer-plugin.vespa.version>3.4.1</maven-enforcer-plugin.vespa.version>
<maven-failsafe-plugin.vespa.version>3.2.5</maven-failsafe-plugin.vespa.version>
<maven-gpg-plugin.vespa.version>3.2.4</maven-gpg-plugin.vespa.version>
- <maven-install-plugin.vespa.version>3.1.1</maven-install-plugin.vespa.version>
+ <maven-install-plugin.vespa.version>3.1.2</maven-install-plugin.vespa.version>
<maven-jar-plugin.vespa.version>3.4.1</maven-jar-plugin.vespa.version>
<maven-javadoc-plugin.vespa.version>3.6.3</maven-javadoc-plugin.vespa.version>
<maven-plugin-api.vespa.version>${maven-core.vespa.version}</maven-plugin-api.vespa.version>
<maven-plugin-tools.vespa.version>3.12.0</maven-plugin-tools.vespa.version>
<maven-resources-plugin.vespa.version>3.3.1</maven-resources-plugin.vespa.version>
- <maven-resolver.vespa.version>1.9.19</maven-resolver.vespa.version>
+ <maven-resolver.vespa.version>1.9.20</maven-resolver.vespa.version>
<maven-shade-plugin.vespa.version>3.5.3</maven-shade-plugin.vespa.version>
<maven-site-plugin.vespa.version>3.12.1</maven-site-plugin.vespa.version>
<maven-source-plugin.vespa.version>3.3.1</maven-source-plugin.vespa.version>
diff --git a/dist/vespa.spec b/dist/vespa.spec
index fa20df04efb..d7f5fe12a74 100644
--- a/dist/vespa.spec
+++ b/dist/vespa.spec
@@ -205,7 +205,7 @@ Requires: vespa-protobuf = %{_vespa_protobuf_version}
Requires: llvm-libs
%endif
Requires: vespa-onnxruntime = 1.17.3
-Requires: vespa-jllama = 3.0.1-2%{?dist}
+Requires: vespa-jllama >= 3.0.1
%description libs
diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json
index 1ca32a2dd37..ceab5025760 100644
--- a/linguistics/abi-spec.json
+++ b/linguistics/abi-spec.json
@@ -803,7 +803,8 @@
"abstract"
],
"methods" : [
- "public abstract com.yahoo.language.significance.DocumentFrequency documentFrequency(java.lang.String)"
+ "public abstract com.yahoo.language.significance.DocumentFrequency documentFrequency(java.lang.String)",
+ "public abstract java.lang.String getId()"
],
"fields" : [ ]
},
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
index a9f1e48af62..c8a31e1892c 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
@@ -9,4 +9,6 @@ import com.yahoo.api.annotations.Beta;
@Beta
public interface SignificanceModel {
DocumentFrequency documentFrequency(String word);
+
+ String getId();
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
index 7ed6f442610..3244b8373ad 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
@@ -1,13 +1,11 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.significance.impl;
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
-import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.language.significance.DocumentFrequency;
import com.yahoo.language.significance.SignificanceModel;
+import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
@@ -18,70 +16,22 @@ import java.util.HashMap;
public class DefaultSignificanceModel implements SignificanceModel {
private final long corpusSize;
private final HashMap<String, Long> frequencies;
- private final Path path;
- @JsonIgnoreProperties(ignoreUnknown = true)
- public static class SignificanceModelFile {
- private final String version;
- private final String id;
- private final String description;
- private final long corpusSize;
- private final String language;
-
- private final long wordCount;
- private final HashMap<String, Long> frequencies;
-
- @JsonCreator
- public SignificanceModelFile(
- @JsonProperty("version") String version,
- @JsonProperty("id") String id,
- @JsonProperty("description") String description,
- @JsonProperty("corpus-size") long corpusSize,
- @JsonProperty("language") String language,
- @JsonProperty("word-count") long wordCount,
- @JsonProperty("frequencies") HashMap<String, Long> frequencies) {
- this.version = version;
- this.id = id;
- this.description = description;
- this.corpusSize = corpusSize;
- this.language = language;
- this.wordCount = wordCount;
- this.frequencies = frequencies;
- }
-
- @JsonProperty("version")
- public String version() { return version; }
-
- @JsonProperty("id")
- public String id() { return id; }
-
- @JsonProperty("description")
- public String description() { return description; }
-
- @JsonProperty("corpus-size")
- public long corpusSize() { return corpusSize; }
-
- @JsonProperty("language")
- public String language() { return language; }
-
- @JsonProperty("frequencies")
- public HashMap<String, Long> frequencies() { return frequencies; }
-
- @JsonProperty("word-count")
- public long wordCount() { return wordCount; }
+ private String id;
+ public DefaultSignificanceModel(DocumentFrequencyFile file, String id) {
+ this.frequencies = file.frequencies();
+ this.corpusSize = file.documentCount();
+ this.id = id;
}
public DefaultSignificanceModel(Path path) {
- this.path = path;
-
ObjectMapper objectMapper = new ObjectMapper();
-
try {
- SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class);
- this.corpusSize = model.corpusSize;
- this.frequencies = model.frequencies;
- } catch (Exception e) {
+ var file = objectMapper.readValue(path.toFile(), DocumentFrequencyFile.class);
+ this.frequencies = file.frequencies();
+ this.corpusSize = file.documentCount();
+ } catch (IOException e) {
throw new RuntimeException("Failed to load model from " + path, e);
}
}
@@ -93,4 +43,10 @@ public class DefaultSignificanceModel implements SignificanceModel {
}
return new DocumentFrequency(1, corpusSize);
}
+
+ @Override
+ public String getId() {
+ return this.id;
+ }
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
index 1be1d3f13b5..72874c15d9e 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
@@ -1,20 +1,21 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.significance.impl;
+import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.component.annotation.Inject;
import com.yahoo.language.Language;
import com.yahoo.language.significance.SignificanceModel;
import com.yahoo.language.significance.SignificanceModelRegistry;
import com.yahoo.search.significance.config.SignificanceConfig;
+import java.io.IOException;
+import java.io.UncheckedIOException;
import java.nio.file.Path;
import java.util.EnumMap;
-import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Optional;
-import java.util.function.Supplier;
-import static com.yahoo.yolean.Exceptions.uncheck;
/**
* Default implementation of {@link SignificanceModelRegistry}.
* This implementation loads models lazily and caches them.
@@ -24,24 +25,35 @@ import static com.yahoo.yolean.Exceptions.uncheck;
public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry {
private final Map<Language, SignificanceModel> models;
+
@Inject
- public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); }
- private DefaultSignificanceModelRegistry(Builder b) {
+ public DefaultSignificanceModelRegistry(SignificanceConfig cfg) {
this.models = new EnumMap<>(Language.class);
- b.models.forEach((language, path) -> {
- models.put(language,
- uncheck(() -> new DefaultSignificanceModel(path)));
- });
+ for (var model : cfg.model()) {
+ addModel(model.path());
+ }
}
- public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) {
+ public DefaultSignificanceModelRegistry(List<Path> models) {
this.models = new EnumMap<>(Language.class);
- map.forEach((language, path) -> {
- models.put(language,
- uncheck(() -> new DefaultSignificanceModel(path)));
- });
+ for (var path : models) {
+ addModel(path);
+ }
}
+ public void addModel(Path path) {
+ ObjectMapper objectMapper = new ObjectMapper();
+ try {
+ SignificanceModelFile file = objectMapper.readValue(path.toFile(), SignificanceModelFile.class);
+ for (var pair : file.languages().entrySet()) {
+ this.models.put(
+ Language.fromLanguageTag(pair.getKey()),
+ new DefaultSignificanceModel(pair.getValue(), file.id()));
+ }
+ } catch (IOException e) {
+ throw new UncheckedIOException("Failed to load model from " + path, e);
+ }
+ }
@Override
public Optional<SignificanceModel> getModel(Language language) {
@@ -51,20 +63,4 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist
}
return Optional.of(models.get(language));
}
-
-
- public static final class Builder {
- private final Map<Language, Path> models = new EnumMap<>(Language.class);
-
- public Builder() {}
- public Builder(SignificanceConfig cfg) {
- for (var model : cfg.model()) {
- addModel(Language.fromLanguageTag(model.language()), model.path());
- }
- }
-
- public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; }
- public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); }
- }
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
new file mode 100644
index 00000000000..b62754ac8ad
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
@@ -0,0 +1,43 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import java.util.HashMap;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+@JsonIgnoreProperties(ignoreUnknown = true)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class DocumentFrequencyFile {
+ private final String description;
+
+ private final int documentCount;
+
+
+ private final HashMap<String, Long> frequencies;
+
+ @JsonCreator
+ public DocumentFrequencyFile(
+ @JsonProperty("description") String description,
+ @JsonProperty("document-count") int documentCount,
+ @JsonProperty("document-frequencies") HashMap<String, Long> frequencies) {
+ this.description = description;
+ this.documentCount = documentCount;
+ this.frequencies = frequencies;
+ }
+
+ @JsonProperty("description")
+ public String description() { return description; }
+
+ @JsonProperty("document-count")
+ public int documentCount() { return documentCount; }
+
+ @JsonProperty("document-frequencies")
+ public HashMap<String, Long> frequencies() { return frequencies; }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
new file mode 100644
index 00000000000..902613379f0
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
@@ -0,0 +1,48 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import java.util.HashMap;
+import java.util.List;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class SignificanceModelFile {
+ private final String version;
+ private final String id;
+ private final String description;
+
+ private final HashMap<String, DocumentFrequencyFile> languages;
+
+ @JsonCreator
+ public SignificanceModelFile(
+ @JsonProperty("version") String version,
+ @JsonProperty("id") String id,
+ @JsonProperty("description") String description,
+ @JsonProperty("languages") HashMap<String, DocumentFrequencyFile> languages) {
+ this.version = version;
+ this.id = id;
+ this.description = description;
+ this.languages = languages;
+ }
+
+ @JsonProperty("version")
+ public String version() { return version; }
+
+ @JsonProperty("id")
+ public String id() { return id; }
+
+ @JsonProperty("description")
+ public String description() { return description; }
+
+ @JsonProperty("languages")
+ public HashMap<String, DocumentFrequencyFile> languages() { return languages; }
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
index d4849571b5e..e8594885b9e 100644
--- a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
+++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
@@ -6,7 +6,8 @@ import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry;
import org.junit.Test;
import java.nio.file.Path;
-import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
@@ -18,10 +19,10 @@ public class DefaultSignificanceModelRegistryTest {
@Test
public void testDefaultSignificanceModelRegistry() {
- HashMap<Language, Path> models = new HashMap<>();
+ List<Path> models = new ArrayList<>();
- models.put(Language.ENGLISH, Path.of("src/test/models/en.json"));
- models.put(Language.NORWEGIAN_BOKMAL, Path.of("src/test/models/no.json"));
+ models.add(Path.of("src/test/models/docv1.json"));
+ models.add(Path.of("src/test/models/docv2.json"));
DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
@@ -39,6 +40,45 @@ public class DefaultSignificanceModelRegistryTest {
assertNotNull(englishModel);
assertNotNull(norwegianModel);
+ assertEquals("test::2", englishModel.getId());
+ assertEquals("test::2", norwegianModel.getId());
+
+ assertEquals(4, englishModel.documentFrequency("test").frequency());
+ assertEquals(14, englishModel.documentFrequency("test").corpusSize());
+
+ assertEquals(3, norwegianModel.documentFrequency("nei").frequency());
+ assertEquals(20, norwegianModel.documentFrequency("nei").corpusSize());
+
+ assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency());
+ assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize());
+
+ }
+
+ @Test
+ public void testDefaultSignificanceModelRegistryInOppsiteOrder() {
+
+ List<Path> models = new ArrayList<>();
+
+ models.add(Path.of("src/test/models/docv2.json"));
+ models.add(Path.of("src/test/models/docv1.json"));
+
+ DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
+
+ var optionalEnglishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH);
+ var optionalNorwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL);
+
+ assertTrue(optionalEnglishModel.isPresent());
+ assertTrue(optionalNorwegianModel.isPresent());
+
+ var englishModel = optionalEnglishModel.get();
+ var norwegianModel = optionalNorwegianModel.get();
+
+ assertNotNull(englishModel);
+ assertNotNull(norwegianModel);
+
+ assertEquals("test::1", englishModel.getId());
+ assertEquals("test::2", norwegianModel.getId());
+
assertEquals(2, englishModel.documentFrequency("test").frequency());
assertEquals(10, englishModel.documentFrequency("test").corpusSize());
@@ -47,6 +87,5 @@ public class DefaultSignificanceModelRegistryTest {
assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency());
assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize());
-
}
}
diff --git a/linguistics/src/test/models/docv1.json b/linguistics/src/test/models/docv1.json
new file mode 100644
index 00000000000..04010959a58
--- /dev/null
+++ b/linguistics/src/test/models/docv1.json
@@ -0,0 +1,18 @@
+{
+ "version" : "1.0",
+ "id" : "test::1",
+ "description" : "desc",
+ "languages" : {
+ "en": {
+ "description" : "english model",
+ "document-count" : 10,
+ "language" : "en",
+ "document-frequencies" : {
+ "usa" : 2,
+ "hello": 3,
+ "world": 5,
+ "test": 2
+ }
+ }
+ }
+}
diff --git a/linguistics/src/test/models/docv2.json b/linguistics/src/test/models/docv2.json
new file mode 100644
index 00000000000..c00d02fb744
--- /dev/null
+++ b/linguistics/src/test/models/docv2.json
@@ -0,0 +1,31 @@
+{
+ "version" : "2.0",
+ "id" : "test::2",
+ "description" : "desc",
+ "languages" : {
+ "en": {
+ "description" : "english model",
+ "document-count" : 14,
+ "document-frequencies" : {
+ "usa" : 2,
+ "hello": 3,
+ "world": 5,
+ "test": 4,
+ "additional": 2
+ }
+ },
+ "nb": {
+ "description" : "norwegian model",
+ "document-count" : 20,
+ "document-frequencies" : {
+ "usa" : 2,
+ "hello": 10,
+ "verden": 5,
+ "test": 2,
+ "norge": 11,
+ "ja": 12,
+ "nei": 3
+ }
+ }
+ }
+}
diff --git a/linguistics/src/test/models/en.json b/linguistics/src/test/models/en.json
index 50bae5e3451..87b7b2faa08 100644
--- a/linguistics/src/test/models/en.json
+++ b/linguistics/src/test/models/en.json
@@ -1,11 +1,11 @@
{
"version" : "1.0",
"id" : "test::1",
- "description" : "desc",
- "corpus-size" : 10,
+ "description" : "english model",
+ "document-count" : 10,
"language" : "en",
"word-count" : 4,
- "frequencies" : {
+ "document-frequencies" : {
"usa" : 2,
"hello": 3,
"world": 5,
diff --git a/linguistics/src/test/models/no.json b/linguistics/src/test/models/no.json
deleted file mode 100644
index 5fca8929e74..00000000000
--- a/linguistics/src/test/models/no.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
- "version" : "1.0",
- "id" : "test::2",
- "description" : "norsk beskrivelse",
- "corpus-size" : 20,
- "language" : "nb",
- "word-count" : 7,
- "frequencies" : {
- "usa" : 2,
- "hello": 10,
- "verden": 5,
- "test": 2,
- "norge": 11,
- "ja": 12,
- "nei": 3
- }
-}
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index b1b2235165f..089f5e2e239 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -329,6 +329,10 @@ public:
static search::tensor::DistanceFunctionFactory::UP my_dist_fun = search::tensor::make_distance_function_factory(search::attribute::DistanceMetric::Euclidean, vespalib::eval::CellType::DOUBLE);
return *my_dist_fun;
}
+
+ uint32_t check_consistency(uint32_t) const noexcept override {
+ return 0;
+ }
};
class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory {
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
index d50677314df..a1cf86c95cc 100644
--- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
@@ -936,6 +936,36 @@ TYPED_TEST(HnswIndexTest, search_during_remove)
this->expect_top_3_by_docid("{0, 0}", {0, 0}, {7});
}
+TYPED_TEST(HnswIndexTest, inconsistent_index)
+{
+ this->init(false);
+ this->vectors.clear();
+ this->vectors.set(1, {1, 3}).set(2, {7, 1}).set(3, {6, 5}).set(4, {8, 3}).set(5, {10, 3});
+ this->add_document(1);
+ this->add_document(2);
+ this->add_document(3);
+ this->add_document(4);
+ this->add_document(5);
+ this->expect_entry_point(1, 0);
+ this->expect_level_0(1, {2, 3});
+ this->expect_level_0(2, {1, 3, 4, 5});
+ this->expect_level_0(3, {1, 2, 4});
+ this->expect_level_0(4, {2, 3, 5});
+ this->expect_level_0(5, {2, 4});
+ EXPECT_EQ(0, this->index->check_consistency(6));
+ // Remove vector for docid 5 but don't update index.
+ this->vectors.clear(5);
+ EXPECT_EQ(1, this->index->check_consistency(6));
+ /*
+ * Removing document 2 causes mutual reconnect for nodes [1, 3, 4, 5]
+ * where nodes 1 and 5 are not previously connected. Distance from
+ * node 1 to node 5 cannot be calculated due to missing vector.
+ */
+ this->remove_document(2);
+ // No reconnect for node without vector
+ this->expect_level_0(5, {4});
+}
+
using HnswMultiIndexTest = HnswIndexTest<HnswIndex<HnswIndexType::MULTI>>;
namespace {
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index 322965ca06a..b542c422f50 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -679,7 +679,10 @@ HnswIndex<type>::mutual_reconnect(const LinkArrayRef &cluster, uint32_t level)
for (uint32_t j = i + 1; j < cluster.size(); ++j) {
uint32_t n_id_2 = cluster[j];
if ( ! has_link_to(n_list_1, n_id_2)) {
- pairs.emplace_back(n_id_1, n_id_2, calc_distance(*df, n_id_2));
+ auto n_cells_2 = get_vector(n_id_2);
+ if (!n_cells_2.non_existing_attribute_value()) {
+ pairs.emplace_back(n_id_1, n_id_2, df->calc(n_cells_2));
+ }
}
}
}
@@ -1120,6 +1123,32 @@ HnswIndex<type>::count_reachable_nodes() const
return {found_cnt, true};
}
+template <HnswIndexType type>
+uint32_t
+HnswIndex<type>::get_subspaces(uint32_t docid) const noexcept
+{
+ if constexpr (type == HnswIndexType::SINGLE) {
+ return (docid < _graph.nodes.get_size() && _graph.nodes.get_elem_ref(docid).levels_ref().load_relaxed().valid()) ? 1 : 0;
+ } else {
+ return _id_mapping.get_ids(docid).size();
+ }
+}
+
+template <HnswIndexType type>
+uint32_t
+HnswIndex<type>::check_consistency(uint32_t docid_limit) const noexcept
+{
+ uint32_t inconsistencies = 0;
+ for (uint32_t docid = 1; docid < docid_limit; ++docid) {
+ auto index_subspaces = get_subspaces(docid);
+ auto store_subspaces = get_vectors(docid).subspaces();
+ if (index_subspaces != store_subspaces) {
+ ++inconsistencies;
+ }
+ }
+ return inconsistencies;
+}
+
template class HnswIndex<HnswIndexType::SINGLE>;
template class HnswIndex<HnswIndexType::MULTI>;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index 616140f426f..4d4440c1bcb 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -193,6 +193,9 @@ protected:
LinkArray filter_valid_nodeids(uint32_t level, const internal::PreparedAddNode::Links &neighbors, uint32_t self_nodeid);
void internal_complete_add(uint32_t docid, internal::PreparedAddDoc &op);
void internal_complete_add_node(uint32_t nodeid, uint32_t docid, uint32_t subspace, internal::PreparedAddNode &prepared_node);
+
+ // Called from writer only.
+ uint32_t get_subspaces(uint32_t docid) const noexcept;
public:
HnswIndex(const DocVectorAccess& vectors, DistanceFunctionFactory::UP distance_ff,
RandomLevelGenerator::UP level_generator, const HnswIndexConfig& cfg);
@@ -248,6 +251,9 @@ public:
uint32_t get_active_nodes() const noexcept { return _graph.get_active_nodes(); }
+ // Called from writer only.
+ uint32_t check_consistency(uint32_t docid_limit) const noexcept override;
+
// Should only be used by unit tests.
HnswTestNode get_node(uint32_t nodeid) const;
void set_node(uint32_t nodeid, const HnswTestNode &node);
diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
index 8462ff05eca..c2bbd17ce63 100644
--- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
@@ -114,6 +114,12 @@ public:
double distance_threshold) const = 0;
virtual DistanceFunctionFactory &distance_function_factory() const = 0;
+
+ /*
+ * Used when checking consistency during load.
+ * Called from writer only.
+ */
+ virtual uint32_t check_consistency(uint32_t docid_limit) const noexcept = 0;
};
}
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
index 28c4099c38b..223c9d7d1f2 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
@@ -322,6 +322,9 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor)
if (!load_index()) {
return false;
}
+ if (dense_store == nullptr) {
+ check_consistency(docid_limit);
+ }
} else {
build_index(executor, docid_limit);
}
@@ -329,4 +332,15 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor)
return true;
}
+void
+TensorAttributeLoader::check_consistency(uint32_t docid_limit)
+{
+ auto before = vespalib::steady_clock::now();
+ uint32_t inconsistencies = _index->check_consistency(docid_limit);
+ auto after = vespalib::steady_clock::now();
+ double elapsed = vespalib::to_s(after - before);
+ LOG(info, "%u inconsistencies detected after loading index for attribute %s, (check used %6.3fs)",
+ inconsistencies, _attr.getName().c_str(), elapsed);
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h
index 6bf68957adc..59baaf0b6dc 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h
@@ -34,6 +34,7 @@ class TensorAttributeLoader {
void load_tensor_store(search::attribute::BlobSequenceReader& reader, uint32_t docid_limit);
void build_index(vespalib::Executor* executor, uint32_t docid_limit);
bool load_index();
+ void check_consistency(uint32_t docid_limit);
public:
TensorAttributeLoader(TensorAttribute& attr, GenerationHandler& generation_handler, RefVector& ref_vector, TensorStore& store, NearestNeighborIndex* index);