diff options
39 files changed, 487 insertions, 157 deletions
diff --git a/client/go/internal/osutil/run_cmd.go b/client/go/internal/osutil/run_cmd.go index 3847dcc912a..ab7bd1069c3 100644 --- a/client/go/internal/osutil/run_cmd.go +++ b/client/go/internal/osutil/run_cmd.go @@ -38,7 +38,7 @@ func analyzeError(err error) string { msg := "died with signal: " + status.Signal().String() switch status.Signal() { case syscall.SIGILL: - msg = msg + " (you probably have an older CPU than required)" + msg = msg + " (you probably have an older CPU than required, see https://docs.vespa.ai/en/cpu-support.html)" } return msg } diff --git a/config-model/src/main/java/com/yahoo/schema/RankProfile.java b/config-model/src/main/java/com/yahoo/schema/RankProfile.java index 82ed45028b3..cdefbbf8174 100644 --- a/config-model/src/main/java/com/yahoo/schema/RankProfile.java +++ b/config-model/src/main/java/com/yahoo/schema/RankProfile.java @@ -141,6 +141,8 @@ public class RankProfile implements Cloneable { private Boolean strict; + private Boolean useSignificanceModel; + private final ApplicationPackage applicationPackage; private final DeployLogger deployLogger; @@ -216,6 +218,15 @@ public class RankProfile implements Cloneable { this.strict = strict; } + public void setUseSignificanceModel(Boolean useSignificanceModel) { + this.useSignificanceModel = useSignificanceModel; + } + + public Boolean useSignificanceModel() { + if (useSignificanceModel != null) return useSignificanceModel; + return uniquelyInherited(p -> p.useSignificanceModel(), "use-model").orElse(null); + } + /** * Adds a profile to those inherited by this. * The profile must belong to this schema (directly or by inheritance). diff --git a/config-model/src/main/java/com/yahoo/schema/derived/RawRankProfile.java b/config-model/src/main/java/com/yahoo/schema/derived/RawRankProfile.java index b057624f055..42586fa7d75 100644 --- a/config-model/src/main/java/com/yahoo/schema/derived/RawRankProfile.java +++ b/config-model/src/main/java/com/yahoo/schema/derived/RawRankProfile.java @@ -186,6 +186,7 @@ public class RawRankProfile { private RankingExpression globalPhaseRanking; private final int globalPhaseRerankCount; private final SerializationContext functionSerializationContext; + private Boolean useSignificanceModel; /** * Creates a raw rank profile from the given rank profile @@ -221,6 +222,7 @@ public class RawRankProfile { rankScoreDropLimit = compiled.getRankScoreDropLimit(); ignoreDefaultRankFeatures = compiled.getIgnoreDefaultRankFeatures(); rankProperties = new ArrayList<>(compiled.getRankProperties()); + useSignificanceModel = compiled.useSignificanceModel(); Map<String, RankProfile.RankingExpressionFunction> functions = compiled.getFunctions(); List<ExpressionFunction> functionExpressions = functions.values().stream().map(RankProfile.RankingExpressionFunction::function).toList(); @@ -479,6 +481,9 @@ public class RawRankProfile { if (targetHitsMaxAdjustmentFactor.isPresent()) { properties.add(new Pair<>("vespa.matching.nns.target_hits_max_adjustment_factor", String.valueOf(targetHitsMaxAdjustmentFactor.getAsDouble()))); } + if (useSignificanceModel != null) { + properties.add(new Pair<>("vespa.significance.use_model", String.valueOf(useSignificanceModel))); + } if (matchPhaseSettings != null) { properties.add(new Pair<>("vespa.matchphase.degradation.attribute", matchPhaseSettings.getAttribute())); properties.add(new Pair<>("vespa.matchphase.degradation.ascendingorder", matchPhaseSettings.getAscending() + "")); diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedRanking.java b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedRanking.java index 5ccbb7b19a4..77a10862f9c 100644 --- a/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedRanking.java +++ b/config-model/src/main/java/com/yahoo/schema/parser/ConvertParsedRanking.java @@ -39,6 +39,7 @@ public class ConvertParsedRanking { profile.inherit(name); parsed.isStrict().ifPresent(value -> profile.setStrict(value)); + parsed.isUseSignificanceModel().ifPresent(value -> profile.setUseSignificanceModel(value)); for (var constant : parsed.getConstants().values()) profile.add(constant); diff --git a/config-model/src/main/java/com/yahoo/schema/parser/ParsedRankProfile.java b/config-model/src/main/java/com/yahoo/schema/parser/ParsedRankProfile.java index fbbb0c7fe83..93319e82076 100644 --- a/config-model/src/main/java/com/yahoo/schema/parser/ParsedRankProfile.java +++ b/config-model/src/main/java/com/yahoo/schema/parser/ParsedRankProfile.java @@ -44,6 +44,7 @@ class ParsedRankProfile extends ParsedBlock { private String inheritedMatchFeatures = null; private String secondPhaseExpression = null; private Boolean strict = null; + private Boolean useSignificanceModel = null; private final List<MutateOperation> mutateOperations = new ArrayList<>(); private final List<String> inherited = new ArrayList<>(); private final Map<String, Boolean> fieldsRankFilter = new LinkedHashMap<>(); @@ -96,6 +97,8 @@ class ParsedRankProfile extends ParsedBlock { Optional<String> getSecondPhaseExpression() { return Optional.ofNullable(this.secondPhaseExpression); } Optional<Boolean> isStrict() { return Optional.ofNullable(this.strict); } + Optional<Boolean> isUseSignificanceModel() { return Optional.ofNullable(this.useSignificanceModel); } + void addSummaryFeatures(FeatureList features) { this.summaryFeatures.add(features); } void addMatchFeatures(FeatureList features) { this.matchFeatures.add(features); } void addRankFeatures(FeatureList features) { this.rankFeatures.add(features); } @@ -218,6 +221,10 @@ class ParsedRankProfile extends ParsedBlock { this.strict = strict; } + void setUseSignificanceModel(boolean useSignificanceModel) { + verifyThat(this.useSignificanceModel == null, "already has use-model"); + this.useSignificanceModel = useSignificanceModel; + } void setTermwiseLimit(double limit) { verifyThat(termwiseLimit == null, "already has termwise-limit"); this.termwiseLimit = limit; diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java b/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java index c210c2621a6..693eebd75a8 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java @@ -33,17 +33,15 @@ public class SignificanceModelRegistry extends SimpleComponent implements Signif if (spec != null) { for (Element modelElement : XML.getChildren(spec, "model")) { - addConfig( - modelElement.getAttribute("language"), - Model.fromXml(deployState, modelElement, Set.of(SIGNIFICANCE_MODEL)).modelReference()); + addConfig(Model.fromXml(deployState, modelElement, Set.of(SIGNIFICANCE_MODEL)).modelReference()); } } } - public void addConfig(String language, ModelReference path) { + public void addConfig(ModelReference path) { configList.add( - new SignificanceModelConfig(language, path) + new SignificanceModelConfig(path) ); } @@ -53,19 +51,16 @@ public class SignificanceModelRegistry extends SimpleComponent implements Signif builder.model( configList.stream() .map(config -> new SignificanceConfig.Model.Builder() - .language(config.language) .path(config.path) ).toList() ); } - class SignificanceModelConfig { - private final String language; + static class SignificanceModelConfig { private final ModelReference path; - public SignificanceModelConfig(String language, ModelReference path) { - this.language = language; + public SignificanceModelConfig(ModelReference path) { this.path = path; } diff --git a/config-model/src/main/javacc/SchemaParser.jj b/config-model/src/main/javacc/SchemaParser.jj index 255cc3cde70..b40f2d0796d 100644 --- a/config-model/src/main/javacc/SchemaParser.jj +++ b/config-model/src/main/javacc/SchemaParser.jj @@ -188,6 +188,8 @@ TOKEN : | < SUFFIX: "suffix" > | < CONSTANT: "constant"> | < ONNX_MODEL: "onnx-model"> +| < SIGNIFICANCE: "significance"> +| < USE_MODEL: "use-model"> | < INTRAOP_THREADS: "intraop-threads"> | < INTEROP_THREADS: "interop-threads"> | < GPU_DEVICE: "gpu-device"> @@ -1761,7 +1763,8 @@ void rankProfileItem(ParsedSchema schema, ParsedRankProfile profile) : { } | matchFeatures(profile) | summaryFeatures(profile) | onnxModelInProfile(profile) - | strict(profile) ) + | strict(profile) + | significance(profile)) } /** @@ -2115,6 +2118,22 @@ void strict(ParsedRankProfile profile) : ) } +void significance(ParsedRankProfile profile) : +{} +{ + <SIGNIFICANCE> lbrace() (significanceItem(profile) (<NL>)*)* <RBRACE> + {} +} + +void significanceItem(ParsedRankProfile profile) : +{} +{ + <USE_MODEL> <COLON> ( + ( <TRUE> { profile.setUseSignificanceModel(true); } ) | + ( <FALSE> { profile.setUseSignificanceModel(false); } ) + ) +} + /** * Consumes a match-features block of a rank profile. * @@ -2710,6 +2729,7 @@ String identifierWithDash() : | <TARGET_HITS_MAX_ADJUSTMENT_FACTOR> | <TERMWISE_LIMIT> | <UPPER_BOUND> + | <USE_MODEL> ) { return token.image; } } @@ -2812,6 +2832,7 @@ String identifier() : { } | <STEMMING> | <STRENGTH> | <STRICT> + | <SIGNIFICANCE> | <STRING> | <STRUCT> | <SUBSTRING> diff --git a/config-model/src/main/resources/schema/containercluster.rnc b/config-model/src/main/resources/schema/containercluster.rnc index 08092f10020..c79a7b38d09 100644 --- a/config-model/src/main/resources/schema/containercluster.rnc +++ b/config-model/src/main/resources/schema/containercluster.rnc @@ -138,7 +138,7 @@ Threadpool = element threadpool { } Significance = element significance { - element model { attribute language { xsd:string } & ModelReference }* + element model { ModelReference }* } Clients = element clients { diff --git a/config-model/src/test/cfg/significance/services.xml b/config-model/src/test/cfg/significance/services.xml index 6991f5498fb..ffdb73bfc2e 100644 --- a/config-model/src/test/cfg/significance/services.xml +++ b/config-model/src/test/cfg/significance/services.xml @@ -8,9 +8,9 @@ <container version="1.0"> <search> <significance> - <model language="en" model-id="idf-wiki-english" path="models/idf-english-wiki.json.zst"/> - <model language="no" path="models/idf-norwegian-wiki.json.zst" /> - <model language="ru" url="https://some/uri/blob.json" /> + <model model-id="idf-wiki-english" path="models/idf-english-wiki.json.zst"/> + <model path="models/idf-norwegian-wiki.json.zst" /> + <model url="https://some/uri/blob.json" /> </significance> </search> </container> diff --git a/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java b/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java index 5a2dc218da7..34ca6c30a61 100644 --- a/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java +++ b/config-model/src/test/java/com/yahoo/schema/parser/SchemaParserTestCase.java @@ -121,6 +121,39 @@ public class SchemaParserTestCase { } @Test + void significance_can_be_parsed() throws Exception { + String input = """ + schema foo { + rank-profile significance-ranking-0 inherits default { + significance { + use-model: true + } + } + rank-profile significance-ranking-1 { + significance { + use-model: false + } + } + } + """; + + ParsedSchema schema = parseString(input); + assertEquals("foo", schema.name()); + var rplist = schema.getRankProfiles(); + assertEquals(2, rplist.size()); + + var rp0 = rplist.get(0); + assertEquals("significance-ranking-0", rp0.name()); + assertTrue(rp0.isUseSignificanceModel().isPresent()); + assertTrue(rp0.isUseSignificanceModel().get()); + + var rp1 = rplist.get(1); + assertEquals("significance-ranking-1", rp1.name()); + assertTrue(rp1.isUseSignificanceModel().isPresent()); + assertFalse(rp1.isUseSignificanceModel().get()); + } + + @Test void maxOccurrencesCanBeParsed() throws Exception { String input = joinLines ("schema foo {", diff --git a/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java b/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java index 00e95a34287..26e8c67a226 100644 --- a/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java @@ -37,9 +37,6 @@ public class SignificanceModelTestCase { ApplicationContainerCluster containerCluster = vespaModel.getContainerClusters().get("container"); var significanceConfig = assertSignificancePresent(containerCluster); assertEquals(3, significanceConfig.model().size()); - assertEquals("en", significanceConfig.model().get(0).language()); - assertEquals("no", significanceConfig.model().get(1).language()); - assertEquals("ru", significanceConfig.model().get(2).language()); assertEquals("models/idf-norwegian-wiki.json.zst", modelReference(significanceConfig.model().get(1), "path").path().orElseThrow().value()); assertEquals("https://some/uri/blob.json", modelReference(significanceConfig.model().get(2), "path").url().orElseThrow().value()); diff --git a/config-model/src/test/schema-test-files/services.xml b/config-model/src/test/schema-test-files/services.xml index 7333ef5a87b..a413ec7753b 100644 --- a/config-model/src/test/schema-test-files/services.xml +++ b/config-model/src/test/schema-test-files/services.xml @@ -168,7 +168,7 @@ </threadpool> <significance> - <model language="en" model-id="idf-wiki-simple-english" path="models/idf-simple-english-wiki.json.zst" /> + <model model-id="idf-wiki-simple-english" path="models/idf-simple-english-wiki.json.zst" /> </significance> </search> diff --git a/configdefinitions/src/vespa/significance.def b/configdefinitions/src/vespa/significance.def index e0cc5b4c611..8d40381a0c9 100644 --- a/configdefinitions/src/vespa/significance.def +++ b/configdefinitions/src/vespa/significance.def @@ -1,6 +1,4 @@ # Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. namespace=search.significance.config -model[].language string model[].path model - diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index 5e66e1bb746..d85f1844b18 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -5499,6 +5499,8 @@ "public com.yahoo.search.query.ranking.RankProperties getProperties()", "public void setListFeatures(boolean)", "public boolean getListFeatures()", + "public void setUseSignificance(boolean)", + "public boolean getUseSignificance()", "public com.yahoo.search.query.ranking.MatchPhase getMatchPhase()", "public com.yahoo.search.query.ranking.GlobalPhase getGlobalPhase()", "public com.yahoo.search.query.ranking.Matching getMatching()", diff --git a/container-search/src/main/java/com/yahoo/search/query/Ranking.java b/container-search/src/main/java/com/yahoo/search/query/Ranking.java index b1dd5624d18..09de1a24ef9 100644 --- a/container-search/src/main/java/com/yahoo/search/query/Ranking.java +++ b/container-search/src/main/java/com/yahoo/search/query/Ranking.java @@ -113,6 +113,8 @@ public class Ranking implements Cloneable { private SoftTimeout softTimeout = new SoftTimeout(); + private boolean useSignificance = false; + public Ranking(Query parent) { this.parent = parent; this.rankFeatures = new RankFeatures(this); @@ -217,6 +219,14 @@ public class Ranking implements Cloneable { /** Returns whether rank features should be dumped with the result of this query, default false */ public boolean getListFeatures() { return listFeatures; } + /** Set whether to use significance in ranking */ + @com.yahoo.api.annotations.Beta + public void setUseSignificance(boolean useSignificance) { this.useSignificance = useSignificance; } + + /** Returns whether to use significance in ranking */ + @com.yahoo.api.annotations.Beta + public boolean getUseSignificance() { return useSignificance; } + /** Returns the match phase rank settings of this. This is never null. */ public MatchPhase getMatchPhase() { return matchPhase; } diff --git a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java index 0a42bf8a259..6cef576f967 100644 --- a/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java +++ b/container-search/src/main/java/com/yahoo/search/significance/SignificanceSearcher.java @@ -14,8 +14,11 @@ import com.yahoo.prelude.query.WordItem; import com.yahoo.search.Query; import com.yahoo.search.Result; import com.yahoo.search.Searcher; +import com.yahoo.search.query.Ranking; import com.yahoo.search.searchchain.Execution; +import com.yahoo.vespa.config.search.RankProfilesConfig; +import java.util.HashMap; import java.util.Optional; import static com.yahoo.prelude.querytransform.StemmingSearcher.STEMMING; @@ -32,15 +35,30 @@ public class SignificanceSearcher extends Searcher { public final static String SIGNIFICANCE = "Significance"; private final SignificanceModelRegistry significanceModelRegistry; + private final RankProfilesConfig rankProfilesConfig; + + private final HashMap<String, Boolean> useModel = new HashMap<>(); @Inject - public SignificanceSearcher(SignificanceModelRegistry significanceModelRegistry) { + public SignificanceSearcher(SignificanceModelRegistry significanceModelRegistry, RankProfilesConfig rankProfilesConfig) { this.significanceModelRegistry = significanceModelRegistry; + this.rankProfilesConfig = rankProfilesConfig; + + for (RankProfilesConfig.Rankprofile profile : rankProfilesConfig.rankprofile()) { + for (RankProfilesConfig.Rankprofile.Fef.Property property : profile.fef().property()) { + if (property.name().equals("vespa.significance.use_model")) { + useModel.put(profile.name(), Boolean.parseBoolean(property.value())); + } + } + } } @Override public Result search(Query query, Execution execution) { + Ranking ranking = query.getRanking(); + if (!useModel.containsKey(ranking.getProfile()) || !useModel.get(ranking.getProfile())) return execution.search(query); + Language language = query.getModel().getParsingLanguage(); Optional<SignificanceModel> model = significanceModelRegistry.getModel(language); diff --git a/container-search/src/test/java/com/yahoo/search/significance/model/en.json b/container-search/src/test/java/com/yahoo/search/significance/model/en.json index 50bae5e3451..04010959a58 100644 --- a/container-search/src/test/java/com/yahoo/search/significance/model/en.json +++ b/container-search/src/test/java/com/yahoo/search/significance/model/en.json @@ -2,13 +2,17 @@ "version" : "1.0", "id" : "test::1", "description" : "desc", - "corpus-size" : 10, - "language" : "en", - "word-count" : 4, - "frequencies" : { - "usa" : 2, - "hello": 3, - "world": 5, - "test": 2 + "languages" : { + "en": { + "description" : "english model", + "document-count" : 10, + "language" : "en", + "document-frequencies" : { + "usa" : 2, + "hello": 3, + "world": 5, + "test": 2 + } + } } } diff --git a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java index 890db3abb51..ed67798b4b1 100644 --- a/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java +++ b/container-search/src/test/java/com/yahoo/search/significance/test/SignificanceSearcherTest.java @@ -2,6 +2,7 @@ package com.yahoo.search.significance.test; import com.yahoo.component.chain.Chain; +import com.yahoo.config.subscription.ConfigGetter; import com.yahoo.language.Language; import com.yahoo.language.significance.SignificanceModel; import com.yahoo.language.significance.SignificanceModelRegistry; @@ -12,10 +13,13 @@ import com.yahoo.search.Query; import com.yahoo.search.Result; import com.yahoo.search.searchchain.Execution; import com.yahoo.search.significance.SignificanceSearcher; +import com.yahoo.vespa.config.search.RankProfilesConfig; import org.junit.jupiter.api.Test; import java.nio.file.Path; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import static com.yahoo.test.JunitCompat.assertEquals; @@ -29,12 +33,24 @@ public class SignificanceSearcherTest { SignificanceModelRegistry significanceModelRegistry; SignificanceSearcher searcher; + private static final String CONFIG_DIR = "src/test/resources/config/"; + public SignificanceSearcherTest() { - HashMap<Language, Path> map = new HashMap<>(); - map.put(Language.ENGLISH, Path.of("src/test/java/com/yahoo/search/significance/model/en.json")); + List<Path> models = new ArrayList<>(); + models.add( Path.of("src/test/java/com/yahoo/search/significance/model/en.json")); + + RankProfilesConfig rpCfg = readConfig("with_significance"); + + assertEquals(1, rpCfg.rankprofile().size()); + + significanceModelRegistry = new DefaultSignificanceModelRegistry(models); + searcher = new SignificanceSearcher(significanceModelRegistry, rpCfg); + } - significanceModelRegistry = new DefaultSignificanceModelRegistry(map); - searcher = new SignificanceSearcher(significanceModelRegistry); + @SuppressWarnings("deprecation") + private RankProfilesConfig readConfig(String subDir) { + String cfgId = "file:" + CONFIG_DIR + subDir + "/rank-profiles.cfg"; + return ConfigGetter.getConfig(RankProfilesConfig.class, cfgId); } private Execution createExecution(SignificanceSearcher searcher) { @@ -49,6 +65,7 @@ public class SignificanceSearcherTest { void testSignificanceValueOnSimpleQuery() { Query q = new Query(); + q.getRanking().setProfile("significance-ranking"); AndItem root = new AndItem(); WordItem tmp; tmp = new WordItem("Hello", true); @@ -79,6 +96,7 @@ public class SignificanceSearcherTest { @Test void testSignificanceValueOnRecursiveQuery() { Query q = new Query(); + q.getRanking().setProfile("significance-ranking"); AndItem root = new AndItem(); WordItem child1 = new WordItem("hello", true); diff --git a/container-search/src/test/resources/config/with_significance/rank-profiles.cfg b/container-search/src/test/resources/config/with_significance/rank-profiles.cfg new file mode 100644 index 00000000000..1dc1be62862 --- /dev/null +++ b/container-search/src/test/resources/config/with_significance/rank-profiles.cfg @@ -0,0 +1,3 @@ +rankprofile[0].name "significance-ranking" +rankprofile[0].fef.property[0].name "vespa.significance.use_model" +rankprofile[0].fef.property[0].value "true"
\ No newline at end of file diff --git a/dependency-versions/pom.xml b/dependency-versions/pom.xml index fd80b4c79d4..65aa5a76795 100644 --- a/dependency-versions/pom.xml +++ b/dependency-versions/pom.xml @@ -68,7 +68,7 @@ <assertj.vespa.version>3.25.3</assertj.vespa.version> <!-- Athenz dependencies. Make sure these dependencies match those in Vespa's internal repositories --> - <aws-sdk.vespa.version>1.12.710</aws-sdk.vespa.version> + <aws-sdk.vespa.version>1.12.711</aws-sdk.vespa.version> <athenz.vespa.version>1.11.57</athenz.vespa.version> <!-- Athenz END --> @@ -171,17 +171,17 @@ <maven-compiler-plugin.vespa.version>3.13.0</maven-compiler-plugin.vespa.version> <maven-core.vespa.version>3.9.6</maven-core.vespa.version> <maven-dependency-plugin.vespa.version>3.6.1</maven-dependency-plugin.vespa.version> - <maven-deploy-plugin.vespa.version>3.1.1</maven-deploy-plugin.vespa.version> + <maven-deploy-plugin.vespa.version>3.1.2</maven-deploy-plugin.vespa.version> <maven-enforcer-plugin.vespa.version>3.4.1</maven-enforcer-plugin.vespa.version> <maven-failsafe-plugin.vespa.version>3.2.5</maven-failsafe-plugin.vespa.version> <maven-gpg-plugin.vespa.version>3.2.4</maven-gpg-plugin.vespa.version> - <maven-install-plugin.vespa.version>3.1.1</maven-install-plugin.vespa.version> + <maven-install-plugin.vespa.version>3.1.2</maven-install-plugin.vespa.version> <maven-jar-plugin.vespa.version>3.4.1</maven-jar-plugin.vespa.version> <maven-javadoc-plugin.vespa.version>3.6.3</maven-javadoc-plugin.vespa.version> <maven-plugin-api.vespa.version>${maven-core.vespa.version}</maven-plugin-api.vespa.version> <maven-plugin-tools.vespa.version>3.12.0</maven-plugin-tools.vespa.version> <maven-resources-plugin.vespa.version>3.3.1</maven-resources-plugin.vespa.version> - <maven-resolver.vespa.version>1.9.19</maven-resolver.vespa.version> + <maven-resolver.vespa.version>1.9.20</maven-resolver.vespa.version> <maven-shade-plugin.vespa.version>3.5.3</maven-shade-plugin.vespa.version> <maven-site-plugin.vespa.version>3.12.1</maven-site-plugin.vespa.version> <maven-source-plugin.vespa.version>3.3.1</maven-source-plugin.vespa.version> diff --git a/dist/vespa.spec b/dist/vespa.spec index fa20df04efb..d7f5fe12a74 100644 --- a/dist/vespa.spec +++ b/dist/vespa.spec @@ -205,7 +205,7 @@ Requires: vespa-protobuf = %{_vespa_protobuf_version} Requires: llvm-libs %endif Requires: vespa-onnxruntime = 1.17.3 -Requires: vespa-jllama = 3.0.1-2%{?dist} +Requires: vespa-jllama >= 3.0.1 %description libs diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index 1ca32a2dd37..ceab5025760 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -803,7 +803,8 @@ "abstract" ], "methods" : [ - "public abstract com.yahoo.language.significance.DocumentFrequency documentFrequency(java.lang.String)" + "public abstract com.yahoo.language.significance.DocumentFrequency documentFrequency(java.lang.String)", + "public abstract java.lang.String getId()" ], "fields" : [ ] }, diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java index a9f1e48af62..c8a31e1892c 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java @@ -9,4 +9,6 @@ import com.yahoo.api.annotations.Beta; @Beta public interface SignificanceModel { DocumentFrequency documentFrequency(String word); + + String getId(); } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java index 7ed6f442610..3244b8373ad 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java @@ -1,13 +1,11 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.significance.impl; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.language.significance.DocumentFrequency; import com.yahoo.language.significance.SignificanceModel; +import java.io.IOException; import java.nio.file.Path; import java.util.HashMap; @@ -18,70 +16,22 @@ import java.util.HashMap; public class DefaultSignificanceModel implements SignificanceModel { private final long corpusSize; private final HashMap<String, Long> frequencies; - private final Path path; - @JsonIgnoreProperties(ignoreUnknown = true) - public static class SignificanceModelFile { - private final String version; - private final String id; - private final String description; - private final long corpusSize; - private final String language; - - private final long wordCount; - private final HashMap<String, Long> frequencies; - - @JsonCreator - public SignificanceModelFile( - @JsonProperty("version") String version, - @JsonProperty("id") String id, - @JsonProperty("description") String description, - @JsonProperty("corpus-size") long corpusSize, - @JsonProperty("language") String language, - @JsonProperty("word-count") long wordCount, - @JsonProperty("frequencies") HashMap<String, Long> frequencies) { - this.version = version; - this.id = id; - this.description = description; - this.corpusSize = corpusSize; - this.language = language; - this.wordCount = wordCount; - this.frequencies = frequencies; - } - - @JsonProperty("version") - public String version() { return version; } - - @JsonProperty("id") - public String id() { return id; } - - @JsonProperty("description") - public String description() { return description; } - - @JsonProperty("corpus-size") - public long corpusSize() { return corpusSize; } - - @JsonProperty("language") - public String language() { return language; } - - @JsonProperty("frequencies") - public HashMap<String, Long> frequencies() { return frequencies; } - - @JsonProperty("word-count") - public long wordCount() { return wordCount; } + private String id; + public DefaultSignificanceModel(DocumentFrequencyFile file, String id) { + this.frequencies = file.frequencies(); + this.corpusSize = file.documentCount(); + this.id = id; } public DefaultSignificanceModel(Path path) { - this.path = path; - ObjectMapper objectMapper = new ObjectMapper(); - try { - SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class); - this.corpusSize = model.corpusSize; - this.frequencies = model.frequencies; - } catch (Exception e) { + var file = objectMapper.readValue(path.toFile(), DocumentFrequencyFile.class); + this.frequencies = file.frequencies(); + this.corpusSize = file.documentCount(); + } catch (IOException e) { throw new RuntimeException("Failed to load model from " + path, e); } } @@ -93,4 +43,10 @@ public class DefaultSignificanceModel implements SignificanceModel { } return new DocumentFrequency(1, corpusSize); } + + @Override + public String getId() { + return this.id; + } + } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java index 1be1d3f13b5..72874c15d9e 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java @@ -1,20 +1,21 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.significance.impl; +import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.component.annotation.Inject; import com.yahoo.language.Language; import com.yahoo.language.significance.SignificanceModel; import com.yahoo.language.significance.SignificanceModelRegistry; import com.yahoo.search.significance.config.SignificanceConfig; +import java.io.IOException; +import java.io.UncheckedIOException; import java.nio.file.Path; import java.util.EnumMap; -import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.function.Supplier; -import static com.yahoo.yolean.Exceptions.uncheck; /** * Default implementation of {@link SignificanceModelRegistry}. * This implementation loads models lazily and caches them. @@ -24,24 +25,35 @@ import static com.yahoo.yolean.Exceptions.uncheck; public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry { private final Map<Language, SignificanceModel> models; + @Inject - public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); } - private DefaultSignificanceModelRegistry(Builder b) { + public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this.models = new EnumMap<>(Language.class); - b.models.forEach((language, path) -> { - models.put(language, - uncheck(() -> new DefaultSignificanceModel(path))); - }); + for (var model : cfg.model()) { + addModel(model.path()); + } } - public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) { + public DefaultSignificanceModelRegistry(List<Path> models) { this.models = new EnumMap<>(Language.class); - map.forEach((language, path) -> { - models.put(language, - uncheck(() -> new DefaultSignificanceModel(path))); - }); + for (var path : models) { + addModel(path); + } } + public void addModel(Path path) { + ObjectMapper objectMapper = new ObjectMapper(); + try { + SignificanceModelFile file = objectMapper.readValue(path.toFile(), SignificanceModelFile.class); + for (var pair : file.languages().entrySet()) { + this.models.put( + Language.fromLanguageTag(pair.getKey()), + new DefaultSignificanceModel(pair.getValue(), file.id())); + } + } catch (IOException e) { + throw new UncheckedIOException("Failed to load model from " + path, e); + } + } @Override public Optional<SignificanceModel> getModel(Language language) { @@ -51,20 +63,4 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist } return Optional.of(models.get(language)); } - - - public static final class Builder { - private final Map<Language, Path> models = new EnumMap<>(Language.class); - - public Builder() {} - public Builder(SignificanceConfig cfg) { - for (var model : cfg.model()) { - addModel(Language.fromLanguageTag(model.language()), model.path()); - } - } - - public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; } - public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); } - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java new file mode 100644 index 00000000000..b62754ac8ad --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java @@ -0,0 +1,43 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance.impl; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.HashMap; + +/** + * + * @author MariusArhaug + */ +@JsonIgnoreProperties(ignoreUnknown = true) +@JsonInclude(JsonInclude.Include.NON_NULL) +public class DocumentFrequencyFile { + private final String description; + + private final int documentCount; + + + private final HashMap<String, Long> frequencies; + + @JsonCreator + public DocumentFrequencyFile( + @JsonProperty("description") String description, + @JsonProperty("document-count") int documentCount, + @JsonProperty("document-frequencies") HashMap<String, Long> frequencies) { + this.description = description; + this.documentCount = documentCount; + this.frequencies = frequencies; + } + + @JsonProperty("description") + public String description() { return description; } + + @JsonProperty("document-count") + public int documentCount() { return documentCount; } + + @JsonProperty("document-frequencies") + public HashMap<String, Long> frequencies() { return frequencies; } +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java new file mode 100644 index 00000000000..902613379f0 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java @@ -0,0 +1,48 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance.impl; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.HashMap; +import java.util.List; + +/** + * + * @author MariusArhaug + */ + +@JsonIgnoreProperties(ignoreUnknown = true) +public class SignificanceModelFile { + private final String version; + private final String id; + private final String description; + + private final HashMap<String, DocumentFrequencyFile> languages; + + @JsonCreator + public SignificanceModelFile( + @JsonProperty("version") String version, + @JsonProperty("id") String id, + @JsonProperty("description") String description, + @JsonProperty("languages") HashMap<String, DocumentFrequencyFile> languages) { + this.version = version; + this.id = id; + this.description = description; + this.languages = languages; + } + + @JsonProperty("version") + public String version() { return version; } + + @JsonProperty("id") + public String id() { return id; } + + @JsonProperty("description") + public String description() { return description; } + + @JsonProperty("languages") + public HashMap<String, DocumentFrequencyFile> languages() { return languages; } +} diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java index d4849571b5e..e8594885b9e 100644 --- a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java +++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java @@ -6,7 +6,8 @@ import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry; import org.junit.Test; import java.nio.file.Path; -import java.util.HashMap; +import java.util.ArrayList; +import java.util.List; import static org.junit.jupiter.api.Assertions.*; @@ -18,10 +19,10 @@ public class DefaultSignificanceModelRegistryTest { @Test public void testDefaultSignificanceModelRegistry() { - HashMap<Language, Path> models = new HashMap<>(); + List<Path> models = new ArrayList<>(); - models.put(Language.ENGLISH, Path.of("src/test/models/en.json")); - models.put(Language.NORWEGIAN_BOKMAL, Path.of("src/test/models/no.json")); + models.add(Path.of("src/test/models/docv1.json")); + models.add(Path.of("src/test/models/docv2.json")); DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models); @@ -39,6 +40,45 @@ public class DefaultSignificanceModelRegistryTest { assertNotNull(englishModel); assertNotNull(norwegianModel); + assertEquals("test::2", englishModel.getId()); + assertEquals("test::2", norwegianModel.getId()); + + assertEquals(4, englishModel.documentFrequency("test").frequency()); + assertEquals(14, englishModel.documentFrequency("test").corpusSize()); + + assertEquals(3, norwegianModel.documentFrequency("nei").frequency()); + assertEquals(20, norwegianModel.documentFrequency("nei").corpusSize()); + + assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency()); + assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize()); + + } + + @Test + public void testDefaultSignificanceModelRegistryInOppsiteOrder() { + + List<Path> models = new ArrayList<>(); + + models.add(Path.of("src/test/models/docv2.json")); + models.add(Path.of("src/test/models/docv1.json")); + + DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models); + + var optionalEnglishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH); + var optionalNorwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL); + + assertTrue(optionalEnglishModel.isPresent()); + assertTrue(optionalNorwegianModel.isPresent()); + + var englishModel = optionalEnglishModel.get(); + var norwegianModel = optionalNorwegianModel.get(); + + assertNotNull(englishModel); + assertNotNull(norwegianModel); + + assertEquals("test::1", englishModel.getId()); + assertEquals("test::2", norwegianModel.getId()); + assertEquals(2, englishModel.documentFrequency("test").frequency()); assertEquals(10, englishModel.documentFrequency("test").corpusSize()); @@ -47,6 +87,5 @@ public class DefaultSignificanceModelRegistryTest { assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency()); assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize()); - } } diff --git a/linguistics/src/test/models/docv1.json b/linguistics/src/test/models/docv1.json new file mode 100644 index 00000000000..04010959a58 --- /dev/null +++ b/linguistics/src/test/models/docv1.json @@ -0,0 +1,18 @@ +{ + "version" : "1.0", + "id" : "test::1", + "description" : "desc", + "languages" : { + "en": { + "description" : "english model", + "document-count" : 10, + "language" : "en", + "document-frequencies" : { + "usa" : 2, + "hello": 3, + "world": 5, + "test": 2 + } + } + } +} diff --git a/linguistics/src/test/models/docv2.json b/linguistics/src/test/models/docv2.json new file mode 100644 index 00000000000..c00d02fb744 --- /dev/null +++ b/linguistics/src/test/models/docv2.json @@ -0,0 +1,31 @@ +{ + "version" : "2.0", + "id" : "test::2", + "description" : "desc", + "languages" : { + "en": { + "description" : "english model", + "document-count" : 14, + "document-frequencies" : { + "usa" : 2, + "hello": 3, + "world": 5, + "test": 4, + "additional": 2 + } + }, + "nb": { + "description" : "norwegian model", + "document-count" : 20, + "document-frequencies" : { + "usa" : 2, + "hello": 10, + "verden": 5, + "test": 2, + "norge": 11, + "ja": 12, + "nei": 3 + } + } + } +} diff --git a/linguistics/src/test/models/en.json b/linguistics/src/test/models/en.json index 50bae5e3451..87b7b2faa08 100644 --- a/linguistics/src/test/models/en.json +++ b/linguistics/src/test/models/en.json @@ -1,11 +1,11 @@ { "version" : "1.0", "id" : "test::1", - "description" : "desc", - "corpus-size" : 10, + "description" : "english model", + "document-count" : 10, "language" : "en", "word-count" : 4, - "frequencies" : { + "document-frequencies" : { "usa" : 2, "hello": 3, "world": 5, diff --git a/linguistics/src/test/models/no.json b/linguistics/src/test/models/no.json deleted file mode 100644 index 5fca8929e74..00000000000 --- a/linguistics/src/test/models/no.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "version" : "1.0", - "id" : "test::2", - "description" : "norsk beskrivelse", - "corpus-size" : 20, - "language" : "nb", - "word-count" : 7, - "frequencies" : { - "usa" : 2, - "hello": 10, - "verden": 5, - "test": 2, - "norge": 11, - "ja": 12, - "nei": 3 - } -} diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index b1b2235165f..089f5e2e239 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -329,6 +329,10 @@ public: static search::tensor::DistanceFunctionFactory::UP my_dist_fun = search::tensor::make_distance_function_factory(search::attribute::DistanceMetric::Euclidean, vespalib::eval::CellType::DOUBLE); return *my_dist_fun; } + + uint32_t check_consistency(uint32_t) const noexcept override { + return 0; + } }; class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory { diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index d50677314df..a1cf86c95cc 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -936,6 +936,36 @@ TYPED_TEST(HnswIndexTest, search_during_remove) this->expect_top_3_by_docid("{0, 0}", {0, 0}, {7}); } +TYPED_TEST(HnswIndexTest, inconsistent_index) +{ + this->init(false); + this->vectors.clear(); + this->vectors.set(1, {1, 3}).set(2, {7, 1}).set(3, {6, 5}).set(4, {8, 3}).set(5, {10, 3}); + this->add_document(1); + this->add_document(2); + this->add_document(3); + this->add_document(4); + this->add_document(5); + this->expect_entry_point(1, 0); + this->expect_level_0(1, {2, 3}); + this->expect_level_0(2, {1, 3, 4, 5}); + this->expect_level_0(3, {1, 2, 4}); + this->expect_level_0(4, {2, 3, 5}); + this->expect_level_0(5, {2, 4}); + EXPECT_EQ(0, this->index->check_consistency(6)); + // Remove vector for docid 5 but don't update index. + this->vectors.clear(5); + EXPECT_EQ(1, this->index->check_consistency(6)); + /* + * Removing document 2 causes mutual reconnect for nodes [1, 3, 4, 5] + * where nodes 1 and 5 are not previously connected. Distance from + * node 1 to node 5 cannot be calculated due to missing vector. + */ + this->remove_document(2); + // No reconnect for node without vector + this->expect_level_0(5, {4}); +} + using HnswMultiIndexTest = HnswIndexTest<HnswIndex<HnswIndexType::MULTI>>; namespace { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 322965ca06a..b542c422f50 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -679,7 +679,10 @@ HnswIndex<type>::mutual_reconnect(const LinkArrayRef &cluster, uint32_t level) for (uint32_t j = i + 1; j < cluster.size(); ++j) { uint32_t n_id_2 = cluster[j]; if ( ! has_link_to(n_list_1, n_id_2)) { - pairs.emplace_back(n_id_1, n_id_2, calc_distance(*df, n_id_2)); + auto n_cells_2 = get_vector(n_id_2); + if (!n_cells_2.non_existing_attribute_value()) { + pairs.emplace_back(n_id_1, n_id_2, df->calc(n_cells_2)); + } } } } @@ -1120,6 +1123,32 @@ HnswIndex<type>::count_reachable_nodes() const return {found_cnt, true}; } +template <HnswIndexType type> +uint32_t +HnswIndex<type>::get_subspaces(uint32_t docid) const noexcept +{ + if constexpr (type == HnswIndexType::SINGLE) { + return (docid < _graph.nodes.get_size() && _graph.nodes.get_elem_ref(docid).levels_ref().load_relaxed().valid()) ? 1 : 0; + } else { + return _id_mapping.get_ids(docid).size(); + } +} + +template <HnswIndexType type> +uint32_t +HnswIndex<type>::check_consistency(uint32_t docid_limit) const noexcept +{ + uint32_t inconsistencies = 0; + for (uint32_t docid = 1; docid < docid_limit; ++docid) { + auto index_subspaces = get_subspaces(docid); + auto store_subspaces = get_vectors(docid).subspaces(); + if (index_subspaces != store_subspaces) { + ++inconsistencies; + } + } + return inconsistencies; +} + template class HnswIndex<HnswIndexType::SINGLE>; template class HnswIndex<HnswIndexType::MULTI>; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index 616140f426f..4d4440c1bcb 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -193,6 +193,9 @@ protected: LinkArray filter_valid_nodeids(uint32_t level, const internal::PreparedAddNode::Links &neighbors, uint32_t self_nodeid); void internal_complete_add(uint32_t docid, internal::PreparedAddDoc &op); void internal_complete_add_node(uint32_t nodeid, uint32_t docid, uint32_t subspace, internal::PreparedAddNode &prepared_node); + + // Called from writer only. + uint32_t get_subspaces(uint32_t docid) const noexcept; public: HnswIndex(const DocVectorAccess& vectors, DistanceFunctionFactory::UP distance_ff, RandomLevelGenerator::UP level_generator, const HnswIndexConfig& cfg); @@ -248,6 +251,9 @@ public: uint32_t get_active_nodes() const noexcept { return _graph.get_active_nodes(); } + // Called from writer only. + uint32_t check_consistency(uint32_t docid_limit) const noexcept override; + // Should only be used by unit tests. HnswTestNode get_node(uint32_t nodeid) const; void set_node(uint32_t nodeid, const HnswTestNode &node); diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h index 8462ff05eca..c2bbd17ce63 100644 --- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h +++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h @@ -114,6 +114,12 @@ public: double distance_threshold) const = 0; virtual DistanceFunctionFactory &distance_function_factory() const = 0; + + /* + * Used when checking consistency during load. + * Called from writer only. + */ + virtual uint32_t check_consistency(uint32_t docid_limit) const noexcept = 0; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp index 28c4099c38b..223c9d7d1f2 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp @@ -322,6 +322,9 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor) if (!load_index()) { return false; } + if (dense_store == nullptr) { + check_consistency(docid_limit); + } } else { build_index(executor, docid_limit); } @@ -329,4 +332,15 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor) return true; } +void +TensorAttributeLoader::check_consistency(uint32_t docid_limit) +{ + auto before = vespalib::steady_clock::now(); + uint32_t inconsistencies = _index->check_consistency(docid_limit); + auto after = vespalib::steady_clock::now(); + double elapsed = vespalib::to_s(after - before); + LOG(info, "%u inconsistencies detected after loading index for attribute %s, (check used %6.3fs)", + inconsistencies, _attr.getName().c_str(), elapsed); +} + } diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h index 6bf68957adc..59baaf0b6dc 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h @@ -34,6 +34,7 @@ class TensorAttributeLoader { void load_tensor_store(search::attribute::BlobSequenceReader& reader, uint32_t docid_limit); void build_index(vespalib::Executor* executor, uint32_t docid_limit); bool load_index(); + void check_consistency(uint32_t docid_limit); public: TensorAttributeLoader(TensorAttribute& attr, GenerationHandler& generation_handler, RefVector& ref_vector, TensorStore& store, NearestNeighborIndex* index); |