diff options
author | MariusArhaug <mariusarhaug@hotmail.com> | 2024-04-24 11:29:23 +0200 |
---|---|---|
committer | MariusArhaug <mariusarhaug@hotmail.com> | 2024-04-24 15:33:21 +0200 |
commit | aaae1863d50586fb86685d3cbb6c73f589d3370c (patch) | |
tree | c0b0f9d8c4126da56aacadea264d66d2110b4e5c | |
parent | ff1d604e77b1943a72fc6b585b09db82a5ee791d (diff) |
Update significance model field and logic from architect meeting
17 files changed, 256 insertions, 138 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java b/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java index c210c2621a6..ce9dc2bce0f 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/container/component/SignificanceModelRegistry.java @@ -32,18 +32,15 @@ public class SignificanceModelRegistry extends SimpleComponent implements Signif super(new ComponentModel(BundleInstantiationSpecification.fromStrings(CLASS, CLASS, BUNDLE))); if (spec != null) { - for (Element modelElement : XML.getChildren(spec, "model")) { - addConfig( - modelElement.getAttribute("language"), - Model.fromXml(deployState, modelElement, Set.of(SIGNIFICANCE_MODEL)).modelReference()); - } + for (Element modelElement : XML.getChildren(spec, "model")) { + addConfig(Model.fromXml(deployState, modelElement, Set.of(SIGNIFICANCE_MODEL)).modelReference()); } } - public void addConfig(String language, ModelReference path) { + public void addConfig(ModelReference path) { configList.add( - new SignificanceModelConfig(language, path) + new SignificanceModelConfig(path) ); } @@ -53,7 +50,6 @@ public class SignificanceModelRegistry extends SimpleComponent implements Signif builder.model( configList.stream() .map(config -> new SignificanceConfig.Model.Builder() - .language(config.language) .path(config.path) ).toList() ); @@ -61,11 +57,9 @@ public class SignificanceModelRegistry extends SimpleComponent implements Signif class SignificanceModelConfig { - private final String language; private final ModelReference path; - public SignificanceModelConfig(String language, ModelReference path) { - this.language = language; + public SignificanceModelConfig(ModelReference path) { this.path = path; } diff --git a/config-model/src/main/resources/schema/containercluster.rnc b/config-model/src/main/resources/schema/containercluster.rnc index 08092f10020..c79a7b38d09 100644 --- a/config-model/src/main/resources/schema/containercluster.rnc +++ b/config-model/src/main/resources/schema/containercluster.rnc @@ -138,7 +138,7 @@ Threadpool = element threadpool { } Significance = element significance { - element model { attribute language { xsd:string } & ModelReference }* + element model { ModelReference }* } Clients = element clients { diff --git a/config-model/src/test/cfg/significance/services.xml b/config-model/src/test/cfg/significance/services.xml index 6991f5498fb..ffdb73bfc2e 100644 --- a/config-model/src/test/cfg/significance/services.xml +++ b/config-model/src/test/cfg/significance/services.xml @@ -8,9 +8,9 @@ <container version="1.0"> <search> <significance> - <model language="en" model-id="idf-wiki-english" path="models/idf-english-wiki.json.zst"/> - <model language="no" path="models/idf-norwegian-wiki.json.zst" /> - <model language="ru" url="https://some/uri/blob.json" /> + <model model-id="idf-wiki-english" path="models/idf-english-wiki.json.zst"/> + <model path="models/idf-norwegian-wiki.json.zst" /> + <model url="https://some/uri/blob.json" /> </significance> </search> </container> diff --git a/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java b/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java index 00e95a34287..26e8c67a226 100644 --- a/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/significance/test/SignificanceModelTestCase.java @@ -37,9 +37,6 @@ public class SignificanceModelTestCase { ApplicationContainerCluster containerCluster = vespaModel.getContainerClusters().get("container"); var significanceConfig = assertSignificancePresent(containerCluster); assertEquals(3, significanceConfig.model().size()); - assertEquals("en", significanceConfig.model().get(0).language()); - assertEquals("no", significanceConfig.model().get(1).language()); - assertEquals("ru", significanceConfig.model().get(2).language()); assertEquals("models/idf-norwegian-wiki.json.zst", modelReference(significanceConfig.model().get(1), "path").path().orElseThrow().value()); assertEquals("https://some/uri/blob.json", modelReference(significanceConfig.model().get(2), "path").url().orElseThrow().value()); diff --git a/config-model/src/test/schema-test-files/services.xml b/config-model/src/test/schema-test-files/services.xml index 7333ef5a87b..a413ec7753b 100644 --- a/config-model/src/test/schema-test-files/services.xml +++ b/config-model/src/test/schema-test-files/services.xml @@ -168,7 +168,7 @@ </threadpool> <significance> - <model language="en" model-id="idf-wiki-simple-english" path="models/idf-simple-english-wiki.json.zst" /> + <model model-id="idf-wiki-simple-english" path="models/idf-simple-english-wiki.json.zst" /> </significance> </search> diff --git a/configdefinitions/src/vespa/significance.def b/configdefinitions/src/vespa/significance.def index e0cc5b4c611..8d40381a0c9 100644 --- a/configdefinitions/src/vespa/significance.def +++ b/configdefinitions/src/vespa/significance.def @@ -1,6 +1,4 @@ # Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. namespace=search.significance.config -model[].language string model[].path model - diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index 1ca32a2dd37..ceab5025760 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -803,7 +803,8 @@ "abstract" ], "methods" : [ - "public abstract com.yahoo.language.significance.DocumentFrequency documentFrequency(java.lang.String)" + "public abstract com.yahoo.language.significance.DocumentFrequency documentFrequency(java.lang.String)", + "public abstract java.lang.String getId()" ], "fields" : [ ] }, diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java index a9f1e48af62..c8a31e1892c 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java @@ -9,4 +9,6 @@ import com.yahoo.api.annotations.Beta; @Beta public interface SignificanceModel { DocumentFrequency documentFrequency(String word); + + String getId(); } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java index 7ed6f442610..3244b8373ad 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java @@ -1,13 +1,11 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.significance.impl; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.language.significance.DocumentFrequency; import com.yahoo.language.significance.SignificanceModel; +import java.io.IOException; import java.nio.file.Path; import java.util.HashMap; @@ -18,70 +16,22 @@ import java.util.HashMap; public class DefaultSignificanceModel implements SignificanceModel { private final long corpusSize; private final HashMap<String, Long> frequencies; - private final Path path; - @JsonIgnoreProperties(ignoreUnknown = true) - public static class SignificanceModelFile { - private final String version; - private final String id; - private final String description; - private final long corpusSize; - private final String language; - - private final long wordCount; - private final HashMap<String, Long> frequencies; - - @JsonCreator - public SignificanceModelFile( - @JsonProperty("version") String version, - @JsonProperty("id") String id, - @JsonProperty("description") String description, - @JsonProperty("corpus-size") long corpusSize, - @JsonProperty("language") String language, - @JsonProperty("word-count") long wordCount, - @JsonProperty("frequencies") HashMap<String, Long> frequencies) { - this.version = version; - this.id = id; - this.description = description; - this.corpusSize = corpusSize; - this.language = language; - this.wordCount = wordCount; - this.frequencies = frequencies; - } - - @JsonProperty("version") - public String version() { return version; } - - @JsonProperty("id") - public String id() { return id; } - - @JsonProperty("description") - public String description() { return description; } - - @JsonProperty("corpus-size") - public long corpusSize() { return corpusSize; } - - @JsonProperty("language") - public String language() { return language; } - - @JsonProperty("frequencies") - public HashMap<String, Long> frequencies() { return frequencies; } - - @JsonProperty("word-count") - public long wordCount() { return wordCount; } + private String id; + public DefaultSignificanceModel(DocumentFrequencyFile file, String id) { + this.frequencies = file.frequencies(); + this.corpusSize = file.documentCount(); + this.id = id; } public DefaultSignificanceModel(Path path) { - this.path = path; - ObjectMapper objectMapper = new ObjectMapper(); - try { - SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class); - this.corpusSize = model.corpusSize; - this.frequencies = model.frequencies; - } catch (Exception e) { + var file = objectMapper.readValue(path.toFile(), DocumentFrequencyFile.class); + this.frequencies = file.frequencies(); + this.corpusSize = file.documentCount(); + } catch (IOException e) { throw new RuntimeException("Failed to load model from " + path, e); } } @@ -93,4 +43,10 @@ public class DefaultSignificanceModel implements SignificanceModel { } return new DocumentFrequency(1, corpusSize); } + + @Override + public String getId() { + return this.id; + } + } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java index 1be1d3f13b5..7bf05c71342 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java @@ -1,20 +1,20 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.significance.impl; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationConfig; import com.yahoo.component.annotation.Inject; import com.yahoo.language.Language; import com.yahoo.language.significance.SignificanceModel; import com.yahoo.language.significance.SignificanceModelRegistry; import com.yahoo.search.significance.config.SignificanceConfig; +import java.io.IOException; import java.nio.file.Path; import java.util.EnumMap; -import java.util.HashMap; +import java.util.List; import java.util.Map; -import java.util.Optional; -import java.util.function.Supplier; -import static com.yahoo.yolean.Exceptions.uncheck; /** * Default implementation of {@link SignificanceModelRegistry}. * This implementation loads models lazily and caches them. @@ -24,24 +24,45 @@ import static com.yahoo.yolean.Exceptions.uncheck; public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry { private final Map<Language, SignificanceModel> models; + @Inject - public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); } - private DefaultSignificanceModelRegistry(Builder b) { + public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this.models = new EnumMap<>(Language.class); - b.models.forEach((language, path) -> { - models.put(language, - uncheck(() -> new DefaultSignificanceModel(path))); - }); + ObjectMapper objectMapper = new ObjectMapper(); + for (var model : cfg.model()) { + try { + SignificanceModelFile file = objectMapper.readValue(model.path().toFile(), SignificanceModelFile.class); + + for (var pair : file.languages().entrySet()) { + addModel( + Language.fromLanguageTag(pair.getKey()), new DefaultSignificanceModel(pair.getValue(), file.id())); + } + } catch (IOException e) { + throw new RuntimeException("Failed to load model from " + model.path(), e); + } + } } - public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) { + public DefaultSignificanceModelRegistry(List<Path> models) { this.models = new EnumMap<>(Language.class); - map.forEach((language, path) -> { - models.put(language, - uncheck(() -> new DefaultSignificanceModel(path))); - }); + + ObjectMapper objectMapper = new ObjectMapper(); + for (var model : models) { + try { + SignificanceModelFile file = objectMapper.readValue(model.toFile(), SignificanceModelFile.class); + for (var pair : file.languages().entrySet()) { + addModel( + Language.fromLanguageTag(pair.getKey()), new DefaultSignificanceModel(pair.getValue(), file.id())); + } + } catch (IOException e) { + throw new RuntimeException("Failed to load model from " + model, e); + } + } } + public void addModel(Language lang, SignificanceModel model) { + this.models.put(lang, model); + } @Override public Optional<SignificanceModel> getModel(Language language) { @@ -51,20 +72,4 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist } return Optional.of(models.get(language)); } - - - public static final class Builder { - private final Map<Language, Path> models = new EnumMap<>(Language.class); - - public Builder() {} - public Builder(SignificanceConfig cfg) { - for (var model : cfg.model()) { - addModel(Language.fromLanguageTag(model.language()), model.path()); - } - } - - public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; } - public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); } - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java new file mode 100644 index 00000000000..c2ad1de8fd8 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java @@ -0,0 +1,51 @@ +package com.yahoo.language.significance.impl; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.HashMap; + +@JsonIgnoreProperties(ignoreUnknown = true) +@JsonInclude(JsonInclude.Include.NON_NULL) +public class DocumentFrequencyFile { + private final String description; + + private final String language; + + private final int documentCount; + + private final int wordCount; + + private final HashMap<String, Long> frequencies; + + @JsonCreator + public DocumentFrequencyFile( + @JsonProperty("description") String description, + @JsonProperty("document-count") int documentCount, + @JsonProperty("word-count") int wordCount, + @JsonProperty("language") String language, + @JsonProperty("document-frequencies") HashMap<String, Long> frequencies) { + this.description = description; + this.documentCount = documentCount; + this.wordCount = wordCount; + this.language = language; + this.frequencies = frequencies; + } + + @JsonProperty("description") + public String description() { return description; } + + @JsonProperty("document-count") + public int documentCount() { return documentCount; } + + @JsonProperty("language") + public String language() { return language; } + + @JsonProperty("word-count") + public int wordCount() { return wordCount; } + + @JsonProperty("document-frequencies") + public HashMap<String, Long> frequencies() { return frequencies; } +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java new file mode 100644 index 00000000000..cb923fe8861 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java @@ -0,0 +1,43 @@ +package com.yahoo.language.significance.impl; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.HashMap; +import java.util.List; + + +@JsonIgnoreProperties(ignoreUnknown = true) +public class SignificanceModelFile { + private final String version; + private final String id; + private final String description; + + private final HashMap<String, DocumentFrequencyFile> languages; + + @JsonCreator + public SignificanceModelFile( + @JsonProperty("version") String version, + @JsonProperty("id") String id, + @JsonProperty("description") String description, + @JsonProperty("languages") HashMap<String, DocumentFrequencyFile> languages) { + this.version = version; + this.id = id; + this.description = description; + this.languages = languages; + } + + @JsonProperty("version") + public String version() { return version; } + + @JsonProperty("id") + public String id() { return id; } + + @JsonProperty("description") + public String description() { return description; } + + @JsonProperty("languages") + public HashMap<String, DocumentFrequencyFile> languages() { return languages; } +} diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java index d4849571b5e..833caca3b24 100644 --- a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java +++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java @@ -6,7 +6,8 @@ import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry; import org.junit.Test; import java.nio.file.Path; -import java.util.HashMap; +import java.util.ArrayList; +import java.util.List; import static org.junit.jupiter.api.Assertions.*; @@ -18,10 +19,10 @@ public class DefaultSignificanceModelRegistryTest { @Test public void testDefaultSignificanceModelRegistry() { - HashMap<Language, Path> models = new HashMap<>(); + List<Path> models = new ArrayList<>(); - models.put(Language.ENGLISH, Path.of("src/test/models/en.json")); - models.put(Language.NORWEGIAN_BOKMAL, Path.of("src/test/models/no.json")); + models.add(Path.of("src/test/models/docv1.json")); + models.add(Path.of("src/test/models/docv2.json")); DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models); @@ -39,6 +40,39 @@ public class DefaultSignificanceModelRegistryTest { assertNotNull(englishModel); assertNotNull(norwegianModel); + assertEquals("test::2", englishModel.getId()); + assertEquals("test::2", norwegianModel.getId()); + + assertEquals(4, englishModel.documentFrequency("test").frequency()); + assertEquals(14, englishModel.documentFrequency("test").corpusSize()); + + assertEquals(3, norwegianModel.documentFrequency("nei").frequency()); + assertEquals(20, norwegianModel.documentFrequency("nei").corpusSize()); + + assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency()); + assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize()); + + } + + @Test + public void testDefaultSignificanceModelRegistryInOppsiteOrder() { + + List<Path> models = new ArrayList<>(); + + models.add(Path.of("src/test/models/docv2.json")); + models.add(Path.of("src/test/models/docv1.json")); + + DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models); + + var englishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH); + var norwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL); + + assertNotNull(englishModel); + assertNotNull(norwegianModel); + + assertEquals("test::1", englishModel.getId()); + assertEquals("test::2", norwegianModel.getId()); + assertEquals(2, englishModel.documentFrequency("test").frequency()); assertEquals(10, englishModel.documentFrequency("test").corpusSize()); @@ -47,6 +81,5 @@ public class DefaultSignificanceModelRegistryTest { assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency()); assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize()); - } } diff --git a/linguistics/src/test/models/docv1.json b/linguistics/src/test/models/docv1.json new file mode 100644 index 00000000000..dd211840ede --- /dev/null +++ b/linguistics/src/test/models/docv1.json @@ -0,0 +1,19 @@ +{ + "version" : "1.0", + "id" : "test::1", + "description" : "desc", + "languages" : { + "en": { + "description" : "english model", + "document-count" : 10, + "language" : "en", + "word-count" : 4, + "document-frequencies" : { + "usa" : 2, + "hello": 3, + "world": 5, + "test": 2 + } + } + } +} diff --git a/linguistics/src/test/models/docv2.json b/linguistics/src/test/models/docv2.json new file mode 100644 index 00000000000..c7f621b9acb --- /dev/null +++ b/linguistics/src/test/models/docv2.json @@ -0,0 +1,35 @@ +{ + "version" : "2.0", + "id" : "test::2", + "description" : "desc", + "languages" : { + "en": { + "description" : "english model", + "document-count" : 14, + "language" : "en", + "word-count" : 5, + "document-frequencies" : { + "usa" : 2, + "hello": 3, + "world": 5, + "test": 4, + "additional": 2 + } + }, + "nb": { + "description" : "norwegian model", + "document-count" : 20, + "language" : "en", + "word-count" : 7, + "document-frequencies" : { + "usa" : 2, + "hello": 10, + "verden": 5, + "test": 2, + "norge": 11, + "ja": 12, + "nei": 3 + } + } + } +} diff --git a/linguistics/src/test/models/en.json b/linguistics/src/test/models/en.json index 50bae5e3451..a46b6a5b51f 100644 --- a/linguistics/src/test/models/en.json +++ b/linguistics/src/test/models/en.json @@ -1,14 +1,15 @@ { "version" : "1.0", "id" : "test::1", - "description" : "desc", - "corpus-size" : 10, + "description" : "english model", + "document-count" : 10, "language" : "en", "word-count" : 4, - "frequencies" : { + "document-frequencies" : { "usa" : 2, "hello": 3, "world": 5, "test": 2 } } +}
\ No newline at end of file diff --git a/linguistics/src/test/models/no.json b/linguistics/src/test/models/no.json deleted file mode 100644 index 5fca8929e74..00000000000 --- a/linguistics/src/test/models/no.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "version" : "1.0", - "id" : "test::2", - "description" : "norsk beskrivelse", - "corpus-size" : 20, - "language" : "nb", - "word-count" : 7, - "frequencies" : { - "usa" : 2, - "hello": 10, - "verden": 5, - "test": 2, - "norge": 11, - "ja": 12, - "nei": 3 - } -} |