diff options
author | Marius Arhaug <mariusarhaug@hotmail.com> | 2024-04-09 16:33:04 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-09 16:33:04 +0200 |
commit | 07010100192978eea266f7cb15b315b57a95438e (patch) | |
tree | a232aba9475b273058179872a7ca251b42e39d5c /linguistics | |
parent | e1f023d0e14c3351948beed1ee0af6e466581251 (diff) | |
parent | 07eedf3b30af36fc05da6c98778ecda23bd0d304 (diff) |
Merge pull request #30816 from vespa-engine/marius/add-significance-model-registry
Add significance model registry to linguistics
Diffstat (limited to 'linguistics')
12 files changed, 385 insertions, 1 deletions
diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index 0bd4638bb05..9f91c32cf62 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -774,5 +774,49 @@ "public abstract java.lang.String accentDrop(java.lang.String, com.yahoo.language.Language)" ], "fields" : [ ] + }, + "com.yahoo.language.significance.DocumentFrequency" : { + "superClass" : "java.lang.Record", + "interfaces" : [ ], + "attributes" : [ + "public", + "final", + "record" + ], + "methods" : [ + "public void <init>(long, long)", + "public final java.lang.String toString()", + "public final int hashCode()", + "public final boolean equals(java.lang.Object)", + "public long frequency()", + "public long corpusSize()" + ], + "fields" : [ ] + }, + "com.yahoo.language.significance.SignificanceModel" : { + "superClass" : "java.lang.Object", + "interfaces" : [ ], + "attributes" : [ + "public", + "interface", + "abstract" + ], + "methods" : [ + "public abstract com.yahoo.language.significance.DocumentFrequency documentFrequency(java.lang.String)" + ], + "fields" : [ ] + }, + "com.yahoo.language.significance.SignificanceModelRegistry" : { + "superClass" : "java.lang.Object", + "interfaces" : [ ], + "attributes" : [ + "public", + "interface", + "abstract" + ], + "methods" : [ + "public abstract com.yahoo.language.significance.SignificanceModel getModel(com.yahoo.language.Language)" + ], + "fields" : [ ] } }
\ No newline at end of file diff --git a/linguistics/pom.xml b/linguistics/pom.xml index 48ea0a765a6..d07ff5d9fdb 100644 --- a/linguistics/pom.xml +++ b/linguistics/pom.xml @@ -56,7 +56,26 @@ <groupId>com.google.inject</groupId> <artifactId>guice</artifactId> <scope>provided</scope> - + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.junit.vintage</groupId> + <artifactId>junit-vintage-engine</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter</artifactId> + <scope>test</scope> </dependency> </dependencies> <build> diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java new file mode 100644 index 00000000000..ff6de32fdaf --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java @@ -0,0 +1,14 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.api.annotations.Beta; + +/** + * + * @author MariusArhaug + */ + +@Beta +public record DocumentFrequency(long frequency, long corpusSize) { + +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java new file mode 100644 index 00000000000..a9f1e48af62 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java @@ -0,0 +1,12 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.api.annotations.Beta; + +/** + * @author MariusArhaug + */ +@Beta +public interface SignificanceModel { + DocumentFrequency documentFrequency(String word); +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java new file mode 100644 index 00000000000..6d8dcc00e0a --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java @@ -0,0 +1,13 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.api.annotations.Beta; +import com.yahoo.language.Language; + +/** + * @author MariusArhaug + */ +@Beta +public interface SignificanceModelRegistry { + SignificanceModel getModel(Language language); +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java new file mode 100644 index 00000000000..7ed6f442610 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java @@ -0,0 +1,96 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance.impl; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.yahoo.language.significance.DocumentFrequency; +import com.yahoo.language.significance.SignificanceModel; + +import java.nio.file.Path; +import java.util.HashMap; + +/** + * + * @author MariusArhaug + */ +public class DefaultSignificanceModel implements SignificanceModel { + private final long corpusSize; + private final HashMap<String, Long> frequencies; + private final Path path; + + @JsonIgnoreProperties(ignoreUnknown = true) + public static class SignificanceModelFile { + private final String version; + private final String id; + private final String description; + private final long corpusSize; + private final String language; + + private final long wordCount; + private final HashMap<String, Long> frequencies; + + @JsonCreator + public SignificanceModelFile( + @JsonProperty("version") String version, + @JsonProperty("id") String id, + @JsonProperty("description") String description, + @JsonProperty("corpus-size") long corpusSize, + @JsonProperty("language") String language, + @JsonProperty("word-count") long wordCount, + @JsonProperty("frequencies") HashMap<String, Long> frequencies) { + this.version = version; + this.id = id; + this.description = description; + this.corpusSize = corpusSize; + this.language = language; + this.wordCount = wordCount; + this.frequencies = frequencies; + } + + @JsonProperty("version") + public String version() { return version; } + + @JsonProperty("id") + public String id() { return id; } + + @JsonProperty("description") + public String description() { return description; } + + @JsonProperty("corpus-size") + public long corpusSize() { return corpusSize; } + + @JsonProperty("language") + public String language() { return language; } + + @JsonProperty("frequencies") + public HashMap<String, Long> frequencies() { return frequencies; } + + @JsonProperty("word-count") + public long wordCount() { return wordCount; } + + } + + public DefaultSignificanceModel(Path path) { + this.path = path; + + ObjectMapper objectMapper = new ObjectMapper(); + + try { + SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class); + this.corpusSize = model.corpusSize; + this.frequencies = model.frequencies; + } catch (Exception e) { + throw new RuntimeException("Failed to load model from " + path, e); + } + } + + @Override + public DocumentFrequency documentFrequency(String word) { + if (frequencies.containsKey(word)) { + return new DocumentFrequency(frequencies.get(word), corpusSize); + } + return new DocumentFrequency(1, corpusSize); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java new file mode 100644 index 00000000000..d44eab39cdf --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java @@ -0,0 +1,69 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance.impl; + +import com.yahoo.component.annotation.Inject; +import com.yahoo.language.Language; +import com.yahoo.language.significance.SignificanceModel; +import com.yahoo.language.significance.SignificanceModelRegistry; +import com.yahoo.search.significance.config.SignificanceConfig; + +import java.nio.file.Path; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; +import java.util.function.Supplier; + +import static com.yahoo.yolean.Exceptions.uncheck; +/** + * Default implementation of {@link SignificanceModelRegistry}. + * This implementation loads models lazily and caches them. + * + * @author MariusArhaug + */ +public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry { + + private final Map<Language, SignificanceModel> models; + @Inject + public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); } + private DefaultSignificanceModelRegistry(Builder b) { + this.models = new EnumMap<>(Language.class); + b.models.forEach((language, path) -> { + models.put(language, + uncheck(() -> new DefaultSignificanceModel(path))); + }); + } + + public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) { + this.models = new EnumMap<>(Language.class); + map.forEach((language, path) -> { + models.put(language, + uncheck(() -> new DefaultSignificanceModel(path))); + }); + } + + + @Override + public SignificanceModel getModel(Language language) throws IllegalArgumentException { + if (!models.containsKey(language)) + { + throw new IllegalArgumentException("No model for language " + language); + } + return models.get(language); + } + + + public static final class Builder { + private final Map<Language, Path> models = new EnumMap<>(Language.class); + + public Builder() {} + public Builder(SignificanceConfig cfg) { + for (var model : cfg.model()) { + addModel(Language.fromLanguageTag(model.language()), model.path()); + } + } + + public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; } + public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); } + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/package-info.java b/linguistics/src/main/java/com/yahoo/language/significance/package-info.java new file mode 100644 index 00000000000..5c2f773452f --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/package-info.java @@ -0,0 +1,7 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +@PublicApi +package com.yahoo.language.significance; + +import com.yahoo.api.annotations.PublicApi; +import com.yahoo.osgi.annotation.ExportPackage; diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java new file mode 100644 index 00000000000..d1de63a994d --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java @@ -0,0 +1,48 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.language.Language; +import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry; +import org.junit.Test; + +import java.nio.file.Path; +import java.util.HashMap; + +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + + +/** + * @author MariusArhaug + */ +public class DefaultSignificanceModelRegistryTest { + + @Test + public void testDefaultSignificanceModelRegistry() { + HashMap<Language, Path> models = new HashMap<>(); + + models.put(Language.ENGLISH, Path.of("src/test/models/en.json")); + models.put(Language.NORWEGIAN_BOKMAL, Path.of("src/test/models/no.json")); + + DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models); + + var englishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH); + var norwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL); + + assertThrows(IllegalArgumentException.class, () -> defaultSignificanceModelRegistry.getModel(Language.FRENCH)); + + assertNotNull(englishModel); + assertNotNull(norwegianModel); + + assertEquals(2, englishModel.documentFrequency("test").frequency()); + assertEquals(10, englishModel.documentFrequency("test").corpusSize()); + + assertEquals(3, norwegianModel.documentFrequency("nei").frequency()); + assertEquals(20, norwegianModel.documentFrequency("nei").corpusSize()); + + assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency()); + assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize()); + + } +} diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java new file mode 100644 index 00000000000..137f8d4513a --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java @@ -0,0 +1,31 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.language.significance.impl.DefaultSignificanceModel; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.assertEquals; + + +/** + * @author MariusArhaug + + */ +public class DefaultSignificanceModelTest { + + @Test + public void testDocumentFrequency() { + DefaultSignificanceModel significanceModel = new DefaultSignificanceModel(Path.of("src/test/models/en.json")); + + assertEquals(2, significanceModel.documentFrequency("test").frequency()); + assertEquals(10, significanceModel.documentFrequency("test").corpusSize()); + + assertEquals(3, significanceModel.documentFrequency("hello").frequency()); + assertEquals(10, significanceModel.documentFrequency("hello").corpusSize()); + + assertEquals(1, significanceModel.documentFrequency("non-existent-word").frequency()); + assertEquals(10, significanceModel.documentFrequency("hello").corpusSize()); + } +} diff --git a/linguistics/src/test/models/en.json b/linguistics/src/test/models/en.json new file mode 100644 index 00000000000..50bae5e3451 --- /dev/null +++ b/linguistics/src/test/models/en.json @@ -0,0 +1,14 @@ +{ + "version" : "1.0", + "id" : "test::1", + "description" : "desc", + "corpus-size" : 10, + "language" : "en", + "word-count" : 4, + "frequencies" : { + "usa" : 2, + "hello": 3, + "world": 5, + "test": 2 + } +} diff --git a/linguistics/src/test/models/no.json b/linguistics/src/test/models/no.json new file mode 100644 index 00000000000..5fca8929e74 --- /dev/null +++ b/linguistics/src/test/models/no.json @@ -0,0 +1,17 @@ +{ + "version" : "1.0", + "id" : "test::2", + "description" : "norsk beskrivelse", + "corpus-size" : 20, + "language" : "nb", + "word-count" : 7, + "frequencies" : { + "usa" : 2, + "hello": 10, + "verden": 5, + "test": 2, + "norge": 11, + "ja": 12, + "nei": 3 + } +} |