From 501f69bef60ebe61beb52ef369c158c38b976c8b Mon Sep 17 00:00:00 2001 From: MariusArhaug Date: Thu, 4 Apr 2024 15:42:35 +0200 Subject: add significance model registry to linguistics --- linguistics/pom.xml | 17 +++- .../significance/DefaultSignificanceModel.java | 93 ++++++++++++++++++++++ .../DefaultSignificanceModelRegistry.java | 79 ++++++++++++++++++ .../language/significance/DocumentFrequency.java | 14 ++++ .../language/significance/SignificanceModel.java | 9 +++ .../significance/SignificanceModelRegistry.java | 11 +++ .../DefaultSignificanceModelRegistryTest.java | 40 ++++++++++ .../significance/DefaultSignificanceModelTest.java | 29 +++++++ linguistics/src/test/models/en.json | 14 ++++ linguistics/src/test/models/no.json | 17 ++++ 10 files changed, 322 insertions(+), 1 deletion(-) create mode 100644 linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModel.java create mode 100644 linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModelRegistry.java create mode 100644 linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java create mode 100644 linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java create mode 100644 linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java create mode 100644 linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java create mode 100644 linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java create mode 100644 linguistics/src/test/models/en.json create mode 100644 linguistics/src/test/models/no.json (limited to 'linguistics') diff --git a/linguistics/pom.xml b/linguistics/pom.xml index 48ea0a765a6..a358141af21 100644 --- a/linguistics/pom.xml +++ b/linguistics/pom.xml @@ -56,7 +56,22 @@ com.google.inject guice provided - + + + com.fasterxml.jackson.core + jackson-databind + provided + + + com.fasterxml.jackson.core + jackson-core + provided + + + com.yahoo.vespa + flags + 8-SNAPSHOT + compile diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModel.java new file mode 100644 index 00000000000..5cc82264b2b --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModel.java @@ -0,0 +1,93 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.nio.file.Path; +import java.util.HashMap; + +/** + * + * @author MariusArhaug + */ +public class DefaultSignificanceModel implements SignificanceModel { + private final long corpusSize; + private final HashMap frequencies; + private final Path path; + + @JsonIgnoreProperties(ignoreUnknown = true) + public static class SignificanceModelFile { + private final String version; + private final String id; + private final String description; + private final long corpusSize; + private final String language; + private final HashMap frequencies; + private final long tokenCount; + + @JsonCreator + public SignificanceModelFile( + @JsonProperty("version") String version, + @JsonProperty("id") String id, + @JsonProperty("description") String description, + @JsonProperty("corpus_size") long corpusSize, + @JsonProperty("language") String language, + @JsonProperty("frequencies") HashMap frequencies, + @JsonProperty("token_count") long tokenCount) { + this.version = version; + this.id = id; + this.description = description; + this.corpusSize = corpusSize; + this.language = language; + this.frequencies = frequencies; + this.tokenCount = tokenCount; + } + + @JsonProperty("version") + public String version() { return version; } + + @JsonProperty("id") + public String id() { return id; } + + @JsonProperty("description") + public String description() { return description; } + + @JsonProperty("corpus_size") + public long corpusSize() { return corpusSize; } + + @JsonProperty("language") + public String language() { return language; } + + @JsonProperty("frequencies") + public HashMap frequencies() { return frequencies; } + + @JsonProperty("token_count") + public long tokenCount() { return tokenCount; } + + } + + public DefaultSignificanceModel(Path path) { + this.path = path; + + ObjectMapper objectMapper = new ObjectMapper(); + + try { + SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class); + this.corpusSize = model.corpusSize; + this.frequencies = model.frequencies; + } catch (Exception e) { + throw new RuntimeException("Failed to load model from " + path, e); + } + } + + @Override + public DocumentFrequency documentFrequency(String word) { + if (frequencies.containsKey(word)) { + return new DocumentFrequency(frequencies.get(word), corpusSize); + } + return new DocumentFrequency(1, corpusSize); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModelRegistry.java new file mode 100644 index 00000000000..59a50e2c36a --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModelRegistry.java @@ -0,0 +1,79 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.component.annotation.Inject; +import com.yahoo.language.Language; +import com.yahoo.search.significance.config.SignificanceConfig; + +import java.nio.file.Path; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; +import java.util.function.Supplier; + +import static com.yahoo.yolean.Exceptions.uncheck; +/** + * Default implementation of {@link SignificanceModelRegistry}. + * This implementation loads models lazily and caches them. + * + * @author MariusArhaug + */ +public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry { + + private final Map models; + @Inject + public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); } + private DefaultSignificanceModelRegistry(Builder b) { + + this.models = withContextClassloader(() -> { + var models = new EnumMap(Language.class); + b.models.forEach((language, path) -> { + models.put(language, + uncheck(() -> new DefaultSignificanceModel(path))); + }); + return models; + }); + } + + public DefaultSignificanceModelRegistry(HashMap map) { + this.models = withContextClassloader(() -> { + var models = new EnumMap(Language.class); + map.forEach((language, path) -> { + models.put(language, + uncheck(() -> new DefaultSignificanceModel(path))); + }); + return models; + }); + } + + + @Override + public SignificanceModel getModel(Language language) { + return models.get(language); + } + + private static R withContextClassloader(Supplier r) { + var original = Thread.currentThread().getContextClassLoader(); + Thread.currentThread().setContextClassLoader(SignificanceModel.class.getClassLoader()); + try { + return r.get(); + } finally { + Thread.currentThread().setContextClassLoader(original); + } + } + + public static final class Builder { + private final Map models = new EnumMap<>(Language.class); + + public Builder() {} + public Builder(SignificanceConfig cfg) { + for (var model : cfg.model()) { + addModel(Language.fromLanguageTag(model.language()), model.path()); + } + } + + public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; } + public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); } + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java new file mode 100644 index 00000000000..a94beacfd64 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java @@ -0,0 +1,14 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +/** + * + * @author MariusArhaug + */ +public record DocumentFrequency(long frequency, long corpusSize) { + + public DocumentFrequency(long frequency, long corpusSize) { + this.frequency = frequency; + this.corpusSize = corpusSize; + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java new file mode 100644 index 00000000000..415eccac93b --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java @@ -0,0 +1,9 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +/** + * @author MariusArhaug + */ +public interface SignificanceModel { + DocumentFrequency documentFrequency(String word); +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java new file mode 100644 index 00000000000..d7f0aac9949 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java @@ -0,0 +1,11 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.language.Language; + +/** + * @author MariusArhaug + */ +public interface SignificanceModelRegistry { + SignificanceModel getModel(Language language); +} diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java new file mode 100644 index 00000000000..698d507c7e8 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java @@ -0,0 +1,40 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.language.Language; +import org.junit.Test; + +import java.nio.file.Path; +import java.util.HashMap; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +/** + * @author MariusArhaug + */ +public class DefaultSignificanceModelRegistryTest { + + @Test + public void testDefaultSignificanceModelRegistry() { + HashMap models = new HashMap<>(); + + models.put(Language.ENGLISH, Path.of("src/test/models/en.json")); + models.put(Language.NORWEGIAN_BOKMAL, Path.of("src/test/models/no.json")); + + DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models); + + var englishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH); + var norwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL); + + assertNotNull(englishModel); + assertNotNull(norwegianModel); + + assertEquals(2, englishModel.documentFrequency("test").frequency()); + assertEquals(10, englishModel.documentFrequency("test").corpusSize()); + + assertEquals(3, norwegianModel.documentFrequency("nei").frequency()); + assertEquals(20, norwegianModel.documentFrequency("nei").corpusSize()); + + } +} diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java new file mode 100644 index 00000000000..38ca24855f8 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java @@ -0,0 +1,29 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import org.junit.Test; + +import java.nio.file.Path; + +import static org.junit.Assert.assertEquals; + +/** + * @author MariusArhaug + + */ +public class DefaultSignificanceModelTest { + + @Test + public void testDocumentFrequency() { + DefaultSignificanceModel significanceModel = new DefaultSignificanceModel(Path.of("src/test/models/en.json")); + + assertEquals(2, significanceModel.documentFrequency("test").frequency()); + assertEquals(10, significanceModel.documentFrequency("test").corpusSize()); + + assertEquals(3, significanceModel.documentFrequency("hello").frequency()); + assertEquals(10, significanceModel.documentFrequency("hello").corpusSize()); + + assertEquals(1, significanceModel.documentFrequency("non-existent-word").frequency()); + assertEquals(10, significanceModel.documentFrequency("hello").corpusSize()); + } +} diff --git a/linguistics/src/test/models/en.json b/linguistics/src/test/models/en.json new file mode 100644 index 00000000000..3fb1f58e8df --- /dev/null +++ b/linguistics/src/test/models/en.json @@ -0,0 +1,14 @@ +{ + "version" : "1.0", + "id" : "test::1", + "description" : "desc", + "corpus_size" : 10, + "language" : "en", + "frequencies" : { + "usa" : 2, + "hello": 3, + "world": 5, + "test": 2 + }, + "token_count" : 4 +} diff --git a/linguistics/src/test/models/no.json b/linguistics/src/test/models/no.json new file mode 100644 index 00000000000..6e5cd6cd7d5 --- /dev/null +++ b/linguistics/src/test/models/no.json @@ -0,0 +1,17 @@ +{ + "version" : "1.0", + "id" : "test::2", + "description" : "norsk beskrivelse", + "corpus_size" : 20, + "language" : "nb", + "frequencies" : { + "usa" : 2, + "hello": 10, + "verden": 5, + "test": 2, + "norge": 11, + "ja": 12, + "nei": 3 + }, + "token_count" : 4 +} -- cgit v1.2.3