summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMariusArhaug <mariusarhaug@hotmail.com>2024-04-04 15:42:35 +0200
committerMariusArhaug <mariusarhaug@hotmail.com>2024-04-04 15:54:00 +0200
commit501f69bef60ebe61beb52ef369c158c38b976c8b (patch)
treea9ca7b4fabe2b97fb31bfb1d642c548164958308
parent800b4beb92627d2faad8681ae9a1f04347731c28 (diff)
add significance model registry to linguistics
-rw-r--r--linguistics/pom.xml17
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModel.java93
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModelRegistry.java79
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java14
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java9
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java11
-rw-r--r--linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java40
-rw-r--r--linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java29
-rw-r--r--linguistics/src/test/models/en.json14
-rw-r--r--linguistics/src/test/models/no.json17
10 files changed, 322 insertions, 1 deletions
diff --git a/linguistics/pom.xml b/linguistics/pom.xml
index 48ea0a765a6..a358141af21 100644
--- a/linguistics/pom.xml
+++ b/linguistics/pom.xml
@@ -56,7 +56,22 @@
<groupId>com.google.inject</groupId>
<artifactId>guice</artifactId>
<scope>provided</scope>
-
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>flags</artifactId>
+ <version>8-SNAPSHOT</version>
+ <scope>compile</scope>
</dependency>
</dependencies>
<build>
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModel.java
new file mode 100644
index 00000000000..5cc82264b2b
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModel.java
@@ -0,0 +1,93 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import java.nio.file.Path;
+import java.util.HashMap;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+public class DefaultSignificanceModel implements SignificanceModel {
+ private final long corpusSize;
+ private final HashMap<String, Long> frequencies;
+ private final Path path;
+
+ @JsonIgnoreProperties(ignoreUnknown = true)
+ public static class SignificanceModelFile {
+ private final String version;
+ private final String id;
+ private final String description;
+ private final long corpusSize;
+ private final String language;
+ private final HashMap<String, Long> frequencies;
+ private final long tokenCount;
+
+ @JsonCreator
+ public SignificanceModelFile(
+ @JsonProperty("version") String version,
+ @JsonProperty("id") String id,
+ @JsonProperty("description") String description,
+ @JsonProperty("corpus_size") long corpusSize,
+ @JsonProperty("language") String language,
+ @JsonProperty("frequencies") HashMap<String, Long> frequencies,
+ @JsonProperty("token_count") long tokenCount) {
+ this.version = version;
+ this.id = id;
+ this.description = description;
+ this.corpusSize = corpusSize;
+ this.language = language;
+ this.frequencies = frequencies;
+ this.tokenCount = tokenCount;
+ }
+
+ @JsonProperty("version")
+ public String version() { return version; }
+
+ @JsonProperty("id")
+ public String id() { return id; }
+
+ @JsonProperty("description")
+ public String description() { return description; }
+
+ @JsonProperty("corpus_size")
+ public long corpusSize() { return corpusSize; }
+
+ @JsonProperty("language")
+ public String language() { return language; }
+
+ @JsonProperty("frequencies")
+ public HashMap<String, Long> frequencies() { return frequencies; }
+
+ @JsonProperty("token_count")
+ public long tokenCount() { return tokenCount; }
+
+ }
+
+ public DefaultSignificanceModel(Path path) {
+ this.path = path;
+
+ ObjectMapper objectMapper = new ObjectMapper();
+
+ try {
+ SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class);
+ this.corpusSize = model.corpusSize;
+ this.frequencies = model.frequencies;
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to load model from " + path, e);
+ }
+ }
+
+ @Override
+ public DocumentFrequency documentFrequency(String word) {
+ if (frequencies.containsKey(word)) {
+ return new DocumentFrequency(frequencies.get(word), corpusSize);
+ }
+ return new DocumentFrequency(1, corpusSize);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModelRegistry.java
new file mode 100644
index 00000000000..59a50e2c36a
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/DefaultSignificanceModelRegistry.java
@@ -0,0 +1,79 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.component.annotation.Inject;
+import com.yahoo.language.Language;
+import com.yahoo.search.significance.config.SignificanceConfig;
+
+import java.nio.file.Path;
+import java.util.EnumMap;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Supplier;
+
+import static com.yahoo.yolean.Exceptions.uncheck;
+/**
+ * Default implementation of {@link SignificanceModelRegistry}.
+ * This implementation loads models lazily and caches them.
+ *
+ * @author MariusArhaug
+ */
+public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry {
+
+ private final Map<Language, SignificanceModel> models;
+ @Inject
+ public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); }
+ private DefaultSignificanceModelRegistry(Builder b) {
+
+ this.models = withContextClassloader(() -> {
+ var models = new EnumMap<Language, SignificanceModel>(Language.class);
+ b.models.forEach((language, path) -> {
+ models.put(language,
+ uncheck(() -> new DefaultSignificanceModel(path)));
+ });
+ return models;
+ });
+ }
+
+ public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) {
+ this.models = withContextClassloader(() -> {
+ var models = new EnumMap<Language, SignificanceModel>(Language.class);
+ map.forEach((language, path) -> {
+ models.put(language,
+ uncheck(() -> new DefaultSignificanceModel(path)));
+ });
+ return models;
+ });
+ }
+
+
+ @Override
+ public SignificanceModel getModel(Language language) {
+ return models.get(language);
+ }
+
+ private static <R> R withContextClassloader(Supplier<R> r) {
+ var original = Thread.currentThread().getContextClassLoader();
+ Thread.currentThread().setContextClassLoader(SignificanceModel.class.getClassLoader());
+ try {
+ return r.get();
+ } finally {
+ Thread.currentThread().setContextClassLoader(original);
+ }
+ }
+
+ public static final class Builder {
+ private final Map<Language, Path> models = new EnumMap<>(Language.class);
+
+ public Builder() {}
+ public Builder(SignificanceConfig cfg) {
+ for (var model : cfg.model()) {
+ addModel(Language.fromLanguageTag(model.language()), model.path());
+ }
+ }
+
+ public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; }
+ public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); }
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java
new file mode 100644
index 00000000000..a94beacfd64
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java
@@ -0,0 +1,14 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+public record DocumentFrequency(long frequency, long corpusSize) {
+
+ public DocumentFrequency(long frequency, long corpusSize) {
+ this.frequency = frequency;
+ this.corpusSize = corpusSize;
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
new file mode 100644
index 00000000000..415eccac93b
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
@@ -0,0 +1,9 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+/**
+ * @author MariusArhaug
+ */
+public interface SignificanceModel {
+ DocumentFrequency documentFrequency(String word);
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java
new file mode 100644
index 00000000000..d7f0aac9949
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java
@@ -0,0 +1,11 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.language.Language;
+
+/**
+ * @author MariusArhaug
+ */
+public interface SignificanceModelRegistry {
+ SignificanceModel getModel(Language language);
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
new file mode 100644
index 00000000000..698d507c7e8
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
@@ -0,0 +1,40 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.language.Language;
+import org.junit.Test;
+
+import java.nio.file.Path;
+import java.util.HashMap;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+/**
+ * @author MariusArhaug
+ */
+public class DefaultSignificanceModelRegistryTest {
+
+ @Test
+ public void testDefaultSignificanceModelRegistry() {
+ HashMap<Language, Path> models = new HashMap<>();
+
+ models.put(Language.ENGLISH, Path.of("src/test/models/en.json"));
+ models.put(Language.NORWEGIAN_BOKMAL, Path.of("src/test/models/no.json"));
+
+ DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
+
+ var englishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH);
+ var norwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL);
+
+ assertNotNull(englishModel);
+ assertNotNull(norwegianModel);
+
+ assertEquals(2, englishModel.documentFrequency("test").frequency());
+ assertEquals(10, englishModel.documentFrequency("test").corpusSize());
+
+ assertEquals(3, norwegianModel.documentFrequency("nei").frequency());
+ assertEquals(20, norwegianModel.documentFrequency("nei").corpusSize());
+
+ }
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java
new file mode 100644
index 00000000000..38ca24855f8
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java
@@ -0,0 +1,29 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import org.junit.Test;
+
+import java.nio.file.Path;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author MariusArhaug
+
+ */
+public class DefaultSignificanceModelTest {
+
+ @Test
+ public void testDocumentFrequency() {
+ DefaultSignificanceModel significanceModel = new DefaultSignificanceModel(Path.of("src/test/models/en.json"));
+
+ assertEquals(2, significanceModel.documentFrequency("test").frequency());
+ assertEquals(10, significanceModel.documentFrequency("test").corpusSize());
+
+ assertEquals(3, significanceModel.documentFrequency("hello").frequency());
+ assertEquals(10, significanceModel.documentFrequency("hello").corpusSize());
+
+ assertEquals(1, significanceModel.documentFrequency("non-existent-word").frequency());
+ assertEquals(10, significanceModel.documentFrequency("hello").corpusSize());
+ }
+}
diff --git a/linguistics/src/test/models/en.json b/linguistics/src/test/models/en.json
new file mode 100644
index 00000000000..3fb1f58e8df
--- /dev/null
+++ b/linguistics/src/test/models/en.json
@@ -0,0 +1,14 @@
+{
+ "version" : "1.0",
+ "id" : "test::1",
+ "description" : "desc",
+ "corpus_size" : 10,
+ "language" : "en",
+ "frequencies" : {
+ "usa" : 2,
+ "hello": 3,
+ "world": 5,
+ "test": 2
+ },
+ "token_count" : 4
+}
diff --git a/linguistics/src/test/models/no.json b/linguistics/src/test/models/no.json
new file mode 100644
index 00000000000..6e5cd6cd7d5
--- /dev/null
+++ b/linguistics/src/test/models/no.json
@@ -0,0 +1,17 @@
+{
+ "version" : "1.0",
+ "id" : "test::2",
+ "description" : "norsk beskrivelse",
+ "corpus_size" : 20,
+ "language" : "nb",
+ "frequencies" : {
+ "usa" : 2,
+ "hello": 10,
+ "verden": 5,
+ "test": 2,
+ "norge": 11,
+ "ja": 12,
+ "nei": 3
+ },
+ "token_count" : 4
+}