summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorMarius Arhaug <mariusarhaug@hotmail.com>2024-04-09 16:33:04 +0200
committerGitHub <noreply@github.com>2024-04-09 16:33:04 +0200
commit07010100192978eea266f7cb15b315b57a95438e (patch)
treea232aba9475b273058179872a7ca251b42e39d5c /linguistics
parente1f023d0e14c3351948beed1ee0af6e466581251 (diff)
parent07eedf3b30af36fc05da6c98778ecda23bd0d304 (diff)
Merge pull request #30816 from vespa-engine/marius/add-significance-model-registry
Add significance model registry to linguistics
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/abi-spec.json44
-rw-r--r--linguistics/pom.xml21
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java14
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java12
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java13
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java96
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java69
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/package-info.java7
-rw-r--r--linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java48
-rw-r--r--linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java31
-rw-r--r--linguistics/src/test/models/en.json14
-rw-r--r--linguistics/src/test/models/no.json17
12 files changed, 385 insertions, 1 deletions
diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json
index 0bd4638bb05..9f91c32cf62 100644
--- a/linguistics/abi-spec.json
+++ b/linguistics/abi-spec.json
@@ -774,5 +774,49 @@
"public abstract java.lang.String accentDrop(java.lang.String, com.yahoo.language.Language)"
],
"fields" : [ ]
+ },
+ "com.yahoo.language.significance.DocumentFrequency" : {
+ "superClass" : "java.lang.Record",
+ "interfaces" : [ ],
+ "attributes" : [
+ "public",
+ "final",
+ "record"
+ ],
+ "methods" : [
+ "public void <init>(long, long)",
+ "public final java.lang.String toString()",
+ "public final int hashCode()",
+ "public final boolean equals(java.lang.Object)",
+ "public long frequency()",
+ "public long corpusSize()"
+ ],
+ "fields" : [ ]
+ },
+ "com.yahoo.language.significance.SignificanceModel" : {
+ "superClass" : "java.lang.Object",
+ "interfaces" : [ ],
+ "attributes" : [
+ "public",
+ "interface",
+ "abstract"
+ ],
+ "methods" : [
+ "public abstract com.yahoo.language.significance.DocumentFrequency documentFrequency(java.lang.String)"
+ ],
+ "fields" : [ ]
+ },
+ "com.yahoo.language.significance.SignificanceModelRegistry" : {
+ "superClass" : "java.lang.Object",
+ "interfaces" : [ ],
+ "attributes" : [
+ "public",
+ "interface",
+ "abstract"
+ ],
+ "methods" : [
+ "public abstract com.yahoo.language.significance.SignificanceModel getModel(com.yahoo.language.Language)"
+ ],
+ "fields" : [ ]
}
} \ No newline at end of file
diff --git a/linguistics/pom.xml b/linguistics/pom.xml
index 48ea0a765a6..d07ff5d9fdb 100644
--- a/linguistics/pom.xml
+++ b/linguistics/pom.xml
@@ -56,7 +56,26 @@
<groupId>com.google.inject</groupId>
<artifactId>guice</artifactId>
<scope>provided</scope>
-
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.junit.vintage</groupId>
+ <artifactId>junit-vintage-engine</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter</artifactId>
+ <scope>test</scope>
</dependency>
</dependencies>
<build>
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java
new file mode 100644
index 00000000000..ff6de32fdaf
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java
@@ -0,0 +1,14 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.api.annotations.Beta;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+
+@Beta
+public record DocumentFrequency(long frequency, long corpusSize) {
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
new file mode 100644
index 00000000000..a9f1e48af62
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
@@ -0,0 +1,12 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.api.annotations.Beta;
+
+/**
+ * @author MariusArhaug
+ */
+@Beta
+public interface SignificanceModel {
+ DocumentFrequency documentFrequency(String word);
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java
new file mode 100644
index 00000000000..6d8dcc00e0a
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java
@@ -0,0 +1,13 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.api.annotations.Beta;
+import com.yahoo.language.Language;
+
+/**
+ * @author MariusArhaug
+ */
+@Beta
+public interface SignificanceModelRegistry {
+ SignificanceModel getModel(Language language);
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
new file mode 100644
index 00000000000..7ed6f442610
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
@@ -0,0 +1,96 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.yahoo.language.significance.DocumentFrequency;
+import com.yahoo.language.significance.SignificanceModel;
+
+import java.nio.file.Path;
+import java.util.HashMap;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+public class DefaultSignificanceModel implements SignificanceModel {
+ private final long corpusSize;
+ private final HashMap<String, Long> frequencies;
+ private final Path path;
+
+ @JsonIgnoreProperties(ignoreUnknown = true)
+ public static class SignificanceModelFile {
+ private final String version;
+ private final String id;
+ private final String description;
+ private final long corpusSize;
+ private final String language;
+
+ private final long wordCount;
+ private final HashMap<String, Long> frequencies;
+
+ @JsonCreator
+ public SignificanceModelFile(
+ @JsonProperty("version") String version,
+ @JsonProperty("id") String id,
+ @JsonProperty("description") String description,
+ @JsonProperty("corpus-size") long corpusSize,
+ @JsonProperty("language") String language,
+ @JsonProperty("word-count") long wordCount,
+ @JsonProperty("frequencies") HashMap<String, Long> frequencies) {
+ this.version = version;
+ this.id = id;
+ this.description = description;
+ this.corpusSize = corpusSize;
+ this.language = language;
+ this.wordCount = wordCount;
+ this.frequencies = frequencies;
+ }
+
+ @JsonProperty("version")
+ public String version() { return version; }
+
+ @JsonProperty("id")
+ public String id() { return id; }
+
+ @JsonProperty("description")
+ public String description() { return description; }
+
+ @JsonProperty("corpus-size")
+ public long corpusSize() { return corpusSize; }
+
+ @JsonProperty("language")
+ public String language() { return language; }
+
+ @JsonProperty("frequencies")
+ public HashMap<String, Long> frequencies() { return frequencies; }
+
+ @JsonProperty("word-count")
+ public long wordCount() { return wordCount; }
+
+ }
+
+ public DefaultSignificanceModel(Path path) {
+ this.path = path;
+
+ ObjectMapper objectMapper = new ObjectMapper();
+
+ try {
+ SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class);
+ this.corpusSize = model.corpusSize;
+ this.frequencies = model.frequencies;
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to load model from " + path, e);
+ }
+ }
+
+ @Override
+ public DocumentFrequency documentFrequency(String word) {
+ if (frequencies.containsKey(word)) {
+ return new DocumentFrequency(frequencies.get(word), corpusSize);
+ }
+ return new DocumentFrequency(1, corpusSize);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
new file mode 100644
index 00000000000..d44eab39cdf
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
@@ -0,0 +1,69 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.yahoo.component.annotation.Inject;
+import com.yahoo.language.Language;
+import com.yahoo.language.significance.SignificanceModel;
+import com.yahoo.language.significance.SignificanceModelRegistry;
+import com.yahoo.search.significance.config.SignificanceConfig;
+
+import java.nio.file.Path;
+import java.util.EnumMap;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Supplier;
+
+import static com.yahoo.yolean.Exceptions.uncheck;
+/**
+ * Default implementation of {@link SignificanceModelRegistry}.
+ * This implementation loads models lazily and caches them.
+ *
+ * @author MariusArhaug
+ */
+public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry {
+
+ private final Map<Language, SignificanceModel> models;
+ @Inject
+ public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); }
+ private DefaultSignificanceModelRegistry(Builder b) {
+ this.models = new EnumMap<>(Language.class);
+ b.models.forEach((language, path) -> {
+ models.put(language,
+ uncheck(() -> new DefaultSignificanceModel(path)));
+ });
+ }
+
+ public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) {
+ this.models = new EnumMap<>(Language.class);
+ map.forEach((language, path) -> {
+ models.put(language,
+ uncheck(() -> new DefaultSignificanceModel(path)));
+ });
+ }
+
+
+ @Override
+ public SignificanceModel getModel(Language language) throws IllegalArgumentException {
+ if (!models.containsKey(language))
+ {
+ throw new IllegalArgumentException("No model for language " + language);
+ }
+ return models.get(language);
+ }
+
+
+ public static final class Builder {
+ private final Map<Language, Path> models = new EnumMap<>(Language.class);
+
+ public Builder() {}
+ public Builder(SignificanceConfig cfg) {
+ for (var model : cfg.model()) {
+ addModel(Language.fromLanguageTag(model.language()), model.path());
+ }
+ }
+
+ public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; }
+ public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); }
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/package-info.java b/linguistics/src/main/java/com/yahoo/language/significance/package-info.java
new file mode 100644
index 00000000000..5c2f773452f
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/package-info.java
@@ -0,0 +1,7 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+@PublicApi
+package com.yahoo.language.significance;
+
+import com.yahoo.api.annotations.PublicApi;
+import com.yahoo.osgi.annotation.ExportPackage;
diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
new file mode 100644
index 00000000000..d1de63a994d
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
@@ -0,0 +1,48 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry;
+import org.junit.Test;
+
+import java.nio.file.Path;
+import java.util.HashMap;
+
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+
+/**
+ * @author MariusArhaug
+ */
+public class DefaultSignificanceModelRegistryTest {
+
+ @Test
+ public void testDefaultSignificanceModelRegistry() {
+ HashMap<Language, Path> models = new HashMap<>();
+
+ models.put(Language.ENGLISH, Path.of("src/test/models/en.json"));
+ models.put(Language.NORWEGIAN_BOKMAL, Path.of("src/test/models/no.json"));
+
+ DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
+
+ var englishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH);
+ var norwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL);
+
+ assertThrows(IllegalArgumentException.class, () -> defaultSignificanceModelRegistry.getModel(Language.FRENCH));
+
+ assertNotNull(englishModel);
+ assertNotNull(norwegianModel);
+
+ assertEquals(2, englishModel.documentFrequency("test").frequency());
+ assertEquals(10, englishModel.documentFrequency("test").corpusSize());
+
+ assertEquals(3, norwegianModel.documentFrequency("nei").frequency());
+ assertEquals(20, norwegianModel.documentFrequency("nei").corpusSize());
+
+ assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency());
+ assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize());
+
+ }
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java
new file mode 100644
index 00000000000..137f8d4513a
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelTest.java
@@ -0,0 +1,31 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.language.significance.impl.DefaultSignificanceModel;
+import org.junit.jupiter.api.Test;
+
+import java.nio.file.Path;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+
+/**
+ * @author MariusArhaug
+
+ */
+public class DefaultSignificanceModelTest {
+
+ @Test
+ public void testDocumentFrequency() {
+ DefaultSignificanceModel significanceModel = new DefaultSignificanceModel(Path.of("src/test/models/en.json"));
+
+ assertEquals(2, significanceModel.documentFrequency("test").frequency());
+ assertEquals(10, significanceModel.documentFrequency("test").corpusSize());
+
+ assertEquals(3, significanceModel.documentFrequency("hello").frequency());
+ assertEquals(10, significanceModel.documentFrequency("hello").corpusSize());
+
+ assertEquals(1, significanceModel.documentFrequency("non-existent-word").frequency());
+ assertEquals(10, significanceModel.documentFrequency("hello").corpusSize());
+ }
+}
diff --git a/linguistics/src/test/models/en.json b/linguistics/src/test/models/en.json
new file mode 100644
index 00000000000..50bae5e3451
--- /dev/null
+++ b/linguistics/src/test/models/en.json
@@ -0,0 +1,14 @@
+{
+ "version" : "1.0",
+ "id" : "test::1",
+ "description" : "desc",
+ "corpus-size" : 10,
+ "language" : "en",
+ "word-count" : 4,
+ "frequencies" : {
+ "usa" : 2,
+ "hello": 3,
+ "world": 5,
+ "test": 2
+ }
+}
diff --git a/linguistics/src/test/models/no.json b/linguistics/src/test/models/no.json
new file mode 100644
index 00000000000..5fca8929e74
--- /dev/null
+++ b/linguistics/src/test/models/no.json
@@ -0,0 +1,17 @@
+{
+ "version" : "1.0",
+ "id" : "test::2",
+ "description" : "norsk beskrivelse",
+ "corpus-size" : 20,
+ "language" : "nb",
+ "word-count" : 7,
+ "frequencies" : {
+ "usa" : 2,
+ "hello": 10,
+ "verden": 5,
+ "test": 2,
+ "norge": 11,
+ "ja": 12,
+ "nei": 3
+ }
+}