summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java14
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java12
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java13
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java96
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java69
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/package-info.java7
6 files changed, 211 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java
new file mode 100644
index 00000000000..ff6de32fdaf
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java
@@ -0,0 +1,14 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.api.annotations.Beta;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+
+@Beta
+public record DocumentFrequency(long frequency, long corpusSize) {
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
new file mode 100644
index 00000000000..a9f1e48af62
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
@@ -0,0 +1,12 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.api.annotations.Beta;
+
+/**
+ * @author MariusArhaug
+ */
+@Beta
+public interface SignificanceModel {
+ DocumentFrequency documentFrequency(String word);
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java
new file mode 100644
index 00000000000..6d8dcc00e0a
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java
@@ -0,0 +1,13 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance;
+
+import com.yahoo.api.annotations.Beta;
+import com.yahoo.language.Language;
+
+/**
+ * @author MariusArhaug
+ */
+@Beta
+public interface SignificanceModelRegistry {
+ SignificanceModel getModel(Language language);
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
new file mode 100644
index 00000000000..7ed6f442610
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
@@ -0,0 +1,96 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.yahoo.language.significance.DocumentFrequency;
+import com.yahoo.language.significance.SignificanceModel;
+
+import java.nio.file.Path;
+import java.util.HashMap;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+public class DefaultSignificanceModel implements SignificanceModel {
+ private final long corpusSize;
+ private final HashMap<String, Long> frequencies;
+ private final Path path;
+
+ @JsonIgnoreProperties(ignoreUnknown = true)
+ public static class SignificanceModelFile {
+ private final String version;
+ private final String id;
+ private final String description;
+ private final long corpusSize;
+ private final String language;
+
+ private final long wordCount;
+ private final HashMap<String, Long> frequencies;
+
+ @JsonCreator
+ public SignificanceModelFile(
+ @JsonProperty("version") String version,
+ @JsonProperty("id") String id,
+ @JsonProperty("description") String description,
+ @JsonProperty("corpus-size") long corpusSize,
+ @JsonProperty("language") String language,
+ @JsonProperty("word-count") long wordCount,
+ @JsonProperty("frequencies") HashMap<String, Long> frequencies) {
+ this.version = version;
+ this.id = id;
+ this.description = description;
+ this.corpusSize = corpusSize;
+ this.language = language;
+ this.wordCount = wordCount;
+ this.frequencies = frequencies;
+ }
+
+ @JsonProperty("version")
+ public String version() { return version; }
+
+ @JsonProperty("id")
+ public String id() { return id; }
+
+ @JsonProperty("description")
+ public String description() { return description; }
+
+ @JsonProperty("corpus-size")
+ public long corpusSize() { return corpusSize; }
+
+ @JsonProperty("language")
+ public String language() { return language; }
+
+ @JsonProperty("frequencies")
+ public HashMap<String, Long> frequencies() { return frequencies; }
+
+ @JsonProperty("word-count")
+ public long wordCount() { return wordCount; }
+
+ }
+
+ public DefaultSignificanceModel(Path path) {
+ this.path = path;
+
+ ObjectMapper objectMapper = new ObjectMapper();
+
+ try {
+ SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class);
+ this.corpusSize = model.corpusSize;
+ this.frequencies = model.frequencies;
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to load model from " + path, e);
+ }
+ }
+
+ @Override
+ public DocumentFrequency documentFrequency(String word) {
+ if (frequencies.containsKey(word)) {
+ return new DocumentFrequency(frequencies.get(word), corpusSize);
+ }
+ return new DocumentFrequency(1, corpusSize);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
new file mode 100644
index 00000000000..d44eab39cdf
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
@@ -0,0 +1,69 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.yahoo.component.annotation.Inject;
+import com.yahoo.language.Language;
+import com.yahoo.language.significance.SignificanceModel;
+import com.yahoo.language.significance.SignificanceModelRegistry;
+import com.yahoo.search.significance.config.SignificanceConfig;
+
+import java.nio.file.Path;
+import java.util.EnumMap;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Supplier;
+
+import static com.yahoo.yolean.Exceptions.uncheck;
+/**
+ * Default implementation of {@link SignificanceModelRegistry}.
+ * This implementation loads models lazily and caches them.
+ *
+ * @author MariusArhaug
+ */
+public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry {
+
+ private final Map<Language, SignificanceModel> models;
+ @Inject
+ public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); }
+ private DefaultSignificanceModelRegistry(Builder b) {
+ this.models = new EnumMap<>(Language.class);
+ b.models.forEach((language, path) -> {
+ models.put(language,
+ uncheck(() -> new DefaultSignificanceModel(path)));
+ });
+ }
+
+ public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) {
+ this.models = new EnumMap<>(Language.class);
+ map.forEach((language, path) -> {
+ models.put(language,
+ uncheck(() -> new DefaultSignificanceModel(path)));
+ });
+ }
+
+
+ @Override
+ public SignificanceModel getModel(Language language) throws IllegalArgumentException {
+ if (!models.containsKey(language))
+ {
+ throw new IllegalArgumentException("No model for language " + language);
+ }
+ return models.get(language);
+ }
+
+
+ public static final class Builder {
+ private final Map<Language, Path> models = new EnumMap<>(Language.class);
+
+ public Builder() {}
+ public Builder(SignificanceConfig cfg) {
+ for (var model : cfg.model()) {
+ addModel(Language.fromLanguageTag(model.language()), model.path());
+ }
+ }
+
+ public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; }
+ public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); }
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/package-info.java b/linguistics/src/main/java/com/yahoo/language/significance/package-info.java
new file mode 100644
index 00000000000..5c2f773452f
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/package-info.java
@@ -0,0 +1,7 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+@PublicApi
+package com.yahoo.language.significance;
+
+import com.yahoo.api.annotations.PublicApi;
+import com.yahoo.osgi.annotation.ExportPackage;