diff options
Diffstat (limited to 'linguistics/src/main/java/com')
6 files changed, 211 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java new file mode 100644 index 00000000000..ff6de32fdaf --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/DocumentFrequency.java @@ -0,0 +1,14 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.api.annotations.Beta; + +/** + * + * @author MariusArhaug + */ + +@Beta +public record DocumentFrequency(long frequency, long corpusSize) { + +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java new file mode 100644 index 00000000000..a9f1e48af62 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java @@ -0,0 +1,12 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.api.annotations.Beta; + +/** + * @author MariusArhaug + */ +@Beta +public interface SignificanceModel { + DocumentFrequency documentFrequency(String word); +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java new file mode 100644 index 00000000000..6d8dcc00e0a --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModelRegistry.java @@ -0,0 +1,13 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance; + +import com.yahoo.api.annotations.Beta; +import com.yahoo.language.Language; + +/** + * @author MariusArhaug + */ +@Beta +public interface SignificanceModelRegistry { + SignificanceModel getModel(Language language); +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java new file mode 100644 index 00000000000..7ed6f442610 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java @@ -0,0 +1,96 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance.impl; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.yahoo.language.significance.DocumentFrequency; +import com.yahoo.language.significance.SignificanceModel; + +import java.nio.file.Path; +import java.util.HashMap; + +/** + * + * @author MariusArhaug + */ +public class DefaultSignificanceModel implements SignificanceModel { + private final long corpusSize; + private final HashMap<String, Long> frequencies; + private final Path path; + + @JsonIgnoreProperties(ignoreUnknown = true) + public static class SignificanceModelFile { + private final String version; + private final String id; + private final String description; + private final long corpusSize; + private final String language; + + private final long wordCount; + private final HashMap<String, Long> frequencies; + + @JsonCreator + public SignificanceModelFile( + @JsonProperty("version") String version, + @JsonProperty("id") String id, + @JsonProperty("description") String description, + @JsonProperty("corpus-size") long corpusSize, + @JsonProperty("language") String language, + @JsonProperty("word-count") long wordCount, + @JsonProperty("frequencies") HashMap<String, Long> frequencies) { + this.version = version; + this.id = id; + this.description = description; + this.corpusSize = corpusSize; + this.language = language; + this.wordCount = wordCount; + this.frequencies = frequencies; + } + + @JsonProperty("version") + public String version() { return version; } + + @JsonProperty("id") + public String id() { return id; } + + @JsonProperty("description") + public String description() { return description; } + + @JsonProperty("corpus-size") + public long corpusSize() { return corpusSize; } + + @JsonProperty("language") + public String language() { return language; } + + @JsonProperty("frequencies") + public HashMap<String, Long> frequencies() { return frequencies; } + + @JsonProperty("word-count") + public long wordCount() { return wordCount; } + + } + + public DefaultSignificanceModel(Path path) { + this.path = path; + + ObjectMapper objectMapper = new ObjectMapper(); + + try { + SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class); + this.corpusSize = model.corpusSize; + this.frequencies = model.frequencies; + } catch (Exception e) { + throw new RuntimeException("Failed to load model from " + path, e); + } + } + + @Override + public DocumentFrequency documentFrequency(String word) { + if (frequencies.containsKey(word)) { + return new DocumentFrequency(frequencies.get(word), corpusSize); + } + return new DocumentFrequency(1, corpusSize); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java new file mode 100644 index 00000000000..d44eab39cdf --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java @@ -0,0 +1,69 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance.impl; + +import com.yahoo.component.annotation.Inject; +import com.yahoo.language.Language; +import com.yahoo.language.significance.SignificanceModel; +import com.yahoo.language.significance.SignificanceModelRegistry; +import com.yahoo.search.significance.config.SignificanceConfig; + +import java.nio.file.Path; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; +import java.util.function.Supplier; + +import static com.yahoo.yolean.Exceptions.uncheck; +/** + * Default implementation of {@link SignificanceModelRegistry}. + * This implementation loads models lazily and caches them. + * + * @author MariusArhaug + */ +public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry { + + private final Map<Language, SignificanceModel> models; + @Inject + public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); } + private DefaultSignificanceModelRegistry(Builder b) { + this.models = new EnumMap<>(Language.class); + b.models.forEach((language, path) -> { + models.put(language, + uncheck(() -> new DefaultSignificanceModel(path))); + }); + } + + public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) { + this.models = new EnumMap<>(Language.class); + map.forEach((language, path) -> { + models.put(language, + uncheck(() -> new DefaultSignificanceModel(path))); + }); + } + + + @Override + public SignificanceModel getModel(Language language) throws IllegalArgumentException { + if (!models.containsKey(language)) + { + throw new IllegalArgumentException("No model for language " + language); + } + return models.get(language); + } + + + public static final class Builder { + private final Map<Language, Path> models = new EnumMap<>(Language.class); + + public Builder() {} + public Builder(SignificanceConfig cfg) { + for (var model : cfg.model()) { + addModel(Language.fromLanguageTag(model.language()), model.path()); + } + } + + public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; } + public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); } + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/package-info.java b/linguistics/src/main/java/com/yahoo/language/significance/package-info.java new file mode 100644 index 00000000000..5c2f773452f --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/package-info.java @@ -0,0 +1,7 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +@PublicApi +package com.yahoo.language.significance; + +import com.yahoo.api.annotations.PublicApi; +import com.yahoo.osgi.annotation.ExportPackage; |