diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language')
7 files changed, 154 insertions, 101 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java index 7e7ee44bf74..b82450bc443 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java @@ -6,22 +6,23 @@ import com.yahoo.language.Language; import java.util.List; /** - * Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a - * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK - * processing). + * A segmenter splits a string into separate segments (such as words) without applying any further + * processing (such as stemming) on each segment. + * + * This is useful when token processing should be done separately from segmentation, such as in + * linguistic processing of queries, where token processing depends on field settings in a specific + * schema, while segmentation only depends on language and happens before schema-specific processing. * * @author Mathias Mølster Lidal */ public interface Segmenter { /** - * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized - * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only - * contains word-characters, any punctuation and spacing tokens will be removed. + * Returns a list of segments produced from a string. * - * @param input the text to segment. - * @param language language of input text. - * @return the list of segments. + * @param input the text to segment + * @param language the language of the input text + * @return the resulting list of segments * @throws ProcessingException if an exception is encountered during processing */ List<String> segment(String input, Language language); diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java index a9f1e48af62..c8a31e1892c 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java @@ -9,4 +9,6 @@ import com.yahoo.api.annotations.Beta; @Beta public interface SignificanceModel { DocumentFrequency documentFrequency(String word); + + String getId(); } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java index 7ed6f442610..6e024c3025e 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java @@ -1,15 +1,15 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.significance.impl; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; -import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.language.significance.DocumentFrequency; import com.yahoo.language.significance.SignificanceModel; +import java.io.IOException; import java.nio.file.Path; import java.util.HashMap; +import java.util.Map; +import java.util.TreeMap; /** * @@ -17,71 +17,23 @@ import java.util.HashMap; */ public class DefaultSignificanceModel implements SignificanceModel { private final long corpusSize; - private final HashMap<String, Long> frequencies; - private final Path path; + private final Map<String, Long> frequencies; - @JsonIgnoreProperties(ignoreUnknown = true) - public static class SignificanceModelFile { - private final String version; - private final String id; - private final String description; - private final long corpusSize; - private final String language; - - private final long wordCount; - private final HashMap<String, Long> frequencies; - - @JsonCreator - public SignificanceModelFile( - @JsonProperty("version") String version, - @JsonProperty("id") String id, - @JsonProperty("description") String description, - @JsonProperty("corpus-size") long corpusSize, - @JsonProperty("language") String language, - @JsonProperty("word-count") long wordCount, - @JsonProperty("frequencies") HashMap<String, Long> frequencies) { - this.version = version; - this.id = id; - this.description = description; - this.corpusSize = corpusSize; - this.language = language; - this.wordCount = wordCount; - this.frequencies = frequencies; - } - - @JsonProperty("version") - public String version() { return version; } - - @JsonProperty("id") - public String id() { return id; } - - @JsonProperty("description") - public String description() { return description; } - - @JsonProperty("corpus-size") - public long corpusSize() { return corpusSize; } - - @JsonProperty("language") - public String language() { return language; } - - @JsonProperty("frequencies") - public HashMap<String, Long> frequencies() { return frequencies; } - - @JsonProperty("word-count") - public long wordCount() { return wordCount; } + private String id; + public DefaultSignificanceModel(DocumentFrequencyFile file, String id) { + this.frequencies = file.frequencies(); + this.corpusSize = file.documentCount(); + this.id = id; } public DefaultSignificanceModel(Path path) { - this.path = path; - ObjectMapper objectMapper = new ObjectMapper(); - try { - SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class); - this.corpusSize = model.corpusSize; - this.frequencies = model.frequencies; - } catch (Exception e) { + var file = objectMapper.readValue(path.toFile(), DocumentFrequencyFile.class); + this.frequencies = file.frequencies(); + this.corpusSize = file.documentCount(); + } catch (IOException e) { throw new RuntimeException("Failed to load model from " + path, e); } } @@ -93,4 +45,10 @@ public class DefaultSignificanceModel implements SignificanceModel { } return new DocumentFrequency(1, corpusSize); } + + @Override + public String getId() { + return this.id; + } + } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java index 1be1d3f13b5..72874c15d9e 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java @@ -1,20 +1,21 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.significance.impl; +import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.component.annotation.Inject; import com.yahoo.language.Language; import com.yahoo.language.significance.SignificanceModel; import com.yahoo.language.significance.SignificanceModelRegistry; import com.yahoo.search.significance.config.SignificanceConfig; +import java.io.IOException; +import java.io.UncheckedIOException; import java.nio.file.Path; import java.util.EnumMap; -import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.function.Supplier; -import static com.yahoo.yolean.Exceptions.uncheck; /** * Default implementation of {@link SignificanceModelRegistry}. * This implementation loads models lazily and caches them. @@ -24,24 +25,35 @@ import static com.yahoo.yolean.Exceptions.uncheck; public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry { private final Map<Language, SignificanceModel> models; + @Inject - public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); } - private DefaultSignificanceModelRegistry(Builder b) { + public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this.models = new EnumMap<>(Language.class); - b.models.forEach((language, path) -> { - models.put(language, - uncheck(() -> new DefaultSignificanceModel(path))); - }); + for (var model : cfg.model()) { + addModel(model.path()); + } } - public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) { + public DefaultSignificanceModelRegistry(List<Path> models) { this.models = new EnumMap<>(Language.class); - map.forEach((language, path) -> { - models.put(language, - uncheck(() -> new DefaultSignificanceModel(path))); - }); + for (var path : models) { + addModel(path); + } } + public void addModel(Path path) { + ObjectMapper objectMapper = new ObjectMapper(); + try { + SignificanceModelFile file = objectMapper.readValue(path.toFile(), SignificanceModelFile.class); + for (var pair : file.languages().entrySet()) { + this.models.put( + Language.fromLanguageTag(pair.getKey()), + new DefaultSignificanceModel(pair.getValue(), file.id())); + } + } catch (IOException e) { + throw new UncheckedIOException("Failed to load model from " + path, e); + } + } @Override public Optional<SignificanceModel> getModel(Language language) { @@ -51,20 +63,4 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist } return Optional.of(models.get(language)); } - - - public static final class Builder { - private final Map<Language, Path> models = new EnumMap<>(Language.class); - - public Builder() {} - public Builder(SignificanceConfig cfg) { - for (var model : cfg.model()) { - addModel(Language.fromLanguageTag(model.language()), model.path()); - } - } - - public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; } - public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); } - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java new file mode 100644 index 00000000000..34e73e1b547 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java @@ -0,0 +1,44 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance.impl; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.HashMap; +import java.util.Map; +import java.util.TreeMap; + +/** + * + * @author MariusArhaug + */ +@JsonIgnoreProperties(ignoreUnknown = true) +@JsonInclude(JsonInclude.Include.NON_NULL) +public class DocumentFrequencyFile { + private final String description; + + private final long documentCount; + + private final Map<String, Long> frequencies; + + @JsonCreator + public DocumentFrequencyFile( + @JsonProperty("description") String description, + @JsonProperty("document-count") long documentCount, + @JsonProperty("document-frequencies") Map<String, Long> frequencies) { + this.description = description; + this.documentCount = documentCount; + this.frequencies = frequencies; + } + + @JsonProperty("description") + public String description() { return description; } + + @JsonProperty("document-count") + public long documentCount() { return documentCount; } + + @JsonProperty("document-frequencies") + public Map<String, Long> frequencies() { return frequencies; } +} diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java new file mode 100644 index 00000000000..94030108671 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java @@ -0,0 +1,52 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.significance.impl; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonProperty; + +import java.util.HashMap; +import java.util.List; + +/** + * + * @author MariusArhaug + */ + +@JsonIgnoreProperties(ignoreUnknown = true) +public class SignificanceModelFile { + private final String version; + private final String id; + private final String description; + + private final HashMap<String, DocumentFrequencyFile> languages; + + @JsonCreator + public SignificanceModelFile( + @JsonProperty("version") String version, + @JsonProperty("id") String id, + @JsonProperty("description") String description, + @JsonProperty("languages") HashMap<String, DocumentFrequencyFile> languages) { + this.version = version; + this.id = id; + this.description = description; + this.languages = languages; + } + + @JsonProperty("version") + public String version() { return version; } + + @JsonProperty("id") + public String id() { return id; } + + @JsonProperty("description") + public String description() { return description; } + + @JsonProperty("languages") + public HashMap<String, DocumentFrequencyFile> languages() { return languages; } + + public void addLanguage(String language, DocumentFrequencyFile documentFrequencyFile) { + languages.put(language, documentFrequencyFile); + } +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index f8d0dc83abc..662d4a807c5 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -47,7 +47,7 @@ public class SimpleDetector implements Detector { } public Language guessLanguage(String input) { - if (input == null || input.length() == 0) { + if (input == null || input.isEmpty()) { return Language.UNKNOWN; } |