aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Segmenter.java19
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java80
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java56
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java44
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java52
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java2
7 files changed, 154 insertions, 101 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
index 7e7ee44bf74..b82450bc443 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
@@ -6,22 +6,23 @@ import com.yahoo.language.Language;
import java.util.List;
/**
- * Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a
- * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK
- * processing).
+ * A segmenter splits a string into separate segments (such as words) without applying any further
+ * processing (such as stemming) on each segment.
+ *
+ * This is useful when token processing should be done separately from segmentation, such as in
+ * linguistic processing of queries, where token processing depends on field settings in a specific
+ * schema, while segmentation only depends on language and happens before schema-specific processing.
*
* @author Mathias Mølster Lidal
*/
public interface Segmenter {
/**
- * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized
- * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only
- * contains word-characters, any punctuation and spacing tokens will be removed.
+ * Returns a list of segments produced from a string.
*
- * @param input the text to segment.
- * @param language language of input text.
- * @return the list of segments.
+ * @param input the text to segment
+ * @param language the language of the input text
+ * @return the resulting list of segments
* @throws ProcessingException if an exception is encountered during processing
*/
List<String> segment(String input, Language language);
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
index a9f1e48af62..c8a31e1892c 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
@@ -9,4 +9,6 @@ import com.yahoo.api.annotations.Beta;
@Beta
public interface SignificanceModel {
DocumentFrequency documentFrequency(String word);
+
+ String getId();
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
index 7ed6f442610..6e024c3025e 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
@@ -1,15 +1,15 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.significance.impl;
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
-import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.language.significance.DocumentFrequency;
import com.yahoo.language.significance.SignificanceModel;
+import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
/**
*
@@ -17,71 +17,23 @@ import java.util.HashMap;
*/
public class DefaultSignificanceModel implements SignificanceModel {
private final long corpusSize;
- private final HashMap<String, Long> frequencies;
- private final Path path;
+ private final Map<String, Long> frequencies;
- @JsonIgnoreProperties(ignoreUnknown = true)
- public static class SignificanceModelFile {
- private final String version;
- private final String id;
- private final String description;
- private final long corpusSize;
- private final String language;
-
- private final long wordCount;
- private final HashMap<String, Long> frequencies;
-
- @JsonCreator
- public SignificanceModelFile(
- @JsonProperty("version") String version,
- @JsonProperty("id") String id,
- @JsonProperty("description") String description,
- @JsonProperty("corpus-size") long corpusSize,
- @JsonProperty("language") String language,
- @JsonProperty("word-count") long wordCount,
- @JsonProperty("frequencies") HashMap<String, Long> frequencies) {
- this.version = version;
- this.id = id;
- this.description = description;
- this.corpusSize = corpusSize;
- this.language = language;
- this.wordCount = wordCount;
- this.frequencies = frequencies;
- }
-
- @JsonProperty("version")
- public String version() { return version; }
-
- @JsonProperty("id")
- public String id() { return id; }
-
- @JsonProperty("description")
- public String description() { return description; }
-
- @JsonProperty("corpus-size")
- public long corpusSize() { return corpusSize; }
-
- @JsonProperty("language")
- public String language() { return language; }
-
- @JsonProperty("frequencies")
- public HashMap<String, Long> frequencies() { return frequencies; }
-
- @JsonProperty("word-count")
- public long wordCount() { return wordCount; }
+ private String id;
+ public DefaultSignificanceModel(DocumentFrequencyFile file, String id) {
+ this.frequencies = file.frequencies();
+ this.corpusSize = file.documentCount();
+ this.id = id;
}
public DefaultSignificanceModel(Path path) {
- this.path = path;
-
ObjectMapper objectMapper = new ObjectMapper();
-
try {
- SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class);
- this.corpusSize = model.corpusSize;
- this.frequencies = model.frequencies;
- } catch (Exception e) {
+ var file = objectMapper.readValue(path.toFile(), DocumentFrequencyFile.class);
+ this.frequencies = file.frequencies();
+ this.corpusSize = file.documentCount();
+ } catch (IOException e) {
throw new RuntimeException("Failed to load model from " + path, e);
}
}
@@ -93,4 +45,10 @@ public class DefaultSignificanceModel implements SignificanceModel {
}
return new DocumentFrequency(1, corpusSize);
}
+
+ @Override
+ public String getId() {
+ return this.id;
+ }
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
index 1be1d3f13b5..72874c15d9e 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
@@ -1,20 +1,21 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.significance.impl;
+import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.component.annotation.Inject;
import com.yahoo.language.Language;
import com.yahoo.language.significance.SignificanceModel;
import com.yahoo.language.significance.SignificanceModelRegistry;
import com.yahoo.search.significance.config.SignificanceConfig;
+import java.io.IOException;
+import java.io.UncheckedIOException;
import java.nio.file.Path;
import java.util.EnumMap;
-import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Optional;
-import java.util.function.Supplier;
-import static com.yahoo.yolean.Exceptions.uncheck;
/**
* Default implementation of {@link SignificanceModelRegistry}.
* This implementation loads models lazily and caches them.
@@ -24,24 +25,35 @@ import static com.yahoo.yolean.Exceptions.uncheck;
public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry {
private final Map<Language, SignificanceModel> models;
+
@Inject
- public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); }
- private DefaultSignificanceModelRegistry(Builder b) {
+ public DefaultSignificanceModelRegistry(SignificanceConfig cfg) {
this.models = new EnumMap<>(Language.class);
- b.models.forEach((language, path) -> {
- models.put(language,
- uncheck(() -> new DefaultSignificanceModel(path)));
- });
+ for (var model : cfg.model()) {
+ addModel(model.path());
+ }
}
- public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) {
+ public DefaultSignificanceModelRegistry(List<Path> models) {
this.models = new EnumMap<>(Language.class);
- map.forEach((language, path) -> {
- models.put(language,
- uncheck(() -> new DefaultSignificanceModel(path)));
- });
+ for (var path : models) {
+ addModel(path);
+ }
}
+ public void addModel(Path path) {
+ ObjectMapper objectMapper = new ObjectMapper();
+ try {
+ SignificanceModelFile file = objectMapper.readValue(path.toFile(), SignificanceModelFile.class);
+ for (var pair : file.languages().entrySet()) {
+ this.models.put(
+ Language.fromLanguageTag(pair.getKey()),
+ new DefaultSignificanceModel(pair.getValue(), file.id()));
+ }
+ } catch (IOException e) {
+ throw new UncheckedIOException("Failed to load model from " + path, e);
+ }
+ }
@Override
public Optional<SignificanceModel> getModel(Language language) {
@@ -51,20 +63,4 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist
}
return Optional.of(models.get(language));
}
-
-
- public static final class Builder {
- private final Map<Language, Path> models = new EnumMap<>(Language.class);
-
- public Builder() {}
- public Builder(SignificanceConfig cfg) {
- for (var model : cfg.model()) {
- addModel(Language.fromLanguageTag(model.language()), model.path());
- }
- }
-
- public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; }
- public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); }
- }
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
new file mode 100644
index 00000000000..34e73e1b547
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
@@ -0,0 +1,44 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+@JsonIgnoreProperties(ignoreUnknown = true)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class DocumentFrequencyFile {
+ private final String description;
+
+ private final long documentCount;
+
+ private final Map<String, Long> frequencies;
+
+ @JsonCreator
+ public DocumentFrequencyFile(
+ @JsonProperty("description") String description,
+ @JsonProperty("document-count") long documentCount,
+ @JsonProperty("document-frequencies") Map<String, Long> frequencies) {
+ this.description = description;
+ this.documentCount = documentCount;
+ this.frequencies = frequencies;
+ }
+
+ @JsonProperty("description")
+ public String description() { return description; }
+
+ @JsonProperty("document-count")
+ public long documentCount() { return documentCount; }
+
+ @JsonProperty("document-frequencies")
+ public Map<String, Long> frequencies() { return frequencies; }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
new file mode 100644
index 00000000000..94030108671
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
@@ -0,0 +1,52 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import java.util.HashMap;
+import java.util.List;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class SignificanceModelFile {
+ private final String version;
+ private final String id;
+ private final String description;
+
+ private final HashMap<String, DocumentFrequencyFile> languages;
+
+ @JsonCreator
+ public SignificanceModelFile(
+ @JsonProperty("version") String version,
+ @JsonProperty("id") String id,
+ @JsonProperty("description") String description,
+ @JsonProperty("languages") HashMap<String, DocumentFrequencyFile> languages) {
+ this.version = version;
+ this.id = id;
+ this.description = description;
+ this.languages = languages;
+ }
+
+ @JsonProperty("version")
+ public String version() { return version; }
+
+ @JsonProperty("id")
+ public String id() { return id; }
+
+ @JsonProperty("description")
+ public String description() { return description; }
+
+ @JsonProperty("languages")
+ public HashMap<String, DocumentFrequencyFile> languages() { return languages; }
+
+ public void addLanguage(String language, DocumentFrequencyFile documentFrequencyFile) {
+ languages.put(language, documentFrequencyFile);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index f8d0dc83abc..662d4a807c5 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -47,7 +47,7 @@ public class SimpleDetector implements Detector {
}
public Language guessLanguage(String input) {
- if (input == null || input.length() == 0) {
+ if (input == null || input.isEmpty()) {
return Language.UNKNOWN;
}