aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Segmenter.java19
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java80
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java56
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java44
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java52
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java2
-rw-r--r--linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java49
-rw-r--r--linguistics/src/test/models/docv1.json18
-rw-r--r--linguistics/src/test/models/docv2.json31
-rw-r--r--linguistics/src/test/models/en.json6
-rw-r--r--linguistics/src/test/models/no.json17
12 files changed, 250 insertions, 126 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
index 7e7ee44bf74..b82450bc443 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
@@ -6,22 +6,23 @@ import com.yahoo.language.Language;
import java.util.List;
/**
- * Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a
- * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK
- * processing).
+ * A segmenter splits a string into separate segments (such as words) without applying any further
+ * processing (such as stemming) on each segment.
+ *
+ * This is useful when token processing should be done separately from segmentation, such as in
+ * linguistic processing of queries, where token processing depends on field settings in a specific
+ * schema, while segmentation only depends on language and happens before schema-specific processing.
*
* @author Mathias Mølster Lidal
*/
public interface Segmenter {
/**
- * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized
- * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only
- * contains word-characters, any punctuation and spacing tokens will be removed.
+ * Returns a list of segments produced from a string.
*
- * @param input the text to segment.
- * @param language language of input text.
- * @return the list of segments.
+ * @param input the text to segment
+ * @param language the language of the input text
+ * @return the resulting list of segments
* @throws ProcessingException if an exception is encountered during processing
*/
List<String> segment(String input, Language language);
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
index a9f1e48af62..c8a31e1892c 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
@@ -9,4 +9,6 @@ import com.yahoo.api.annotations.Beta;
@Beta
public interface SignificanceModel {
DocumentFrequency documentFrequency(String word);
+
+ String getId();
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
index 7ed6f442610..6e024c3025e 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
@@ -1,15 +1,15 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.significance.impl;
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
-import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.language.significance.DocumentFrequency;
import com.yahoo.language.significance.SignificanceModel;
+import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
/**
*
@@ -17,71 +17,23 @@ import java.util.HashMap;
*/
public class DefaultSignificanceModel implements SignificanceModel {
private final long corpusSize;
- private final HashMap<String, Long> frequencies;
- private final Path path;
+ private final Map<String, Long> frequencies;
- @JsonIgnoreProperties(ignoreUnknown = true)
- public static class SignificanceModelFile {
- private final String version;
- private final String id;
- private final String description;
- private final long corpusSize;
- private final String language;
-
- private final long wordCount;
- private final HashMap<String, Long> frequencies;
-
- @JsonCreator
- public SignificanceModelFile(
- @JsonProperty("version") String version,
- @JsonProperty("id") String id,
- @JsonProperty("description") String description,
- @JsonProperty("corpus-size") long corpusSize,
- @JsonProperty("language") String language,
- @JsonProperty("word-count") long wordCount,
- @JsonProperty("frequencies") HashMap<String, Long> frequencies) {
- this.version = version;
- this.id = id;
- this.description = description;
- this.corpusSize = corpusSize;
- this.language = language;
- this.wordCount = wordCount;
- this.frequencies = frequencies;
- }
-
- @JsonProperty("version")
- public String version() { return version; }
-
- @JsonProperty("id")
- public String id() { return id; }
-
- @JsonProperty("description")
- public String description() { return description; }
-
- @JsonProperty("corpus-size")
- public long corpusSize() { return corpusSize; }
-
- @JsonProperty("language")
- public String language() { return language; }
-
- @JsonProperty("frequencies")
- public HashMap<String, Long> frequencies() { return frequencies; }
-
- @JsonProperty("word-count")
- public long wordCount() { return wordCount; }
+ private String id;
+ public DefaultSignificanceModel(DocumentFrequencyFile file, String id) {
+ this.frequencies = file.frequencies();
+ this.corpusSize = file.documentCount();
+ this.id = id;
}
public DefaultSignificanceModel(Path path) {
- this.path = path;
-
ObjectMapper objectMapper = new ObjectMapper();
-
try {
- SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class);
- this.corpusSize = model.corpusSize;
- this.frequencies = model.frequencies;
- } catch (Exception e) {
+ var file = objectMapper.readValue(path.toFile(), DocumentFrequencyFile.class);
+ this.frequencies = file.frequencies();
+ this.corpusSize = file.documentCount();
+ } catch (IOException e) {
throw new RuntimeException("Failed to load model from " + path, e);
}
}
@@ -93,4 +45,10 @@ public class DefaultSignificanceModel implements SignificanceModel {
}
return new DocumentFrequency(1, corpusSize);
}
+
+ @Override
+ public String getId() {
+ return this.id;
+ }
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
index 1be1d3f13b5..72874c15d9e 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
@@ -1,20 +1,21 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.significance.impl;
+import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.component.annotation.Inject;
import com.yahoo.language.Language;
import com.yahoo.language.significance.SignificanceModel;
import com.yahoo.language.significance.SignificanceModelRegistry;
import com.yahoo.search.significance.config.SignificanceConfig;
+import java.io.IOException;
+import java.io.UncheckedIOException;
import java.nio.file.Path;
import java.util.EnumMap;
-import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Optional;
-import java.util.function.Supplier;
-import static com.yahoo.yolean.Exceptions.uncheck;
/**
* Default implementation of {@link SignificanceModelRegistry}.
* This implementation loads models lazily and caches them.
@@ -24,24 +25,35 @@ import static com.yahoo.yolean.Exceptions.uncheck;
public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry {
private final Map<Language, SignificanceModel> models;
+
@Inject
- public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); }
- private DefaultSignificanceModelRegistry(Builder b) {
+ public DefaultSignificanceModelRegistry(SignificanceConfig cfg) {
this.models = new EnumMap<>(Language.class);
- b.models.forEach((language, path) -> {
- models.put(language,
- uncheck(() -> new DefaultSignificanceModel(path)));
- });
+ for (var model : cfg.model()) {
+ addModel(model.path());
+ }
}
- public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) {
+ public DefaultSignificanceModelRegistry(List<Path> models) {
this.models = new EnumMap<>(Language.class);
- map.forEach((language, path) -> {
- models.put(language,
- uncheck(() -> new DefaultSignificanceModel(path)));
- });
+ for (var path : models) {
+ addModel(path);
+ }
}
+ public void addModel(Path path) {
+ ObjectMapper objectMapper = new ObjectMapper();
+ try {
+ SignificanceModelFile file = objectMapper.readValue(path.toFile(), SignificanceModelFile.class);
+ for (var pair : file.languages().entrySet()) {
+ this.models.put(
+ Language.fromLanguageTag(pair.getKey()),
+ new DefaultSignificanceModel(pair.getValue(), file.id()));
+ }
+ } catch (IOException e) {
+ throw new UncheckedIOException("Failed to load model from " + path, e);
+ }
+ }
@Override
public Optional<SignificanceModel> getModel(Language language) {
@@ -51,20 +63,4 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist
}
return Optional.of(models.get(language));
}
-
-
- public static final class Builder {
- private final Map<Language, Path> models = new EnumMap<>(Language.class);
-
- public Builder() {}
- public Builder(SignificanceConfig cfg) {
- for (var model : cfg.model()) {
- addModel(Language.fromLanguageTag(model.language()), model.path());
- }
- }
-
- public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; }
- public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); }
- }
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
new file mode 100644
index 00000000000..34e73e1b547
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
@@ -0,0 +1,44 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+@JsonIgnoreProperties(ignoreUnknown = true)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class DocumentFrequencyFile {
+ private final String description;
+
+ private final long documentCount;
+
+ private final Map<String, Long> frequencies;
+
+ @JsonCreator
+ public DocumentFrequencyFile(
+ @JsonProperty("description") String description,
+ @JsonProperty("document-count") long documentCount,
+ @JsonProperty("document-frequencies") Map<String, Long> frequencies) {
+ this.description = description;
+ this.documentCount = documentCount;
+ this.frequencies = frequencies;
+ }
+
+ @JsonProperty("description")
+ public String description() { return description; }
+
+ @JsonProperty("document-count")
+ public long documentCount() { return documentCount; }
+
+ @JsonProperty("document-frequencies")
+ public Map<String, Long> frequencies() { return frequencies; }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
new file mode 100644
index 00000000000..94030108671
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
@@ -0,0 +1,52 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import java.util.HashMap;
+import java.util.List;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class SignificanceModelFile {
+ private final String version;
+ private final String id;
+ private final String description;
+
+ private final HashMap<String, DocumentFrequencyFile> languages;
+
+ @JsonCreator
+ public SignificanceModelFile(
+ @JsonProperty("version") String version,
+ @JsonProperty("id") String id,
+ @JsonProperty("description") String description,
+ @JsonProperty("languages") HashMap<String, DocumentFrequencyFile> languages) {
+ this.version = version;
+ this.id = id;
+ this.description = description;
+ this.languages = languages;
+ }
+
+ @JsonProperty("version")
+ public String version() { return version; }
+
+ @JsonProperty("id")
+ public String id() { return id; }
+
+ @JsonProperty("description")
+ public String description() { return description; }
+
+ @JsonProperty("languages")
+ public HashMap<String, DocumentFrequencyFile> languages() { return languages; }
+
+ public void addLanguage(String language, DocumentFrequencyFile documentFrequencyFile) {
+ languages.put(language, documentFrequencyFile);
+ }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index f8d0dc83abc..662d4a807c5 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -47,7 +47,7 @@ public class SimpleDetector implements Detector {
}
public Language guessLanguage(String input) {
- if (input == null || input.length() == 0) {
+ if (input == null || input.isEmpty()) {
return Language.UNKNOWN;
}
diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
index d4849571b5e..e8594885b9e 100644
--- a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
+++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
@@ -6,7 +6,8 @@ import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry;
import org.junit.Test;
import java.nio.file.Path;
-import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
@@ -18,10 +19,10 @@ public class DefaultSignificanceModelRegistryTest {
@Test
public void testDefaultSignificanceModelRegistry() {
- HashMap<Language, Path> models = new HashMap<>();
+ List<Path> models = new ArrayList<>();
- models.put(Language.ENGLISH, Path.of("src/test/models/en.json"));
- models.put(Language.NORWEGIAN_BOKMAL, Path.of("src/test/models/no.json"));
+ models.add(Path.of("src/test/models/docv1.json"));
+ models.add(Path.of("src/test/models/docv2.json"));
DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
@@ -39,6 +40,45 @@ public class DefaultSignificanceModelRegistryTest {
assertNotNull(englishModel);
assertNotNull(norwegianModel);
+ assertEquals("test::2", englishModel.getId());
+ assertEquals("test::2", norwegianModel.getId());
+
+ assertEquals(4, englishModel.documentFrequency("test").frequency());
+ assertEquals(14, englishModel.documentFrequency("test").corpusSize());
+
+ assertEquals(3, norwegianModel.documentFrequency("nei").frequency());
+ assertEquals(20, norwegianModel.documentFrequency("nei").corpusSize());
+
+ assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency());
+ assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize());
+
+ }
+
+ @Test
+ public void testDefaultSignificanceModelRegistryInOppsiteOrder() {
+
+ List<Path> models = new ArrayList<>();
+
+ models.add(Path.of("src/test/models/docv2.json"));
+ models.add(Path.of("src/test/models/docv1.json"));
+
+ DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
+
+ var optionalEnglishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH);
+ var optionalNorwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL);
+
+ assertTrue(optionalEnglishModel.isPresent());
+ assertTrue(optionalNorwegianModel.isPresent());
+
+ var englishModel = optionalEnglishModel.get();
+ var norwegianModel = optionalNorwegianModel.get();
+
+ assertNotNull(englishModel);
+ assertNotNull(norwegianModel);
+
+ assertEquals("test::1", englishModel.getId());
+ assertEquals("test::2", norwegianModel.getId());
+
assertEquals(2, englishModel.documentFrequency("test").frequency());
assertEquals(10, englishModel.documentFrequency("test").corpusSize());
@@ -47,6 +87,5 @@ public class DefaultSignificanceModelRegistryTest {
assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency());
assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize());
-
}
}
diff --git a/linguistics/src/test/models/docv1.json b/linguistics/src/test/models/docv1.json
new file mode 100644
index 00000000000..04010959a58
--- /dev/null
+++ b/linguistics/src/test/models/docv1.json
@@ -0,0 +1,18 @@
+{
+ "version" : "1.0",
+ "id" : "test::1",
+ "description" : "desc",
+ "languages" : {
+ "en": {
+ "description" : "english model",
+ "document-count" : 10,
+ "language" : "en",
+ "document-frequencies" : {
+ "usa" : 2,
+ "hello": 3,
+ "world": 5,
+ "test": 2
+ }
+ }
+ }
+}
diff --git a/linguistics/src/test/models/docv2.json b/linguistics/src/test/models/docv2.json
new file mode 100644
index 00000000000..c00d02fb744
--- /dev/null
+++ b/linguistics/src/test/models/docv2.json
@@ -0,0 +1,31 @@
+{
+ "version" : "2.0",
+ "id" : "test::2",
+ "description" : "desc",
+ "languages" : {
+ "en": {
+ "description" : "english model",
+ "document-count" : 14,
+ "document-frequencies" : {
+ "usa" : 2,
+ "hello": 3,
+ "world": 5,
+ "test": 4,
+ "additional": 2
+ }
+ },
+ "nb": {
+ "description" : "norwegian model",
+ "document-count" : 20,
+ "document-frequencies" : {
+ "usa" : 2,
+ "hello": 10,
+ "verden": 5,
+ "test": 2,
+ "norge": 11,
+ "ja": 12,
+ "nei": 3
+ }
+ }
+ }
+}
diff --git a/linguistics/src/test/models/en.json b/linguistics/src/test/models/en.json
index 50bae5e3451..87b7b2faa08 100644
--- a/linguistics/src/test/models/en.json
+++ b/linguistics/src/test/models/en.json
@@ -1,11 +1,11 @@
{
"version" : "1.0",
"id" : "test::1",
- "description" : "desc",
- "corpus-size" : 10,
+ "description" : "english model",
+ "document-count" : 10,
"language" : "en",
"word-count" : 4,
- "frequencies" : {
+ "document-frequencies" : {
"usa" : 2,
"hello": 3,
"world": 5,
diff --git a/linguistics/src/test/models/no.json b/linguistics/src/test/models/no.json
deleted file mode 100644
index 5fca8929e74..00000000000
--- a/linguistics/src/test/models/no.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
- "version" : "1.0",
- "id" : "test::2",
- "description" : "norsk beskrivelse",
- "corpus-size" : 20,
- "language" : "nb",
- "word-count" : 7,
- "frequencies" : {
- "usa" : 2,
- "hello": 10,
- "verden": 5,
- "test": 2,
- "norge": 11,
- "ja": 12,
- "nei": 3
- }
-}