aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Segmenter.java19
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java7
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java4
3 files changed, 17 insertions, 13 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
index 7e7ee44bf74..b82450bc443 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
@@ -6,22 +6,23 @@ import com.yahoo.language.Language;
import java.util.List;
/**
- * Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a
- * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK
- * processing).
+ * A segmenter splits a string into separate segments (such as words) without applying any further
+ * processing (such as stemming) on each segment.
+ *
+ * This is useful when token processing should be done separately from segmentation, such as in
+ * linguistic processing of queries, where token processing depends on field settings in a specific
+ * schema, while segmentation only depends on language and happens before schema-specific processing.
*
* @author Mathias Mølster Lidal
*/
public interface Segmenter {
/**
- * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized
- * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only
- * contains word-characters, any punctuation and spacing tokens will be removed.
+ * Returns a list of segments produced from a string.
*
- * @param input the text to segment.
- * @param language language of input text.
- * @return the list of segments.
+ * @param input the text to segment
+ * @param language the language of the input text
+ * @return the resulting list of segments
* @throws ProcessingException if an exception is encountered during processing
*/
List<String> segment(String input, Language language);
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
index b62754ac8ad..9b7cbae834a 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
@@ -17,15 +17,14 @@ import java.util.HashMap;
public class DocumentFrequencyFile {
private final String description;
- private final int documentCount;
-
+ private final long documentCount;
private final HashMap<String, Long> frequencies;
@JsonCreator
public DocumentFrequencyFile(
@JsonProperty("description") String description,
- @JsonProperty("document-count") int documentCount,
+ @JsonProperty("document-count") long documentCount,
@JsonProperty("document-frequencies") HashMap<String, Long> frequencies) {
this.description = description;
this.documentCount = documentCount;
@@ -36,7 +35,7 @@ public class DocumentFrequencyFile {
public String description() { return description; }
@JsonProperty("document-count")
- public int documentCount() { return documentCount; }
+ public long documentCount() { return documentCount; }
@JsonProperty("document-frequencies")
public HashMap<String, Long> frequencies() { return frequencies; }
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
index 902613379f0..94030108671 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
@@ -45,4 +45,8 @@ public class SignificanceModelFile {
@JsonProperty("languages")
public HashMap<String, DocumentFrequencyFile> languages() { return languages; }
+
+ public void addLanguage(String language, DocumentFrequencyFile documentFrequencyFile) {
+ languages.put(language, documentFrequencyFile);
+ }
}