diff options
Diffstat (limited to 'linguistics')
3 files changed, 17 insertions, 13 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java index 7e7ee44bf74..b82450bc443 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java @@ -6,22 +6,23 @@ import com.yahoo.language.Language; import java.util.List; /** - * Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a - * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK - * processing). + * A segmenter splits a string into separate segments (such as words) without applying any further + * processing (such as stemming) on each segment. + * + * This is useful when token processing should be done separately from segmentation, such as in + * linguistic processing of queries, where token processing depends on field settings in a specific + * schema, while segmentation only depends on language and happens before schema-specific processing. * * @author Mathias Mølster Lidal */ public interface Segmenter { /** - * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized - * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only - * contains word-characters, any punctuation and spacing tokens will be removed. + * Returns a list of segments produced from a string. * - * @param input the text to segment. - * @param language language of input text. - * @return the list of segments. + * @param input the text to segment + * @param language the language of the input text + * @return the resulting list of segments * @throws ProcessingException if an exception is encountered during processing */ List<String> segment(String input, Language language); diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java index b62754ac8ad..9b7cbae834a 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java @@ -17,15 +17,14 @@ import java.util.HashMap; public class DocumentFrequencyFile { private final String description; - private final int documentCount; - + private final long documentCount; private final HashMap<String, Long> frequencies; @JsonCreator public DocumentFrequencyFile( @JsonProperty("description") String description, - @JsonProperty("document-count") int documentCount, + @JsonProperty("document-count") long documentCount, @JsonProperty("document-frequencies") HashMap<String, Long> frequencies) { this.description = description; this.documentCount = documentCount; @@ -36,7 +35,7 @@ public class DocumentFrequencyFile { public String description() { return description; } @JsonProperty("document-count") - public int documentCount() { return documentCount; } + public long documentCount() { return documentCount; } @JsonProperty("document-frequencies") public HashMap<String, Long> frequencies() { return frequencies; } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java index 902613379f0..94030108671 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java @@ -45,4 +45,8 @@ public class SignificanceModelFile { @JsonProperty("languages") public HashMap<String, DocumentFrequencyFile> languages() { return languages; } + + public void addLanguage(String language, DocumentFrequencyFile documentFrequencyFile) { + languages.put(language, documentFrequencyFile); + } } |