diff options
Diffstat (limited to 'linguistics')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/Segmenter.java | 19 |
1 files changed, 10 insertions, 9 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java index 7e7ee44bf74..5240737ae45 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java @@ -6,22 +6,23 @@ import com.yahoo.language.Language; import java.util.List; /** - * Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a - * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK - * processing). + * A segmnenter splits a string into separate segments (such as words) without applying any further + * processing (such as stemming) on each segment. + * + * This is useful when token processing should be done separately from segmentation, such as in + * linguistic processing of queries, where token processing depends on field settings in a specific + * schema, while segmentation only depends on language and happens before schema-specific processing. * * @author Mathias Mølster Lidal */ public interface Segmenter { /** - * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized - * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only - * contains word-characters, any punctuation and spacing tokens will be removed. + * Returns a list of segments produced from a string. * - * @param input the text to segment. - * @param language language of input text. - * @return the list of segments. + * @param input the text to segment + * @param language the language of the input text + * @return the resulting list of segments * @throws ProcessingException if an exception is encountered during processing */ List<String> segment(String input, Language language); |