diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/process/Segmenter.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/Segmenter.java | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java new file mode 100644 index 00000000000..73764e06ef6 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java @@ -0,0 +1,29 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; + +import java.util.List; + +/** + * <p>Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a + * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK + * processing).</p> + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public interface Segmenter { + + /** + * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized + * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only + * contains word-characters, any punctuation and spacing tokens will be removed. + * + * @param input the text to segment. + * @param language language of input text. + * @return the list of segments. + * @throws ProcessingException if an exception is encountered during processing + */ + List<String> segment(String input, Language language); + +} |