aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/process/Segmenter.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Segmenter.java29
1 files changed, 29 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
new file mode 100644
index 00000000000..73764e06ef6
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+
+import java.util.List;
+
+/**
+ * <p>Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a
+ * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK
+ * processing).</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public interface Segmenter {
+
+ /**
+ * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized
+ * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only
+ * contains word-characters, any punctuation and spacing tokens will be removed.
+ *
+ * @param input the text to segment.
+ * @param language language of input text.
+ * @return the list of segments.
+ * @throws ProcessingException if an exception is encountered during processing
+ */
+ List<String> segment(String input, Language language);
+
+}