diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/process/Token.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/Token.java | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Token.java b/linguistics/src/main/java/com/yahoo/language/process/Token.java new file mode 100644 index 00000000000..f1dc6639e11 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Token.java @@ -0,0 +1,56 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +/** + * Interface providing access to a single token produced by the tokenizer. + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public interface Token { + + /** Returns the type of this token - word, space or punctuation etc. */ + TokenType getType(); + + /** Returns the original form of this token */ + String getOrig(); + + /** Returns the number of stem forms available for this token. */ + int getNumStems(); + + /** Returns the stem at position i */ + String getStem(int i); + + /** + * Returns the number of components, if this token is a compound word + * (e.g. german "kommunikationsfehler". Otherwise, return 0 + * + * @return number of components, or 0 if none + */ + int getNumComponents(); + + /** Returns a component token of this */ + Token getComponent(int i); + + /** Returns the offset position of this token */ + long getOffset(); + + /** Returns the script of this token */ + TokenScript getScript(); + + /** + * Returns token string in a form suitable for indexing: The + * most lowercased variant of the most processed token form available. + * If called on a compound token this returns a lowercased form of the + * entire word. + * + * @return token string value + */ + String getTokenString(); + + /** Returns whether this is an instance of a declared special token (e.g. c++) */ + boolean isSpecialToken(); + + /** Whether this token should be indexed */ + boolean isIndexable(); + +} |