linguistics/src/main/java/com/yahoo/language/process/Token.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;

/**
 * A single token produced by the tokenizer.
 *
 * @author Mathias Mølster Lidal
 */
public interface Token {

    /** Returns the type of this token - word, space or punctuation etc. */
    TokenType getType();

    /** Returns the original form of this token */
    String getOrig();

    /** Returns the number of stem forms available for this token. */
    int getNumStems();

    /** Returns the stem at position i */
    String getStem(int i);

    /**
     * Returns the number of components, if this token is a compound word
     * (e.g. german "kommunikationsfehler". Otherwise, return 0
     *
     * @return number of components, or 0 if none
     */
    int getNumComponents();

    /** Returns a component token of this */
    Token getComponent(int i);

    /** Returns the offset position of this token */
    long getOffset();

    /** Returns the script of this token */
    TokenScript getScript();

    /**
     * Returns token string in a form suitable for indexing: The
     * most lowercased variant of the most processed token form available.
     * If called on a compound token this returns a lowercased form of the
     * entire word.
     *
     * @return token string value
     */
    String getTokenString();

    /** Returns whether this is an instance of a declared special token (e.g. c++) */
    boolean isSpecialToken();

    /** Whether this token should be indexed */
    boolean isIndexable();

}