blob: 73c0ac857ab445fa9465680bd799982fff23c771 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;
/**
* A single token produced by the tokenizer.
*
* @author Mathias Mølster Lidal
*/
public interface Token {
/** Returns the type of this token - word, space or punctuation etc. */
TokenType getType();
/** Returns the original form of this token */
String getOrig();
/** Returns the number of stem forms available for this token. */
int getNumStems();
/** Returns the stem at position i */
String getStem(int i);
/**
* Returns the number of components, if this token is a compound word
* (e.g. german "kommunikationsfehler". Otherwise, return 0
*
* @return number of components, or 0 if none
*/
int getNumComponents();
/** Returns a component token of this */
Token getComponent(int i);
/** Returns the offset position of this token */
long getOffset();
/** Returns the script of this token */
TokenScript getScript();
/**
* Returns token string in a form suitable for indexing: The
* most lowercased variant of the most processed token form available.
* If called on a compound token this returns a lowercased form of the
* entire word.
*
* @return token string value
*/
String getTokenString();
/** Returns whether this is an instance of a declared special token (e.g. c++) */
boolean isSpecialToken();
/** Whether this token should be indexed */
boolean isIndexable();
}
|