1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
import com.yahoo.language.Language;
import com.yahoo.language.Linguistics;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenScript;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.assertEquals;
/**
* @author bratseth
*/
public class TokenizerTester {
private boolean accentDrop = false;
private Language language = Language.ENGLISH;
private Linguistics linguistics = new SimpleLinguistics();
private StemMode stemMode = StemMode.NONE;
public void assertTokens(String input, String ... expectedTokenStrings) {
List<String> actual = new ArrayList<>();
for (Token token : tokenize(input)) {
findTokenStrings(token, actual);
}
assertEquals(Arrays.asList(expectedTokenStrings), actual);
}
public void assertTokenScripts(String input, TokenScript... expectedTokenScripts) {
List<TokenScript> actual = new ArrayList<>();
for (Token token : tokenize(input)) {
findTokenScripts(token, actual);
}
assertEquals(Arrays.asList(expectedTokenScripts), actual);
}
public List<String> findTokenStrings(Token token, List<String> out) {
int numComponents = token.getNumComponents();
if (token.isSpecialToken() || numComponents == 0) {
out.add(token.getTokenString());
} else {
for (int i = 0; i < numComponents; ++i) {
findTokenStrings(token.getComponent(i), out);
}
}
return out;
}
public List<TokenScript> findTokenScripts(Token token, List<TokenScript> out) {
int numComponents = token.getNumComponents();
if (token.isSpecialToken() || numComponents == 0) {
out.add(token.getScript());
} else {
for (int i = 0; i < numComponents; ++i) {
findTokenScripts(token.getComponent(i), out);
}
}
return out;
}
public Iterable<Token> tokenize(String input) {
return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop);
}
public TokenizerTester setAccentDrop(boolean accentDrop) {
this.accentDrop = accentDrop;
return this;
}
public TokenizerTester setLanguage(Language language) {
this.language = language;
return this;
}
public TokenizerTester setLinguistics(Linguistics linguistics) {
this.linguistics = linguistics;
return this;
}
public TokenizerTester setStemMode(StemMode stemMode) {
this.stemMode = stemMode;
return this;
}
}
|