1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;
import com.yahoo.language.Language;
import com.yahoo.language.simple.SimpleNormalizer;
import com.yahoo.language.simple.SimpleToken;
import com.yahoo.language.simple.SimpleTokenizer;
import org.junit.Test;
import org.mockito.Mockito;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import static org.junit.Assert.assertEquals;
/**
* @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
*/
public class StemmerImplTestCase {
@Test
public void requireThatStemIsNormalizedAndLowerCased() {
assertStem("FOO", Arrays.asList("foo"));
assertStem("a\u030A", Arrays.asList("\u00E5"));
}
@Test
public void requireThatOnlyIndexableTokensAreReturned() {
assertStem("foo. (bar)!", Arrays.asList("foo", "bar"));
}
@Test
public void requireThatSpecialTokensAreNotDecompounded() {
SimpleToken token = new SimpleToken("c++").setType(TokenType.ALPHABETIC)
.setTokenString("c++")
.addComponent(new SimpleToken("c").setType(TokenType.ALPHABETIC)
.setTokenString("c"))
.addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC)
.setTokenString("p"))
.addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC)
.setTokenString("p"));
Tokenizer tokenizer = Mockito.mock(Tokenizer.class);
Mockito.when(tokenizer.tokenize(Mockito.anyString(), Mockito.<Language>any(), Mockito.<StemMode>any(),
Mockito.anyBoolean()))
.thenReturn(Arrays.<Token>asList(token));
Stemmer stemmer = new StemmerImpl(tokenizer);
token.setSpecialToken(false);
assertEquals(Arrays.asList(new StemList("c"),
new StemList("p"),
new StemList("p")),
stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
token.setSpecialToken(true);
assertEquals(Arrays.asList(new StemList("c++")),
stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
}
private static void assertStem(String input, List<String> expectedStems) {
Stemmer stemmer = new StemmerImpl(new SimpleTokenizer(new SimpleNormalizer()));
List<String> got = new ArrayList<>();
for (StemList word : stemmer.stem(input, StemMode.ALL, Language.ENGLISH)) {
got.add(word.get(0));
}
assertEquals(expectedStems, got);
}
}
|