1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;
import com.yahoo.language.Language;
import com.yahoo.language.simple.SimpleNormalizer;
import com.yahoo.language.simple.SimpleToken;
import com.yahoo.language.simple.SimpleTokenizer;
import org.junit.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.assertEquals;
/**
* @author Simon Thoresen Hult
*/
public class SegmenterImplTestCase {
private final static Segmenter SEGMENTER = new SegmenterImpl(new SimpleTokenizer(new SimpleNormalizer()));
@Test
public void requireThatNonIndexableCharactersAreDelimiters() {
assertSegments("i've", Arrays.asList("i", "ve"));
assertSegments("foo bar. baz", Arrays.asList("foo", "bar", "baz"));
assertSegments("1,2, 3 4", Arrays.asList("1", "2", "3", "4"));
}
@Test
public void requireThatAdjacentIndexableTokenTypesAreNotSplit() {
assertSegments("a1,2b,c3,4d", Arrays.asList("a1", "2b", "c3", "4d"));
}
@Test
public void requireThatSegmentationReturnsOriginalForm() {
assertSegments("a\u030A", Arrays.asList("a\u030A"));
assertSegments("FOO BAR", Arrays.asList("FOO", "BAR"));
}
@Test
public void requireThatEmptyInputIsPreserved() {
assertSegments("", Arrays.asList(""));
}
private static void assertSegments(String input, List<String> expectedSegments) {
assertEquals(expectedSegments, SEGMENTER.segment(input, Language.ENGLISH));
}
@Test
public void requireThatEmptyStringsAreSuppressed() {
Tokenizer fancyTokenizer = new FancyTokenizer();
Segmenter fancySegmenter = new SegmenterImpl(fancyTokenizer);
List<String> expectedSegments = Arrays.asList("juice", "\u00BD", "oz");
String input = "juice \u00BD oz";
assertEquals(expectedSegments, fancySegmenter.segment(input, Language.ENGLISH));
}
private static class FancyTokenizer implements Tokenizer {
private Tokenizer backend = new SimpleTokenizer(new SimpleNormalizer());
FancyTokenizer() {}
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
List<Token> output = new ArrayList<>();
for (Token token : backend.tokenize(input, language, stemMode, removeAccents)) {
if ("\u00BD".equals(token.getOrig())) {
// emulate tokenizer turning "1/2" symbol into tree tokens ["1", "/", "2"]
Token nt1 = new SimpleToken("").
setTokenString("1").
setType(TokenType.NUMERIC).
setScript(token.getScript()).
setOffset(token.getOffset());
output.add(nt1);
Token nt2 = new SimpleToken("").
setTokenString("\u2044").
setType(TokenType.SYMBOL).
setScript(token.getScript()).
setOffset(token.getOffset());
output.add(nt2);
Token nt3 = new SimpleToken(token.getOrig()).
setTokenString("2").
setType(TokenType.NUMERIC).
setScript(token.getScript()).
setOffset(token.getOffset());
output.add(nt3);
} else {
output.add(token);
}
}
return output;
}
}
}
|