aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/test/java/com/yahoo/language/process
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2020-08-24 07:21:14 +0000
committerArne Juul <arnej@verizonmedia.com>2020-08-24 07:23:05 +0000
commitb720e164d9e88c33450b2bae2186e3214e4804bf (patch)
treeb7a32c2d71c62b030ae1f8029aa7a0ae331790c9 /linguistics/src/test/java/com/yahoo/language/process
parentef1f0e04884a31f55011374b4fff0dcbe9fa7e30 (diff)
handle plugin tokenizer returning tokens with empty original string
Diffstat (limited to 'linguistics/src/test/java/com/yahoo/language/process')
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java51
1 files changed, 51 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
index bbe424b7f14..736568b402a 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
@@ -3,9 +3,11 @@ package com.yahoo.language.process;
import com.yahoo.language.Language;
import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleToken;
import com.yahoo.language.simple.SimpleTokenizer;
import org.junit.Test;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -36,8 +38,57 @@ public class SegmenterImplTestCase {
assertSegments("FOO BAR", Arrays.asList("FOO", "BAR"));
}
+ @Test
+ public void requireThatEmptyInputIsPreserved() {
+ assertSegments("", Arrays.asList(""));
+ }
+
private static void assertSegments(String input, List<String> expectedSegments) {
assertEquals(expectedSegments, SEGMENTER.segment(input, Language.ENGLISH));
}
+ @Test
+ public void requireThatEmptyStringsAreSuppressed() {
+ Tokenizer fancyTokenizer = new FancyTokenizer();
+ Segmenter fancySegmenter = new SegmenterImpl(fancyTokenizer);
+ List<String> expectedSegments = Arrays.asList("juice", "\u00BD", "oz");
+ String input = "juice \u00BD oz";
+ assertEquals(expectedSegments, fancySegmenter.segment(input, Language.ENGLISH));
+ }
+
+ private static class FancyTokenizer implements Tokenizer {
+ private Tokenizer backend = new SimpleTokenizer(new SimpleNormalizer());
+
+ FancyTokenizer() {}
+
+ public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
+ List<Token> output = new ArrayList<>();
+ for (Token token : backend.tokenize(input, language, stemMode, removeAccents)) {
+ if ("\u00BD".equals(token.getOrig())) {
+ // emulate tokenizer turning "1/2" symbol into tree tokens ["1", "/", "2"]
+ Token nt1 = new SimpleToken("").
+ setTokenString("1").
+ setType(TokenType.NUMERIC).
+ setScript(token.getScript()).
+ setOffset(token.getOffset());
+ output.add(nt1);
+ Token nt2 = new SimpleToken("").
+ setTokenString("\u2044").
+ setType(TokenType.SYMBOL).
+ setScript(token.getScript()).
+ setOffset(token.getOffset());
+ output.add(nt2);
+ Token nt3 = new SimpleToken(token.getOrig()).
+ setTokenString("2").
+ setType(TokenType.NUMERIC).
+ setScript(token.getScript()).
+ setOffset(token.getOffset());
+ output.add(nt3);
+ } else {
+ output.add(token);
+ }
+ }
+ return output;
+ }
+ }
}