summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2020-08-24 07:21:14 +0000
committerArne Juul <arnej@verizonmedia.com>2020-08-24 07:23:05 +0000
commitb720e164d9e88c33450b2bae2186e3214e4804bf (patch)
treeb7a32c2d71c62b030ae1f8029aa7a0ae331790c9 /linguistics
parentef1f0e04884a31f55011374b4fff0dcbe9fa7e30 (diff)
handle plugin tokenizer returning tokens with empty original string
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java5
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java51
2 files changed, 55 insertions, 1 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
index a5e665b9444..9301e73aa5d 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java
@@ -33,7 +33,10 @@ public class SegmenterImpl implements Segmenter {
int len;
if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) {
if (token.isIndexable()) {
- out.add(token.getOrig());
+ String orig = token.getOrig();
+ if (! orig.isEmpty()) {
+ out.add(orig);
+ }
}
} else {
for (int i = 0; i < len; ++i) {
diff --git a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
index bbe424b7f14..736568b402a 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
@@ -3,9 +3,11 @@ package com.yahoo.language.process;
import com.yahoo.language.Language;
import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleToken;
import com.yahoo.language.simple.SimpleTokenizer;
import org.junit.Test;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -36,8 +38,57 @@ public class SegmenterImplTestCase {
assertSegments("FOO BAR", Arrays.asList("FOO", "BAR"));
}
+ @Test
+ public void requireThatEmptyInputIsPreserved() {
+ assertSegments("", Arrays.asList(""));
+ }
+
private static void assertSegments(String input, List<String> expectedSegments) {
assertEquals(expectedSegments, SEGMENTER.segment(input, Language.ENGLISH));
}
+ @Test
+ public void requireThatEmptyStringsAreSuppressed() {
+ Tokenizer fancyTokenizer = new FancyTokenizer();
+ Segmenter fancySegmenter = new SegmenterImpl(fancyTokenizer);
+ List<String> expectedSegments = Arrays.asList("juice", "\u00BD", "oz");
+ String input = "juice \u00BD oz";
+ assertEquals(expectedSegments, fancySegmenter.segment(input, Language.ENGLISH));
+ }
+
+ private static class FancyTokenizer implements Tokenizer {
+ private Tokenizer backend = new SimpleTokenizer(new SimpleNormalizer());
+
+ FancyTokenizer() {}
+
+ public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
+ List<Token> output = new ArrayList<>();
+ for (Token token : backend.tokenize(input, language, stemMode, removeAccents)) {
+ if ("\u00BD".equals(token.getOrig())) {
+ // emulate tokenizer turning "1/2" symbol into tree tokens ["1", "/", "2"]
+ Token nt1 = new SimpleToken("").
+ setTokenString("1").
+ setType(TokenType.NUMERIC).
+ setScript(token.getScript()).
+ setOffset(token.getOffset());
+ output.add(nt1);
+ Token nt2 = new SimpleToken("").
+ setTokenString("\u2044").
+ setType(TokenType.SYMBOL).
+ setScript(token.getScript()).
+ setOffset(token.getOffset());
+ output.add(nt2);
+ Token nt3 = new SimpleToken(token.getOrig()).
+ setTokenString("2").
+ setType(TokenType.NUMERIC).
+ setScript(token.getScript()).
+ setOffset(token.getOffset());
+ output.add(nt3);
+ } else {
+ output.add(token);
+ }
+ }
+ return output;
+ }
+ }
}