diff options
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java | 5 | ||||
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java | 51 |
2 files changed, 55 insertions, 1 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java index a5e665b9444..9301e73aa5d 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java +++ b/linguistics/src/main/java/com/yahoo/language/process/SegmenterImpl.java @@ -33,7 +33,10 @@ public class SegmenterImpl implements Segmenter { int len; if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) { if (token.isIndexable()) { - out.add(token.getOrig()); + String orig = token.getOrig(); + if (! orig.isEmpty()) { + out.add(orig); + } } } else { for (int i = 0; i < len; ++i) { diff --git a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java index bbe424b7f14..736568b402a 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java @@ -3,9 +3,11 @@ package com.yahoo.language.process; import com.yahoo.language.Language; import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleToken; import com.yahoo.language.simple.SimpleTokenizer; import org.junit.Test; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -36,8 +38,57 @@ public class SegmenterImplTestCase { assertSegments("FOO BAR", Arrays.asList("FOO", "BAR")); } + @Test + public void requireThatEmptyInputIsPreserved() { + assertSegments("", Arrays.asList("")); + } + private static void assertSegments(String input, List<String> expectedSegments) { assertEquals(expectedSegments, SEGMENTER.segment(input, Language.ENGLISH)); } + @Test + public void requireThatEmptyStringsAreSuppressed() { + Tokenizer fancyTokenizer = new FancyTokenizer(); + Segmenter fancySegmenter = new SegmenterImpl(fancyTokenizer); + List<String> expectedSegments = Arrays.asList("juice", "\u00BD", "oz"); + String input = "juice \u00BD oz"; + assertEquals(expectedSegments, fancySegmenter.segment(input, Language.ENGLISH)); + } + + private static class FancyTokenizer implements Tokenizer { + private Tokenizer backend = new SimpleTokenizer(new SimpleNormalizer()); + + FancyTokenizer() {} + + public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { + List<Token> output = new ArrayList<>(); + for (Token token : backend.tokenize(input, language, stemMode, removeAccents)) { + if ("\u00BD".equals(token.getOrig())) { + // emulate tokenizer turning "1/2" symbol into tree tokens ["1", "/", "2"] + Token nt1 = new SimpleToken(""). + setTokenString("1"). + setType(TokenType.NUMERIC). + setScript(token.getScript()). + setOffset(token.getOffset()); + output.add(nt1); + Token nt2 = new SimpleToken(""). + setTokenString("\u2044"). + setType(TokenType.SYMBOL). + setScript(token.getScript()). + setOffset(token.getOffset()); + output.add(nt2); + Token nt3 = new SimpleToken(token.getOrig()). + setTokenString("2"). + setType(TokenType.NUMERIC). + setScript(token.getScript()). + setOffset(token.getOffset()); + output.add(nt3); + } else { + output.add(token); + } + } + return output; + } + } } |