diff options
Diffstat (limited to 'linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java')
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java new file mode 100644 index 00000000000..8e7e52358f9 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java @@ -0,0 +1,43 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleTokenizer; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class SegmenterImplTestCase { + + private final static Segmenter SEGMENTER = new SegmenterImpl(new SimpleTokenizer(new SimpleNormalizer())); + + @Test + public void requireThatNonIndexableCharactersAreDelimiters() { + assertSegments("i've", Arrays.asList("i", "ve")); + assertSegments("foo bar. baz", Arrays.asList("foo", "bar", "baz")); + assertSegments("1,2, 3 4", Arrays.asList("1", "2", "3", "4")); + } + + @Test + public void requireThatAdjacentIndexableTokenTypesAreNotSplit() { + assertSegments("a1,2b,c3,4d", Arrays.asList("a1", "2b", "c3", "4d")); + } + + @Test + public void requireThatSegmentationReturnsOriginalForm() { + assertSegments("a\u030A", Arrays.asList("a\u030A")); + assertSegments("FOO BAR", Arrays.asList("FOO", "BAR")); + } + + private static void assertSegments(String input, List<String> expectedSegments) { + assertEquals(expectedSegments, SEGMENTER.segment(input, Language.ENGLISH)); + } + +} |