summaryrefslogtreecommitdiffstats
path: root/linguistics/src/test/java/com/yahoo/language/process
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/test/java/com/yahoo/language/process')
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java66
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java150
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java35
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java27
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java43
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java73
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java27
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java68
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java38
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java233
10 files changed, 760 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java
new file mode 100644
index 00000000000..3cb82572976
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java
@@ -0,0 +1,66 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.Linguistics;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public abstract class AbstractTokenizerTestCase {
+
+ private boolean accentDrop = false;
+ private Language language = Language.ENGLISH;
+ private Linguistics linguistics;
+ private StemMode stemMode = StemMode.NONE;
+
+ public void assertTokenStrings(String input, List<String> expectedTokenStrings) {
+ List<String> actual = new ArrayList<>();
+ for (Token token : tokenize(input)) {
+ findTokenStrings(token, actual);
+ }
+ assertEquals(expectedTokenStrings, actual);
+ }
+
+ public List<String> findTokenStrings(Token token, List<String> out) {
+ int numComponents = token.getNumComponents();
+ if (token.isSpecialToken() || numComponents == 0) {
+ out.add(token.getTokenString());
+ } else {
+ for (int i = 0; i < numComponents; ++i) {
+ findTokenStrings(token.getComponent(i), out);
+ }
+ }
+ return out;
+ }
+
+ public Iterable<Token> tokenize(String input) {
+ return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop);
+ }
+
+ public AbstractTokenizerTestCase setAccentDrop(boolean accentDrop) {
+ this.accentDrop = accentDrop;
+ return this;
+ }
+
+ public AbstractTokenizerTestCase setLanguage(Language language) {
+ this.language = language;
+ return this;
+ }
+
+ public AbstractTokenizerTestCase setLinguistics(Linguistics linguistics) {
+ this.linguistics = linguistics;
+ return this;
+ }
+
+ public AbstractTokenizerTestCase setStemMode(StemMode stemMode) {
+ this.stemMode = stemMode;
+ return this;
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
new file mode 100644
index 00000000000..8233ef1b8f0
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
@@ -0,0 +1,150 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.simple.SimpleLinguistics;
+import org.junit.Test;
+
+import java.util.Iterator;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.*;
+
+/**
+ * @author bratseth
+ */
+public class GramSplitterTestCase {
+
+ private static final GramSplitter gramSplitter = new SimpleLinguistics().getGramSplitter();
+
+ @Test
+ public void testNoSpaces() {
+ // no spaces
+ assertGramSplit("engulbillesang", 1, "[e, n, g, u, l, b, i, l, l, e, s, a, n, g]");
+ assertGramSplit("engulbillesang", 2, "[en, ng, gu, ul, lb, bi, il, ll, le, es, sa, an, ng]");
+ assertGramSplit("engulbillesang", 3, "[eng, ngu, gul, ulb, lbi, bil, ill, lle, les, esa, san, ang]");
+ }
+
+ @Test
+ public void testWithSpaces() {
+ // with spaces
+ assertGramSplit("en gul bille sang", 1, "[e, n, g, u, l, b, i, l, l, e, s, a, n, g]");
+ assertGramSplit("en gul bille sang", 2, "[en, gu, ul, bi, il, ll, le, sa, an, ng]");
+ assertGramSplit("en gul bille sang", 3, "[en, gul, bil, ill, lle, san, ang]");
+ }
+
+ @Test
+ public void testCornerCases() {
+ // corner cases
+ assertGramSplit("", 1, "[]");
+ assertGramSplit("", 2, "[]");
+ assertGramSplit("e", 1, "[e]");
+ assertGramSplit("e", 2, "[e]");
+ assertGramSplit("en", 1, "[e, n]");
+ assertGramSplit("en", 2, "[en]");
+ assertGramSplit("en", 3, "[en]");
+ }
+
+ @Test
+ public void testSpaceCornerCases() {
+ // space corner cases
+ assertGramSplit("e en e", 1, "[e, e, n, e]");
+ assertGramSplit("e en e", 2, "[e, en, e]");
+ assertGramSplit("e en e", 3, "[e, en, e]");
+ assertGramSplit(" e en e ", 1, "[e, e, n, e]");
+ assertGramSplit(" e en e ", 2, "[e, en, e]");
+ assertGramSplit(" e en e ", 3, "[e, en, e]");
+ assertGramSplit(" e en e ", 1, "[e, e, n, e]");
+ assertGramSplit(" e en e ", 2, "[e, en, e]");
+ assertGramSplit(" e en e ", 3, "[e, en, e]");
+ assertGramSplit("a b c", 4, "[a, b, c]");
+ }
+
+ @Test
+ public void testWithCasing() {
+ assertGramSplit("This is the Black Eyed Peas", 2,
+ "[Th, hi, is, is, th, he, Bl, la, ac, ck, Ey, ye, ed, Pe, ea, as]");
+ assertGramSplit("This is the Black Eyed Peas", 3,
+ "[Thi, his, is, the, Bla, lac, ack, Eye, yed, Pea, eas]");
+ assertGramSplit("This is the Black Eyed Peas", 4,
+ "[This, is, the, Blac, lack, Eyed, Peas]");
+ assertGramSplit("This is the Black Eyed Peas", 5,
+ "[This, is, the, Black, Eyed, Peas]");
+ assertGramSplit("This is the Black Eyed Peas", 6,
+ "[This, is, the, Black, Eyed, Peas]");
+ }
+
+ @Test
+ public void testWithPunctuation() {
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 2,
+ "[th, hi, is, is, in, a, se, en, ns, se, mo, or, re, th, ha, an, th, he, su, um, of, pa, ar, rt, ts]");
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 3,
+ "[thi, his, is, in, a, sen, ens, nse, mor, ore, tha, han, the, sum, of, par, art, rts]");
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 4,
+ "[this, is, in, a, sens, ense, more, than, the, sum, of, part, arts]");
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 5,
+ "[this, is, in, a, sense, more, than, the, sum, of, parts]");
+ assertGramSplit("this is, in a sense, more than the sum of parts!", 6,
+ "[this, is, in, a, sense, more, than, the, sum, of, parts]");
+ }
+
+ @Test
+ public void testAccents() {
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 2, "[ca, af, f\u00e9, de, l, h\u00f4, \u00f4t, te, el]");
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 3, "[caf, af\u00e9, de, l, h\u00f4t, \u00f4te, tel]");
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 4, "[caf\u00e9, de, l, h\u00f4te, \u00f4tel]");
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 5, "[caf\u00e9, de, l, h\u00f4tel]");
+ assertGramSplit("caf\u00e9 de l'h\u00f4tel", 6, "[caf\u00e9, de, l, h\u00f4tel]");
+ }
+
+ @Test
+ public void testChinese() {
+ String input = "\u77f3\u5ba4\u8a69\u58eb\u65bd\u6c0f\uff0c\u55dc\u7345\uff0c\u8a93\u98df\u5341\u7345\u3002" +
+ "\u65bd\u6c0f\u6642\u6642\u9069\u5e02\u8996\u7345\uff0c\u5341\u6642\uff0c\u9069\u5341\u7345" +
+ "\u9069\u5e02\u3002";
+ assertGramSplit(input, 2, "[\u77f3\u5ba4, \u5ba4\u8a69, \u8a69\u58eb, \u58eb\u65bd, \u65bd\u6c0f, " +
+ "\u55dc\u7345, \u8a93\u98df, \u98df\u5341, \u5341\u7345, \u65bd\u6c0f, " +
+ "\u6c0f\u6642, \u6642\u6642, \u6642\u9069, \u9069\u5e02, \u5e02\u8996, " +
+ "\u8996\u7345, \u5341\u6642, \u9069\u5341, \u5341\u7345, \u7345\u9069, " +
+ "\u9069\u5e02]");
+ assertGramSplit(input, 3, "[\u77f3\u5ba4\u8a69, \u5ba4\u8a69\u58eb, \u8a69\u58eb\u65bd, \u58eb\u65bd\u6c0f, " +
+ "\u55dc\u7345, \u8a93\u98df\u5341, \u98df\u5341\u7345, \u65bd\u6c0f\u6642, " +
+ "\u6c0f\u6642\u6642, \u6642\u6642\u9069, \u6642\u9069\u5e02, \u9069\u5e02\u8996, " +
+ "\u5e02\u8996\u7345, \u5341\u6642, \u9069\u5341\u7345, \u5341\u7345\u9069, " +
+ "\u7345\u9069\u5e02]");
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testInvalidSplitSize() {
+ gramSplitter.split("en", 0);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testInvalidSplitNull() {
+ gramSplitter.split(null, 1);
+ }
+
+ @Test
+ public void testUnusualIteratorUse() {
+ String text = "en gul bille sang";
+ Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 3);
+
+ assertThat(grams.next().extractFrom(text), is("en"));
+ assertTrue(grams.hasNext());
+ assertTrue(grams.hasNext());
+ assertThat(grams.next().extractFrom(text), is("gul"));
+ assertThat(grams.next().extractFrom(text), is("bil"));
+ assertThat(grams.next().extractFrom(text), is("ill"));
+ assertThat(grams.next().extractFrom(text), is("lle"));
+ assertTrue(grams.hasNext());
+ assertTrue(grams.hasNext());
+ assertThat(grams.next().extractFrom(text), is("san"));
+ assertThat(grams.next().extractFrom(text), is("ang"));
+ assertFalse(grams.hasNext());
+ assertFalse(grams.hasNext());
+ }
+
+ private void assertGramSplit(String input, int gramSize, String expected) {
+ assertThat(gramSplitter.split(input, gramSize).toExtractedList().toString(), is(expected));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java
new file mode 100644
index 00000000000..771487d0e71
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java
@@ -0,0 +1,35 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.simple.SimpleLinguistics;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public class NormalizationTestCase {
+
+ private final Normalizer normalizer = new SimpleLinguistics().getNormalizer();
+
+ @Test
+ public void testEmptyStringNormalization() {
+ assertEquals("", normalizer.normalize(""));
+ }
+
+ @Test
+ public void testDoubleWidthAscii() {
+ assertNormalize("\uff41\uff42\uff43\uff44\uff45\uff46\uff47\uff48\uff49", "abcdefghi");
+ }
+
+ @Test
+ public void testLigature() {
+ assertNormalize("\uFB01nance", "finance");
+ }
+
+ private void assertNormalize(String input, String exp) {
+ assertEquals(exp, normalizer.normalize(input));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java
new file mode 100644
index 00000000000..a70a3dc24c5
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java
@@ -0,0 +1,27 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class ProcessingExceptionTestCase {
+
+ @Test
+ public void requireThatMessageCanBeSet() {
+ assertEquals("foo", new ProcessingException("foo").getMessage());
+ }
+
+ @Test
+ public void requireThatMessageAndCauseCanBeSet() {
+ Throwable t = new Throwable();
+ ProcessingException e = new ProcessingException("bar", t);
+ assertEquals("bar", e.getMessage());
+ assertSame(t, e.getCause());
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
new file mode 100644
index 00000000000..8e7e52358f9
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleTokenizer;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class SegmenterImplTestCase {
+
+ private final static Segmenter SEGMENTER = new SegmenterImpl(new SimpleTokenizer(new SimpleNormalizer()));
+
+ @Test
+ public void requireThatNonIndexableCharactersAreDelimiters() {
+ assertSegments("i've", Arrays.asList("i", "ve"));
+ assertSegments("foo bar. baz", Arrays.asList("foo", "bar", "baz"));
+ assertSegments("1,2, 3 4", Arrays.asList("1", "2", "3", "4"));
+ }
+
+ @Test
+ public void requireThatAdjacentIndexableTokenTypesAreNotSplit() {
+ assertSegments("a1,2b,c3,4d", Arrays.asList("a1", "2b", "c3", "4d"));
+ }
+
+ @Test
+ public void requireThatSegmentationReturnsOriginalForm() {
+ assertSegments("a\u030A", Arrays.asList("a\u030A"));
+ assertSegments("FOO BAR", Arrays.asList("FOO", "BAR"));
+ }
+
+ private static void assertSegments(String input, List<String> expectedSegments) {
+ assertEquals(expectedSegments, SEGMENTER.segment(input, Language.ENGLISH));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java
new file mode 100644
index 00000000000..9a592781998
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java
@@ -0,0 +1,73 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import static org.junit.Assert.*;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Functional testing of StemList.
+ *
+ * @author steinar
+ */
+public class StemListTestCase {
+
+ private StemList stems;
+
+ @Before
+ public void setUp() throws Exception {
+ stems = new StemList();
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ stems = null;
+ }
+
+ @Test
+ public void testSize() {
+ assertEquals(0, stems.size());
+ stems.add("a");
+ stems.add("b");
+ stems.add("a");
+ assertEquals(2, stems.size());
+ }
+
+ @Test
+ public void testSet() {
+ stems.add("a");
+ stems.add("b");
+ stems.add("c");
+ stems.add("d");
+ assertEquals("a", stems.set(2, "a"));
+ assertEquals("c", stems.get(2));
+ assertEquals("c", stems.set(2, "z"));
+ assertEquals("z", stems.get(2));
+ }
+
+ @Test
+ public void testAdd() {
+ stems.add("a");
+ stems.add("b");
+ stems.add("c");
+ stems.add("d");
+ assertEquals(4, stems.size());
+ stems.add("a");
+ assertEquals(4, stems.size());
+ stems.add("z");
+ assertEquals(5, stems.size());
+ }
+
+ @Test
+ public void testremove() {
+ stems.add("a");
+ stems.add("b");
+ stems.add("c");
+ stems.add("d");
+ assertEquals("c", stems.remove(2));
+ assertEquals(3, stems.size());
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java
new file mode 100644
index 00000000000..13cd8a82e36
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java
@@ -0,0 +1,27 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class StemModeTestCase {
+
+ @Test
+ @SuppressWarnings("deprecation")
+ public void requireThatValueOfWorks() {
+ for (StemMode mode : StemMode.values()) {
+ assertEquals(mode, StemMode.valueOf(mode.getValue()));
+ }
+ }
+
+ @Test
+ @SuppressWarnings("deprecation")
+ public void requireThatValueOfUnknownIsNone() {
+ assertEquals(StemMode.NONE, StemMode.valueOf(-1));
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java
new file mode 100644
index 00000000000..d81aaaafcc8
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java
@@ -0,0 +1,68 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleToken;
+import com.yahoo.language.simple.SimpleTokenizer;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.ArrayList;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class StemmerImplTestCase {
+
+ @Test
+ public void requireThatStemIsNormalizedAndLowerCased() {
+ assertStem("FOO", Arrays.asList("foo"));
+ assertStem("a\u030A", Arrays.asList("\u00E5"));
+ }
+
+ @Test
+ public void requireThatOnlyIndexableTokensAreReturned() {
+ assertStem("foo. (bar)!", Arrays.asList("foo", "bar"));
+ }
+
+ @Test
+ public void requireThatSpecialTokensAreNotDecompounded() {
+ SimpleToken token = new SimpleToken("c++").setType(TokenType.ALPHABETIC)
+ .setTokenString("c++")
+ .addComponent(new SimpleToken("c").setType(TokenType.ALPHABETIC)
+ .setTokenString("c"))
+ .addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC)
+ .setTokenString("p"))
+ .addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC)
+ .setTokenString("p"));
+ Tokenizer tokenizer = Mockito.mock(Tokenizer.class);
+ Mockito.when(tokenizer.tokenize(Mockito.anyString(), Mockito.<Language>any(), Mockito.<StemMode>any(),
+ Mockito.anyBoolean()))
+ .thenReturn(Arrays.<Token>asList(token));
+ Stemmer stemmer = new StemmerImpl(tokenizer);
+
+ token.setSpecialToken(false);
+ assertEquals(Arrays.asList(new StemList("c"),
+ new StemList("p"),
+ new StemList("p")),
+ stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
+
+ token.setSpecialToken(true);
+ assertEquals(Arrays.asList(new StemList("c++")),
+ stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
+ }
+
+ private static void assertStem(String input, List<String> expectedStems) {
+ Stemmer stemmer = new StemmerImpl(new SimpleTokenizer(new SimpleNormalizer()));
+ List<String> got = new ArrayList<>();
+ for (StemList word : stemmer.stem(input, StemMode.ALL, Language.ENGLISH)) {
+ got.add(word.get(0));
+ }
+ assertEquals(expectedStems, got);
+ }
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
new file mode 100644
index 00000000000..1a92f5a750e
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+public class TokenTypeTestCase {
+
+ @Test
+ @SuppressWarnings("deprecation")
+ public void requireThatValueOfWorks() {
+ for (TokenType type : TokenType.values()) {
+ assertEquals(type, TokenType.valueOf(type.getValue()));
+ }
+ }
+
+ @Test
+ @SuppressWarnings("deprecation")
+ public void requireThatValueOfUnknownIsUnknown() {
+ assertEquals(TokenType.UNKNOWN, TokenType.valueOf(-1));
+ }
+
+ @Test
+ public void requireThatOnlyAlphaNumericsAreIndexable() {
+ for (TokenType type : TokenType.values()) {
+ if (type == TokenType.ALPHABETIC || type == TokenType.NUMERIC) {
+ assertTrue(type.isIndexable());
+ } else {
+ assertFalse(type.isIndexable());
+ }
+ }
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
new file mode 100644
index 00000000000..6506b41fc79
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java
@@ -0,0 +1,233 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.process;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.simple.SimpleTokenizer;
+import org.junit.Test;
+
+import java.util.*;
+
+import static com.yahoo.language.LinguisticsCase.toLowerCase;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.*;
+
+/**
+ * Test of tokenization, with stemming and accent removal
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ */
+public class TokenizationTestCase {
+
+ private final Tokenizer tokenizer = new SimpleTokenizer();
+
+ @Test
+ public void testTokenizer() {
+ assertTokenize("This is a test, 123",
+ Arrays.asList("this", "is", "a", "test", "123"),
+ Arrays.asList("This", " ", "is", " ", "a", " ", "test", ",", " ", "123"));
+ }
+
+ @Test
+ public void testUnderScoreTokenization() {
+ assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, Arrays.asList("ugcapi", "1"), null);
+ }
+
+ @Test
+ public void testPhrasesWithPunctuation() {
+ assertTokenize("PHY_101.html a space/time or space-time course", Language.ENGLISH, StemMode.NONE,
+ false,
+ Arrays.asList("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"),
+ null);
+ assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("phy", "101"), null);
+ assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("101", "3"), null);
+ }
+
+ @Test
+ public void testDoubleWidthTokenization() {
+ // "sony"
+ assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.NONE, false,
+ Arrays.asList("sony"), null);
+ assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.SHORTEST, false,
+ Arrays.asList("sony"), null);
+ // "SONY"
+ assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.NONE, false,
+ Arrays.asList("sony"), null);
+ assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.SHORTEST, false,
+ Arrays.asList("sony"), null);
+ // "on"
+ assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.NONE, false,
+ Arrays.asList("on"), null);
+ assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.SHORTEST, false,
+ Arrays.asList("on"), null);
+ // "ON"
+ assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.NONE, false,
+ Arrays.asList("on"), null);
+ assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false,
+ Arrays.asList("on"), null);
+ }
+
+ @Test
+ public void testLargeTextTokenization() {
+ StringBuilder sb = new StringBuilder();
+ String s = "teststring ";
+ for (int i = 0; i < 100000; i++) {
+ sb.append(s);
+ }
+
+ String input = sb.toString();
+
+ int numTokens = 0;
+ List<Long> pos = new ArrayList<>();
+ for (Token t : tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false)) {
+ numTokens++;
+ if ((numTokens % 100) == 0) {
+ pos.add(t.getOffset());
+ }
+ }
+
+ assertEquals("Check that all tokens have been tokenized", numTokens, 200000);
+ assertTrue("Increasing token pos", assertMonoIncr(pos));
+ }
+
+ @Test
+ public void testLargeTokenGuard() {
+ StringBuilder str = new StringBuilder();
+ for (int i = 0; i < 128 * 256; i++) {
+ str.append("ab");
+ }
+ Iterator<Token> it = tokenizer.tokenize(str.toString(), Language.ENGLISH, StemMode.NONE, false).iterator();
+ assertTrue(it.hasNext());
+ assertNotNull(it.next().getTokenString());
+ assertFalse(it.hasNext());
+ }
+
+ @Test
+ public void testTokenIterator() {
+ Iterator<Token> it = tokenizer.tokenize("", Language.ENGLISH, StemMode.NONE, false).iterator();
+ assertFalse(it.hasNext());
+ try {
+ it.next();
+ fail();
+ } catch (NoSuchElementException e) {
+ // success
+ }
+
+ it = tokenizer.tokenize("", Language.ENGLISH, StemMode.NONE, false).iterator();
+ assertFalse(it.hasNext());
+
+ it = tokenizer.tokenize("one two three", Language.ENGLISH, StemMode.NONE, false).iterator();
+ assertNotNull(it.next());
+ assertNotNull(it.next());
+ assertNotNull(it.next());
+ assertNotNull(it.next());
+ assertNotNull(it.next());
+ assertFalse(it.hasNext());
+ }
+
+ @Test
+ public void testGetOffsetLength() {
+ String input = "Deka-Chef Weber r\u00e4umt Kommunikationsfehler ein";
+ long[] expOffset = { 0, 4, 5, 9, 10, 15, 16, 21, 22, 42, 43 };
+ int[] len = { 4, 1, 4, 1, 5, 1, 5, 1, 20, 1, 3 };
+
+ int idx = 0;
+ for (Token token : tokenizer.tokenize(input, Language.GERMAN, StemMode.SHORTEST, false)) {
+ assertThat("Token offset for token #" + idx, token.getOffset(), is(expOffset[idx]));
+ assertThat("Token len for token #" + idx, token.getOrig().length(), is(len[idx]));
+ idx++;
+ }
+ }
+
+ @Test
+ public void testRecursiveDecompose() {
+ for (Token t : tokenizer.tokenize("\u00a510%", Language.ENGLISH, StemMode.SHORTEST, false)) {
+ recurseDecompose(t);
+ }
+ }
+
+ @Test
+ public void testIndexability() {
+ String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652";
+ for (StemMode stemMode : new StemMode[] { StemMode.NONE,
+ StemMode.SHORTEST }) {
+ for (Language l : new Language[] { Language.INDONESIAN,
+ Language.ENGLISH, Language.ARABIC }) {
+ for (boolean accentDrop : new boolean[] { true, false }) {
+ for (Token token : tokenizer.tokenize(input,
+ l, stemMode, accentDrop)) {
+ if (token.getTokenString().length() == 0) {
+ assertFalse(token.isIndexable());
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private void recurseDecompose(Token t) {
+ assertTrue(t.getOffset() >= 0);
+ assertTrue(t.getOrig().length() >= 0);
+
+ int numComp = t.getNumComponents();
+ for (int i = 0; i < numComp; i++) {
+ Token comp = t.getComponent(i);
+ recurseDecompose(comp);
+ }
+ }
+
+ private boolean assertMonoIncr(Iterable<Long> n) {
+ long trailing = -1;
+ for (long i : n) {
+ if (i < trailing) {
+ return false;
+ }
+ trailing = i;
+ }
+ return true;
+ }
+
+ private void assertTokenize(String input, List<String> indexed, List<String> orig) {
+ assertTokenize(input, Language.ENGLISH, StemMode.NONE, false, indexed, orig);
+ }
+
+ /**
+ * <p>Compare the results of running an input string through the tokenizer with an "index" truth, and an optional
+ * "orig" truth.</p>
+ *
+ * @param input The text to process, passed to tokenizer.
+ * @param language The language tag, passed to tokenizer.
+ * @param stemMode If stemMode != NONE, test will silently succeed if tokenizer does not do stemming.
+ * @param accentDrop Passed to the tokenizer.
+ * @param indexed Compared to the "TokenString" result from the tokenizer.
+ * @param orig Compared to the "Orig" result from the tokenizer.
+ */
+ private void assertTokenize(String input, Language language, StemMode stemMode, boolean accentDrop,
+ List<String> indexed, List<String> orig) {
+ int i = 0;
+ int j = 0;
+ for (Token token : tokenizer.tokenize(input, language, stemMode, accentDrop)) {
+ // System.err.println("got token orig '"+token.getOrig()+"'");
+ // System.err.println("got token stem '"+token.getTokenString(stemMode)+"'");
+ if (token.getNumComponents() > 0) {
+ for (int comp = 0; comp < token.getNumComponents(); comp++) {
+ Token t = token.getComponent(comp);
+ if (t.getType().isIndexable()) {
+ assertThat("comp index: " + i, toLowerCase(t.getTokenString()), is(indexed.get(i++)));
+ }
+ }
+ } else {
+ if (token.getType().isIndexable()) {
+ assertThat("exp index: " + i, toLowerCase(token.getTokenString()), is(indexed.get(i++)));
+ }
+ }
+ if (orig != null) {
+ assertThat("orig index: " + j, token.getOrig(), is(orig.get(j++)));
+ }
+ }
+ assertThat("indexed length", i, is(indexed.size()));
+ if (orig != null) {
+ assertThat("orig length", j, is(orig.size()));
+ }
+ }
+
+}