diff options
Diffstat (limited to 'linguistics/src/test')
20 files changed, 1485 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java b/linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java new file mode 100644 index 00000000000..c99c4009c4c --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/LanguageTestCase.java @@ -0,0 +1,107 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.*; + +/** + * @author Rich Pito + */ +public class LanguageTestCase { + + @Test + public void requireThatSpecificLanguagesAreCjk() { + List<Language> cjk = Arrays.asList(Language.CHINESE_SIMPLIFIED, + Language.CHINESE_TRADITIONAL, + Language.JAPANESE, + Language.KOREAN, + Language.THAI); + for (Language language : cjk) { + assertTrue(language.toString(), language.isCjk()); + } + for (Language language : Language.values()) { + if (cjk.contains(language)) { + continue; + } + assertFalse(language.toString(), language.isCjk()); + } + } + + @Test + public void requireThatLanguageTagsAreRecognized() { + assertLanguage(Language.ARABIC, "ar"); + assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-hans"); + assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-Hans"); + assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-foo-CN"); + assertLanguage(Language.CHINESE_SIMPLIFIED, "zh-CN"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-foo"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-hant"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-Hant"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-Hant-TW"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-Hant-HK"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-foo-TW"); + assertLanguage(Language.CHINESE_TRADITIONAL, "zh-TW"); + assertLanguage(Language.CROATIAN, "hr"); + assertLanguage(Language.DANISH, "da"); + assertLanguage(Language.DUTCH, "nl"); + assertLanguage(Language.ENGLISH, "en"); + assertLanguage(Language.ENGLISH, "en-CA"); + assertLanguage(Language.ENGLISH, "en-GB"); + assertLanguage(Language.ENGLISH, "en-US"); + assertLanguage(Language.ENGLISH, "en-Latn-i-oed-1992"); + assertLanguage(Language.FINNISH, "fi"); + assertLanguage(Language.FRENCH, "fr"); + assertLanguage(Language.FRENCH, "fr-FR"); + assertLanguage(Language.GERMAN, "de"); + assertLanguage(Language.GERMAN, "de-DE"); + assertLanguage(Language.GREEK, "el"); + assertLanguage(Language.ITALIAN, "it"); + assertLanguage(Language.ITALIAN, "it-IT"); + assertLanguage(Language.JAPANESE, "ja"); + assertLanguage(Language.KOREAN, "ko"); + assertLanguage(Language.NORWEGIAN_BOKMAL, "no"); + assertLanguage(Language.NORWEGIAN_BOKMAL, "nb"); + assertLanguage(Language.POLISH, "pl"); + assertLanguage(Language.PORTUGUESE, "pt"); + assertLanguage(Language.ROMANIAN, "ro"); + assertLanguage(Language.RUSSIAN, "ru"); + assertLanguage(Language.SPANISH, "es"); + assertLanguage(Language.SPANISH, "es-ES"); + assertLanguage(Language.SPANISH, "es-419"); + assertLanguage(Language.SWEDISH, "sv"); + assertLanguage(Language.THAI, "th"); + assertLanguage(Language.TURKISH, "tr"); + assertLanguage(Language.VIETNAMESE, "vi"); + + assertLanguage(Language.UNKNOWN, null); + assertLanguage(Language.UNKNOWN, ""); + assertLanguage(Language.UNKNOWN, "und"); + assertLanguage(Language.UNKNOWN, "z-foo"); + assertLanguage(Language.UNKNOWN, "ojeroierhoiherohjdadsfodsfoifiopeoipefwoipfwe"); + assertLanguage(Language.UNKNOWN, "#$_^@#$_@%#$)%@$%^--@&&&#-%^_^%"); + } + + @Test + public void requireThatLanguageIsGuessedCorrectlyFromEncodings() { + assertSame(Language.UNKNOWN, Language.fromEncoding(null)); + assertSame(Language.UNKNOWN, Language.fromEncoding("lkij")); + assertSame(Language.UNKNOWN, Language.fromEncoding("(/)(###)")); + + assertSame(Language.CHINESE_SIMPLIFIED, Language.fromEncoding("GB2312")); + assertSame(Language.CHINESE_TRADITIONAL, Language.fromEncoding("BIG5")); + assertSame(Language.JAPANESE, Language.fromEncoding("EUC-jp")); + assertSame(Language.JAPANESE, Language.fromEncoding("ISO-2022-jp")); + assertSame(Language.JAPANESE, Language.fromEncoding("Shift-JIS")); + assertSame(Language.KOREAN, Language.fromEncoding("EUC-kr")); + } + + private static void assertLanguage(Language expected, String str) { + assertSame(expected, Language.fromLanguageTag(str)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java b/linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java new file mode 100644 index 00000000000..910627584ce --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/LocaleFactoryTestCase.java @@ -0,0 +1,52 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language; + +import org.junit.Test; + +import java.util.Locale; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + */ +public class LocaleFactoryTestCase { + + @Test + public void requireThatLocaleCanBeCreatedFromLanguageTag() { + assertLocale("zh", "zh", "", ""); + assertLocale("zh-CN", "zh", "", "CN"); + assertLocale("zh-foo-CN", "zh", "", "CN"); + assertLocale("zh-Hans", "zh", "Hans", ""); + assertLocale("zh-TW", "zh", "", "TW"); + assertLocale("zh-foo-TW", "zh", "", "TW"); + assertLocale("zh-Hant", "zh", "Hant", ""); + assertLocale("ja", "ja", "", ""); + assertLocale("ko", "ko", "", ""); + assertLocale("en", "en", "", ""); + assertLocale("en-NO", "en", "", "NO"); + assertLocale("de", "de", "", ""); + assertLocale("es", "es", "", ""); + assertLocale("es-419", "es", "", "419"); + + try { + LocaleFactory.fromLanguageTag(null); + fail(); + } catch (NullPointerException e) { + + } + + assertLocale("", "", "", ""); + assertLocale("z-foo", "", "", ""); + assertLocale("ojeroierhoiherohjdadsfodsfoifiopeoipefwoipfwe", "", "", ""); + } + + private static void assertLocale(String tag, String language, String variant, String country) { + Locale locale = LocaleFactory.fromLanguageTag(tag); + assertEquals(language, locale.getLanguage()); + assertEquals(country, locale.getCountry()); + assertEquals(variant, locale.getVariant()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java new file mode 100644 index 00000000000..aa8102fe9f2 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/detect/AbstractDetectorTestCase.java @@ -0,0 +1,61 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.detect; + +import com.yahoo.language.Language; +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.nio.charset.Charset; + +import static org.junit.Assert.*; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class AbstractDetectorTestCase { + + private static final Detection DETECTION = new Detection(Language.ARABIC, "encoding", true); + private static final Charset UTF8 = Charset.forName("UTF-8"); + + @Test + public void requireThatDetectStringForwardsUtf8Bytes() { + Hint hint = Hint.newCountryHint("no"); + MyDetector detector = new MyDetector(); + Detection detection = detector.detect("69", hint); + assertSame(DETECTION, detection); + assertArrayEquals("69".getBytes(UTF8), detector.input); + assertEquals(0, detector.offset); + assertEquals(2, detector.length); + assertSame(hint, detector.hint); + } + + @Test + public void requireThatDetectByteBufferForwardsUtf8Bytes() { + byte[] buf = new byte[] { 6, 9 }; + Hint hint = Hint.newCountryHint("no"); + MyDetector detector = new MyDetector(); + Detection detection = detector.detect(ByteBuffer.wrap(buf), hint); + assertSame(DETECTION, detection); + assertArrayEquals(buf, detector.input); + assertEquals(0, detector.offset); + assertEquals(2, detector.length); + assertSame(hint, detector.hint); + } + + private static class MyDetector extends AbstractDetector { + + byte[] input; + int offset; + int length; + Hint hint; + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + this.input = input; + this.offset = offset; + this.length = length; + this.hint = hint; + return DETECTION; + } + } +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java new file mode 100644 index 00000000000..3cb82572976 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/AbstractTokenizerTestCase.java @@ -0,0 +1,66 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public abstract class AbstractTokenizerTestCase { + + private boolean accentDrop = false; + private Language language = Language.ENGLISH; + private Linguistics linguistics; + private StemMode stemMode = StemMode.NONE; + + public void assertTokenStrings(String input, List<String> expectedTokenStrings) { + List<String> actual = new ArrayList<>(); + for (Token token : tokenize(input)) { + findTokenStrings(token, actual); + } + assertEquals(expectedTokenStrings, actual); + } + + public List<String> findTokenStrings(Token token, List<String> out) { + int numComponents = token.getNumComponents(); + if (token.isSpecialToken() || numComponents == 0) { + out.add(token.getTokenString()); + } else { + for (int i = 0; i < numComponents; ++i) { + findTokenStrings(token.getComponent(i), out); + } + } + return out; + } + + public Iterable<Token> tokenize(String input) { + return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop); + } + + public AbstractTokenizerTestCase setAccentDrop(boolean accentDrop) { + this.accentDrop = accentDrop; + return this; + } + + public AbstractTokenizerTestCase setLanguage(Language language) { + this.language = language; + return this; + } + + public AbstractTokenizerTestCase setLinguistics(Linguistics linguistics) { + this.linguistics = linguistics; + return this; + } + + public AbstractTokenizerTestCase setStemMode(StemMode stemMode) { + this.stemMode = stemMode; + return this; + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java new file mode 100644 index 00000000000..8233ef1b8f0 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java @@ -0,0 +1,150 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.simple.SimpleLinguistics; +import org.junit.Test; + +import java.util.Iterator; + +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.*; + +/** + * @author bratseth + */ +public class GramSplitterTestCase { + + private static final GramSplitter gramSplitter = new SimpleLinguistics().getGramSplitter(); + + @Test + public void testNoSpaces() { + // no spaces + assertGramSplit("engulbillesang", 1, "[e, n, g, u, l, b, i, l, l, e, s, a, n, g]"); + assertGramSplit("engulbillesang", 2, "[en, ng, gu, ul, lb, bi, il, ll, le, es, sa, an, ng]"); + assertGramSplit("engulbillesang", 3, "[eng, ngu, gul, ulb, lbi, bil, ill, lle, les, esa, san, ang]"); + } + + @Test + public void testWithSpaces() { + // with spaces + assertGramSplit("en gul bille sang", 1, "[e, n, g, u, l, b, i, l, l, e, s, a, n, g]"); + assertGramSplit("en gul bille sang", 2, "[en, gu, ul, bi, il, ll, le, sa, an, ng]"); + assertGramSplit("en gul bille sang", 3, "[en, gul, bil, ill, lle, san, ang]"); + } + + @Test + public void testCornerCases() { + // corner cases + assertGramSplit("", 1, "[]"); + assertGramSplit("", 2, "[]"); + assertGramSplit("e", 1, "[e]"); + assertGramSplit("e", 2, "[e]"); + assertGramSplit("en", 1, "[e, n]"); + assertGramSplit("en", 2, "[en]"); + assertGramSplit("en", 3, "[en]"); + } + + @Test + public void testSpaceCornerCases() { + // space corner cases + assertGramSplit("e en e", 1, "[e, e, n, e]"); + assertGramSplit("e en e", 2, "[e, en, e]"); + assertGramSplit("e en e", 3, "[e, en, e]"); + assertGramSplit(" e en e ", 1, "[e, e, n, e]"); + assertGramSplit(" e en e ", 2, "[e, en, e]"); + assertGramSplit(" e en e ", 3, "[e, en, e]"); + assertGramSplit(" e en e ", 1, "[e, e, n, e]"); + assertGramSplit(" e en e ", 2, "[e, en, e]"); + assertGramSplit(" e en e ", 3, "[e, en, e]"); + assertGramSplit("a b c", 4, "[a, b, c]"); + } + + @Test + public void testWithCasing() { + assertGramSplit("This is the Black Eyed Peas", 2, + "[Th, hi, is, is, th, he, Bl, la, ac, ck, Ey, ye, ed, Pe, ea, as]"); + assertGramSplit("This is the Black Eyed Peas", 3, + "[Thi, his, is, the, Bla, lac, ack, Eye, yed, Pea, eas]"); + assertGramSplit("This is the Black Eyed Peas", 4, + "[This, is, the, Blac, lack, Eyed, Peas]"); + assertGramSplit("This is the Black Eyed Peas", 5, + "[This, is, the, Black, Eyed, Peas]"); + assertGramSplit("This is the Black Eyed Peas", 6, + "[This, is, the, Black, Eyed, Peas]"); + } + + @Test + public void testWithPunctuation() { + assertGramSplit("this is, in a sense, more than the sum of parts!", 2, + "[th, hi, is, is, in, a, se, en, ns, se, mo, or, re, th, ha, an, th, he, su, um, of, pa, ar, rt, ts]"); + assertGramSplit("this is, in a sense, more than the sum of parts!", 3, + "[thi, his, is, in, a, sen, ens, nse, mor, ore, tha, han, the, sum, of, par, art, rts]"); + assertGramSplit("this is, in a sense, more than the sum of parts!", 4, + "[this, is, in, a, sens, ense, more, than, the, sum, of, part, arts]"); + assertGramSplit("this is, in a sense, more than the sum of parts!", 5, + "[this, is, in, a, sense, more, than, the, sum, of, parts]"); + assertGramSplit("this is, in a sense, more than the sum of parts!", 6, + "[this, is, in, a, sense, more, than, the, sum, of, parts]"); + } + + @Test + public void testAccents() { + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 2, "[ca, af, f\u00e9, de, l, h\u00f4, \u00f4t, te, el]"); + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 3, "[caf, af\u00e9, de, l, h\u00f4t, \u00f4te, tel]"); + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 4, "[caf\u00e9, de, l, h\u00f4te, \u00f4tel]"); + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 5, "[caf\u00e9, de, l, h\u00f4tel]"); + assertGramSplit("caf\u00e9 de l'h\u00f4tel", 6, "[caf\u00e9, de, l, h\u00f4tel]"); + } + + @Test + public void testChinese() { + String input = "\u77f3\u5ba4\u8a69\u58eb\u65bd\u6c0f\uff0c\u55dc\u7345\uff0c\u8a93\u98df\u5341\u7345\u3002" + + "\u65bd\u6c0f\u6642\u6642\u9069\u5e02\u8996\u7345\uff0c\u5341\u6642\uff0c\u9069\u5341\u7345" + + "\u9069\u5e02\u3002"; + assertGramSplit(input, 2, "[\u77f3\u5ba4, \u5ba4\u8a69, \u8a69\u58eb, \u58eb\u65bd, \u65bd\u6c0f, " + + "\u55dc\u7345, \u8a93\u98df, \u98df\u5341, \u5341\u7345, \u65bd\u6c0f, " + + "\u6c0f\u6642, \u6642\u6642, \u6642\u9069, \u9069\u5e02, \u5e02\u8996, " + + "\u8996\u7345, \u5341\u6642, \u9069\u5341, \u5341\u7345, \u7345\u9069, " + + "\u9069\u5e02]"); + assertGramSplit(input, 3, "[\u77f3\u5ba4\u8a69, \u5ba4\u8a69\u58eb, \u8a69\u58eb\u65bd, \u58eb\u65bd\u6c0f, " + + "\u55dc\u7345, \u8a93\u98df\u5341, \u98df\u5341\u7345, \u65bd\u6c0f\u6642, " + + "\u6c0f\u6642\u6642, \u6642\u6642\u9069, \u6642\u9069\u5e02, \u9069\u5e02\u8996, " + + "\u5e02\u8996\u7345, \u5341\u6642, \u9069\u5341\u7345, \u5341\u7345\u9069, " + + "\u7345\u9069\u5e02]"); + } + + @Test(expected = IllegalArgumentException.class) + public void testInvalidSplitSize() { + gramSplitter.split("en", 0); + } + + @Test(expected = NullPointerException.class) + public void testInvalidSplitNull() { + gramSplitter.split(null, 1); + } + + @Test + public void testUnusualIteratorUse() { + String text = "en gul bille sang"; + Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 3); + + assertThat(grams.next().extractFrom(text), is("en")); + assertTrue(grams.hasNext()); + assertTrue(grams.hasNext()); + assertThat(grams.next().extractFrom(text), is("gul")); + assertThat(grams.next().extractFrom(text), is("bil")); + assertThat(grams.next().extractFrom(text), is("ill")); + assertThat(grams.next().extractFrom(text), is("lle")); + assertTrue(grams.hasNext()); + assertTrue(grams.hasNext()); + assertThat(grams.next().extractFrom(text), is("san")); + assertThat(grams.next().extractFrom(text), is("ang")); + assertFalse(grams.hasNext()); + assertFalse(grams.hasNext()); + } + + private void assertGramSplit(String input, int gramSize, String expected) { + assertThat(gramSplitter.split(input, gramSize).toExtractedList().toString(), is(expected)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java new file mode 100644 index 00000000000..771487d0e71 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/NormalizationTestCase.java @@ -0,0 +1,35 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.simple.SimpleLinguistics; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public class NormalizationTestCase { + + private final Normalizer normalizer = new SimpleLinguistics().getNormalizer(); + + @Test + public void testEmptyStringNormalization() { + assertEquals("", normalizer.normalize("")); + } + + @Test + public void testDoubleWidthAscii() { + assertNormalize("\uff41\uff42\uff43\uff44\uff45\uff46\uff47\uff48\uff49", "abcdefghi"); + } + + @Test + public void testLigature() { + assertNormalize("\uFB01nance", "finance"); + } + + private void assertNormalize(String input, String exp) { + assertEquals(exp, normalizer.normalize(input)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java new file mode 100644 index 00000000000..a70a3dc24c5 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/ProcessingExceptionTestCase.java @@ -0,0 +1,27 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertSame; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class ProcessingExceptionTestCase { + + @Test + public void requireThatMessageCanBeSet() { + assertEquals("foo", new ProcessingException("foo").getMessage()); + } + + @Test + public void requireThatMessageAndCauseCanBeSet() { + Throwable t = new Throwable(); + ProcessingException e = new ProcessingException("bar", t); + assertEquals("bar", e.getMessage()); + assertSame(t, e.getCause()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java new file mode 100644 index 00000000000..8e7e52358f9 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/SegmenterImplTestCase.java @@ -0,0 +1,43 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleTokenizer; +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class SegmenterImplTestCase { + + private final static Segmenter SEGMENTER = new SegmenterImpl(new SimpleTokenizer(new SimpleNormalizer())); + + @Test + public void requireThatNonIndexableCharactersAreDelimiters() { + assertSegments("i've", Arrays.asList("i", "ve")); + assertSegments("foo bar. baz", Arrays.asList("foo", "bar", "baz")); + assertSegments("1,2, 3 4", Arrays.asList("1", "2", "3", "4")); + } + + @Test + public void requireThatAdjacentIndexableTokenTypesAreNotSplit() { + assertSegments("a1,2b,c3,4d", Arrays.asList("a1", "2b", "c3", "4d")); + } + + @Test + public void requireThatSegmentationReturnsOriginalForm() { + assertSegments("a\u030A", Arrays.asList("a\u030A")); + assertSegments("FOO BAR", Arrays.asList("FOO", "BAR")); + } + + private static void assertSegments(String input, List<String> expectedSegments) { + assertEquals(expectedSegments, SEGMENTER.segment(input, Language.ENGLISH)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java new file mode 100644 index 00000000000..9a592781998 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/StemListTestCase.java @@ -0,0 +1,73 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import static org.junit.Assert.*; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Functional testing of StemList. + * + * @author steinar + */ +public class StemListTestCase { + + private StemList stems; + + @Before + public void setUp() throws Exception { + stems = new StemList(); + } + + @After + public void tearDown() throws Exception { + stems = null; + } + + @Test + public void testSize() { + assertEquals(0, stems.size()); + stems.add("a"); + stems.add("b"); + stems.add("a"); + assertEquals(2, stems.size()); + } + + @Test + public void testSet() { + stems.add("a"); + stems.add("b"); + stems.add("c"); + stems.add("d"); + assertEquals("a", stems.set(2, "a")); + assertEquals("c", stems.get(2)); + assertEquals("c", stems.set(2, "z")); + assertEquals("z", stems.get(2)); + } + + @Test + public void testAdd() { + stems.add("a"); + stems.add("b"); + stems.add("c"); + stems.add("d"); + assertEquals(4, stems.size()); + stems.add("a"); + assertEquals(4, stems.size()); + stems.add("z"); + assertEquals(5, stems.size()); + } + + @Test + public void testremove() { + stems.add("a"); + stems.add("b"); + stems.add("c"); + stems.add("d"); + assertEquals("c", stems.remove(2)); + assertEquals(3, stems.size()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java new file mode 100644 index 00000000000..13cd8a82e36 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/StemModeTestCase.java @@ -0,0 +1,27 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class StemModeTestCase { + + @Test + @SuppressWarnings("deprecation") + public void requireThatValueOfWorks() { + for (StemMode mode : StemMode.values()) { + assertEquals(mode, StemMode.valueOf(mode.getValue())); + } + } + + @Test + @SuppressWarnings("deprecation") + public void requireThatValueOfUnknownIsNone() { + assertEquals(StemMode.NONE, StemMode.valueOf(-1)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java new file mode 100644 index 00000000000..d81aaaafcc8 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java @@ -0,0 +1,68 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleToken; +import com.yahoo.language.simple.SimpleTokenizer; +import org.junit.Test; +import org.mockito.Mockito; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class StemmerImplTestCase { + + @Test + public void requireThatStemIsNormalizedAndLowerCased() { + assertStem("FOO", Arrays.asList("foo")); + assertStem("a\u030A", Arrays.asList("\u00E5")); + } + + @Test + public void requireThatOnlyIndexableTokensAreReturned() { + assertStem("foo. (bar)!", Arrays.asList("foo", "bar")); + } + + @Test + public void requireThatSpecialTokensAreNotDecompounded() { + SimpleToken token = new SimpleToken("c++").setType(TokenType.ALPHABETIC) + .setTokenString("c++") + .addComponent(new SimpleToken("c").setType(TokenType.ALPHABETIC) + .setTokenString("c")) + .addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC) + .setTokenString("p")) + .addComponent(new SimpleToken("p").setType(TokenType.ALPHABETIC) + .setTokenString("p")); + Tokenizer tokenizer = Mockito.mock(Tokenizer.class); + Mockito.when(tokenizer.tokenize(Mockito.anyString(), Mockito.<Language>any(), Mockito.<StemMode>any(), + Mockito.anyBoolean())) + .thenReturn(Arrays.<Token>asList(token)); + Stemmer stemmer = new StemmerImpl(tokenizer); + + token.setSpecialToken(false); + assertEquals(Arrays.asList(new StemList("c"), + new StemList("p"), + new StemList("p")), + stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH)); + + token.setSpecialToken(true); + assertEquals(Arrays.asList(new StemList("c++")), + stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH)); + } + + private static void assertStem(String input, List<String> expectedStems) { + Stemmer stemmer = new StemmerImpl(new SimpleTokenizer(new SimpleNormalizer())); + List<String> got = new ArrayList<>(); + for (StemList word : stemmer.stem(input, StemMode.ALL, Language.ENGLISH)) { + got.add(word.get(0)); + } + assertEquals(expectedStems, got); + } +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java new file mode 100644 index 00000000000..1a92f5a750e --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java @@ -0,0 +1,38 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import org.junit.Test; + +import static org.junit.Assert.*; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class TokenTypeTestCase { + + @Test + @SuppressWarnings("deprecation") + public void requireThatValueOfWorks() { + for (TokenType type : TokenType.values()) { + assertEquals(type, TokenType.valueOf(type.getValue())); + } + } + + @Test + @SuppressWarnings("deprecation") + public void requireThatValueOfUnknownIsUnknown() { + assertEquals(TokenType.UNKNOWN, TokenType.valueOf(-1)); + } + + @Test + public void requireThatOnlyAlphaNumericsAreIndexable() { + for (TokenType type : TokenType.values()) { + if (type == TokenType.ALPHABETIC || type == TokenType.NUMERIC) { + assertTrue(type.isIndexable()); + } else { + assertFalse(type.isIndexable()); + } + } + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java new file mode 100644 index 00000000000..6506b41fc79 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java @@ -0,0 +1,233 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.language.simple.SimpleTokenizer; +import org.junit.Test; + +import java.util.*; + +import static com.yahoo.language.LinguisticsCase.toLowerCase; +import static org.hamcrest.CoreMatchers.is; +import static org.junit.Assert.*; + +/** + * Test of tokenization, with stemming and accent removal + * + * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + */ +public class TokenizationTestCase { + + private final Tokenizer tokenizer = new SimpleTokenizer(); + + @Test + public void testTokenizer() { + assertTokenize("This is a test, 123", + Arrays.asList("this", "is", "a", "test", "123"), + Arrays.asList("This", " ", "is", " ", "a", " ", "test", ",", " ", "123")); + } + + @Test + public void testUnderScoreTokenization() { + assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, Arrays.asList("ugcapi", "1"), null); + } + + @Test + public void testPhrasesWithPunctuation() { + assertTokenize("PHY_101.html a space/time or space-time course", Language.ENGLISH, StemMode.NONE, + false, + Arrays.asList("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"), + null); + assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("phy", "101"), null); + assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("101", "3"), null); + } + + @Test + public void testDoubleWidthTokenization() { + // "sony" + assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.NONE, false, + Arrays.asList("sony"), null); + assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.SHORTEST, false, + Arrays.asList("sony"), null); + // "SONY" + assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.NONE, false, + Arrays.asList("sony"), null); + assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.SHORTEST, false, + Arrays.asList("sony"), null); + // "on" + assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.NONE, false, + Arrays.asList("on"), null); + assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.SHORTEST, false, + Arrays.asList("on"), null); + // "ON" + assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.NONE, false, + Arrays.asList("on"), null); + assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false, + Arrays.asList("on"), null); + } + + @Test + public void testLargeTextTokenization() { + StringBuilder sb = new StringBuilder(); + String s = "teststring "; + for (int i = 0; i < 100000; i++) { + sb.append(s); + } + + String input = sb.toString(); + + int numTokens = 0; + List<Long> pos = new ArrayList<>(); + for (Token t : tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false)) { + numTokens++; + if ((numTokens % 100) == 0) { + pos.add(t.getOffset()); + } + } + + assertEquals("Check that all tokens have been tokenized", numTokens, 200000); + assertTrue("Increasing token pos", assertMonoIncr(pos)); + } + + @Test + public void testLargeTokenGuard() { + StringBuilder str = new StringBuilder(); + for (int i = 0; i < 128 * 256; i++) { + str.append("ab"); + } + Iterator<Token> it = tokenizer.tokenize(str.toString(), Language.ENGLISH, StemMode.NONE, false).iterator(); + assertTrue(it.hasNext()); + assertNotNull(it.next().getTokenString()); + assertFalse(it.hasNext()); + } + + @Test + public void testTokenIterator() { + Iterator<Token> it = tokenizer.tokenize("", Language.ENGLISH, StemMode.NONE, false).iterator(); + assertFalse(it.hasNext()); + try { + it.next(); + fail(); + } catch (NoSuchElementException e) { + // success + } + + it = tokenizer.tokenize("", Language.ENGLISH, StemMode.NONE, false).iterator(); + assertFalse(it.hasNext()); + + it = tokenizer.tokenize("one two three", Language.ENGLISH, StemMode.NONE, false).iterator(); + assertNotNull(it.next()); + assertNotNull(it.next()); + assertNotNull(it.next()); + assertNotNull(it.next()); + assertNotNull(it.next()); + assertFalse(it.hasNext()); + } + + @Test + public void testGetOffsetLength() { + String input = "Deka-Chef Weber r\u00e4umt Kommunikationsfehler ein"; + long[] expOffset = { 0, 4, 5, 9, 10, 15, 16, 21, 22, 42, 43 }; + int[] len = { 4, 1, 4, 1, 5, 1, 5, 1, 20, 1, 3 }; + + int idx = 0; + for (Token token : tokenizer.tokenize(input, Language.GERMAN, StemMode.SHORTEST, false)) { + assertThat("Token offset for token #" + idx, token.getOffset(), is(expOffset[idx])); + assertThat("Token len for token #" + idx, token.getOrig().length(), is(len[idx])); + idx++; + } + } + + @Test + public void testRecursiveDecompose() { + for (Token t : tokenizer.tokenize("\u00a510%", Language.ENGLISH, StemMode.SHORTEST, false)) { + recurseDecompose(t); + } + } + + @Test + public void testIndexability() { + String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652"; + for (StemMode stemMode : new StemMode[] { StemMode.NONE, + StemMode.SHORTEST }) { + for (Language l : new Language[] { Language.INDONESIAN, + Language.ENGLISH, Language.ARABIC }) { + for (boolean accentDrop : new boolean[] { true, false }) { + for (Token token : tokenizer.tokenize(input, + l, stemMode, accentDrop)) { + if (token.getTokenString().length() == 0) { + assertFalse(token.isIndexable()); + } + } + } + } + } + } + + private void recurseDecompose(Token t) { + assertTrue(t.getOffset() >= 0); + assertTrue(t.getOrig().length() >= 0); + + int numComp = t.getNumComponents(); + for (int i = 0; i < numComp; i++) { + Token comp = t.getComponent(i); + recurseDecompose(comp); + } + } + + private boolean assertMonoIncr(Iterable<Long> n) { + long trailing = -1; + for (long i : n) { + if (i < trailing) { + return false; + } + trailing = i; + } + return true; + } + + private void assertTokenize(String input, List<String> indexed, List<String> orig) { + assertTokenize(input, Language.ENGLISH, StemMode.NONE, false, indexed, orig); + } + + /** + * <p>Compare the results of running an input string through the tokenizer with an "index" truth, and an optional + * "orig" truth.</p> + * + * @param input The text to process, passed to tokenizer. + * @param language The language tag, passed to tokenizer. + * @param stemMode If stemMode != NONE, test will silently succeed if tokenizer does not do stemming. + * @param accentDrop Passed to the tokenizer. + * @param indexed Compared to the "TokenString" result from the tokenizer. + * @param orig Compared to the "Orig" result from the tokenizer. + */ + private void assertTokenize(String input, Language language, StemMode stemMode, boolean accentDrop, + List<String> indexed, List<String> orig) { + int i = 0; + int j = 0; + for (Token token : tokenizer.tokenize(input, language, stemMode, accentDrop)) { + // System.err.println("got token orig '"+token.getOrig()+"'"); + // System.err.println("got token stem '"+token.getTokenString(stemMode)+"'"); + if (token.getNumComponents() > 0) { + for (int comp = 0; comp < token.getNumComponents(); comp++) { + Token t = token.getComponent(comp); + if (t.getType().isIndexable()) { + assertThat("comp index: " + i, toLowerCase(t.getTokenString()), is(indexed.get(i++))); + } + } + } else { + if (token.getType().isIndexable()) { + assertThat("exp index: " + i, toLowerCase(token.getTokenString()), is(indexed.get(i++))); + } + } + if (orig != null) { + assertThat("orig index: " + j, token.getOrig(), is(orig.get(j++))); + } + } + assertThat("indexed length", i, is(indexed.size())); + if (orig != null) { + assertThat("orig length", j, is(orig.size())); + } + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java new file mode 100644 index 00000000000..66eee3f73d4 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java @@ -0,0 +1,89 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detection; +import com.yahoo.text.Utf8; +import org.junit.Test; + +import java.nio.charset.Charset; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + */ +public class SimpleDetectorTestCase { + + @Test + public void requireThatLanguageCanDetected() { + assertLanguage(Language.UNKNOWN, "Hello!"); + + // "Chinese language" + assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input + "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002"); + + // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". + assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_TRADITIONAL input + "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u50B7\u8EAB\u9AD4\u3002"); + + // four katakana characters from this web page: http://www.japanese-online.com/language/lessons/katakana.htm + assertLanguage(Language.JAPANESE, "\u30ab\u30bf\u30ab\u30ca"); + + // four hiragana characters gotton from web page: http://www.japanese-online.com/language/lessons/hiragana.htm + assertLanguage(Language.JAPANESE, "\u3072\u3089\u304c\u306a"); + + // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". + // This is a good test because this string contains not only japanese but chinese characters, so we need to look + // through it to find the japanese ones. + assertLanguage(Language.JAPANESE, + "\u79c1\u306f\u30ac\u30e9\u30b9\u3092\u98df\u3079\u3089\u308c\u307e\u3059" + + "\u3002\u305d\u308c\u306f\u79c1\u3092\u50b7\u3064\u3051\u307e\u305b\u3093" + + "\u3002"); + + // an introduction on an adobe web page. What it measn I don't know. + assertLanguage(Language.KOREAN, "\ud55c\uae00\uacfc"); + + // for the sound of "A" + assertLanguage(Language.KOREAN, "\u314f"); + + // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". + assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " + + "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694"); + } + + @Test + public void testEncodingGuess() { + // just some arbitrary data above 127 which is not valid as UTF-8 + byte[] b = new byte[] { (byte)196, (byte)197, (byte)198 }; + Detection d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding()); + + // a string from http://www.columbia.edu/kermit/utf8.html that says + // "I can eat glass (and it doesn't hurt me)". + b = Utf8.toBytes("\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " + + "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694"); + d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Utf8.getCharset(), d.getEncoding()); + + // arbitrary ascii + b = new byte[] { 31, 32, 33 }; + d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Charset.forName("US-ASCII"), d.getEncoding()); + + // character which is not valid in UTF-8 + b = new byte[] { -1 }; + d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding()); + + // UTF-8 which requires more bytes than available + b = new byte[] { Utf8.toBytes("\u00E5")[0] }; + d = new SimpleDetector().detect(b, 0, b.length, null); + assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding()); + } + + private static void assertLanguage(Language language, String input) { + assertEquals(language, new SimpleDetector().detect(input, null).getLanguage()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java new file mode 100644 index 00000000000..9c9c8b8fcc5 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleNormalizerTestCase.java @@ -0,0 +1,34 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.Normalizer; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class SimpleNormalizerTestCase { + + private static final Normalizer NORMALIZER = new SimpleNormalizer(); + + @Test + public void requireThatInputIsNfkcNormalized() { + assertNormalize("\u212B", "\u00C5"); + assertNormalize("\u2126", "\u03A9"); + assertNormalize("\u00C5", "\u00C5"); + assertNormalize("\u00F4", "\u00F4"); + assertNormalize("\u1E69", "\u1E69"); + assertNormalize("\u1E0B\u0323", "\u1E0D\u0307"); + assertNormalize("\u0071\u0307\u0323", "q\u0323\u0307"); + assertNormalize("\uFB01", "fi"); + assertNormalize("\u0032\u2075", "25"); + assertNormalize("\u1E9B\u0323", "\u1E69"); + } + + private static void assertNormalize(String input, String expectedNormalForm) { + assertEquals(expectedNormalForm, NORMALIZER.normalize(input)); + } + +}
\ No newline at end of file diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java new file mode 100644 index 00000000000..b27b70b4dc9 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTestCase.java @@ -0,0 +1,194 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.TokenScript; +import com.yahoo.language.process.TokenType; +import org.junit.Test; + +import static org.junit.Assert.*; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class SimpleTokenTestCase { + + @Test + public void requireThatOrigAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals("foo", token.getOrig()); + + assertEquals(token, new SimpleToken("foo")); + assertFalse(token.equals(new SimpleToken("bar"))); + } + + @Test + public void requireThatComponentAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(0, token.getNumComponents()); + SimpleToken bar = new SimpleToken("bar"); + SimpleToken baz = new SimpleToken("baz"); + token.addComponent(bar); + token.addComponent(baz); + assertEquals(2, token.getNumComponents()); + assertSame(bar, token.getComponent(0)); + assertSame(baz, token.getComponent(1)); + + SimpleToken other = new SimpleToken("foo"); + assertFalse(token.equals(other)); + other.addComponent(bar); + assertFalse(token.equals(other)); + other.addComponent(baz); + assertEquals(token, other); + + other = new SimpleToken("foo"); + other.addComponent(baz); + other.addComponent(bar); + assertFalse(token.equals(other)); + } + + @Test + public void requireThatStemAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(0, token.getNumStems()); + assertNull(token.getStem(0)); + token.setTokenString("bar"); + assertEquals(1, token.getNumStems()); + assertEquals("bar", token.getStem(0)); + } + + @Test + public void requireThatTokenStringAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertNull(token.getTokenString()); + token.setTokenString("bar"); + assertEquals("bar", token.getTokenString()); + SimpleToken other = new SimpleToken("foo"); + assertFalse(token.equals(other)); + other.setTokenString("bar"); + assertEquals(token, other); + } + + @Test + public void requireThatTypeAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(TokenType.UNKNOWN, token.getType()); + for (TokenType type : TokenType.values()) { + token.setType(type); + assertEquals(type, token.getType()); + } + + SimpleToken other = new SimpleToken("foo"); + for (TokenType type : TokenType.values()) { + other.setType(type); + if (type == token.getType()) { + assertEquals(token, other); + } else { + assertFalse(token.equals(other)); + } + } + } + + @Test + public void requireThatScriptAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(TokenScript.UNKNOWN, token.getScript()); + for (TokenScript script : TokenScript.values()) { + token.setScript(script); + assertEquals(script, token.getScript()); + } + + SimpleToken other = new SimpleToken("foo"); + for (TokenScript script : TokenScript.values()) { + other.setScript(script); + if (script == token.getScript()) { + assertEquals(token, other); + } else { + assertFalse(token.equals(other)); + } + } + } + + @Test + public void requireThatSpecialTokenAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertFalse(token.isSpecialToken()); + token.setSpecialToken(true); + assertTrue(token.isSpecialToken()); + token.setSpecialToken(false); + assertFalse(token.isSpecialToken()); + + SimpleToken other = new SimpleToken("foo"); + other.setSpecialToken(true); + assertFalse(token.equals(other)); + other.setSpecialToken(false); + assertEquals(token, other); + } + + @Test + public void requireThatOffsetAccessorsWork() { + SimpleToken token = new SimpleToken("foo"); + assertEquals(0, token.getOffset()); + token.setOffset(69); + assertEquals(69, token.getOffset()); + + SimpleToken other = new SimpleToken("foo"); + assertFalse(token.equals(other)); + other.setOffset(69); + assertEquals(token, other); + } + + @Test + public void requireThatToStringIsExpressive() { + SimpleToken token = new SimpleToken("my_orig"); + token.addComponent(new SimpleToken("my_component_1")); + token.addComponent(new SimpleToken("my_component_2")); + token.setTokenString("my_token_string"); + token.setType(TokenType.ALPHABETIC); + token.setScript(TokenScript.ARABIC); + token.setOffset(1); + + String expected = "token : SimpleToken {\n" + + " components : {\n" + + " [0] : SimpleToken {\n" + + " components : {\n" + + " }\n" + + " offset : 0\n" + + " orig : 'my_component_1'\n" + + " script : UNKNOWN\n" + + " special : false\n" + + " token string : null\n" + + " type : UNKNOWN\n" + + " }\n" + + " [1] : SimpleToken {\n" + + " components : {\n" + + " }\n" + + " offset : 0\n" + + " orig : 'my_component_2'\n" + + " script : UNKNOWN\n" + + " special : false\n" + + " token string : null\n" + + " type : UNKNOWN\n" + + " }\n" + + " }\n" + + " offset : 1\n" + + " orig : 'my_orig'\n" + + " script : ARABIC\n" + + " special : false\n" + + " token string : 'my_token_string'\n" + + " type : ALPHABETIC\n" + + "}"; + assertEquals(expected, token.toString()); + } + + @Test + public void requireThatHashCodeIsImplemented() { + assertEquals(new SimpleToken("foo").hashCode(), new SimpleToken("foo").hashCode()); + } + + @Test + public void requireThatEqualsIsImplemented() { + assertFalse(new SimpleToken("foo").equals(new Object())); + assertEquals(new SimpleToken("foo"), new SimpleToken("foo")); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java new file mode 100644 index 00000000000..2d258be7af0 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenTypeTestCase.java @@ -0,0 +1,43 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.TokenType; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * Check simple token types. + * + * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + */ +public class SimpleTokenTypeTestCase { + + @Test + public final void test() { + assertEquals(TokenType.ALPHABETIC, tokenType('a')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u02c1')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u02c1')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u01c0')); + assertEquals(TokenType.SYMBOL, tokenType('\u20dd')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u0912')); + assertEquals(TokenType.NUMERIC, tokenType('1')); + assertEquals(TokenType.PUNCTUATION, tokenType('.')); + assertEquals(TokenType.PUNCTUATION, tokenType('\u0f3b')); + assertEquals(TokenType.PUNCTUATION, tokenType('\u0f3c')); + assertEquals(TokenType.PUNCTUATION, tokenType('\u203f')); + assertEquals(TokenType.SYMBOL, tokenType('\u2044')); + assertEquals(TokenType.SYMBOL, tokenType('$')); + assertEquals(TokenType.ALPHABETIC, tokenType('\u2132')); + assertEquals(TokenType.ALPHABETIC, tokenType('\uD800', '\uDFC8')); + } + + private static TokenType tokenType(char c) { + return SimpleTokenType.valueOf(c); + } + + private static TokenType tokenType(char high, char low) { + return SimpleTokenType.valueOf(Character.toCodePoint(high, low)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java new file mode 100644 index 00000000000..8760da56415 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java @@ -0,0 +1,36 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.process.AbstractTokenizerTestCase; +import com.yahoo.language.process.StemMode; +import org.junit.Test; + +/** + * @author <a href="mailto:steinar@yahoo-inc.com">Steinar Knutsen</a> + * @author bratseth + */ +public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase { + + @Test + public void testTokenizingNoStemming() { + TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.NONE); + tester.assertTokens("a\u030a tralalala n4lle. \uD800\uDFC8 (old Persian sign Auramazda, sorry if " + + "anyone 1s offended by ancien7 gods.Running)", + "\u00E5", " ", "tralalala"," ","n4lle", ".", " ","\uD800\uDFC8", " ", "(", + "old", " ", "persian", " ", "sign", " ", "auramazda", ",", " ", "sorry", " ", + "if", " ", "anyone", " ", "1s", " ", "offended", " ", "by", " ", "ancien7", + " ", "gods", ".", "running", ")"); + } + + @Test + public void testTokenizingStemming() { + TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL); + tester.assertTokens("a\u030a tralalala n4lle. \uD800\uDFC8 (old Persian sign Auramazda, sorry if " + + "anyone 1s offended by ancien7 gods.Running)", + "\u00E5", " ", "tralalala"," ","n4lle", ".", " ","\uD800\uDFC8", " ", "(", + "old", " ", "persian", " ", "sign", " ", "auramazda", ",", " ", "sorry", " ", + "if", " ", "anyone", " ", "1s", " ", "offend", " ", "by", " ", "ancien7", + " ", "gods", ".", "running", ")"); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java new file mode 100644 index 00000000000..ea4b85e4bd1 --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTransformerTestCase.java @@ -0,0 +1,40 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.process.Transformer; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +public class SimpleTransformerTestCase { + + private final static Transformer TRANSFORMER = new SimpleTransformer(); + + @Test + public void requireThatNonAccentsRemain() { + assertTransform("foo", "foo"); + } + + @Test + public void requireThatTransformerRemovesAccents() { + assertTransform("\u212B", "A"); + assertTransform("\u2126", "\u03A9"); + assertTransform("\u00C5", "A"); + assertTransform("\u00F4", "o"); + assertTransform("\u1E69", "s"); + assertTransform("\u1E0B\u0323", "d"); + assertTransform("\u0071\u0307\u0323", "q"); + assertTransform("\uFB01", "\uFB01"); + assertTransform("2\u2075", "2\u2075"); + assertTransform("\u1E9B\u0323", "\u017F"); + } + + private static void assertTransform(String input, String expectedTransform) { + assertEquals(expectedTransform, TRANSFORMER.accentDrop(input, Language.ENGLISH)); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java b/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java new file mode 100644 index 00000000000..bb59788b26e --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java @@ -0,0 +1,69 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * @author bratseth + */ +public class TokenizerTester { + + private boolean accentDrop = false; + private Language language = Language.ENGLISH; + private Linguistics linguistics = new SimpleLinguistics(); + private StemMode stemMode = StemMode.NONE; + + public void assertTokens(String input, String ... expectedTokenStrings) { + List<String> actual = new ArrayList<>(); + for (Token token : tokenize(input)) { + findTokenStrings(token, actual); + } + assertEquals(Arrays.asList(expectedTokenStrings), actual); + } + + public List<String> findTokenStrings(Token token, List<String> out) { + int numComponents = token.getNumComponents(); + if (token.isSpecialToken() || numComponents == 0) { + out.add(token.getTokenString()); + } else { + for (int i = 0; i < numComponents; ++i) { + findTokenStrings(token.getComponent(i), out); + } + } + return out; + } + + public Iterable<Token> tokenize(String input) { + return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop); + } + + public TokenizerTester setAccentDrop(boolean accentDrop) { + this.accentDrop = accentDrop; + return this; + } + + public TokenizerTester setLanguage(Language language) { + this.language = language; + return this; + } + + public TokenizerTester setLinguistics(Linguistics linguistics) { + this.linguistics = linguistics; + return this; + } + + public TokenizerTester setStemMode(StemMode stemMode) { + this.stemMode = stemMode; + return this; + } + +} |