diff options
Diffstat (limited to 'linguistics/src/test')
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java | 32 | ||||
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java | 23 |
2 files changed, 44 insertions, 11 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index 77489f2eb44..cd2a0f73895 100644 --- a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -4,6 +4,7 @@ package com.yahoo.language.opennlp; import com.yahoo.language.Language; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenType; import com.yahoo.language.process.Tokenizer; import org.junit.Test; @@ -151,11 +152,9 @@ public class OpenNlpTokenizationTestCase { String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652"; for (StemMode stemMode : new StemMode[] { StemMode.NONE, StemMode.SHORTEST }) { - for (Language l : new Language[] { Language.INDONESIAN, - Language.ENGLISH, Language.ARABIC }) { + for (Language l : List.of(Language.INDONESIAN, Language.ENGLISH, Language.ARABIC)) { for (boolean accentDrop : new boolean[] { true, false }) { - for (Token token : tokenizer.tokenize(input, - l, stemMode, accentDrop)) { + for (Token token : tokenizer.tokenize(input, l, stemMode, accentDrop)) { if (token.getTokenString().length() == 0) { assertFalse(token.isIndexable()); } @@ -165,6 +164,31 @@ public class OpenNlpTokenizationTestCase { } } + @Test + public void testTokenTypes() { + testTokenTypes(Language.ENGLISH); + testTokenTypes(Language.SPANISH); + } + + public void testTokenTypes(Language language) { + assertEquals(TokenType.ALPHABETIC, tokenize("word", language).iterator().next().getType()); + assertEquals(TokenType.NUMERIC, tokenize("123", language).iterator().next().getType()); + assertEquals(TokenType.SPACE, tokenize(" ", language).iterator().next().getType()); + assertEquals(TokenType.PUNCTUATION, tokenize(".", language).iterator().next().getType()); + assertEquals(TokenType.ALPHABETIC, tokenize("123word", language).iterator().next().getType()); + + var tokens = tokenize("123 123word word123", language).iterator(); + assertEquals(TokenType.NUMERIC, tokens.next().getType()); + assertEquals(TokenType.SPACE, tokens.next().getType()); + assertEquals(TokenType.ALPHABETIC, tokens.next().getType()); + assertEquals(TokenType.SPACE, tokens.next().getType()); + assertEquals(TokenType.ALPHABETIC, tokens.next().getType()); + } + + private Iterable<Token> tokenize(String input, Language language) { + return tokenizer.tokenize(input, language, StemMode.SHORTEST, true); + } + private void recurseDecompose(Token t) { assertTrue(t.getOffset() >= 0); assertTrue(t.getOrig().length() >= 0); diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java index 5054f5a9bff..fa8419e200f 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java @@ -171,21 +171,30 @@ public class GramSplitterTestCase { public void testChineseComma() { String text = "我喜欢红色、蓝色和紫色"; Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 2); - for (; grams.hasNext(); ) { - System.out.println(grams.next().extractFrom(text)); - } + assertEquals("我喜", grams.next().extractFrom(text)); + assertEquals("喜欢", grams.next().extractFrom(text)); + assertEquals("欢红", grams.next().extractFrom(text)); + assertEquals("红色", grams.next().extractFrom(text)); + assertEquals("蓝色", grams.next().extractFrom(text)); + assertEquals("色和", grams.next().extractFrom(text)); + assertEquals("和紫", grams.next().extractFrom(text)); + assertEquals("紫色", grams.next().extractFrom(text)); } @Test public void testEnglishComma() { String text = "我喜欢红色,蓝色和紫色"; Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 2); - for (; grams.hasNext(); ) { - System.out.println(grams.next().extractFrom(text)); - } + assertEquals("我喜", grams.next().extractFrom(text)); + assertEquals("喜欢", grams.next().extractFrom(text)); + assertEquals("欢红", grams.next().extractFrom(text)); + assertEquals("红色", grams.next().extractFrom(text)); + assertEquals("蓝色", grams.next().extractFrom(text)); + assertEquals("色和", grams.next().extractFrom(text)); + assertEquals("和紫", grams.next().extractFrom(text)); + assertEquals("紫色", grams.next().extractFrom(text)); } - private void assertGramSplits(String input, int gramSize, String ... expected) { assertEquals(Arrays.asList(expected), gramSplitter.split(input, gramSize).toExtractedList()); } |