summaryrefslogtreecommitdiffstats
path: root/opennlp-linguistics
diff options
context:
space:
mode:
Diffstat (limited to 'opennlp-linguistics')
-rw-r--r--opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java12
1 files changed, 10 insertions, 2 deletions
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index a5daf7f0531..ef29ffd51cc 100644
--- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -150,8 +150,7 @@ public class OpenNlpTokenizationTestCase {
@Test
public void testIndexability() {
String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652";
- for (StemMode stemMode : new StemMode[] { StemMode.NONE,
- StemMode.SHORTEST }) {
+ for (StemMode stemMode : new StemMode[] { StemMode.NONE, StemMode.SHORTEST }) {
for (Language l : List.of(Language.INDONESIAN, Language.ENGLISH, Language.ARABIC)) {
for (boolean accentDrop : new boolean[] { true, false }) {
for (Token token : tokenizer.tokenize(input, l, stemMode, accentDrop)) {
@@ -165,6 +164,15 @@ public class OpenNlpTokenizationTestCase {
}
@Test
+ public void testTokenizeEmojis() {
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens.hasNext());
+ assertEquals(emoji, tokens.next().getTokenString());
+ assertFalse(tokens.hasNext());
+ }
+
+ @Test
public void testTokenTypes() {
testTokenTypes(Language.ENGLISH);
testTokenTypes(Language.SPANISH);