diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 23:08:48 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 23:08:48 +0200 |
commit | cc60531ac22a7e9601055174a02a6e67c428f800 (patch) | |
tree | 8a1f336745c8ae2da36ca55501e1192ad111ac32 /opennlp-linguistics/src/test | |
parent | 179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (diff) |
Always treat each symbol as a separate token
Diffstat (limited to 'opennlp-linguistics/src/test')
-rw-r--r-- | opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java | 17 |
1 files changed, 12 insertions, 5 deletions
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index ef29ffd51cc..78412f94fd4 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -165,11 +165,18 @@ public class OpenNlpTokenizationTestCase { @Test public void testTokenizeEmojis() { - String emoji = "\uD83D\uDD2A"; // 🔪 - Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator(); - assertTrue(tokens.hasNext()); - assertEquals(emoji, tokens.next().getTokenString()); - assertFalse(tokens.hasNext()); + String emoji1 = "\uD83D\uDD2A"; // 🔪 + Iterator<Token> tokens1 = tokenizer.tokenize(emoji1, Language.ENGLISH, StemMode.ALL, true).iterator(); + assertTrue(tokens1.hasNext()); + assertEquals(emoji1, tokens1.next().getTokenString()); + assertFalse(tokens1.hasNext()); + + String emoji2 = "\uD83D\uDE00"; // 😀 + Iterator<Token> tokens2 = tokenizer.tokenize(emoji1 + emoji2, Language.ENGLISH, StemMode.ALL, true).iterator(); + assertTrue(tokens2.hasNext()); + assertEquals(emoji1, tokens2.next().getTokenString()); + assertEquals(emoji2, tokens2.next().getTokenString()); + assertFalse(tokens2.hasNext()); } @Test |