diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 11:47:54 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 11:47:54 +0200 |
commit | 179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (patch) | |
tree | f24eccc59c5d4f8d2caf8945ad0323c9143f5e53 /opennlp-linguistics | |
parent | c7a07adf43c13165e49e2aa2ef509ecb2526a48c (diff) |
Threat 'other symbols' as letters
The unicode class 'other symbols' contains emojis, math symbols, etc.
Treat these as letter characters to support searching for them.
Diffstat (limited to 'opennlp-linguistics')
-rw-r--r-- | opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java | 12 |
1 files changed, 10 insertions, 2 deletions
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index a5daf7f0531..ef29ffd51cc 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -150,8 +150,7 @@ public class OpenNlpTokenizationTestCase { @Test public void testIndexability() { String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652"; - for (StemMode stemMode : new StemMode[] { StemMode.NONE, - StemMode.SHORTEST }) { + for (StemMode stemMode : new StemMode[] { StemMode.NONE, StemMode.SHORTEST }) { for (Language l : List.of(Language.INDONESIAN, Language.ENGLISH, Language.ARABIC)) { for (boolean accentDrop : new boolean[] { true, false }) { for (Token token : tokenizer.tokenize(input, l, stemMode, accentDrop)) { @@ -165,6 +164,15 @@ public class OpenNlpTokenizationTestCase { } @Test + public void testTokenizeEmojis() { + String emoji = "\uD83D\uDD2A"; // 🔪 + Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator(); + assertTrue(tokens.hasNext()); + assertEquals(emoji, tokens.next().getTokenString()); + assertFalse(tokens.hasNext()); + } + + @Test public void testTokenTypes() { testTokenTypes(Language.ENGLISH); testTokenTypes(Language.SPANISH); |