aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/test/java/com/yahoo
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-05-22 11:47:54 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-05-22 11:47:54 +0200
commit179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (patch)
treef24eccc59c5d4f8d2caf8945ad0323c9143f5e53 /linguistics/src/test/java/com/yahoo
parentc7a07adf43c13165e49e2aa2ef509ecb2526a48c (diff)
Threat 'other symbols' as letters
The unicode class 'other symbols' contains emojis, math symbols, etc. Treat these as letter characters to support searching for them.
Diffstat (limited to 'linguistics/src/test/java/com/yahoo')
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java8
1 files changed, 8 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
index f9ff66ee345..1c2f7377bde 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
@@ -33,4 +33,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {
" ", "gods", ".", "running", ")");
}
+ @Test
+ public void testTokenizeEmojis() {
+ TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ tester.assertTokens(emoji, emoji);
+ tester.assertTokens(emoji + "foo", emoji, "foo");
+ }
+
}