diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 11:47:54 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 11:47:54 +0200 |
commit | 179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (patch) | |
tree | f24eccc59c5d4f8d2caf8945ad0323c9143f5e53 /linguistics/src/test/java/com/yahoo | |
parent | c7a07adf43c13165e49e2aa2ef509ecb2526a48c (diff) |
Threat 'other symbols' as letters
The unicode class 'other symbols' contains emojis, math symbols, etc.
Treat these as letter characters to support searching for them.
Diffstat (limited to 'linguistics/src/test/java/com/yahoo')
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java index f9ff66ee345..1c2f7377bde 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java @@ -33,4 +33,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase { " ", "gods", ".", "running", ")"); } + @Test + public void testTokenizeEmojis() { + TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL); + String emoji = "\uD83D\uDD2A"; // 🔪 + tester.assertTokens(emoji, emoji); + tester.assertTokens(emoji + "foo", emoji, "foo"); + } + } |