diff options
4 files changed, 32 insertions, 4 deletions
diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java index 583e89bacd6..475b7beb879 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java @@ -2580,4 +2580,16 @@ public class ParseTestCase { void testNoGrammar4() { tester.assertParsed("WEAKAND(100) foo bar baz one two 37", "foo -(bar baz \"one two\" 37)", Query.Type.TOKENIZE); } + + @Test + void testEmojis() { + String emoji1 = "\uD83D\uDD2A"; // 🔪 + tester.assertParsed(emoji1, emoji1, Query.Type.ANY); + + String emoji2 = "\uD83D\uDE00"; // 😀 + tester.assertParsed(emoji2, emoji2, Query.Type.ANY); + + tester.assertParsed(emoji1 + emoji2, emoji1 + emoji2, Query.Type.ANY); + } + } diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java index 413dce0d6c1..5946a00b8bf 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java +++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java @@ -13,9 +13,9 @@ public class CharacterClasses { * which are useful to view as letters even though not defined as such in unicode. */ public boolean isLetter(int c) { - if (java.lang.Character.isLetter(c)) return true; + if (Character.isLetter(c)) return true; if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters - // if (c == '_') return true; + if (Character.getType(c) == Character.OTHER_SYMBOL) return true; // emojis searchable // Some CJK punctuation defined as word characters if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' || diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java index f9ff66ee345..1c2f7377bde 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java @@ -33,4 +33,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase { " ", "gods", ".", "running", ")"); } + @Test + public void testTokenizeEmojis() { + TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL); + String emoji = "\uD83D\uDD2A"; // 🔪 + tester.assertTokens(emoji, emoji); + tester.assertTokens(emoji + "foo", emoji, "foo"); + } + } diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index a5daf7f0531..ef29ffd51cc 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -150,8 +150,7 @@ public class OpenNlpTokenizationTestCase { @Test public void testIndexability() { String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652"; - for (StemMode stemMode : new StemMode[] { StemMode.NONE, - StemMode.SHORTEST }) { + for (StemMode stemMode : new StemMode[] { StemMode.NONE, StemMode.SHORTEST }) { for (Language l : List.of(Language.INDONESIAN, Language.ENGLISH, Language.ARABIC)) { for (boolean accentDrop : new boolean[] { true, false }) { for (Token token : tokenizer.tokenize(input, l, stemMode, accentDrop)) { @@ -165,6 +164,15 @@ public class OpenNlpTokenizationTestCase { } @Test + public void testTokenizeEmojis() { + String emoji = "\uD83D\uDD2A"; // 🔪 + Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator(); + assertTrue(tokens.hasNext()); + assertEquals(emoji, tokens.next().getTokenString()); + assertFalse(tokens.hasNext()); + } + + @Test public void testTokenTypes() { testTokenTypes(Language.ENGLISH); testTokenTypes(Language.SPANISH); |