aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java4
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java8
2 files changed, 10 insertions, 2 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
index 413dce0d6c1..5946a00b8bf 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -13,9 +13,9 @@ public class CharacterClasses {
* which are useful to view as letters even though not defined as such in unicode.
*/
public boolean isLetter(int c) {
- if (java.lang.Character.isLetter(c)) return true;
+ if (Character.isLetter(c)) return true;
if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
- // if (c == '_') return true;
+ if (Character.getType(c) == Character.OTHER_SYMBOL) return true; // emojis searchable
// Some CJK punctuation defined as word characters
if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
index f9ff66ee345..1c2f7377bde 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
@@ -33,4 +33,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {
" ", "gods", ".", "running", ")");
}
+ @Test
+ public void testTokenizeEmojis() {
+ TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ tester.assertTokens(emoji, emoji);
+ tester.assertTokens(emoji + "foo", emoji, "foo");
+ }
+
}