aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-05-22 11:47:54 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-05-22 11:47:54 +0200
commit179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (patch)
treef24eccc59c5d4f8d2caf8945ad0323c9143f5e53 /linguistics
parentc7a07adf43c13165e49e2aa2ef509ecb2526a48c (diff)
Threat 'other symbols' as letters
The unicode class 'other symbols' contains emojis, math symbols, etc. Treat these as letter characters to support searching for them.
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java4
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java8
2 files changed, 10 insertions, 2 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
index 413dce0d6c1..5946a00b8bf 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -13,9 +13,9 @@ public class CharacterClasses {
* which are useful to view as letters even though not defined as such in unicode.
*/
public boolean isLetter(int c) {
- if (java.lang.Character.isLetter(c)) return true;
+ if (Character.isLetter(c)) return true;
if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
- // if (c == '_') return true;
+ if (Character.getType(c) == Character.OTHER_SYMBOL) return true; // emojis searchable
// Some CJK punctuation defined as word characters
if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
index f9ff66ee345..1c2f7377bde 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
@@ -33,4 +33,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {
" ", "gods", ".", "running", ")");
}
+ @Test
+ public void testTokenizeEmojis() {
+ TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ tester.assertTokens(emoji, emoji);
+ tester.assertTokens(emoji + "foo", emoji, "foo");
+ }
+
}