summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-05-22 11:47:54 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-05-22 11:47:54 +0200
commit179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (patch)
treef24eccc59c5d4f8d2caf8945ad0323c9143f5e53
parentc7a07adf43c13165e49e2aa2ef509ecb2526a48c (diff)
Threat 'other symbols' as letters
The unicode class 'other symbols' contains emojis, math symbols, etc. Treat these as letter characters to support searching for them.
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java12
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java4
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java8
-rw-r--r--opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java12
4 files changed, 32 insertions, 4 deletions
diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
index 583e89bacd6..475b7beb879 100644
--- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
+++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
@@ -2580,4 +2580,16 @@ public class ParseTestCase {
void testNoGrammar4() {
tester.assertParsed("WEAKAND(100) foo bar baz one two 37", "foo -(bar baz \"one two\" 37)", Query.Type.TOKENIZE);
}
+
+ @Test
+ void testEmojis() {
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ tester.assertParsed(emoji1, emoji1, Query.Type.ANY);
+
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ tester.assertParsed(emoji2, emoji2, Query.Type.ANY);
+
+ tester.assertParsed(emoji1 + emoji2, emoji1 + emoji2, Query.Type.ANY);
+ }
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
index 413dce0d6c1..5946a00b8bf 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -13,9 +13,9 @@ public class CharacterClasses {
* which are useful to view as letters even though not defined as such in unicode.
*/
public boolean isLetter(int c) {
- if (java.lang.Character.isLetter(c)) return true;
+ if (Character.isLetter(c)) return true;
if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
- // if (c == '_') return true;
+ if (Character.getType(c) == Character.OTHER_SYMBOL) return true; // emojis searchable
// Some CJK punctuation defined as word characters
if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
index f9ff66ee345..1c2f7377bde 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
@@ -33,4 +33,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {
" ", "gods", ".", "running", ")");
}
+ @Test
+ public void testTokenizeEmojis() {
+ TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ tester.assertTokens(emoji, emoji);
+ tester.assertTokens(emoji + "foo", emoji, "foo");
+ }
+
}
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index a5daf7f0531..ef29ffd51cc 100644
--- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -150,8 +150,7 @@ public class OpenNlpTokenizationTestCase {
@Test
public void testIndexability() {
String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652";
- for (StemMode stemMode : new StemMode[] { StemMode.NONE,
- StemMode.SHORTEST }) {
+ for (StemMode stemMode : new StemMode[] { StemMode.NONE, StemMode.SHORTEST }) {
for (Language l : List.of(Language.INDONESIAN, Language.ENGLISH, Language.ARABIC)) {
for (boolean accentDrop : new boolean[] { true, false }) {
for (Token token : tokenizer.tokenize(input, l, stemMode, accentDrop)) {
@@ -165,6 +164,15 @@ public class OpenNlpTokenizationTestCase {
}
@Test
+ public void testTokenizeEmojis() {
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens.hasNext());
+ assertEquals(emoji, tokens.next().getTokenString());
+ assertFalse(tokens.hasNext());
+ }
+
+ @Test
public void testTokenTypes() {
testTokenTypes(Language.ENGLISH);
testTokenTypes(Language.SPANISH);