aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java12
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java4
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java8
-rw-r--r--opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java12
4 files changed, 32 insertions, 4 deletions
diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
index 583e89bacd6..475b7beb879 100644
--- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
+++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
@@ -2580,4 +2580,16 @@ public class ParseTestCase {
void testNoGrammar4() {
tester.assertParsed("WEAKAND(100) foo bar baz one two 37", "foo -(bar baz \"one two\" 37)", Query.Type.TOKENIZE);
}
+
+ @Test
+ void testEmojis() {
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ tester.assertParsed(emoji1, emoji1, Query.Type.ANY);
+
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ tester.assertParsed(emoji2, emoji2, Query.Type.ANY);
+
+ tester.assertParsed(emoji1 + emoji2, emoji1 + emoji2, Query.Type.ANY);
+ }
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
index 413dce0d6c1..5946a00b8bf 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -13,9 +13,9 @@ public class CharacterClasses {
* which are useful to view as letters even though not defined as such in unicode.
*/
public boolean isLetter(int c) {
- if (java.lang.Character.isLetter(c)) return true;
+ if (Character.isLetter(c)) return true;
if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
- // if (c == '_') return true;
+ if (Character.getType(c) == Character.OTHER_SYMBOL) return true; // emojis searchable
// Some CJK punctuation defined as word characters
if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
index f9ff66ee345..1c2f7377bde 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
@@ -33,4 +33,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {
" ", "gods", ".", "running", ")");
}
+ @Test
+ public void testTokenizeEmojis() {
+ TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ tester.assertTokens(emoji, emoji);
+ tester.assertTokens(emoji + "foo", emoji, "foo");
+ }
+
}
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index a5daf7f0531..ef29ffd51cc 100644
--- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -150,8 +150,7 @@ public class OpenNlpTokenizationTestCase {
@Test
public void testIndexability() {
String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652";
- for (StemMode stemMode : new StemMode[] { StemMode.NONE,
- StemMode.SHORTEST }) {
+ for (StemMode stemMode : new StemMode[] { StemMode.NONE, StemMode.SHORTEST }) {
for (Language l : List.of(Language.INDONESIAN, Language.ENGLISH, Language.ARABIC)) {
for (boolean accentDrop : new boolean[] { true, false }) {
for (Token token : tokenizer.tokenize(input, l, stemMode, accentDrop)) {
@@ -165,6 +164,15 @@ public class OpenNlpTokenizationTestCase {
}
@Test
+ public void testTokenizeEmojis() {
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens.hasNext());
+ assertEquals(emoji, tokens.next().getTokenString());
+ assertFalse(tokens.hasNext());
+ }
+
+ @Test
public void testTokenTypes() {
testTokenTypes(Language.ENGLISH);
testTokenTypes(Language.SPANISH);