summaryrefslogtreecommitdiffstats
path: root/opennlp-linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-05-22 11:47:54 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-05-22 11:47:54 +0200
commit179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (patch)
treef24eccc59c5d4f8d2caf8945ad0323c9143f5e53 /opennlp-linguistics
parentc7a07adf43c13165e49e2aa2ef509ecb2526a48c (diff)
Threat 'other symbols' as letters
The unicode class 'other symbols' contains emojis, math symbols, etc. Treat these as letter characters to support searching for them.
Diffstat (limited to 'opennlp-linguistics')
-rw-r--r--opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java12
1 files changed, 10 insertions, 2 deletions
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index a5daf7f0531..ef29ffd51cc 100644
--- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -150,8 +150,7 @@ public class OpenNlpTokenizationTestCase {
@Test
public void testIndexability() {
String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652";
- for (StemMode stemMode : new StemMode[] { StemMode.NONE,
- StemMode.SHORTEST }) {
+ for (StemMode stemMode : new StemMode[] { StemMode.NONE, StemMode.SHORTEST }) {
for (Language l : List.of(Language.INDONESIAN, Language.ENGLISH, Language.ARABIC)) {
for (boolean accentDrop : new boolean[] { true, false }) {
for (Token token : tokenizer.tokenize(input, l, stemMode, accentDrop)) {
@@ -165,6 +164,15 @@ public class OpenNlpTokenizationTestCase {
}
@Test
+ public void testTokenizeEmojis() {
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens.hasNext());
+ assertEquals(emoji, tokens.next().getTokenString());
+ assertFalse(tokens.hasNext());
+ }
+
+ @Test
public void testTokenTypes() {
testTokenTypes(Language.ENGLISH);
testTokenTypes(Language.SPANISH);