summaryrefslogtreecommitdiffstats
path: root/opennlp-linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-06-02 08:50:08 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-06-02 08:50:08 +0200
commitb18703690547333d559f09f63f40ada4fed6f4d4 (patch)
tree6329d27ff2a2b7ff357fbc65e93713e6ffdcc1da /opennlp-linguistics
parentd799fb136d17e62cc13d7021d409618b58d6d60a (diff)
Don't remove indexable symbols when stemming
Diffstat (limited to 'opennlp-linguistics')
-rw-r--r--opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java12
1 files changed, 12 insertions, 0 deletions
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index 78412f94fd4..33e820fbb9a 100644
--- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -2,6 +2,7 @@
package com.yahoo.language.opennlp;
import com.yahoo.language.Language;
+import com.yahoo.language.process.StemList;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenType;
@@ -180,6 +181,17 @@ public class OpenNlpTokenizationTestCase {
}
@Test
+ public void testStemEmojis() {
+ var stemmer = new OpenNlpLinguistics().getStemmer();
+ String emoji = "\uD83D\uDD2A"; // 🔪
+ List<StemList> stems = stemmer.stem(emoji, StemMode.ALL, Language.ENGLISH);
+ assertEquals(1, stems.size());
+ var stemList = stems.get(0);
+ assertEquals(1, stemList.size());
+ assertEquals(emoji, stemList.get(0));
+ }
+
+ @Test
public void testTokenTypes() {
testTokenTypes(Language.ENGLISH);
testTokenTypes(Language.SPANISH);