diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-06-02 08:50:08 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-06-02 08:50:08 +0200 |
commit | b18703690547333d559f09f63f40ada4fed6f4d4 (patch) | |
tree | 6329d27ff2a2b7ff357fbc65e93713e6ffdcc1da /opennlp-linguistics | |
parent | d799fb136d17e62cc13d7021d409618b58d6d60a (diff) |
Don't remove indexable symbols when stemming
Diffstat (limited to 'opennlp-linguistics')
-rw-r--r-- | opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java | 12 |
1 files changed, 12 insertions, 0 deletions
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index 78412f94fd4..33e820fbb9a 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -2,6 +2,7 @@ package com.yahoo.language.opennlp; import com.yahoo.language.Language; +import com.yahoo.language.process.StemList; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.TokenType; @@ -180,6 +181,17 @@ public class OpenNlpTokenizationTestCase { } @Test + public void testStemEmojis() { + var stemmer = new OpenNlpLinguistics().getStemmer(); + String emoji = "\uD83D\uDD2A"; // 🔪 + List<StemList> stems = stemmer.stem(emoji, StemMode.ALL, Language.ENGLISH); + assertEquals(1, stems.size()); + var stemList = stems.get(0); + assertEquals(1, stemList.size()); + assertEquals(emoji, stemList.get(0)); + } + + @Test public void testTokenTypes() { testTokenTypes(Language.ENGLISH); testTokenTypes(Language.SPANISH); |