aboutsummaryrefslogtreecommitdiffstats
path: root/opennlp-linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-05-22 23:08:48 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-05-22 23:08:48 +0200
commitcc60531ac22a7e9601055174a02a6e67c428f800 (patch)
tree8a1f336745c8ae2da36ca55501e1192ad111ac32 /opennlp-linguistics
parent179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (diff)
Always treat each symbol as a separate token
Diffstat (limited to 'opennlp-linguistics')
-rw-r--r--opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java41
-rw-r--r--opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java17
2 files changed, 32 insertions, 26 deletions
diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index 8080dc92729..5452da71775 100644
--- a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -25,7 +25,6 @@ import java.util.List;
*/
public class OpenNlpTokenizer implements Tokenizer {
- private final static int SPACE_CODE = 32;
private final Normalizer normalizer;
private final Transformer transformer;
private final SimpleTokenizer simpleTokenizer;
@@ -74,26 +73,26 @@ public class OpenNlpTokenizer implements Tokenizer {
}
private SnowballStemmer.ALGORITHM algorithmFor(Language language) {
- switch (language) {
- case DANISH: return SnowballStemmer.ALGORITHM.DANISH;
- case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH;
- case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH;
- case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH;
- case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN;
- case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN;
- case IRISH: return SnowballStemmer.ALGORITHM.IRISH;
- case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN;
- case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN;
- case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN;
- case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE;
- case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN;
- case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN;
- case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH;
- case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH;
- case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH;
- case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH;
- default: return null;
- }
+ return switch (language) {
+ case DANISH -> SnowballStemmer.ALGORITHM.DANISH;
+ case DUTCH -> SnowballStemmer.ALGORITHM.DUTCH;
+ case FINNISH -> SnowballStemmer.ALGORITHM.FINNISH;
+ case FRENCH -> SnowballStemmer.ALGORITHM.FRENCH;
+ case GERMAN -> SnowballStemmer.ALGORITHM.GERMAN;
+ case HUNGARIAN -> SnowballStemmer.ALGORITHM.HUNGARIAN;
+ case IRISH -> SnowballStemmer.ALGORITHM.IRISH;
+ case ITALIAN -> SnowballStemmer.ALGORITHM.ITALIAN;
+ case NORWEGIAN_BOKMAL -> SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case NORWEGIAN_NYNORSK -> SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case PORTUGUESE -> SnowballStemmer.ALGORITHM.PORTUGUESE;
+ case ROMANIAN -> SnowballStemmer.ALGORITHM.ROMANIAN;
+ case RUSSIAN -> SnowballStemmer.ALGORITHM.RUSSIAN;
+ case SPANISH -> SnowballStemmer.ALGORITHM.SPANISH;
+ case SWEDISH -> SnowballStemmer.ALGORITHM.SWEDISH;
+ case TURKISH -> SnowballStemmer.ALGORITHM.TURKISH;
+ case ENGLISH -> SnowballStemmer.ALGORITHM.ENGLISH;
+ default -> null;
+ };
}
}
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index ef29ffd51cc..78412f94fd4 100644
--- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -165,11 +165,18 @@ public class OpenNlpTokenizationTestCase {
@Test
public void testTokenizeEmojis() {
- String emoji = "\uD83D\uDD2A"; // 🔪
- Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator();
- assertTrue(tokens.hasNext());
- assertEquals(emoji, tokens.next().getTokenString());
- assertFalse(tokens.hasNext());
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ Iterator<Token> tokens1 = tokenizer.tokenize(emoji1, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens1.hasNext());
+ assertEquals(emoji1, tokens1.next().getTokenString());
+ assertFalse(tokens1.hasNext());
+
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ Iterator<Token> tokens2 = tokenizer.tokenize(emoji1 + emoji2, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens2.hasNext());
+ assertEquals(emoji1, tokens2.next().getTokenString());
+ assertEquals(emoji2, tokens2.next().getTokenString());
+ assertFalse(tokens2.hasNext());
}
@Test