From cc60531ac22a7e9601055174a02a6e67c428f800 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Mon, 22 May 2023 23:08:48 +0200 Subject: Always treat each symbol as a separate token --- .../yahoo/language/opennlp/OpenNlpTokenizer.java | 41 +++++++++++----------- .../opennlp/OpenNlpTokenizationTestCase.java | 17 ++++++--- 2 files changed, 32 insertions(+), 26 deletions(-) (limited to 'opennlp-linguistics') diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 8080dc92729..5452da71775 100644 --- a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -25,7 +25,6 @@ import java.util.List; */ public class OpenNlpTokenizer implements Tokenizer { - private final static int SPACE_CODE = 32; private final Normalizer normalizer; private final Transformer transformer; private final SimpleTokenizer simpleTokenizer; @@ -74,26 +73,26 @@ public class OpenNlpTokenizer implements Tokenizer { } private SnowballStemmer.ALGORITHM algorithmFor(Language language) { - switch (language) { - case DANISH: return SnowballStemmer.ALGORITHM.DANISH; - case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH; - case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH; - case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH; - case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN; - case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN; - case IRISH: return SnowballStemmer.ALGORITHM.IRISH; - case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN; - case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN; - case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN; - case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE; - case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN; - case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN; - case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH; - case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH; - case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH; - case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH; - default: return null; - } + return switch (language) { + case DANISH -> SnowballStemmer.ALGORITHM.DANISH; + case DUTCH -> SnowballStemmer.ALGORITHM.DUTCH; + case FINNISH -> SnowballStemmer.ALGORITHM.FINNISH; + case FRENCH -> SnowballStemmer.ALGORITHM.FRENCH; + case GERMAN -> SnowballStemmer.ALGORITHM.GERMAN; + case HUNGARIAN -> SnowballStemmer.ALGORITHM.HUNGARIAN; + case IRISH -> SnowballStemmer.ALGORITHM.IRISH; + case ITALIAN -> SnowballStemmer.ALGORITHM.ITALIAN; + case NORWEGIAN_BOKMAL -> SnowballStemmer.ALGORITHM.NORWEGIAN; + case NORWEGIAN_NYNORSK -> SnowballStemmer.ALGORITHM.NORWEGIAN; + case PORTUGUESE -> SnowballStemmer.ALGORITHM.PORTUGUESE; + case ROMANIAN -> SnowballStemmer.ALGORITHM.ROMANIAN; + case RUSSIAN -> SnowballStemmer.ALGORITHM.RUSSIAN; + case SPANISH -> SnowballStemmer.ALGORITHM.SPANISH; + case SWEDISH -> SnowballStemmer.ALGORITHM.SWEDISH; + case TURKISH -> SnowballStemmer.ALGORITHM.TURKISH; + case ENGLISH -> SnowballStemmer.ALGORITHM.ENGLISH; + default -> null; + }; } } diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index ef29ffd51cc..78412f94fd4 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -165,11 +165,18 @@ public class OpenNlpTokenizationTestCase { @Test public void testTokenizeEmojis() { - String emoji = "\uD83D\uDD2A"; // 🔪 - Iterator tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator(); - assertTrue(tokens.hasNext()); - assertEquals(emoji, tokens.next().getTokenString()); - assertFalse(tokens.hasNext()); + String emoji1 = "\uD83D\uDD2A"; // 🔪 + Iterator tokens1 = tokenizer.tokenize(emoji1, Language.ENGLISH, StemMode.ALL, true).iterator(); + assertTrue(tokens1.hasNext()); + assertEquals(emoji1, tokens1.next().getTokenString()); + assertFalse(tokens1.hasNext()); + + String emoji2 = "\uD83D\uDE00"; // 😀 + Iterator tokens2 = tokenizer.tokenize(emoji1 + emoji2, Language.ENGLISH, StemMode.ALL, true).iterator(); + assertTrue(tokens2.hasNext()); + assertEquals(emoji1, tokens2.next().getTokenString()); + assertEquals(emoji2, tokens2.next().getTokenString()); + assertFalse(tokens2.hasNext()); } @Test -- cgit v1.2.3