diff options
6 files changed, 29 insertions, 8 deletions
diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index a6aa902c688..dc85a2e6f0b 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -724,6 +724,7 @@ "public static final enum com.yahoo.language.process.TokenType SYMBOL", "public static final enum com.yahoo.language.process.TokenType ALPHABETIC", "public static final enum com.yahoo.language.process.TokenType NUMERIC", + "public static final enum com.yahoo.language.process.TokenType INDEXABLE_SYMBOL", "public static final enum com.yahoo.language.process.TokenType MARKER" ] }, diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java index 14c7e9bc144..6c3e0c2ab36 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java +++ b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java @@ -14,6 +14,7 @@ public enum TokenType { SYMBOL(3), ALPHABETIC(4), NUMERIC(5), + INDEXABLE_SYMBOL(6), MARKER(255); private final int value; @@ -34,10 +35,10 @@ public enum TokenType { * @return whether this type of token can be indexed */ public boolean isIndexable() { - switch (this) { - case ALPHABETIC: case NUMERIC: return true; - default: return false; - } + return switch (this) { + case ALPHABETIC, NUMERIC, INDEXABLE_SYMBOL -> true; + default -> false; + }; } /** Translates this from the int code representation returned from {@link #getValue} */ diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java index 5c321e4da9b..8a88ae8f005 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java @@ -31,8 +31,9 @@ public class SimpleTokenType { case Character.MATH_SYMBOL: case Character.CURRENCY_SYMBOL: case Character.MODIFIER_SYMBOL: - case Character.OTHER_SYMBOL: return TokenType.SYMBOL; + case Character.OTHER_SYMBOL: + return TokenType.INDEXABLE_SYMBOL; case Character.OTHER_NUMBER: // "SUPERSCRIPT TWO", diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 2728249333e..d86ca30a632 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -67,7 +67,7 @@ public class SimpleTokenizer implements Tokenizer { for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) { nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE; TokenType nextType = SimpleTokenType.valueOf(nextCode); - if (!prevType.isIndexable() || !nextType.isIndexable()) { + if (isAtTokenBoundary(prevType, nextType)) { String original = input.substring(prev, next); tokens.add(new SimpleToken(original).setOffset(prev) .setType(tokenType) @@ -84,6 +84,12 @@ public class SimpleTokenizer implements Tokenizer { return tokens; } + private boolean isAtTokenBoundary(TokenType prevType, TokenType nextType) { + // Always index each symbol as a token + if (prevType == TokenType.INDEXABLE_SYMBOL || nextType == TokenType.INDEXABLE_SYMBOL) return true; + return !prevType.isIndexable() || !nextType.isIndexable(); + } + private TokenType determineType(TokenType tokenType, TokenType characterType) { if (characterType == TokenType.ALPHABETIC) return TokenType.ALPHABETIC; return tokenType; diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java index 0ce9b327533..70a97cda7e3 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java @@ -23,9 +23,9 @@ public class TokenTypeTestCase { } @Test - public void requireThatOnlyAlphaNumericsAreIndexable() { + public void testIsIndexable() { for (TokenType type : TokenType.values()) { - if (type == TokenType.ALPHABETIC || type == TokenType.NUMERIC) { + if (type == TokenType.ALPHABETIC || type == TokenType.NUMERIC || type == TokenType.INDEXABLE_SYMBOL) { assertTrue(type.isIndexable()); } else { assertFalse(type.isIndexable()); diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index 78412f94fd4..33e820fbb9a 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -2,6 +2,7 @@ package com.yahoo.language.opennlp; import com.yahoo.language.Language; +import com.yahoo.language.process.StemList; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.TokenType; @@ -180,6 +181,17 @@ public class OpenNlpTokenizationTestCase { } @Test + public void testStemEmojis() { + var stemmer = new OpenNlpLinguistics().getStemmer(); + String emoji = "\uD83D\uDD2A"; // 🔪 + List<StemList> stems = stemmer.stem(emoji, StemMode.ALL, Language.ENGLISH); + assertEquals(1, stems.size()); + var stemList = stems.get(0); + assertEquals(1, stemList.size()); + assertEquals(emoji, stemList.get(0)); + } + + @Test public void testTokenTypes() { testTokenTypes(Language.ENGLISH); testTokenTypes(Language.SPANISH); |