diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 23:08:48 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 23:08:48 +0200 |
commit | cc60531ac22a7e9601055174a02a6e67c428f800 (patch) | |
tree | 8a1f336745c8ae2da36ca55501e1192ad111ac32 | |
parent | 179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (diff) |
Always treat each symbol as a separate token
9 files changed, 95 insertions, 51 deletions
diff --git a/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java b/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java index c143aa43d53..06ea202b9c3 100644 --- a/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java +++ b/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java @@ -52,7 +52,7 @@ public class NGramTestCase extends AbstractSchemaTestCase { @Test void testInvalidNGramSetting1() throws IOException, ParseException { try { - Schema schema = ApplicationBuilder.buildFromFile("src/test/examples/invalidngram1.sd"); + ApplicationBuilder.buildFromFile("src/test/examples/invalidngram1.sd"); fail("Should cause an exception"); } catch (IllegalArgumentException e) { @@ -63,7 +63,7 @@ public class NGramTestCase extends AbstractSchemaTestCase { @Test void testInvalidNGramSetting2() throws IOException, ParseException { try { - Schema schema = ApplicationBuilder.buildFromFile("src/test/examples/invalidngram2.sd"); + ApplicationBuilder.buildFromFile("src/test/examples/invalidngram2.sd"); fail("Should cause an exception"); } catch (IllegalArgumentException e) { @@ -74,7 +74,7 @@ public class NGramTestCase extends AbstractSchemaTestCase { @Test void testInvalidNGramSetting3() throws IOException, ParseException { try { - Schema schema = ApplicationBuilder.buildFromFile("src/test/examples/invalidngram3.sd"); + ApplicationBuilder.buildFromFile("src/test/examples/invalidngram3.sd"); fail("Should cause an exception"); } catch (IllegalArgumentException e) { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java index c1d415b8e27..01bb606e9ee 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java @@ -107,7 +107,9 @@ public final class Tokenizer { if (i >= source.length()) break; int c = source.codePointAt(i); - if (characterClasses.isLetterOrDigit(c) || (c == '\'' && acceptApostropheAsWordCharacter(currentIndex))) { + if (characterClasses.isSymbol(c)) { // treat each symbol is a separate word + addToken(WORD, Character.toString(c), i, i + 1); + } else if (characterClasses.isLetterOrDigit(c) || (c == '\'' && acceptApostropheAsWordCharacter(currentIndex))) { i = consumeWordOrNumber(i, currentIndex); } else if (Character.isWhitespace(c)) { addToken(SPACE, " ", i, i + 1); diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java index 475b7beb879..c2f533d4cfd 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java @@ -2589,7 +2589,7 @@ public class ParseTestCase { String emoji2 = "\uD83D\uDE00"; // 😀 tester.assertParsed(emoji2, emoji2, Query.Type.ANY); - tester.assertParsed(emoji1 + emoji2, emoji1 + emoji2, Query.Type.ANY); + tester.assertParsed("AND " + emoji1 + " " + emoji2, emoji1 + emoji2, Query.Type.ANY); } } diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java index 5946a00b8bf..f6177262bf9 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java +++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java @@ -15,7 +15,6 @@ public class CharacterClasses { public boolean isLetter(int c) { if (Character.isLetter(c)) return true; if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters - if (Character.getType(c) == Character.OTHER_SYMBOL) return true; // emojis searchable // Some CJK punctuation defined as word characters if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' || @@ -30,6 +29,13 @@ public class CharacterClasses { } /** + * Returns true if the character is in the class "other symbol" - emojis etc. + */ + public boolean isSymbol(int c) { + return Character.getType(c) == Character.OTHER_SYMBOL; + } + + /** * Returns true for code points which should be considered digits - same as java.lang.Character.isDigit */ public boolean isDigit(int c) { diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java index 83110c0021e..210d7ac94ff 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -88,46 +88,54 @@ public class GramSplitter { } private Gram findNext() { - // Skip to next word character - while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) { + // Skip to next indexable character + while (i < input.length() && !isIndexable(input.codePointAt(i))) { i = input.next(i); isFirstAfterSeparator = true; } - if (i >= input.length()) return null; - - UnicodeString gram = input.substring(i, n); - int nonWordChar = indexOfNonWordCodepoint(gram); - if (nonWordChar == 0) throw new RuntimeException("Programming error"); - - if (nonWordChar > 0) - gram = new UnicodeString(gram.toString().substring(0, nonWordChar)); + if (i >= input.length()) return null; // no indexable characters + int tokenStart = i; + UnicodeString gram = input.substring(tokenStart, n); + int tokenEnd = tokenEnd(gram); + gram = new UnicodeString(gram.toString().substring(0, tokenEnd)); if (gram.codePointCount() == n) { // normal case: got a full length gram Gram g = new Gram(i, gram.codePointCount()); i = input.next(i); isFirstAfterSeparator = false; return g; } - else { // gram is too short due either to a non-word separator or end of string - if (isFirstAfterSeparator) { // make a gram anyway + else { // gram is too short due either to being a symbol, being followed by a non-word separator, or end of string + if (isFirstAfterSeparator || ( gram.codePointCount() == 1 && characterClasses.isSymbol(gram.codePointAt(0)))) { // make a gram anyway Gram g = new Gram(i, gram.codePointCount()); i = input.next(i); isFirstAfterSeparator = false; return g; } else { // skip to next - i = input.skip(gram.codePointCount() + 1, i); + i = input.skip(gram.codePointCount(), i); isFirstAfterSeparator = true; return findNext(); } } } - private int indexOfNonWordCodepoint(UnicodeString s) { - for (int i = 0; i < s.length(); i = s.next(i)) { + private boolean isIndexable(int codepoint) { + if (characterClasses.isLetterOrDigit(codepoint)) return true; + if (characterClasses.isSymbol(codepoint)) return true; + return false; + } + + /** Given a string s starting by an indexable character, return the position where that token should end. */ + private int tokenEnd(UnicodeString s) { + if (characterClasses.isSymbol(s.codePointAt(0))) + return s.next(0); // symbols have length 1 + + int i = 0; + for (; i < s.length(); i = s.next(i)) { if ( ! characterClasses.isLetterOrDigit(s.codePointAt(i))) return i; } - return -1; + return i; } @Override diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java index 6cefcfbf67a..a219efce3cd 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java @@ -49,6 +49,17 @@ public class GramSplitterTestCase { } @Test + public void testEmojis() { + String emoji1 = "\uD83D\uDD2A"; // 🔪 + String emoji2 = "\uD83D\uDE00"; // 😀 + assertGramSplit(emoji1, 2, "[" + emoji1+ "]"); + assertGramSplit(emoji1 + emoji2, 2, "[" + emoji1 + ", " + emoji2 + "]"); + assertGramSplit(emoji1 + "." + emoji2, 2, "[" + emoji1 + ", " + emoji2 + "]"); + assertGramSplit("." + emoji1 + "." + emoji2 + ".", 2, "[" + emoji1 + ", " + emoji2 + "]"); + assertGramSplit("foo" + emoji1 + "bar" + emoji2 + "baz", 2, "[fo, oo, " + emoji1 + ", ba, ar, " + emoji2 + ", ba, az]"); + } + + @Test public void testSpaceCornerCases() { // space corner cases assertGramSplit("e en e", 1, "[e, e, n, e]"); diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java index 1c2f7377bde..b4f080405bd 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java @@ -1,10 +1,18 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; +import com.yahoo.language.Language; import com.yahoo.language.process.AbstractTokenizerTestCase; import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; import org.junit.Test; +import java.util.Iterator; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + /** * @author Steinar Knutsen * @author bratseth @@ -36,9 +44,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase { @Test public void testTokenizeEmojis() { TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL); - String emoji = "\uD83D\uDD2A"; // 🔪 - tester.assertTokens(emoji, emoji); - tester.assertTokens(emoji + "foo", emoji, "foo"); + + String emoji1 = "\uD83D\uDD2A"; // 🔪 + String emoji2 = "\uD83D\uDE00"; // 😀 + tester.assertTokens(emoji1, emoji1); + tester.assertTokens(emoji1 + "foo", emoji1, "foo"); + tester.assertTokens(emoji1 + emoji2, emoji1, emoji2); } } diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 8080dc92729..5452da71775 100644 --- a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -25,7 +25,6 @@ import java.util.List; */ public class OpenNlpTokenizer implements Tokenizer { - private final static int SPACE_CODE = 32; private final Normalizer normalizer; private final Transformer transformer; private final SimpleTokenizer simpleTokenizer; @@ -74,26 +73,26 @@ public class OpenNlpTokenizer implements Tokenizer { } private SnowballStemmer.ALGORITHM algorithmFor(Language language) { - switch (language) { - case DANISH: return SnowballStemmer.ALGORITHM.DANISH; - case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH; - case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH; - case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH; - case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN; - case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN; - case IRISH: return SnowballStemmer.ALGORITHM.IRISH; - case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN; - case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN; - case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN; - case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE; - case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN; - case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN; - case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH; - case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH; - case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH; - case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH; - default: return null; - } + return switch (language) { + case DANISH -> SnowballStemmer.ALGORITHM.DANISH; + case DUTCH -> SnowballStemmer.ALGORITHM.DUTCH; + case FINNISH -> SnowballStemmer.ALGORITHM.FINNISH; + case FRENCH -> SnowballStemmer.ALGORITHM.FRENCH; + case GERMAN -> SnowballStemmer.ALGORITHM.GERMAN; + case HUNGARIAN -> SnowballStemmer.ALGORITHM.HUNGARIAN; + case IRISH -> SnowballStemmer.ALGORITHM.IRISH; + case ITALIAN -> SnowballStemmer.ALGORITHM.ITALIAN; + case NORWEGIAN_BOKMAL -> SnowballStemmer.ALGORITHM.NORWEGIAN; + case NORWEGIAN_NYNORSK -> SnowballStemmer.ALGORITHM.NORWEGIAN; + case PORTUGUESE -> SnowballStemmer.ALGORITHM.PORTUGUESE; + case ROMANIAN -> SnowballStemmer.ALGORITHM.ROMANIAN; + case RUSSIAN -> SnowballStemmer.ALGORITHM.RUSSIAN; + case SPANISH -> SnowballStemmer.ALGORITHM.SPANISH; + case SWEDISH -> SnowballStemmer.ALGORITHM.SWEDISH; + case TURKISH -> SnowballStemmer.ALGORITHM.TURKISH; + case ENGLISH -> SnowballStemmer.ALGORITHM.ENGLISH; + default -> null; + }; } } diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index ef29ffd51cc..78412f94fd4 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -165,11 +165,18 @@ public class OpenNlpTokenizationTestCase { @Test public void testTokenizeEmojis() { - String emoji = "\uD83D\uDD2A"; // 🔪 - Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator(); - assertTrue(tokens.hasNext()); - assertEquals(emoji, tokens.next().getTokenString()); - assertFalse(tokens.hasNext()); + String emoji1 = "\uD83D\uDD2A"; // 🔪 + Iterator<Token> tokens1 = tokenizer.tokenize(emoji1, Language.ENGLISH, StemMode.ALL, true).iterator(); + assertTrue(tokens1.hasNext()); + assertEquals(emoji1, tokens1.next().getTokenString()); + assertFalse(tokens1.hasNext()); + + String emoji2 = "\uD83D\uDE00"; // 😀 + Iterator<Token> tokens2 = tokenizer.tokenize(emoji1 + emoji2, Language.ENGLISH, StemMode.ALL, true).iterator(); + assertTrue(tokens2.hasNext()); + assertEquals(emoji1, tokens2.next().getTokenString()); + assertEquals(emoji2, tokens2.next().getTokenString()); + assertFalse(tokens2.hasNext()); } @Test |