aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java6
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java4
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java8
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java40
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java11
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java17
-rw-r--r--opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java41
-rw-r--r--opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java17
9 files changed, 95 insertions, 51 deletions
diff --git a/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java b/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java
index c143aa43d53..06ea202b9c3 100644
--- a/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java
+++ b/config-model/src/test/java/com/yahoo/schema/processing/NGramTestCase.java
@@ -52,7 +52,7 @@ public class NGramTestCase extends AbstractSchemaTestCase {
@Test
void testInvalidNGramSetting1() throws IOException, ParseException {
try {
- Schema schema = ApplicationBuilder.buildFromFile("src/test/examples/invalidngram1.sd");
+ ApplicationBuilder.buildFromFile("src/test/examples/invalidngram1.sd");
fail("Should cause an exception");
}
catch (IllegalArgumentException e) {
@@ -63,7 +63,7 @@ public class NGramTestCase extends AbstractSchemaTestCase {
@Test
void testInvalidNGramSetting2() throws IOException, ParseException {
try {
- Schema schema = ApplicationBuilder.buildFromFile("src/test/examples/invalidngram2.sd");
+ ApplicationBuilder.buildFromFile("src/test/examples/invalidngram2.sd");
fail("Should cause an exception");
}
catch (IllegalArgumentException e) {
@@ -74,7 +74,7 @@ public class NGramTestCase extends AbstractSchemaTestCase {
@Test
void testInvalidNGramSetting3() throws IOException, ParseException {
try {
- Schema schema = ApplicationBuilder.buildFromFile("src/test/examples/invalidngram3.sd");
+ ApplicationBuilder.buildFromFile("src/test/examples/invalidngram3.sd");
fail("Should cause an exception");
}
catch (IllegalArgumentException e) {
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
index c1d415b8e27..01bb606e9ee 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/parser/Tokenizer.java
@@ -107,7 +107,9 @@ public final class Tokenizer {
if (i >= source.length()) break;
int c = source.codePointAt(i);
- if (characterClasses.isLetterOrDigit(c) || (c == '\'' && acceptApostropheAsWordCharacter(currentIndex))) {
+ if (characterClasses.isSymbol(c)) { // treat each symbol is a separate word
+ addToken(WORD, Character.toString(c), i, i + 1);
+ } else if (characterClasses.isLetterOrDigit(c) || (c == '\'' && acceptApostropheAsWordCharacter(currentIndex))) {
i = consumeWordOrNumber(i, currentIndex);
} else if (Character.isWhitespace(c)) {
addToken(SPACE, " ", i, i + 1);
diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
index 475b7beb879..c2f533d4cfd 100644
--- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
+++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java
@@ -2589,7 +2589,7 @@ public class ParseTestCase {
String emoji2 = "\uD83D\uDE00"; // 😀
tester.assertParsed(emoji2, emoji2, Query.Type.ANY);
- tester.assertParsed(emoji1 + emoji2, emoji1 + emoji2, Query.Type.ANY);
+ tester.assertParsed("AND " + emoji1 + " " + emoji2, emoji1 + emoji2, Query.Type.ANY);
}
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
index 5946a00b8bf..f6177262bf9 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/CharacterClasses.java
@@ -15,7 +15,6 @@ public class CharacterClasses {
public boolean isLetter(int c) {
if (Character.isLetter(c)) return true;
if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters
- if (Character.getType(c) == Character.OTHER_SYMBOL) return true; // emojis searchable
// Some CJK punctuation defined as word characters
if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' ||
@@ -30,6 +29,13 @@ public class CharacterClasses {
}
/**
+ * Returns true if the character is in the class "other symbol" - emojis etc.
+ */
+ public boolean isSymbol(int c) {
+ return Character.getType(c) == Character.OTHER_SYMBOL;
+ }
+
+ /**
* Returns true for code points which should be considered digits - same as java.lang.Character.isDigit
*/
public boolean isDigit(int c) {
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
index 83110c0021e..210d7ac94ff 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -88,46 +88,54 @@ public class GramSplitter {
}
private Gram findNext() {
- // Skip to next word character
- while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) {
+ // Skip to next indexable character
+ while (i < input.length() && !isIndexable(input.codePointAt(i))) {
i = input.next(i);
isFirstAfterSeparator = true;
}
- if (i >= input.length()) return null;
-
- UnicodeString gram = input.substring(i, n);
- int nonWordChar = indexOfNonWordCodepoint(gram);
- if (nonWordChar == 0) throw new RuntimeException("Programming error");
-
- if (nonWordChar > 0)
- gram = new UnicodeString(gram.toString().substring(0, nonWordChar));
+ if (i >= input.length()) return null; // no indexable characters
+ int tokenStart = i;
+ UnicodeString gram = input.substring(tokenStart, n);
+ int tokenEnd = tokenEnd(gram);
+ gram = new UnicodeString(gram.toString().substring(0, tokenEnd));
if (gram.codePointCount() == n) { // normal case: got a full length gram
Gram g = new Gram(i, gram.codePointCount());
i = input.next(i);
isFirstAfterSeparator = false;
return g;
}
- else { // gram is too short due either to a non-word separator or end of string
- if (isFirstAfterSeparator) { // make a gram anyway
+ else { // gram is too short due either to being a symbol, being followed by a non-word separator, or end of string
+ if (isFirstAfterSeparator || ( gram.codePointCount() == 1 && characterClasses.isSymbol(gram.codePointAt(0)))) { // make a gram anyway
Gram g = new Gram(i, gram.codePointCount());
i = input.next(i);
isFirstAfterSeparator = false;
return g;
} else { // skip to next
- i = input.skip(gram.codePointCount() + 1, i);
+ i = input.skip(gram.codePointCount(), i);
isFirstAfterSeparator = true;
return findNext();
}
}
}
- private int indexOfNonWordCodepoint(UnicodeString s) {
- for (int i = 0; i < s.length(); i = s.next(i)) {
+ private boolean isIndexable(int codepoint) {
+ if (characterClasses.isLetterOrDigit(codepoint)) return true;
+ if (characterClasses.isSymbol(codepoint)) return true;
+ return false;
+ }
+
+ /** Given a string s starting by an indexable character, return the position where that token should end. */
+ private int tokenEnd(UnicodeString s) {
+ if (characterClasses.isSymbol(s.codePointAt(0)))
+ return s.next(0); // symbols have length 1
+
+ int i = 0;
+ for (; i < s.length(); i = s.next(i)) {
if ( ! characterClasses.isLetterOrDigit(s.codePointAt(i)))
return i;
}
- return -1;
+ return i;
}
@Override
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
index 6cefcfbf67a..a219efce3cd 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
@@ -49,6 +49,17 @@ public class GramSplitterTestCase {
}
@Test
+ public void testEmojis() {
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ assertGramSplit(emoji1, 2, "[" + emoji1+ "]");
+ assertGramSplit(emoji1 + emoji2, 2, "[" + emoji1 + ", " + emoji2 + "]");
+ assertGramSplit(emoji1 + "." + emoji2, 2, "[" + emoji1 + ", " + emoji2 + "]");
+ assertGramSplit("." + emoji1 + "." + emoji2 + ".", 2, "[" + emoji1 + ", " + emoji2 + "]");
+ assertGramSplit("foo" + emoji1 + "bar" + emoji2 + "baz", 2, "[fo, oo, " + emoji1 + ", ba, ar, " + emoji2 + ", ba, az]");
+ }
+
+ @Test
public void testSpaceCornerCases() {
// space corner cases
assertGramSplit("e en e", 1, "[e, e, n, e]");
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
index 1c2f7377bde..b4f080405bd 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
@@ -1,10 +1,18 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
+import com.yahoo.language.Language;
import com.yahoo.language.process.AbstractTokenizerTestCase;
import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
import org.junit.Test;
+import java.util.Iterator;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
/**
* @author Steinar Knutsen
* @author bratseth
@@ -36,9 +44,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {
@Test
public void testTokenizeEmojis() {
TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);
- String emoji = "\uD83D\uDD2A"; // 🔪
- tester.assertTokens(emoji, emoji);
- tester.assertTokens(emoji + "foo", emoji, "foo");
+
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ tester.assertTokens(emoji1, emoji1);
+ tester.assertTokens(emoji1 + "foo", emoji1, "foo");
+ tester.assertTokens(emoji1 + emoji2, emoji1, emoji2);
}
}
diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index 8080dc92729..5452da71775 100644
--- a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -25,7 +25,6 @@ import java.util.List;
*/
public class OpenNlpTokenizer implements Tokenizer {
- private final static int SPACE_CODE = 32;
private final Normalizer normalizer;
private final Transformer transformer;
private final SimpleTokenizer simpleTokenizer;
@@ -74,26 +73,26 @@ public class OpenNlpTokenizer implements Tokenizer {
}
private SnowballStemmer.ALGORITHM algorithmFor(Language language) {
- switch (language) {
- case DANISH: return SnowballStemmer.ALGORITHM.DANISH;
- case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH;
- case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH;
- case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH;
- case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN;
- case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN;
- case IRISH: return SnowballStemmer.ALGORITHM.IRISH;
- case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN;
- case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN;
- case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN;
- case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE;
- case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN;
- case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN;
- case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH;
- case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH;
- case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH;
- case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH;
- default: return null;
- }
+ return switch (language) {
+ case DANISH -> SnowballStemmer.ALGORITHM.DANISH;
+ case DUTCH -> SnowballStemmer.ALGORITHM.DUTCH;
+ case FINNISH -> SnowballStemmer.ALGORITHM.FINNISH;
+ case FRENCH -> SnowballStemmer.ALGORITHM.FRENCH;
+ case GERMAN -> SnowballStemmer.ALGORITHM.GERMAN;
+ case HUNGARIAN -> SnowballStemmer.ALGORITHM.HUNGARIAN;
+ case IRISH -> SnowballStemmer.ALGORITHM.IRISH;
+ case ITALIAN -> SnowballStemmer.ALGORITHM.ITALIAN;
+ case NORWEGIAN_BOKMAL -> SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case NORWEGIAN_NYNORSK -> SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case PORTUGUESE -> SnowballStemmer.ALGORITHM.PORTUGUESE;
+ case ROMANIAN -> SnowballStemmer.ALGORITHM.ROMANIAN;
+ case RUSSIAN -> SnowballStemmer.ALGORITHM.RUSSIAN;
+ case SPANISH -> SnowballStemmer.ALGORITHM.SPANISH;
+ case SWEDISH -> SnowballStemmer.ALGORITHM.SWEDISH;
+ case TURKISH -> SnowballStemmer.ALGORITHM.TURKISH;
+ case ENGLISH -> SnowballStemmer.ALGORITHM.ENGLISH;
+ default -> null;
+ };
}
}
diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index ef29ffd51cc..78412f94fd4 100644
--- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -165,11 +165,18 @@ public class OpenNlpTokenizationTestCase {
@Test
public void testTokenizeEmojis() {
- String emoji = "\uD83D\uDD2A"; // 🔪
- Iterator<Token> tokens = tokenizer.tokenize(emoji, Language.ENGLISH, StemMode.ALL, true).iterator();
- assertTrue(tokens.hasNext());
- assertEquals(emoji, tokens.next().getTokenString());
- assertFalse(tokens.hasNext());
+ String emoji1 = "\uD83D\uDD2A"; // 🔪
+ Iterator<Token> tokens1 = tokenizer.tokenize(emoji1, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens1.hasNext());
+ assertEquals(emoji1, tokens1.next().getTokenString());
+ assertFalse(tokens1.hasNext());
+
+ String emoji2 = "\uD83D\uDE00"; // 😀
+ Iterator<Token> tokens2 = tokenizer.tokenize(emoji1 + emoji2, Language.ENGLISH, StemMode.ALL, true).iterator();
+ assertTrue(tokens2.hasNext());
+ assertEquals(emoji1, tokens2.next().getTokenString());
+ assertEquals(emoji2, tokens2.next().getTokenString());
+ assertFalse(tokens2.hasNext());
}
@Test