aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-08-17 22:47:17 +0200
committerGitHub <noreply@github.com>2022-08-17 22:47:17 +0200
commitb61662345db1cbf9a46b0a0aabac963d905c3041 (patch)
tree8b8de2fc1739222933b95e1f67879b152fd232a7
parent06c7ed329cd824312f59e25348d59523f1e3b610 (diff)
parentb24fcd4853b21971f404851147b51bf069864345 (diff)
Merge pull request #23684 from vespa-engine/bratseth/token-typesv8.37.26
Determine token types considering all characters
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java28
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java49
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java94
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java26
-rw-r--r--linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java32
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java23
6 files changed, 133 insertions, 119 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index dba19b47821..8080dc92729 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -7,24 +7,21 @@ import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.SpecialTokenRegistry;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
-import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.process.Transformer;
import com.yahoo.language.simple.SimpleNormalizer;
-import com.yahoo.language.simple.SimpleToken;
-import com.yahoo.language.simple.SimpleTokenType;
import com.yahoo.language.simple.SimpleTokenizer;
import com.yahoo.language.simple.SimpleTransformer;
import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
-import java.util.ArrayList;
import java.util.List;
/**
* Tokenizer using OpenNlp
*
* @author matskin
+ * @author bratseth
*/
public class OpenNlpTokenizer implements Tokenizer {
@@ -51,26 +48,11 @@ public class OpenNlpTokenizer implements Tokenizer {
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
- if (input.isEmpty()) return List.of();
Stemmer stemmer = stemmerFor(language, stemMode);
- if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
-
- List<Token> tokens = new ArrayList<>();
- int nextCode = input.codePointAt(0);
- TokenType prevType = SimpleTokenType.valueOf(nextCode);
- for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
- nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
- TokenType nextType = SimpleTokenType.valueOf(nextCode);
- if (!prevType.isIndexable() || !nextType.isIndexable()) {
- String original = input.substring(prev, next);
- String token = processToken(original, language, stemMode, removeAccents, stemmer);
- tokens.add(new SimpleToken(original).setOffset(prev).setType(prevType).setTokenString(token));
- prev = next;
- prevType = nextType;
- }
- next += Character.charCount(nextCode);
- }
- return tokens;
+ if (stemmer == null)
+ return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
+ else
+ return simpleTokenizer.tokenize(input, token -> processToken(token, language, stemMode, removeAccents, stemmer));
}
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
index 7479e326b45..b6ca219afc8 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
@@ -111,45 +111,28 @@ public class SimpleToken implements Token {
}
@Override
- public int hashCode() {
- return orig.hashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof Token)) {
- return false;
- }
- Token rhs = (Token)obj;
- if (!getType().equals(rhs.getType())) {
- return false;
- }
- if (!equalsOpt(getOrig(), rhs.getOrig())) {
- return false;
- }
- if (getOffset() != rhs.getOffset()) {
- return false;
- }
- if (!equalsOpt(getScript(), rhs.getScript())) {
- return false;
- }
- if (!equalsOpt(getTokenString(), rhs.getTokenString())) {
- return false;
- }
- if (isSpecialToken() != rhs.isSpecialToken()) {
- return false;
- }
- if (getNumComponents() != rhs.getNumComponents()) {
- return false;
- }
+ public boolean equals(Object o) {
+ if (!(o instanceof Token other)) return false;
+
+ if (getType() != other.getType()) return false;
+ if (!equalsOpt(getOrig(), other.getOrig())) return false;
+ if (getOffset() != other.getOffset()) return false;
+ if (!equalsOpt(getScript(), other.getScript())) return false;
+ if (!equalsOpt(getTokenString(), other.getTokenString())) return false;
+ if (isSpecialToken() != other.isSpecialToken()) return false;
+ if (getNumComponents() != other.getNumComponents()) return false;
for (int i = 0, len = getNumComponents(); i < len; ++i) {
- if (!equalsOpt(getComponent(i), rhs.getComponent(i))) {
+ if (!equalsOpt(getComponent(i), other.getComponent(i)))
return false;
- }
}
return true;
}
+ @Override
+ public int hashCode() {
+ return orig.hashCode();
+ }
+
private static boolean equalsOpt(Object lhs, Object rhs) {
if (lhs == null || rhs == null) {
return lhs == rhs;
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
index ace2fd3246e..5c321e4da9b 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
@@ -10,58 +10,58 @@ public class SimpleTokenType {
public static TokenType valueOf(int codePoint) {
switch (Character.getType(codePoint)) {
- case Character.NON_SPACING_MARK:
- // "combining grave accent"
- // and "DEVANAGARI VOWEL SIGN SHORT E" etc
- // (letter-like)
- case Character.COMBINING_SPACING_MARK:
- // "DEVANAGARI VOWEL SIGN SHORT O"
- // and similar (letter-like)
- case Character.LETTER_NUMBER:
- // "SMALL ROMAN NUMERAL SIX" etc (letter-like)
- case Character.UPPERCASE_LETTER:
- case Character.LOWERCASE_LETTER:
- case Character.TITLECASE_LETTER:
- case Character.MODIFIER_LETTER:
- case Character.OTHER_LETTER:
- return TokenType.ALPHABETIC;
+ case Character.NON_SPACING_MARK:
+ // "combining grave accent"
+ // and "DEVANAGARI VOWEL SIGN SHORT E" etc
+ // (letter-like)
+ case Character.COMBINING_SPACING_MARK:
+ // "DEVANAGARI VOWEL SIGN SHORT O"
+ // and similar (letter-like)
+ case Character.LETTER_NUMBER:
+ // "SMALL ROMAN NUMERAL SIX" etc (letter-like)
+ case Character.UPPERCASE_LETTER:
+ case Character.LOWERCASE_LETTER:
+ case Character.TITLECASE_LETTER:
+ case Character.MODIFIER_LETTER:
+ case Character.OTHER_LETTER:
+ return TokenType.ALPHABETIC;
- case Character.ENCLOSING_MARK:
- // "enclosing circle" etc is symbol-like
- case Character.MATH_SYMBOL:
- case Character.CURRENCY_SYMBOL:
- case Character.MODIFIER_SYMBOL:
- case Character.OTHER_SYMBOL:
- return TokenType.SYMBOL;
+ case Character.ENCLOSING_MARK:
+ // "enclosing circle" etc is symbol-like
+ case Character.MATH_SYMBOL:
+ case Character.CURRENCY_SYMBOL:
+ case Character.MODIFIER_SYMBOL:
+ case Character.OTHER_SYMBOL:
+ return TokenType.SYMBOL;
- case Character.OTHER_NUMBER:
- // "SUPERSCRIPT TWO",
- // "DINGBAT CIRCLED SANS-SERIF DIGIT THREE"
- // and more numbers that should mostly normalize
- // to digits
- case Character.DECIMAL_DIGIT_NUMBER:
- return TokenType.NUMERIC;
+ case Character.OTHER_NUMBER:
+ // "SUPERSCRIPT TWO",
+ // "DINGBAT CIRCLED SANS-SERIF DIGIT THREE"
+ // and more numbers that should mostly normalize
+ // to digits
+ case Character.DECIMAL_DIGIT_NUMBER:
+ return TokenType.NUMERIC;
- case Character.SPACE_SEPARATOR:
- case Character.LINE_SEPARATOR:
- case Character.PARAGRAPH_SEPARATOR:
- return TokenType.SPACE;
+ case Character.SPACE_SEPARATOR:
+ case Character.LINE_SEPARATOR:
+ case Character.PARAGRAPH_SEPARATOR:
+ return TokenType.SPACE;
- case Character.DASH_PUNCTUATION:
- case Character.START_PUNCTUATION:
- case Character.END_PUNCTUATION:
- case Character.CONNECTOR_PUNCTUATION:
- case Character.OTHER_PUNCTUATION:
- case Character.INITIAL_QUOTE_PUNCTUATION:
- case Character.FINAL_QUOTE_PUNCTUATION:
- return TokenType.PUNCTUATION;
+ case Character.DASH_PUNCTUATION:
+ case Character.START_PUNCTUATION:
+ case Character.END_PUNCTUATION:
+ case Character.CONNECTOR_PUNCTUATION:
+ case Character.OTHER_PUNCTUATION:
+ case Character.INITIAL_QUOTE_PUNCTUATION:
+ case Character.FINAL_QUOTE_PUNCTUATION:
+ return TokenType.PUNCTUATION;
- case Character.CONTROL:
- case Character.FORMAT:
- case Character.SURROGATE:
- case Character.PRIVATE_USE:
- case Character.UNASSIGNED:
- return TokenType.UNKNOWN;
+ case Character.CONTROL:
+ case Character.FORMAT:
+ case Character.SURROGATE:
+ case Character.PRIVATE_USE:
+ case Character.UNASSIGNED:
+ return TokenType.UNKNOWN;
}
throw new UnsupportedOperationException(String.valueOf(Character.getType(codePoint)));
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 3dc28b99144..b791c843357 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -7,8 +7,8 @@ import com.yahoo.language.process.*;
import com.yahoo.language.simple.kstem.KStemmer;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
+import java.util.function.Function;
import java.util.logging.Logger;
import java.util.logging.Level;
@@ -49,30 +49,46 @@ public class SimpleTokenizer implements Tokenizer {
this.specialTokenRegistry = specialTokenRegistry;
}
+ /** Tokenize the input, applying the transform of this to each token string. */
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
- if (input.isEmpty()) return Collections.emptyList();
+ return tokenize(input,
+ token -> processToken(token, language, stemMode, removeAccents));
+ }
+
+ /** Tokenize the input, and apply the given transform to each token string. */
+ public Iterable<Token> tokenize(String input, Function<String, String> tokenProocessor) {
+ if (input.isEmpty()) return List.of();
List<Token> tokens = new ArrayList<>();
int nextCode = input.codePointAt(0);
TokenType prevType = SimpleTokenType.valueOf(nextCode);
+ TokenType tokenType = prevType;
for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
TokenType nextType = SimpleTokenType.valueOf(nextCode);
if (!prevType.isIndexable() || !nextType.isIndexable()) {
String original = input.substring(prev, next);
- String token = processToken(original, language, stemMode, removeAccents);
tokens.add(new SimpleToken(original).setOffset(prev)
- .setType(prevType)
- .setTokenString(token));
+ .setType(tokenType)
+ .setTokenString(tokenProocessor.apply(original)));
prev = next;
prevType = nextType;
+ tokenType = prevType;
+ }
+ else {
+ tokenType = determineType(tokenType, nextType);
}
next += Character.charCount(nextCode);
}
return tokens;
}
+ private TokenType determineType(TokenType tokenType, TokenType characterType) {
+ if (characterType == TokenType.ALPHABETIC) return TokenType.ALPHABETIC;
+ return tokenType;
+ }
+
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
String original = token;
log.log(Level.FINEST, () -> "processToken '" + original + "'");
diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index 77489f2eb44..cd2a0f73895 100644
--- a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -4,6 +4,7 @@ package com.yahoo.language.opennlp;
import com.yahoo.language.Language;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
import org.junit.Test;
@@ -151,11 +152,9 @@ public class OpenNlpTokenizationTestCase {
String input = "tafsirnya\u0648\u0643\u064F\u0646\u0652";
for (StemMode stemMode : new StemMode[] { StemMode.NONE,
StemMode.SHORTEST }) {
- for (Language l : new Language[] { Language.INDONESIAN,
- Language.ENGLISH, Language.ARABIC }) {
+ for (Language l : List.of(Language.INDONESIAN, Language.ENGLISH, Language.ARABIC)) {
for (boolean accentDrop : new boolean[] { true, false }) {
- for (Token token : tokenizer.tokenize(input,
- l, stemMode, accentDrop)) {
+ for (Token token : tokenizer.tokenize(input, l, stemMode, accentDrop)) {
if (token.getTokenString().length() == 0) {
assertFalse(token.isIndexable());
}
@@ -165,6 +164,31 @@ public class OpenNlpTokenizationTestCase {
}
}
+ @Test
+ public void testTokenTypes() {
+ testTokenTypes(Language.ENGLISH);
+ testTokenTypes(Language.SPANISH);
+ }
+
+ public void testTokenTypes(Language language) {
+ assertEquals(TokenType.ALPHABETIC, tokenize("word", language).iterator().next().getType());
+ assertEquals(TokenType.NUMERIC, tokenize("123", language).iterator().next().getType());
+ assertEquals(TokenType.SPACE, tokenize(" ", language).iterator().next().getType());
+ assertEquals(TokenType.PUNCTUATION, tokenize(".", language).iterator().next().getType());
+ assertEquals(TokenType.ALPHABETIC, tokenize("123word", language).iterator().next().getType());
+
+ var tokens = tokenize("123 123word word123", language).iterator();
+ assertEquals(TokenType.NUMERIC, tokens.next().getType());
+ assertEquals(TokenType.SPACE, tokens.next().getType());
+ assertEquals(TokenType.ALPHABETIC, tokens.next().getType());
+ assertEquals(TokenType.SPACE, tokens.next().getType());
+ assertEquals(TokenType.ALPHABETIC, tokens.next().getType());
+ }
+
+ private Iterable<Token> tokenize(String input, Language language) {
+ return tokenizer.tokenize(input, language, StemMode.SHORTEST, true);
+ }
+
private void recurseDecompose(Token t) {
assertTrue(t.getOffset() >= 0);
assertTrue(t.getOrig().length() >= 0);
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
index 5054f5a9bff..fa8419e200f 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
@@ -171,21 +171,30 @@ public class GramSplitterTestCase {
public void testChineseComma() {
String text = "我喜欢红色、蓝色和紫色";
Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 2);
- for (; grams.hasNext(); ) {
- System.out.println(grams.next().extractFrom(text));
- }
+ assertEquals("我喜", grams.next().extractFrom(text));
+ assertEquals("喜欢", grams.next().extractFrom(text));
+ assertEquals("欢红", grams.next().extractFrom(text));
+ assertEquals("红色", grams.next().extractFrom(text));
+ assertEquals("蓝色", grams.next().extractFrom(text));
+ assertEquals("色和", grams.next().extractFrom(text));
+ assertEquals("和紫", grams.next().extractFrom(text));
+ assertEquals("紫色", grams.next().extractFrom(text));
}
@Test
public void testEnglishComma() {
String text = "我喜欢红色,蓝色和紫色";
Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 2);
- for (; grams.hasNext(); ) {
- System.out.println(grams.next().extractFrom(text));
- }
+ assertEquals("我喜", grams.next().extractFrom(text));
+ assertEquals("喜欢", grams.next().extractFrom(text));
+ assertEquals("欢红", grams.next().extractFrom(text));
+ assertEquals("红色", grams.next().extractFrom(text));
+ assertEquals("蓝色", grams.next().extractFrom(text));
+ assertEquals("色和", grams.next().extractFrom(text));
+ assertEquals("和紫", grams.next().extractFrom(text));
+ assertEquals("紫色", grams.next().extractFrom(text));
}
-
private void assertGramSplits(String input, int gramSize, String ... expected) {
assertEquals(Arrays.asList(expected), gramSplitter.split(input, gramSize).toExtractedList());
}