aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java28
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java49
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java94
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java26
4 files changed, 89 insertions, 108 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index dba19b47821..8080dc92729 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -7,24 +7,21 @@ import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.SpecialTokenRegistry;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
-import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.process.Transformer;
import com.yahoo.language.simple.SimpleNormalizer;
-import com.yahoo.language.simple.SimpleToken;
-import com.yahoo.language.simple.SimpleTokenType;
import com.yahoo.language.simple.SimpleTokenizer;
import com.yahoo.language.simple.SimpleTransformer;
import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
-import java.util.ArrayList;
import java.util.List;
/**
* Tokenizer using OpenNlp
*
* @author matskin
+ * @author bratseth
*/
public class OpenNlpTokenizer implements Tokenizer {
@@ -51,26 +48,11 @@ public class OpenNlpTokenizer implements Tokenizer {
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
- if (input.isEmpty()) return List.of();
Stemmer stemmer = stemmerFor(language, stemMode);
- if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
-
- List<Token> tokens = new ArrayList<>();
- int nextCode = input.codePointAt(0);
- TokenType prevType = SimpleTokenType.valueOf(nextCode);
- for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
- nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
- TokenType nextType = SimpleTokenType.valueOf(nextCode);
- if (!prevType.isIndexable() || !nextType.isIndexable()) {
- String original = input.substring(prev, next);
- String token = processToken(original, language, stemMode, removeAccents, stemmer);
- tokens.add(new SimpleToken(original).setOffset(prev).setType(prevType).setTokenString(token));
- prev = next;
- prevType = nextType;
- }
- next += Character.charCount(nextCode);
- }
- return tokens;
+ if (stemmer == null)
+ return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
+ else
+ return simpleTokenizer.tokenize(input, token -> processToken(token, language, stemMode, removeAccents, stemmer));
}
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
index 7479e326b45..b6ca219afc8 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
@@ -111,45 +111,28 @@ public class SimpleToken implements Token {
}
@Override
- public int hashCode() {
- return orig.hashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof Token)) {
- return false;
- }
- Token rhs = (Token)obj;
- if (!getType().equals(rhs.getType())) {
- return false;
- }
- if (!equalsOpt(getOrig(), rhs.getOrig())) {
- return false;
- }
- if (getOffset() != rhs.getOffset()) {
- return false;
- }
- if (!equalsOpt(getScript(), rhs.getScript())) {
- return false;
- }
- if (!equalsOpt(getTokenString(), rhs.getTokenString())) {
- return false;
- }
- if (isSpecialToken() != rhs.isSpecialToken()) {
- return false;
- }
- if (getNumComponents() != rhs.getNumComponents()) {
- return false;
- }
+ public boolean equals(Object o) {
+ if (!(o instanceof Token other)) return false;
+
+ if (getType() != other.getType()) return false;
+ if (!equalsOpt(getOrig(), other.getOrig())) return false;
+ if (getOffset() != other.getOffset()) return false;
+ if (!equalsOpt(getScript(), other.getScript())) return false;
+ if (!equalsOpt(getTokenString(), other.getTokenString())) return false;
+ if (isSpecialToken() != other.isSpecialToken()) return false;
+ if (getNumComponents() != other.getNumComponents()) return false;
for (int i = 0, len = getNumComponents(); i < len; ++i) {
- if (!equalsOpt(getComponent(i), rhs.getComponent(i))) {
+ if (!equalsOpt(getComponent(i), other.getComponent(i)))
return false;
- }
}
return true;
}
+ @Override
+ public int hashCode() {
+ return orig.hashCode();
+ }
+
private static boolean equalsOpt(Object lhs, Object rhs) {
if (lhs == null || rhs == null) {
return lhs == rhs;
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
index ace2fd3246e..5c321e4da9b 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
@@ -10,58 +10,58 @@ public class SimpleTokenType {
public static TokenType valueOf(int codePoint) {
switch (Character.getType(codePoint)) {
- case Character.NON_SPACING_MARK:
- // "combining grave accent"
- // and "DEVANAGARI VOWEL SIGN SHORT E" etc
- // (letter-like)
- case Character.COMBINING_SPACING_MARK:
- // "DEVANAGARI VOWEL SIGN SHORT O"
- // and similar (letter-like)
- case Character.LETTER_NUMBER:
- // "SMALL ROMAN NUMERAL SIX" etc (letter-like)
- case Character.UPPERCASE_LETTER:
- case Character.LOWERCASE_LETTER:
- case Character.TITLECASE_LETTER:
- case Character.MODIFIER_LETTER:
- case Character.OTHER_LETTER:
- return TokenType.ALPHABETIC;
+ case Character.NON_SPACING_MARK:
+ // "combining grave accent"
+ // and "DEVANAGARI VOWEL SIGN SHORT E" etc
+ // (letter-like)
+ case Character.COMBINING_SPACING_MARK:
+ // "DEVANAGARI VOWEL SIGN SHORT O"
+ // and similar (letter-like)
+ case Character.LETTER_NUMBER:
+ // "SMALL ROMAN NUMERAL SIX" etc (letter-like)
+ case Character.UPPERCASE_LETTER:
+ case Character.LOWERCASE_LETTER:
+ case Character.TITLECASE_LETTER:
+ case Character.MODIFIER_LETTER:
+ case Character.OTHER_LETTER:
+ return TokenType.ALPHABETIC;
- case Character.ENCLOSING_MARK:
- // "enclosing circle" etc is symbol-like
- case Character.MATH_SYMBOL:
- case Character.CURRENCY_SYMBOL:
- case Character.MODIFIER_SYMBOL:
- case Character.OTHER_SYMBOL:
- return TokenType.SYMBOL;
+ case Character.ENCLOSING_MARK:
+ // "enclosing circle" etc is symbol-like
+ case Character.MATH_SYMBOL:
+ case Character.CURRENCY_SYMBOL:
+ case Character.MODIFIER_SYMBOL:
+ case Character.OTHER_SYMBOL:
+ return TokenType.SYMBOL;
- case Character.OTHER_NUMBER:
- // "SUPERSCRIPT TWO",
- // "DINGBAT CIRCLED SANS-SERIF DIGIT THREE"
- // and more numbers that should mostly normalize
- // to digits
- case Character.DECIMAL_DIGIT_NUMBER:
- return TokenType.NUMERIC;
+ case Character.OTHER_NUMBER:
+ // "SUPERSCRIPT TWO",
+ // "DINGBAT CIRCLED SANS-SERIF DIGIT THREE"
+ // and more numbers that should mostly normalize
+ // to digits
+ case Character.DECIMAL_DIGIT_NUMBER:
+ return TokenType.NUMERIC;
- case Character.SPACE_SEPARATOR:
- case Character.LINE_SEPARATOR:
- case Character.PARAGRAPH_SEPARATOR:
- return TokenType.SPACE;
+ case Character.SPACE_SEPARATOR:
+ case Character.LINE_SEPARATOR:
+ case Character.PARAGRAPH_SEPARATOR:
+ return TokenType.SPACE;
- case Character.DASH_PUNCTUATION:
- case Character.START_PUNCTUATION:
- case Character.END_PUNCTUATION:
- case Character.CONNECTOR_PUNCTUATION:
- case Character.OTHER_PUNCTUATION:
- case Character.INITIAL_QUOTE_PUNCTUATION:
- case Character.FINAL_QUOTE_PUNCTUATION:
- return TokenType.PUNCTUATION;
+ case Character.DASH_PUNCTUATION:
+ case Character.START_PUNCTUATION:
+ case Character.END_PUNCTUATION:
+ case Character.CONNECTOR_PUNCTUATION:
+ case Character.OTHER_PUNCTUATION:
+ case Character.INITIAL_QUOTE_PUNCTUATION:
+ case Character.FINAL_QUOTE_PUNCTUATION:
+ return TokenType.PUNCTUATION;
- case Character.CONTROL:
- case Character.FORMAT:
- case Character.SURROGATE:
- case Character.PRIVATE_USE:
- case Character.UNASSIGNED:
- return TokenType.UNKNOWN;
+ case Character.CONTROL:
+ case Character.FORMAT:
+ case Character.SURROGATE:
+ case Character.PRIVATE_USE:
+ case Character.UNASSIGNED:
+ return TokenType.UNKNOWN;
}
throw new UnsupportedOperationException(String.valueOf(Character.getType(codePoint)));
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 3dc28b99144..b791c843357 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -7,8 +7,8 @@ import com.yahoo.language.process.*;
import com.yahoo.language.simple.kstem.KStemmer;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
+import java.util.function.Function;
import java.util.logging.Logger;
import java.util.logging.Level;
@@ -49,30 +49,46 @@ public class SimpleTokenizer implements Tokenizer {
this.specialTokenRegistry = specialTokenRegistry;
}
+ /** Tokenize the input, applying the transform of this to each token string. */
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
- if (input.isEmpty()) return Collections.emptyList();
+ return tokenize(input,
+ token -> processToken(token, language, stemMode, removeAccents));
+ }
+
+ /** Tokenize the input, and apply the given transform to each token string. */
+ public Iterable<Token> tokenize(String input, Function<String, String> tokenProocessor) {
+ if (input.isEmpty()) return List.of();
List<Token> tokens = new ArrayList<>();
int nextCode = input.codePointAt(0);
TokenType prevType = SimpleTokenType.valueOf(nextCode);
+ TokenType tokenType = prevType;
for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
TokenType nextType = SimpleTokenType.valueOf(nextCode);
if (!prevType.isIndexable() || !nextType.isIndexable()) {
String original = input.substring(prev, next);
- String token = processToken(original, language, stemMode, removeAccents);
tokens.add(new SimpleToken(original).setOffset(prev)
- .setType(prevType)
- .setTokenString(token));
+ .setType(tokenType)
+ .setTokenString(tokenProocessor.apply(original)));
prev = next;
prevType = nextType;
+ tokenType = prevType;
+ }
+ else {
+ tokenType = determineType(tokenType, nextType);
}
next += Character.charCount(nextCode);
}
return tokens;
}
+ private TokenType determineType(TokenType tokenType, TokenType characterType) {
+ if (characterType == TokenType.ALPHABETIC) return TokenType.ALPHABETIC;
+ return tokenType;
+ }
+
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
String original = token;
log.log(Level.FINEST, () -> "processToken '" + original + "'");