diff options
Diffstat (limited to 'linguistics/src/main')
4 files changed, 89 insertions, 108 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index dba19b47821..8080dc92729 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -7,24 +7,21 @@ import com.yahoo.language.process.Normalizer; import com.yahoo.language.process.SpecialTokenRegistry; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; -import com.yahoo.language.process.TokenType; import com.yahoo.language.process.Tokenizer; import com.yahoo.language.process.Transformer; import com.yahoo.language.simple.SimpleNormalizer; -import com.yahoo.language.simple.SimpleToken; -import com.yahoo.language.simple.SimpleTokenType; import com.yahoo.language.simple.SimpleTokenizer; import com.yahoo.language.simple.SimpleTransformer; import opennlp.tools.stemmer.Stemmer; import opennlp.tools.stemmer.snowball.SnowballStemmer; -import java.util.ArrayList; import java.util.List; /** * Tokenizer using OpenNlp * * @author matskin + * @author bratseth */ public class OpenNlpTokenizer implements Tokenizer { @@ -51,26 +48,11 @@ public class OpenNlpTokenizer implements Tokenizer { @Override public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { - if (input.isEmpty()) return List.of(); Stemmer stemmer = stemmerFor(language, stemMode); - if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); - - List<Token> tokens = new ArrayList<>(); - int nextCode = input.codePointAt(0); - TokenType prevType = SimpleTokenType.valueOf(nextCode); - for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) { - nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE; - TokenType nextType = SimpleTokenType.valueOf(nextCode); - if (!prevType.isIndexable() || !nextType.isIndexable()) { - String original = input.substring(prev, next); - String token = processToken(original, language, stemMode, removeAccents, stemmer); - tokens.add(new SimpleToken(original).setOffset(prev).setType(prevType).setTokenString(token)); - prev = next; - prevType = nextType; - } - next += Character.charCount(nextCode); - } - return tokens; + if (stemmer == null) + return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); + else + return simpleTokenizer.tokenize(input, token -> processToken(token, language, stemMode, removeAccents, stemmer)); } private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java index 7479e326b45..b6ca219afc8 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java @@ -111,45 +111,28 @@ public class SimpleToken implements Token { } @Override - public int hashCode() { - return orig.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof Token)) { - return false; - } - Token rhs = (Token)obj; - if (!getType().equals(rhs.getType())) { - return false; - } - if (!equalsOpt(getOrig(), rhs.getOrig())) { - return false; - } - if (getOffset() != rhs.getOffset()) { - return false; - } - if (!equalsOpt(getScript(), rhs.getScript())) { - return false; - } - if (!equalsOpt(getTokenString(), rhs.getTokenString())) { - return false; - } - if (isSpecialToken() != rhs.isSpecialToken()) { - return false; - } - if (getNumComponents() != rhs.getNumComponents()) { - return false; - } + public boolean equals(Object o) { + if (!(o instanceof Token other)) return false; + + if (getType() != other.getType()) return false; + if (!equalsOpt(getOrig(), other.getOrig())) return false; + if (getOffset() != other.getOffset()) return false; + if (!equalsOpt(getScript(), other.getScript())) return false; + if (!equalsOpt(getTokenString(), other.getTokenString())) return false; + if (isSpecialToken() != other.isSpecialToken()) return false; + if (getNumComponents() != other.getNumComponents()) return false; for (int i = 0, len = getNumComponents(); i < len; ++i) { - if (!equalsOpt(getComponent(i), rhs.getComponent(i))) { + if (!equalsOpt(getComponent(i), other.getComponent(i))) return false; - } } return true; } + @Override + public int hashCode() { + return orig.hashCode(); + } + private static boolean equalsOpt(Object lhs, Object rhs) { if (lhs == null || rhs == null) { return lhs == rhs; diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java index ace2fd3246e..5c321e4da9b 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java @@ -10,58 +10,58 @@ public class SimpleTokenType { public static TokenType valueOf(int codePoint) { switch (Character.getType(codePoint)) { - case Character.NON_SPACING_MARK: - // "combining grave accent" - // and "DEVANAGARI VOWEL SIGN SHORT E" etc - // (letter-like) - case Character.COMBINING_SPACING_MARK: - // "DEVANAGARI VOWEL SIGN SHORT O" - // and similar (letter-like) - case Character.LETTER_NUMBER: - // "SMALL ROMAN NUMERAL SIX" etc (letter-like) - case Character.UPPERCASE_LETTER: - case Character.LOWERCASE_LETTER: - case Character.TITLECASE_LETTER: - case Character.MODIFIER_LETTER: - case Character.OTHER_LETTER: - return TokenType.ALPHABETIC; + case Character.NON_SPACING_MARK: + // "combining grave accent" + // and "DEVANAGARI VOWEL SIGN SHORT E" etc + // (letter-like) + case Character.COMBINING_SPACING_MARK: + // "DEVANAGARI VOWEL SIGN SHORT O" + // and similar (letter-like) + case Character.LETTER_NUMBER: + // "SMALL ROMAN NUMERAL SIX" etc (letter-like) + case Character.UPPERCASE_LETTER: + case Character.LOWERCASE_LETTER: + case Character.TITLECASE_LETTER: + case Character.MODIFIER_LETTER: + case Character.OTHER_LETTER: + return TokenType.ALPHABETIC; - case Character.ENCLOSING_MARK: - // "enclosing circle" etc is symbol-like - case Character.MATH_SYMBOL: - case Character.CURRENCY_SYMBOL: - case Character.MODIFIER_SYMBOL: - case Character.OTHER_SYMBOL: - return TokenType.SYMBOL; + case Character.ENCLOSING_MARK: + // "enclosing circle" etc is symbol-like + case Character.MATH_SYMBOL: + case Character.CURRENCY_SYMBOL: + case Character.MODIFIER_SYMBOL: + case Character.OTHER_SYMBOL: + return TokenType.SYMBOL; - case Character.OTHER_NUMBER: - // "SUPERSCRIPT TWO", - // "DINGBAT CIRCLED SANS-SERIF DIGIT THREE" - // and more numbers that should mostly normalize - // to digits - case Character.DECIMAL_DIGIT_NUMBER: - return TokenType.NUMERIC; + case Character.OTHER_NUMBER: + // "SUPERSCRIPT TWO", + // "DINGBAT CIRCLED SANS-SERIF DIGIT THREE" + // and more numbers that should mostly normalize + // to digits + case Character.DECIMAL_DIGIT_NUMBER: + return TokenType.NUMERIC; - case Character.SPACE_SEPARATOR: - case Character.LINE_SEPARATOR: - case Character.PARAGRAPH_SEPARATOR: - return TokenType.SPACE; + case Character.SPACE_SEPARATOR: + case Character.LINE_SEPARATOR: + case Character.PARAGRAPH_SEPARATOR: + return TokenType.SPACE; - case Character.DASH_PUNCTUATION: - case Character.START_PUNCTUATION: - case Character.END_PUNCTUATION: - case Character.CONNECTOR_PUNCTUATION: - case Character.OTHER_PUNCTUATION: - case Character.INITIAL_QUOTE_PUNCTUATION: - case Character.FINAL_QUOTE_PUNCTUATION: - return TokenType.PUNCTUATION; + case Character.DASH_PUNCTUATION: + case Character.START_PUNCTUATION: + case Character.END_PUNCTUATION: + case Character.CONNECTOR_PUNCTUATION: + case Character.OTHER_PUNCTUATION: + case Character.INITIAL_QUOTE_PUNCTUATION: + case Character.FINAL_QUOTE_PUNCTUATION: + return TokenType.PUNCTUATION; - case Character.CONTROL: - case Character.FORMAT: - case Character.SURROGATE: - case Character.PRIVATE_USE: - case Character.UNASSIGNED: - return TokenType.UNKNOWN; + case Character.CONTROL: + case Character.FORMAT: + case Character.SURROGATE: + case Character.PRIVATE_USE: + case Character.UNASSIGNED: + return TokenType.UNKNOWN; } throw new UnsupportedOperationException(String.valueOf(Character.getType(codePoint))); } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 3dc28b99144..b791c843357 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -7,8 +7,8 @@ import com.yahoo.language.process.*; import com.yahoo.language.simple.kstem.KStemmer; import java.util.ArrayList; -import java.util.Collections; import java.util.List; +import java.util.function.Function; import java.util.logging.Logger; import java.util.logging.Level; @@ -49,30 +49,46 @@ public class SimpleTokenizer implements Tokenizer { this.specialTokenRegistry = specialTokenRegistry; } + /** Tokenize the input, applying the transform of this to each token string. */ @Override public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { - if (input.isEmpty()) return Collections.emptyList(); + return tokenize(input, + token -> processToken(token, language, stemMode, removeAccents)); + } + + /** Tokenize the input, and apply the given transform to each token string. */ + public Iterable<Token> tokenize(String input, Function<String, String> tokenProocessor) { + if (input.isEmpty()) return List.of(); List<Token> tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); TokenType prevType = SimpleTokenType.valueOf(nextCode); + TokenType tokenType = prevType; for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) { nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE; TokenType nextType = SimpleTokenType.valueOf(nextCode); if (!prevType.isIndexable() || !nextType.isIndexable()) { String original = input.substring(prev, next); - String token = processToken(original, language, stemMode, removeAccents); tokens.add(new SimpleToken(original).setOffset(prev) - .setType(prevType) - .setTokenString(token)); + .setType(tokenType) + .setTokenString(tokenProocessor.apply(original))); prev = next; prevType = nextType; + tokenType = prevType; + } + else { + tokenType = determineType(tokenType, nextType); } next += Character.charCount(nextCode); } return tokens; } + private TokenType determineType(TokenType tokenType, TokenType characterType) { + if (characterType == TokenType.ALPHABETIC) return TokenType.ALPHABETIC; + return tokenType; + } + private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { String original = token; log.log(Level.FINEST, () -> "processToken '" + original + "'"); |