diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java | 139 |
1 files changed, 51 insertions, 88 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 93599fa7dbe..e1185cb2457 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -3,21 +3,32 @@ package com.yahoo.language.opennlp; import com.yahoo.language.Language; import com.yahoo.language.LinguisticsCase; -import com.yahoo.language.process.*; -import com.yahoo.language.simple.*; +import com.yahoo.language.process.Normalizer; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenType; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.process.Transformer; +import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleToken; +import com.yahoo.language.simple.SimpleTokenType; +import com.yahoo.language.simple.SimpleTokenizer; +import com.yahoo.language.simple.SimpleTransformer; import opennlp.tools.stemmer.Stemmer; import opennlp.tools.stemmer.snowball.SnowballStemmer; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.logging.Logger; -import java.util.logging.Level; +/** + * Tokenizer using OpenNlp + * + * @author matskin + */ public class OpenNlpTokenizer implements Tokenizer { private final static int SPACE_CODE = 32; - private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName()); private final Normalizer normalizer; private final Transformer transformer; private final SimpleTokenizer simpleTokenizer; @@ -35,10 +46,8 @@ public class OpenNlpTokenizer implements Tokenizer { @Override public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { if (input.isEmpty()) return Collections.emptyList(); - Stemmer stemmer = getStemmerForLanguage(language, stemMode); - if (stemmer == null) { - return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); - } + Stemmer stemmer = stemmerFor(language, stemMode); + if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); List<Token> tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); @@ -49,9 +58,7 @@ public class OpenNlpTokenizer implements Tokenizer { if (!prevType.isIndexable() || !nextType.isIndexable()) { String original = input.substring(prev, next); String token = processToken(original, language, stemMode, removeAccents, stemmer); - tokens.add(new SimpleToken(original).setOffset(prev) - .setType(prevType) - .setTokenString(token)); + tokens.add(new SimpleToken(original).setOffset(prev).setType(prevType).setTokenString(token)); prev = next; prevType = nextType; } @@ -60,89 +67,45 @@ public class OpenNlpTokenizer implements Tokenizer { return tokens; } - private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) { - log.log(Level.FINEST, () -> "getStemmerForLanguage '"+language+"' mode: "+stemMode); - if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) { - return null; - } - SnowballStemmer.ALGORITHM alg; - switch (language) { - case DANISH: - alg = SnowballStemmer.ALGORITHM.DANISH; - break; - case DUTCH: - alg = SnowballStemmer.ALGORITHM.DUTCH; - break; - case FINNISH: - alg = SnowballStemmer.ALGORITHM.FINNISH; - break; - case FRENCH: - alg = SnowballStemmer.ALGORITHM.FRENCH; - break; - case GERMAN: - alg = SnowballStemmer.ALGORITHM.GERMAN; - break; - case HUNGARIAN: - alg = SnowballStemmer.ALGORITHM.HUNGARIAN; - break; - case IRISH: - alg = SnowballStemmer.ALGORITHM.IRISH; - break; - case ITALIAN: - alg = SnowballStemmer.ALGORITHM.ITALIAN; - break; - case NORWEGIAN_BOKMAL: - case NORWEGIAN_NYNORSK: - alg = SnowballStemmer.ALGORITHM.NORWEGIAN; - break; - case PORTUGUESE: - alg = SnowballStemmer.ALGORITHM.PORTUGUESE; - break; - case ROMANIAN: - alg = SnowballStemmer.ALGORITHM.ROMANIAN; - break; - case RUSSIAN: - alg = SnowballStemmer.ALGORITHM.RUSSIAN; - break; - case SPANISH: - alg = SnowballStemmer.ALGORITHM.SPANISH; - break; - case SWEDISH: - alg = SnowballStemmer.ALGORITHM.SWEDISH; - break; - case TURKISH: - alg = SnowballStemmer.ALGORITHM.TURKISH; - break; - case ENGLISH: - alg = SnowballStemmer.ALGORITHM.ENGLISH; - break; - default: - return null; - - } - return new SnowballStemmer(alg); - } - private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { - final String original = token; - log.log(Level.FINEST, () -> "processToken '"+original+"'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); - if (stemMode != StemMode.NONE) { - final String oldToken = token; - token = doStemming(token, stemmer); - final String newToken = token; - log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'"); - } - final String result = token; - log.log(Level.FINEST, () -> "processed token is: "+result); - return result; + if (stemMode != StemMode.NONE) + token = stemmer.stem(token).toString(); + return token; + } + + private Stemmer stemmerFor(Language language, StemMode stemMode) { + if (language == null || language == Language.ENGLISH || stemMode == StemMode.NONE) return null; + SnowballStemmer.ALGORITHM algorithm = algorithmFor(language); + if (algorithm == null) return null; + return new SnowballStemmer(algorithm); } - private String doStemming(String token, Stemmer stemmer) { - return stemmer.stem(token).toString(); + private SnowballStemmer.ALGORITHM algorithmFor(Language language) { + switch (language) { + case DANISH: return SnowballStemmer.ALGORITHM.DANISH; + case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH; + case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH; + case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH; + case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN; + case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN; + case IRISH: return SnowballStemmer.ALGORITHM.IRISH; + case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN; + case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN; + case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN; + case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE; + case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN; + case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN; + case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH; + case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH; + case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH; + case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH; + default: return null; + } } + } |