aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java139
1 files changed, 51 insertions, 88 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index 93599fa7dbe..e1185cb2457 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -3,21 +3,32 @@ package com.yahoo.language.opennlp;
import com.yahoo.language.Language;
import com.yahoo.language.LinguisticsCase;
-import com.yahoo.language.process.*;
-import com.yahoo.language.simple.*;
+import com.yahoo.language.process.Normalizer;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenType;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.process.Transformer;
+import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleToken;
+import com.yahoo.language.simple.SimpleTokenType;
+import com.yahoo.language.simple.SimpleTokenizer;
+import com.yahoo.language.simple.SimpleTransformer;
import opennlp.tools.stemmer.Stemmer;
import opennlp.tools.stemmer.snowball.SnowballStemmer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
-import java.util.logging.Logger;
-import java.util.logging.Level;
+/**
+ * Tokenizer using OpenNlp
+ *
+ * @author matskin
+ */
public class OpenNlpTokenizer implements Tokenizer {
private final static int SPACE_CODE = 32;
- private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName());
private final Normalizer normalizer;
private final Transformer transformer;
private final SimpleTokenizer simpleTokenizer;
@@ -35,10 +46,8 @@ public class OpenNlpTokenizer implements Tokenizer {
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
if (input.isEmpty()) return Collections.emptyList();
- Stemmer stemmer = getStemmerForLanguage(language, stemMode);
- if (stemmer == null) {
- return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
- }
+ Stemmer stemmer = stemmerFor(language, stemMode);
+ if (stemmer == null) return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
List<Token> tokens = new ArrayList<>();
int nextCode = input.codePointAt(0);
@@ -49,9 +58,7 @@ public class OpenNlpTokenizer implements Tokenizer {
if (!prevType.isIndexable() || !nextType.isIndexable()) {
String original = input.substring(prev, next);
String token = processToken(original, language, stemMode, removeAccents, stemmer);
- tokens.add(new SimpleToken(original).setOffset(prev)
- .setType(prevType)
- .setTokenString(token));
+ tokens.add(new SimpleToken(original).setOffset(prev).setType(prevType).setTokenString(token));
prev = next;
prevType = nextType;
}
@@ -60,89 +67,45 @@ public class OpenNlpTokenizer implements Tokenizer {
return tokens;
}
- private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) {
- log.log(Level.FINEST, () -> "getStemmerForLanguage '"+language+"' mode: "+stemMode);
- if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) {
- return null;
- }
- SnowballStemmer.ALGORITHM alg;
- switch (language) {
- case DANISH:
- alg = SnowballStemmer.ALGORITHM.DANISH;
- break;
- case DUTCH:
- alg = SnowballStemmer.ALGORITHM.DUTCH;
- break;
- case FINNISH:
- alg = SnowballStemmer.ALGORITHM.FINNISH;
- break;
- case FRENCH:
- alg = SnowballStemmer.ALGORITHM.FRENCH;
- break;
- case GERMAN:
- alg = SnowballStemmer.ALGORITHM.GERMAN;
- break;
- case HUNGARIAN:
- alg = SnowballStemmer.ALGORITHM.HUNGARIAN;
- break;
- case IRISH:
- alg = SnowballStemmer.ALGORITHM.IRISH;
- break;
- case ITALIAN:
- alg = SnowballStemmer.ALGORITHM.ITALIAN;
- break;
- case NORWEGIAN_BOKMAL:
- case NORWEGIAN_NYNORSK:
- alg = SnowballStemmer.ALGORITHM.NORWEGIAN;
- break;
- case PORTUGUESE:
- alg = SnowballStemmer.ALGORITHM.PORTUGUESE;
- break;
- case ROMANIAN:
- alg = SnowballStemmer.ALGORITHM.ROMANIAN;
- break;
- case RUSSIAN:
- alg = SnowballStemmer.ALGORITHM.RUSSIAN;
- break;
- case SPANISH:
- alg = SnowballStemmer.ALGORITHM.SPANISH;
- break;
- case SWEDISH:
- alg = SnowballStemmer.ALGORITHM.SWEDISH;
- break;
- case TURKISH:
- alg = SnowballStemmer.ALGORITHM.TURKISH;
- break;
- case ENGLISH:
- alg = SnowballStemmer.ALGORITHM.ENGLISH;
- break;
- default:
- return null;
-
- }
- return new SnowballStemmer(alg);
- }
-
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
Stemmer stemmer) {
- final String original = token;
- log.log(Level.FINEST, () -> "processToken '"+original+"'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
- if (stemMode != StemMode.NONE) {
- final String oldToken = token;
- token = doStemming(token, stemmer);
- final String newToken = token;
- log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'");
- }
- final String result = token;
- log.log(Level.FINEST, () -> "processed token is: "+result);
- return result;
+ if (stemMode != StemMode.NONE)
+ token = stemmer.stem(token).toString();
+ return token;
+ }
+
+ private Stemmer stemmerFor(Language language, StemMode stemMode) {
+ if (language == null || language == Language.ENGLISH || stemMode == StemMode.NONE) return null;
+ SnowballStemmer.ALGORITHM algorithm = algorithmFor(language);
+ if (algorithm == null) return null;
+ return new SnowballStemmer(algorithm);
}
- private String doStemming(String token, Stemmer stemmer) {
- return stemmer.stem(token).toString();
+ private SnowballStemmer.ALGORITHM algorithmFor(Language language) {
+ switch (language) {
+ case DANISH: return SnowballStemmer.ALGORITHM.DANISH;
+ case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH;
+ case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH;
+ case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH;
+ case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN;
+ case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN;
+ case IRISH: return SnowballStemmer.ALGORITHM.IRISH;
+ case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN;
+ case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE;
+ case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN;
+ case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN;
+ case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH;
+ case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH;
+ case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH;
+ case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH;
+ default: return null;
+ }
}
+
}