summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java75
1 files changed, 7 insertions, 68 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 4888fd8676f..068fc0126d7 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -5,8 +5,6 @@ import com.yahoo.language.Language;
import com.yahoo.language.LinguisticsCase;
import com.yahoo.language.process.*;
import com.yahoo.language.simple.kstem.KStemmer;
-import opennlp.tools.stemmer.Stemmer;
-import opennlp.tools.stemmer.snowball.SnowballStemmer;
import java.util.ArrayList;
import java.util.Collections;
@@ -26,6 +24,7 @@ public class SimpleTokenizer implements Tokenizer {
private final static int SPACE_CODE = 32;
private final Normalizer normalizer;
private final Transformer transformer;
+ private final KStemmer stemmer = new KStemmer();
public SimpleTokenizer() {
this(new SimpleNormalizer(), new SimpleTransformer());
@@ -44,8 +43,6 @@ public class SimpleTokenizer implements Tokenizer {
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
if (input.isEmpty()) return Collections.emptyList();
- opennlp.tools.stemmer.Stemmer stemmer = getStemmerForLanguage(language);
-
List<Token> tokens = new ArrayList<>();
int nextCode = input.codePointAt(0);
TokenType prevType = SimpleTokenType.valueOf(nextCode);
@@ -54,10 +51,10 @@ public class SimpleTokenizer implements Tokenizer {
TokenType nextType = SimpleTokenType.valueOf(nextCode);
if (!prevType.isIndexable() || !nextType.isIndexable()) {
String original = input.substring(prev, next);
- String token = processToken(original, language, stemMode, removeAccents, stemmer);
+ String token = processToken(original, language, stemMode, removeAccents);
tokens.add(new SimpleToken(original).setOffset(prev)
- .setType(prevType)
- .setTokenString(token));
+ .setType(prevType)
+ .setTokenString(token));
prev = next;
prevType = nextType;
}
@@ -66,72 +63,14 @@ public class SimpleTokenizer implements Tokenizer {
return tokens;
}
- private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) {
+ private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
- if (stemMode != StemMode.NONE && token != null)
- token = stemmer.stem(token).toString();
+ if (stemMode != StemMode.NONE)
+ token = stemmer.stem(token);
return token;
}
- private static Stemmer getStemmerForLanguage(Language language) {
- Stemmer stemmer = charSequence -> charSequence == null ? null : new KStemmer().stem(charSequence.toString());
- if (language == null) {
- return stemmer;
- }
- SnowballStemmer.ALGORITHM alg;
- switch (language) {
- case DANISH:
- alg = SnowballStemmer.ALGORITHM.DANISH;
- break;
- case DUTCH:
- alg = SnowballStemmer.ALGORITHM.DUTCH;
- break;
- case FINNISH:
- alg = SnowballStemmer.ALGORITHM.FINNISH;
- break;
- case FRENCH:
- alg = SnowballStemmer.ALGORITHM.FRENCH;
- break;
- case GERMAN:
- alg = SnowballStemmer.ALGORITHM.GERMAN;
- break;
- case HUNGARIAN:
- alg = SnowballStemmer.ALGORITHM.HUNGARIAN;
- break;
- case IRISH:
- alg = SnowballStemmer.ALGORITHM.IRISH;
- break;
- case ITALIAN:
- alg = SnowballStemmer.ALGORITHM.ITALIAN;
- break;
- case NORWEGIAN_BOKMAL:
- case NORWEGIAN_NYNORSK:
- alg = SnowballStemmer.ALGORITHM.NORWEGIAN;
- break;
- case PORTUGUESE:
- alg = SnowballStemmer.ALGORITHM.PORTUGUESE;
- break;
- case ROMANIAN:
- alg = SnowballStemmer.ALGORITHM.ROMANIAN;
- break;
- case RUSSIAN:
- alg = SnowballStemmer.ALGORITHM.RUSSIAN;
- break;
- case SPANISH:
- alg = SnowballStemmer.ALGORITHM.SPANISH;
- break;
- case SWEDISH:
- alg = SnowballStemmer.ALGORITHM.SWEDISH;
- break;
- case TURKISH:
- alg = SnowballStemmer.ALGORITHM.TURKISH;
- break;
- default:
- return stemmer;
- }
- return new SnowballStemmer(alg);
- }
}