diff options
author | Jefim Matskin <jefimm@wix.com> | 2018-07-17 16:33:16 +0300 |
---|---|---|
committer | Jefim Matskin <jefimm@wix.com> | 2018-07-17 16:33:16 +0300 |
commit | 6bb3a541aa4059593508f7d45cec1c5d1df3ca9b (patch) | |
tree | 8ea506fb09a947bae3a55c9456c368a7a88ca098 /linguistics/src/main | |
parent | 72b8b369ee55fcc6dd4a10357353c0416a426054 (diff) |
add lang detection and opennlp stemmers
https://github.com/vespa-engine/vespa/issues/6403
Diffstat (limited to 'linguistics/src/main')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java | 10 |
1 files changed, 7 insertions, 3 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 02232b61e89..4888fd8676f 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -44,7 +44,7 @@ public class SimpleTokenizer implements Tokenizer { public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { if (input.isEmpty()) return Collections.emptyList(); - opennlp.tools.stemmer.Stemmer stemmer = getStemmerForLanguage(language, stemMode); + opennlp.tools.stemmer.Stemmer stemmer = getStemmerForLanguage(language); List<Token> tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); @@ -76,7 +76,11 @@ public class SimpleTokenizer implements Tokenizer { return token; } - private static Stemmer getStemmerForLanguage(Language language, StemMode stemMode) { + private static Stemmer getStemmerForLanguage(Language language) { + Stemmer stemmer = charSequence -> charSequence == null ? null : new KStemmer().stem(charSequence.toString()); + if (language == null) { + return stemmer; + } SnowballStemmer.ALGORITHM alg; switch (language) { case DANISH: @@ -126,7 +130,7 @@ public class SimpleTokenizer implements Tokenizer { alg = SnowballStemmer.ALGORITHM.TURKISH; break; default: - return charSequence -> charSequence == null ? null : new KStemmer().stem(charSequence.toString()); + return stemmer; } return new SnowballStemmer(alg); } |