From 6bb3a541aa4059593508f7d45cec1c5d1df3ca9b Mon Sep 17 00:00:00 2001 From: Jefim Matskin Date: Tue, 17 Jul 2018 16:33:16 +0300 Subject: add lang detection and opennlp stemmers https://github.com/vespa-engine/vespa/issues/6403 --- .../main/java/com/yahoo/language/simple/SimpleTokenizer.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 02232b61e89..4888fd8676f 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -44,7 +44,7 @@ public class SimpleTokenizer implements Tokenizer { public Iterable tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { if (input.isEmpty()) return Collections.emptyList(); - opennlp.tools.stemmer.Stemmer stemmer = getStemmerForLanguage(language, stemMode); + opennlp.tools.stemmer.Stemmer stemmer = getStemmerForLanguage(language); List tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); @@ -76,7 +76,11 @@ public class SimpleTokenizer implements Tokenizer { return token; } - private static Stemmer getStemmerForLanguage(Language language, StemMode stemMode) { + private static Stemmer getStemmerForLanguage(Language language) { + Stemmer stemmer = charSequence -> charSequence == null ? null : new KStemmer().stem(charSequence.toString()); + if (language == null) { + return stemmer; + } SnowballStemmer.ALGORITHM alg; switch (language) { case DANISH: @@ -126,7 +130,7 @@ public class SimpleTokenizer implements Tokenizer { alg = SnowballStemmer.ALGORITHM.TURKISH; break; default: - return charSequence -> charSequence == null ? null : new KStemmer().stem(charSequence.toString()); + return stemmer; } return new SnowballStemmer(alg); } -- cgit v1.2.3