summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJefim Matskin <jefimm@wix.com>2018-07-17 16:33:16 +0300
committerJefim Matskin <jefimm@wix.com>2018-07-17 16:33:16 +0300
commit6bb3a541aa4059593508f7d45cec1c5d1df3ca9b (patch)
tree8ea506fb09a947bae3a55c9456c368a7a88ca098 /linguistics
parent72b8b369ee55fcc6dd4a10357353c0416a426054 (diff)
add lang detection and opennlp stemmers
https://github.com/vespa-engine/vespa/issues/6403
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java10
1 files changed, 7 insertions, 3 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 02232b61e89..4888fd8676f 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -44,7 +44,7 @@ public class SimpleTokenizer implements Tokenizer {
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
if (input.isEmpty()) return Collections.emptyList();
- opennlp.tools.stemmer.Stemmer stemmer = getStemmerForLanguage(language, stemMode);
+ opennlp.tools.stemmer.Stemmer stemmer = getStemmerForLanguage(language);
List<Token> tokens = new ArrayList<>();
int nextCode = input.codePointAt(0);
@@ -76,7 +76,11 @@ public class SimpleTokenizer implements Tokenizer {
return token;
}
- private static Stemmer getStemmerForLanguage(Language language, StemMode stemMode) {
+ private static Stemmer getStemmerForLanguage(Language language) {
+ Stemmer stemmer = charSequence -> charSequence == null ? null : new KStemmer().stem(charSequence.toString());
+ if (language == null) {
+ return stemmer;
+ }
SnowballStemmer.ALGORITHM alg;
switch (language) {
case DANISH:
@@ -126,7 +130,7 @@ public class SimpleTokenizer implements Tokenizer {
alg = SnowballStemmer.ALGORITHM.TURKISH;
break;
default:
- return charSequence -> charSequence == null ? null : new KStemmer().stem(charSequence.toString());
+ return stemmer;
}
return new SnowballStemmer(alg);
}