diff options
author | Jefim Matskin <jefimm@wix.com> | 2018-07-17 16:14:54 +0300 |
---|---|---|
committer | Jefim Matskin <jefimm@wix.com> | 2018-07-17 16:14:54 +0300 |
commit | 72b8b369ee55fcc6dd4a10357353c0416a426054 (patch) | |
tree | ae6d926d86528b7785f7d3c962c41ce36006d3b5 /linguistics/src | |
parent | c8c45e7c9afcd5b8e9a7daed54aa8b1c290eede7 (diff) |
add lang detection and opennlp stemmers
https://github.com/vespa-engine/vespa/issues/6403
Diffstat (limited to 'linguistics/src')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java | 5 |
1 files changed, 2 insertions, 3 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index e9ad4bf767c..02232b61e89 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -26,7 +26,6 @@ public class SimpleTokenizer implements Tokenizer { private final static int SPACE_CODE = 32; private final Normalizer normalizer; private final Transformer transformer; - private static final KStemmer kStemmer = new KStemmer(); public SimpleTokenizer() { this(new SimpleNormalizer(), new SimpleTransformer()); @@ -72,7 +71,7 @@ public class SimpleTokenizer implements Tokenizer { token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); - if (stemMode != StemMode.NONE) + if (stemMode != StemMode.NONE && token != null) token = stemmer.stem(token).toString(); return token; } @@ -127,7 +126,7 @@ public class SimpleTokenizer implements Tokenizer { alg = SnowballStemmer.ALGORITHM.TURKISH; break; default: - return charSequence -> kStemmer.stem(charSequence.toString()); + return charSequence -> charSequence == null ? null : new KStemmer().stem(charSequence.toString()); } return new SnowballStemmer(alg); } |