summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJefim Matskin <jefimm@wix.com>2018-07-17 16:14:54 +0300
committerJefim Matskin <jefimm@wix.com>2018-07-17 16:14:54 +0300
commit72b8b369ee55fcc6dd4a10357353c0416a426054 (patch)
treeae6d926d86528b7785f7d3c962c41ce36006d3b5 /linguistics
parentc8c45e7c9afcd5b8e9a7daed54aa8b1c290eede7 (diff)
add lang detection and opennlp stemmers
https://github.com/vespa-engine/vespa/issues/6403
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java5
1 files changed, 2 insertions, 3 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index e9ad4bf767c..02232b61e89 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -26,7 +26,6 @@ public class SimpleTokenizer implements Tokenizer {
private final static int SPACE_CODE = 32;
private final Normalizer normalizer;
private final Transformer transformer;
- private static final KStemmer kStemmer = new KStemmer();
public SimpleTokenizer() {
this(new SimpleNormalizer(), new SimpleTransformer());
@@ -72,7 +71,7 @@ public class SimpleTokenizer implements Tokenizer {
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
- if (stemMode != StemMode.NONE)
+ if (stemMode != StemMode.NONE && token != null)
token = stemmer.stem(token).toString();
return token;
}
@@ -127,7 +126,7 @@ public class SimpleTokenizer implements Tokenizer {
alg = SnowballStemmer.ALGORITHM.TURKISH;
break;
default:
- return charSequence -> kStemmer.stem(charSequence.toString());
+ return charSequence -> charSequence == null ? null : new KStemmer().stem(charSequence.toString());
}
return new SnowballStemmer(alg);
}