summaryrefslogtreecommitdiffstats
path: root/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java')
-rw-r--r--opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java99
1 files changed, 99 insertions, 0 deletions
diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
new file mode 100644
index 00000000000..8080dc92729
--- /dev/null
+++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -0,0 +1,99 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.opennlp;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.LinguisticsCase;
+import com.yahoo.language.process.Normalizer;
+import com.yahoo.language.process.SpecialTokenRegistry;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.process.Transformer;
+import com.yahoo.language.simple.SimpleNormalizer;
+import com.yahoo.language.simple.SimpleTokenizer;
+import com.yahoo.language.simple.SimpleTransformer;
+import opennlp.tools.stemmer.Stemmer;
+import opennlp.tools.stemmer.snowball.SnowballStemmer;
+
+import java.util.List;
+
+/**
+ * Tokenizer using OpenNlp
+ *
+ * @author matskin
+ * @author bratseth
+ */
+public class OpenNlpTokenizer implements Tokenizer {
+
+ private final static int SPACE_CODE = 32;
+ private final Normalizer normalizer;
+ private final Transformer transformer;
+ private final SimpleTokenizer simpleTokenizer;
+ private final SpecialTokenRegistry specialTokenRegistry;
+
+ public OpenNlpTokenizer() {
+ this(new SimpleNormalizer(), new SimpleTransformer());
+ }
+
+ public OpenNlpTokenizer(Normalizer normalizer, Transformer transformer) {
+ this(normalizer, transformer, new SpecialTokenRegistry(List.of()));
+ }
+
+ public OpenNlpTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) {
+ this.normalizer = normalizer;
+ this.transformer = transformer;
+ this.specialTokenRegistry = specialTokenRegistry;
+ this.simpleTokenizer = new SimpleTokenizer(normalizer, transformer, specialTokenRegistry);
+ }
+
+ @Override
+ public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
+ Stemmer stemmer = stemmerFor(language, stemMode);
+ if (stemmer == null)
+ return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
+ else
+ return simpleTokenizer.tokenize(input, token -> processToken(token, language, stemMode, removeAccents, stemmer));
+ }
+
+ private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
+ Stemmer stemmer) {
+ token = normalizer.normalize(token);
+ token = LinguisticsCase.toLowerCase(token);
+ if (removeAccents)
+ token = transformer.accentDrop(token, language);
+ if (stemMode != StemMode.NONE)
+ token = stemmer.stem(token).toString();
+ return token;
+ }
+
+ private Stemmer stemmerFor(Language language, StemMode stemMode) {
+ if (language == null || language == Language.ENGLISH || stemMode == StemMode.NONE) return null;
+ SnowballStemmer.ALGORITHM algorithm = algorithmFor(language);
+ if (algorithm == null) return null;
+ return new SnowballStemmer(algorithm);
+ }
+
+ private SnowballStemmer.ALGORITHM algorithmFor(Language language) {
+ switch (language) {
+ case DANISH: return SnowballStemmer.ALGORITHM.DANISH;
+ case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH;
+ case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH;
+ case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH;
+ case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN;
+ case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN;
+ case IRISH: return SnowballStemmer.ALGORITHM.IRISH;
+ case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN;
+ case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN;
+ case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE;
+ case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN;
+ case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN;
+ case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH;
+ case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH;
+ case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH;
+ case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH;
+ default: return null;
+ }
+ }
+
+}