summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java99
1 files changed, 0 insertions, 99 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
deleted file mode 100644
index 8080dc92729..00000000000
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.language.opennlp;
-
-import com.yahoo.language.Language;
-import com.yahoo.language.LinguisticsCase;
-import com.yahoo.language.process.Normalizer;
-import com.yahoo.language.process.SpecialTokenRegistry;
-import com.yahoo.language.process.StemMode;
-import com.yahoo.language.process.Token;
-import com.yahoo.language.process.Tokenizer;
-import com.yahoo.language.process.Transformer;
-import com.yahoo.language.simple.SimpleNormalizer;
-import com.yahoo.language.simple.SimpleTokenizer;
-import com.yahoo.language.simple.SimpleTransformer;
-import opennlp.tools.stemmer.Stemmer;
-import opennlp.tools.stemmer.snowball.SnowballStemmer;
-
-import java.util.List;
-
-/**
- * Tokenizer using OpenNlp
- *
- * @author matskin
- * @author bratseth
- */
-public class OpenNlpTokenizer implements Tokenizer {
-
- private final static int SPACE_CODE = 32;
- private final Normalizer normalizer;
- private final Transformer transformer;
- private final SimpleTokenizer simpleTokenizer;
- private final SpecialTokenRegistry specialTokenRegistry;
-
- public OpenNlpTokenizer() {
- this(new SimpleNormalizer(), new SimpleTransformer());
- }
-
- public OpenNlpTokenizer(Normalizer normalizer, Transformer transformer) {
- this(normalizer, transformer, new SpecialTokenRegistry(List.of()));
- }
-
- public OpenNlpTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) {
- this.normalizer = normalizer;
- this.transformer = transformer;
- this.specialTokenRegistry = specialTokenRegistry;
- this.simpleTokenizer = new SimpleTokenizer(normalizer, transformer, specialTokenRegistry);
- }
-
- @Override
- public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
- Stemmer stemmer = stemmerFor(language, stemMode);
- if (stemmer == null)
- return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
- else
- return simpleTokenizer.tokenize(input, token -> processToken(token, language, stemMode, removeAccents, stemmer));
- }
-
- private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
- Stemmer stemmer) {
- token = normalizer.normalize(token);
- token = LinguisticsCase.toLowerCase(token);
- if (removeAccents)
- token = transformer.accentDrop(token, language);
- if (stemMode != StemMode.NONE)
- token = stemmer.stem(token).toString();
- return token;
- }
-
- private Stemmer stemmerFor(Language language, StemMode stemMode) {
- if (language == null || language == Language.ENGLISH || stemMode == StemMode.NONE) return null;
- SnowballStemmer.ALGORITHM algorithm = algorithmFor(language);
- if (algorithm == null) return null;
- return new SnowballStemmer(algorithm);
- }
-
- private SnowballStemmer.ALGORITHM algorithmFor(Language language) {
- switch (language) {
- case DANISH: return SnowballStemmer.ALGORITHM.DANISH;
- case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH;
- case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH;
- case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH;
- case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN;
- case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN;
- case IRISH: return SnowballStemmer.ALGORITHM.IRISH;
- case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN;
- case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN;
- case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN;
- case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE;
- case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN;
- case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN;
- case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH;
- case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH;
- case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH;
- case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH;
- default: return null;
- }
- }
-
-}