diff options
Diffstat (limited to 'linguistics/src/main/java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java | 50 | ||||
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java | 70 |
2 files changed, 114 insertions, 6 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index e6ce4eddb59..8ac3237a953 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -1,13 +1,26 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; +import com.google.common.base.Optional; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObject; +import com.optimaize.langdetect.text.TextObjectFactory; import com.yahoo.language.Language; import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; import com.yahoo.text.Utf8; +import java.io.IOException; import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; /** * Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese, @@ -23,6 +36,27 @@ import java.nio.ByteBuffer; * @author Rich Pito */ public class SimpleDetector implements Detector { + static private TextObjectFactory textObjectFactory; + static private LanguageDetector languageDetector; + + static { + // origin: https://github.com/optimaize/language-detector + //load all languages: + List<LanguageProfile> languageProfiles; + try { + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + //build language detector: + languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + + //create a text object factory + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { @@ -109,10 +143,26 @@ public class SimpleDetector implements Detector { return Language.THAI; } } + if (Language.UNKNOWN.equals(soFar)){ + return detectLangOptimaize(input); + } // got to the end, so return the current best guess return soFar; } + private static Language detectLangOptimaize(String input) { + if (input == null || input.length() == 0) { + return Language.UNKNOWN; + } + TextObject textObject = textObjectFactory.forText(input); + Optional<LdLocale> lang = languageDetector.detect(textObject); + if (lang.isPresent()) { + String language = lang.get().getLanguage(); + return Language.fromLocale(new Locale(language)); + } + return Language.UNKNOWN; + } + private boolean isTrailingOctet(byte i) { return ((i >>> 6) & 3) == 2; } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 068fc0126d7..e9ad4bf767c 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -5,6 +5,8 @@ import com.yahoo.language.Language; import com.yahoo.language.LinguisticsCase; import com.yahoo.language.process.*; import com.yahoo.language.simple.kstem.KStemmer; +import opennlp.tools.stemmer.Stemmer; +import opennlp.tools.stemmer.snowball.SnowballStemmer; import java.util.ArrayList; import java.util.Collections; @@ -24,7 +26,7 @@ public class SimpleTokenizer implements Tokenizer { private final static int SPACE_CODE = 32; private final Normalizer normalizer; private final Transformer transformer; - private final KStemmer stemmer = new KStemmer(); + private static final KStemmer kStemmer = new KStemmer(); public SimpleTokenizer() { this(new SimpleNormalizer(), new SimpleTransformer()); @@ -43,6 +45,8 @@ public class SimpleTokenizer implements Tokenizer { public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { if (input.isEmpty()) return Collections.emptyList(); + opennlp.tools.stemmer.Stemmer stemmer = getStemmerForLanguage(language, stemMode); + List<Token> tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); TokenType prevType = SimpleTokenType.valueOf(nextCode); @@ -51,10 +55,10 @@ public class SimpleTokenizer implements Tokenizer { TokenType nextType = SimpleTokenType.valueOf(nextCode); if (!prevType.isIndexable() || !nextType.isIndexable()) { String original = input.substring(prev, next); - String token = processToken(original, language, stemMode, removeAccents); + String token = processToken(original, language, stemMode, removeAccents, stemmer); tokens.add(new SimpleToken(original).setOffset(prev) - .setType(prevType) - .setTokenString(token)); + .setType(prevType) + .setTokenString(token)); prev = next; prevType = nextType; } @@ -63,14 +67,68 @@ public class SimpleTokenizer implements Tokenizer { return tokens; } - private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { + private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) - token = stemmer.stem(token); + token = stemmer.stem(token).toString(); return token; } + private static Stemmer getStemmerForLanguage(Language language, StemMode stemMode) { + SnowballStemmer.ALGORITHM alg; + switch (language) { + case DANISH: + alg = SnowballStemmer.ALGORITHM.DANISH; + break; + case DUTCH: + alg = SnowballStemmer.ALGORITHM.DUTCH; + break; + case FINNISH: + alg = SnowballStemmer.ALGORITHM.FINNISH; + break; + case FRENCH: + alg = SnowballStemmer.ALGORITHM.FRENCH; + break; + case GERMAN: + alg = SnowballStemmer.ALGORITHM.GERMAN; + break; + case HUNGARIAN: + alg = SnowballStemmer.ALGORITHM.HUNGARIAN; + break; + case IRISH: + alg = SnowballStemmer.ALGORITHM.IRISH; + break; + case ITALIAN: + alg = SnowballStemmer.ALGORITHM.ITALIAN; + break; + case NORWEGIAN_BOKMAL: + case NORWEGIAN_NYNORSK: + alg = SnowballStemmer.ALGORITHM.NORWEGIAN; + break; + case PORTUGUESE: + alg = SnowballStemmer.ALGORITHM.PORTUGUESE; + break; + case ROMANIAN: + alg = SnowballStemmer.ALGORITHM.ROMANIAN; + break; + case RUSSIAN: + alg = SnowballStemmer.ALGORITHM.RUSSIAN; + break; + case SPANISH: + alg = SnowballStemmer.ALGORITHM.SPANISH; + break; + case SWEDISH: + alg = SnowballStemmer.ALGORITHM.SWEDISH; + break; + case TURKISH: + alg = SnowballStemmer.ALGORITHM.TURKISH; + break; + default: + return charSequence -> kStemmer.stem(charSequence.toString()); + } + return new SnowballStemmer(alg); + } } |