diff options
author | Jefim Matskin <jefimm@wix.com> | 2018-07-17 15:43:44 +0300 |
---|---|---|
committer | Jefim Matskin <jefimm@wix.com> | 2018-07-17 15:43:44 +0300 |
commit | c8c45e7c9afcd5b8e9a7daed54aa8b1c290eede7 (patch) | |
tree | b1cb65200f03490cce360a900b019c64d43eb7c4 | |
parent | 3f59a7da59991ef74adfd5bc334d96095945c575 (diff) |
add lang detection and opennlp stemmers
https://github.com/vespa-engine/vespa/issues/6403
6 files changed, 138 insertions, 6 deletions
diff --git a/linguistics/pom.xml b/linguistics/pom.xml index e4aa7c3049e..f743348dde3 100644 --- a/linguistics/pom.xml +++ b/linguistics/pom.xml @@ -62,6 +62,14 @@ <scope>provided</scope> <classifier>no_aop</classifier> </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools</artifactId> + </dependency> + <dependency> + <groupId>com.optimaize.languagedetector</groupId> + <artifactId>language-detector</artifactId> + </dependency> </dependencies> <build> <plugins> diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index e6ce4eddb59..8ac3237a953 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -1,13 +1,26 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; +import com.google.common.base.Optional; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObject; +import com.optimaize.langdetect.text.TextObjectFactory; import com.yahoo.language.Language; import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; import com.yahoo.text.Utf8; +import java.io.IOException; import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; /** * Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese, @@ -23,6 +36,27 @@ import java.nio.ByteBuffer; * @author Rich Pito */ public class SimpleDetector implements Detector { + static private TextObjectFactory textObjectFactory; + static private LanguageDetector languageDetector; + + static { + // origin: https://github.com/optimaize/language-detector + //load all languages: + List<LanguageProfile> languageProfiles; + try { + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + //build language detector: + languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + + //create a text object factory + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { @@ -109,10 +143,26 @@ public class SimpleDetector implements Detector { return Language.THAI; } } + if (Language.UNKNOWN.equals(soFar)){ + return detectLangOptimaize(input); + } // got to the end, so return the current best guess return soFar; } + private static Language detectLangOptimaize(String input) { + if (input == null || input.length() == 0) { + return Language.UNKNOWN; + } + TextObject textObject = textObjectFactory.forText(input); + Optional<LdLocale> lang = languageDetector.detect(textObject); + if (lang.isPresent()) { + String language = lang.get().getLanguage(); + return Language.fromLocale(new Locale(language)); + } + return Language.UNKNOWN; + } + private boolean isTrailingOctet(byte i) { return ((i >>> 6) & 3) == 2; } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 068fc0126d7..e9ad4bf767c 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -5,6 +5,8 @@ import com.yahoo.language.Language; import com.yahoo.language.LinguisticsCase; import com.yahoo.language.process.*; import com.yahoo.language.simple.kstem.KStemmer; +import opennlp.tools.stemmer.Stemmer; +import opennlp.tools.stemmer.snowball.SnowballStemmer; import java.util.ArrayList; import java.util.Collections; @@ -24,7 +26,7 @@ public class SimpleTokenizer implements Tokenizer { private final static int SPACE_CODE = 32; private final Normalizer normalizer; private final Transformer transformer; - private final KStemmer stemmer = new KStemmer(); + private static final KStemmer kStemmer = new KStemmer(); public SimpleTokenizer() { this(new SimpleNormalizer(), new SimpleTransformer()); @@ -43,6 +45,8 @@ public class SimpleTokenizer implements Tokenizer { public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { if (input.isEmpty()) return Collections.emptyList(); + opennlp.tools.stemmer.Stemmer stemmer = getStemmerForLanguage(language, stemMode); + List<Token> tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); TokenType prevType = SimpleTokenType.valueOf(nextCode); @@ -51,10 +55,10 @@ public class SimpleTokenizer implements Tokenizer { TokenType nextType = SimpleTokenType.valueOf(nextCode); if (!prevType.isIndexable() || !nextType.isIndexable()) { String original = input.substring(prev, next); - String token = processToken(original, language, stemMode, removeAccents); + String token = processToken(original, language, stemMode, removeAccents, stemmer); tokens.add(new SimpleToken(original).setOffset(prev) - .setType(prevType) - .setTokenString(token)); + .setType(prevType) + .setTokenString(token)); prev = next; prevType = nextType; } @@ -63,14 +67,68 @@ public class SimpleTokenizer implements Tokenizer { return tokens; } - private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { + private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) - token = stemmer.stem(token); + token = stemmer.stem(token).toString(); return token; } + private static Stemmer getStemmerForLanguage(Language language, StemMode stemMode) { + SnowballStemmer.ALGORITHM alg; + switch (language) { + case DANISH: + alg = SnowballStemmer.ALGORITHM.DANISH; + break; + case DUTCH: + alg = SnowballStemmer.ALGORITHM.DUTCH; + break; + case FINNISH: + alg = SnowballStemmer.ALGORITHM.FINNISH; + break; + case FRENCH: + alg = SnowballStemmer.ALGORITHM.FRENCH; + break; + case GERMAN: + alg = SnowballStemmer.ALGORITHM.GERMAN; + break; + case HUNGARIAN: + alg = SnowballStemmer.ALGORITHM.HUNGARIAN; + break; + case IRISH: + alg = SnowballStemmer.ALGORITHM.IRISH; + break; + case ITALIAN: + alg = SnowballStemmer.ALGORITHM.ITALIAN; + break; + case NORWEGIAN_BOKMAL: + case NORWEGIAN_NYNORSK: + alg = SnowballStemmer.ALGORITHM.NORWEGIAN; + break; + case PORTUGUESE: + alg = SnowballStemmer.ALGORITHM.PORTUGUESE; + break; + case ROMANIAN: + alg = SnowballStemmer.ALGORITHM.ROMANIAN; + break; + case RUSSIAN: + alg = SnowballStemmer.ALGORITHM.RUSSIAN; + break; + case SPANISH: + alg = SnowballStemmer.ALGORITHM.SPANISH; + break; + case SWEDISH: + alg = SnowballStemmer.ALGORITHM.SWEDISH; + break; + case TURKISH: + alg = SnowballStemmer.ALGORITHM.TURKISH; + break; + default: + return charSequence -> kStemmer.stem(charSequence.toString()); + } + return new SnowballStemmer(alg); + } } diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java index e36d90b3206..27cfc12da5e 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/TokenizationTestCase.java @@ -64,6 +64,9 @@ public class TokenizationTestCase { Arrays.asList("on"), null); assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false, Arrays.asList("on"), null); + + assertTokenize("наименование", Language.RUSSIAN, StemMode.SHORTEST, false, + Arrays.asList("наименован"), null); } @Test diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java index f9912f6b7a2..e1cac896525 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java @@ -50,6 +50,9 @@ public class SimpleDetectorTestCase { // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " + "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694"); + + // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F + assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии"); } @Test diff --git a/parent/pom.xml b/parent/pom.xml index 0b141046d8a..34f6a4e1523 100644 --- a/parent/pom.xml +++ b/parent/pom.xml @@ -663,6 +663,16 @@ <artifactId>wiremock-standalone</artifactId> <version>2.6.0</version> </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools</artifactId> + <version>1.8.4</version> + </dependency> + <dependency> + <groupId>com.optimaize.languagedetector</groupId> + <artifactId>language-detector</artifactId> + <version>0.6</version> + </dependency> </dependencies> </dependencyManagement> |