diff options
author | Jefim Matskin <jefimm@wix.com> | 2018-07-17 15:43:44 +0300 |
---|---|---|
committer | Jefim Matskin <jefimm@wix.com> | 2018-07-17 15:43:44 +0300 |
commit | c8c45e7c9afcd5b8e9a7daed54aa8b1c290eede7 (patch) | |
tree | b1cb65200f03490cce360a900b019c64d43eb7c4 /linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java | |
parent | 3f59a7da59991ef74adfd5bc334d96095945c575 (diff) |
add lang detection and opennlp stemmers
https://github.com/vespa-engine/vespa/issues/6403
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index e6ce4eddb59..8ac3237a953 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -1,13 +1,26 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; +import com.google.common.base.Optional; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObject; +import com.optimaize.langdetect.text.TextObjectFactory; import com.yahoo.language.Language; import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; import com.yahoo.text.Utf8; +import java.io.IOException; import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; /** * Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese, @@ -23,6 +36,27 @@ import java.nio.ByteBuffer; * @author Rich Pito */ public class SimpleDetector implements Detector { + static private TextObjectFactory textObjectFactory; + static private LanguageDetector languageDetector; + + static { + // origin: https://github.com/optimaize/language-detector + //load all languages: + List<LanguageProfile> languageProfiles; + try { + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + //build language detector: + languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + + //create a text object factory + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { @@ -109,10 +143,26 @@ public class SimpleDetector implements Detector { return Language.THAI; } } + if (Language.UNKNOWN.equals(soFar)){ + return detectLangOptimaize(input); + } // got to the end, so return the current best guess return soFar; } + private static Language detectLangOptimaize(String input) { + if (input == null || input.length() == 0) { + return Language.UNKNOWN; + } + TextObject textObject = textObjectFactory.forText(input); + Optional<LdLocale> lang = languageDetector.detect(textObject); + if (lang.isPresent()) { + String language = lang.get().getLanguage(); + return Language.fromLocale(new Locale(language)); + } + return Language.UNKNOWN; + } + private boolean isTrailingOctet(byte i) { return ((i >>> 6) & 3) == 2; } |