diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java | 50 |
1 files changed, 0 insertions, 50 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index 8ac3237a953..e6ce4eddb59 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -1,26 +1,13 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; -import com.google.common.base.Optional; -import com.optimaize.langdetect.LanguageDetector; -import com.optimaize.langdetect.LanguageDetectorBuilder; -import com.optimaize.langdetect.i18n.LdLocale; -import com.optimaize.langdetect.ngram.NgramExtractors; -import com.optimaize.langdetect.profiles.LanguageProfile; -import com.optimaize.langdetect.profiles.LanguageProfileReader; -import com.optimaize.langdetect.text.CommonTextObjectFactories; -import com.optimaize.langdetect.text.TextObject; -import com.optimaize.langdetect.text.TextObjectFactory; import com.yahoo.language.Language; import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; import com.yahoo.text.Utf8; -import java.io.IOException; import java.nio.ByteBuffer; -import java.util.List; -import java.util.Locale; /** * Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese, @@ -36,27 +23,6 @@ import java.util.Locale; * @author Rich Pito */ public class SimpleDetector implements Detector { - static private TextObjectFactory textObjectFactory; - static private LanguageDetector languageDetector; - - static { - // origin: https://github.com/optimaize/language-detector - //load all languages: - List<LanguageProfile> languageProfiles; - try { - languageProfiles = new LanguageProfileReader().readAllBuiltIn(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - //build language detector: - languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) - .withProfiles(languageProfiles) - .build(); - - //create a text object factory - textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); - } @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { @@ -143,26 +109,10 @@ public class SimpleDetector implements Detector { return Language.THAI; } } - if (Language.UNKNOWN.equals(soFar)){ - return detectLangOptimaize(input); - } // got to the end, so return the current best guess return soFar; } - private static Language detectLangOptimaize(String input) { - if (input == null || input.length() == 0) { - return Language.UNKNOWN; - } - TextObject textObject = textObjectFactory.forText(input); - Optional<LdLocale> lang = languageDetector.detect(textObject); - if (lang.isPresent()) { - String language = lang.get().getLanguage(); - return Language.fromLocale(new Locale(language)); - } - return Language.UNKNOWN; - } - private boolean isTrailingOctet(byte i) { return ((i >>> 6) & 3) == 2; } |