diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java | 54 |
1 files changed, 52 insertions, 2 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index e6ce4eddb59..4ae3644d62c 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -1,17 +1,30 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; +import com.google.common.base.Optional; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObject; +import com.optimaize.langdetect.text.TextObjectFactory; import com.yahoo.language.Language; import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; import com.yahoo.text.Utf8; +import java.io.IOException; import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; /** - * Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese, - * Japanese and Korean are supported. There are two ways to guess a String's langCode, by encoding and by character + * Includes functionality for determining the langCode from a sample or from the encoding. + * There are two ways to guess a String's langCode, by encoding and by character * set. If the encoding is available this is a very good indication of the langCode. If the encoding is not available, * then the actual characters in the string can be used to make an educated guess at the String's langCode. Recall a * String in Java is unicode. Therefore, we can simply look at the unicode blocks of the characters in the string. @@ -23,6 +36,27 @@ import java.nio.ByteBuffer; * @author Rich Pito */ public class SimpleDetector implements Detector { + static private TextObjectFactory textObjectFactory; + static private LanguageDetector languageDetector; + + static { + // origin: https://github.com/optimaize/language-detector + //load all languages: + List<LanguageProfile> languageProfiles; + try { + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + //build language detector: + languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + + //create a text object factory + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { @@ -109,10 +143,26 @@ public class SimpleDetector implements Detector { return Language.THAI; } } + if (Language.UNKNOWN.equals(soFar)){ + return detectLangOptimaize(input); + } // got to the end, so return the current best guess return soFar; } + private static Language detectLangOptimaize(String input) { + if (input == null || input.length() == 0) { + return Language.UNKNOWN; + } + TextObject textObject = textObjectFactory.forText(input); + Optional<LdLocale> lang = languageDetector.detect(textObject); + if (lang.isPresent()) { + String language = lang.get().getLanguage(); + return Language.fromLocale(new Locale(language)); + } + return Language.UNKNOWN; + } + private boolean isTrailingOctet(byte i) { return ((i >>> 6) & 3) == 2; } |