diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java new file mode 100644 index 00000000000..7ba061aaef1 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java @@ -0,0 +1,102 @@ +package com.yahoo.language.opennlp; + +import com.google.common.base.Optional; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObjectFactory; +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.language.simple.SimpleDetector; +import com.yahoo.text.Utf8; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; + +/** + * Detects the language of some sample text using SimpleDetector for CJK and Optimaize otherwise. + * + * @author bratseth + */ +public class OptimaizeDetector implements Detector { + + static private Object initGuard = new Object(); + static private TextObjectFactory textObjectFactory = null; + static private LanguageDetector languageDetector = null; + + static private void initOptimaize() { + synchronized (initGuard) { + if ((textObjectFactory != null) && (languageDetector != null)) return; + + // origin: https://github.com/optimaize/language-detector + // load all languages: + List<LanguageProfile> languageProfiles; + try { + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + + //build language detector: + languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + + //create a text object factory + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } + } + + private SimpleDetector simpleDetector = new SimpleDetector(); + + public OptimaizeDetector() { + initOptimaize(); + } + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + return new Detection(guessLanguage(input, offset, length), simpleDetector.guessEncoding(input), false); + } + + @Override + public Detection detect(ByteBuffer input, Hint hint) { + byte[] buf = new byte[input.remaining()]; + input.get(buf, 0, buf.length); + return detect(buf, 0, buf.length, hint); + } + + @Override + public Detection detect(String input, Hint hint) { + return new Detection(guessLanguage(input), Utf8.getCharset().name(), false); + } + + private Language guessLanguage(byte[] buf, int offset, int length) { + return guessLanguage(Utf8.toString(buf, offset, length)); + } + + public Language guessLanguage(String input) { + if (input == null || input.length() == 0) return Language.UNKNOWN; + + Language result = simpleDetector.guessLanguage(input); + if (result != Language.UNKNOWN) return result; + + return guessLanguageUsingOptimaize(input); + } + + private static Language guessLanguageUsingOptimaize(String input) { + Optional<LdLocale> result = languageDetector.detect(textObjectFactory.forText(input)); + if ( ! result.isPresent()) return Language.UNKNOWN; + + return Language.fromLocale(new Locale(result.get().getLanguage())); + } + +} |