From 66986b9b77b2d699430f29c135b2b30fb2c553f6 Mon Sep 17 00:00:00 2001 From: Jefim Matskin Date: Mon, 23 Jul 2018 14:01:00 +0300 Subject: use com.optimaize.langdetect for lang detection --- linguistics/pom.xml | 4 ++ .../com/yahoo/language/simple/SimpleDetector.java | 54 +++++++++++++++++++++- .../language/simple/SimpleDetectorTestCase.java | 5 ++ 3 files changed, 61 insertions(+), 2 deletions(-) (limited to 'linguistics') diff --git a/linguistics/pom.xml b/linguistics/pom.xml index 1785befbc39..f743348dde3 100644 --- a/linguistics/pom.xml +++ b/linguistics/pom.xml @@ -66,6 +66,10 @@ org.apache.opennlp opennlp-tools + + com.optimaize.languagedetector + language-detector + diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index e6ce4eddb59..4ae3644d62c 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -1,17 +1,30 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; +import com.google.common.base.Optional; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObject; +import com.optimaize.langdetect.text.TextObjectFactory; import com.yahoo.language.Language; import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; import com.yahoo.text.Utf8; +import java.io.IOException; import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; /** - * Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese, - * Japanese and Korean are supported. There are two ways to guess a String's langCode, by encoding and by character + * Includes functionality for determining the langCode from a sample or from the encoding. + * There are two ways to guess a String's langCode, by encoding and by character * set. If the encoding is available this is a very good indication of the langCode. If the encoding is not available, * then the actual characters in the string can be used to make an educated guess at the String's langCode. Recall a * String in Java is unicode. Therefore, we can simply look at the unicode blocks of the characters in the string. @@ -23,6 +36,27 @@ import java.nio.ByteBuffer; * @author Rich Pito */ public class SimpleDetector implements Detector { + static private TextObjectFactory textObjectFactory; + static private LanguageDetector languageDetector; + + static { + // origin: https://github.com/optimaize/language-detector + //load all languages: + List languageProfiles; + try { + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + //build language detector: + languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + + //create a text object factory + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { @@ -109,10 +143,26 @@ public class SimpleDetector implements Detector { return Language.THAI; } } + if (Language.UNKNOWN.equals(soFar)){ + return detectLangOptimaize(input); + } // got to the end, so return the current best guess return soFar; } + private static Language detectLangOptimaize(String input) { + if (input == null || input.length() == 0) { + return Language.UNKNOWN; + } + TextObject textObject = textObjectFactory.forText(input); + Optional lang = languageDetector.detect(textObject); + if (lang.isPresent()) { + String language = lang.get().getLanguage(); + return Language.fromLocale(new Locale(language)); + } + return Language.UNKNOWN; + } + private boolean isTrailingOctet(byte i) { return ((i >>> 6) & 3) == 2; } diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java index f9912f6b7a2..1905c6d98a9 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java @@ -50,6 +50,11 @@ public class SimpleDetectorTestCase { // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " + "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694"); + + // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F + assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии"); + // https://he.wikipedia.org/wiki/Yahoo! + assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום"); } @Test -- cgit v1.2.3