aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJefim Matskin <jefimm@wix.com>2018-07-23 14:01:00 +0300
committerJefim Matskin <jefimm@wix.com>2018-07-24 22:01:20 +0300
commit66986b9b77b2d699430f29c135b2b30fb2c553f6 (patch)
tree738b168ed36236fcc26ddd3213e4f8a1765db1e0 /linguistics
parentddd952a7bc5e4425cd2039bdad63491d3abf0dfe (diff)
use com.optimaize.langdetect for lang detection
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/pom.xml4
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java54
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java5
3 files changed, 61 insertions, 2 deletions
diff --git a/linguistics/pom.xml b/linguistics/pom.xml
index 1785befbc39..f743348dde3 100644
--- a/linguistics/pom.xml
+++ b/linguistics/pom.xml
@@ -66,6 +66,10 @@
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
</dependency>
+ <dependency>
+ <groupId>com.optimaize.languagedetector</groupId>
+ <artifactId>language-detector</artifactId>
+ </dependency>
</dependencies>
<build>
<plugins>
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index e6ce4eddb59..4ae3644d62c 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -1,17 +1,30 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
+import com.google.common.base.Optional;
+import com.optimaize.langdetect.LanguageDetector;
+import com.optimaize.langdetect.LanguageDetectorBuilder;
+import com.optimaize.langdetect.i18n.LdLocale;
+import com.optimaize.langdetect.ngram.NgramExtractors;
+import com.optimaize.langdetect.profiles.LanguageProfile;
+import com.optimaize.langdetect.profiles.LanguageProfileReader;
+import com.optimaize.langdetect.text.CommonTextObjectFactories;
+import com.optimaize.langdetect.text.TextObject;
+import com.optimaize.langdetect.text.TextObjectFactory;
import com.yahoo.language.Language;
import com.yahoo.language.detect.Detection;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.detect.Hint;
import com.yahoo.text.Utf8;
+import java.io.IOException;
import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Locale;
/**
- * Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese,
- * Japanese and Korean are supported. There are two ways to guess a String's langCode, by encoding and by character
+ * Includes functionality for determining the langCode from a sample or from the encoding.
+ * There are two ways to guess a String's langCode, by encoding and by character
* set. If the encoding is available this is a very good indication of the langCode. If the encoding is not available,
* then the actual characters in the string can be used to make an educated guess at the String's langCode. Recall a
* String in Java is unicode. Therefore, we can simply look at the unicode blocks of the characters in the string.
@@ -23,6 +36,27 @@ import java.nio.ByteBuffer;
* @author Rich Pito
*/
public class SimpleDetector implements Detector {
+ static private TextObjectFactory textObjectFactory;
+ static private LanguageDetector languageDetector;
+
+ static {
+ // origin: https://github.com/optimaize/language-detector
+ //load all languages:
+ List<LanguageProfile> languageProfiles;
+ try {
+ languageProfiles = new LanguageProfileReader().readAllBuiltIn();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ //build language detector:
+ languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
+ .withProfiles(languageProfiles)
+ .build();
+
+ //create a text object factory
+ textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
+ }
@Override
public Detection detect(byte[] input, int offset, int length, Hint hint) {
@@ -109,10 +143,26 @@ public class SimpleDetector implements Detector {
return Language.THAI;
}
}
+ if (Language.UNKNOWN.equals(soFar)){
+ return detectLangOptimaize(input);
+ }
// got to the end, so return the current best guess
return soFar;
}
+ private static Language detectLangOptimaize(String input) {
+ if (input == null || input.length() == 0) {
+ return Language.UNKNOWN;
+ }
+ TextObject textObject = textObjectFactory.forText(input);
+ Optional<LdLocale> lang = languageDetector.detect(textObject);
+ if (lang.isPresent()) {
+ String language = lang.get().getLanguage();
+ return Language.fromLocale(new Locale(language));
+ }
+ return Language.UNKNOWN;
+ }
+
private boolean isTrailingOctet(byte i) {
return ((i >>> 6) & 3) == 2;
}
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
index f9912f6b7a2..1905c6d98a9 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
@@ -50,6 +50,11 @@ public class SimpleDetectorTestCase {
// a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " +
"\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
+
+ // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F
+ assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии");
+ // https://he.wikipedia.org/wiki/Yahoo!
+ assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום");
}
@Test