use com.optimaize.langdetect for lang detection

author: Jefim Matskin <jefimm@wix.com> 2018-07-23 14:01:00 +0300
committer: Jefim Matskin <jefimm@wix.com> 2018-07-24 22:01:20 +0300
commit: 66986b9b77b2d699430f29c135b2b30fb2c553f6 (patch)
tree: 738b168ed36236fcc26ddd3213e4f8a1765db1e0 /linguistics
parent: ddd952a7bc5e4425cd2039bdad63491d3abf0dfe (diff)
3 files changed, 61 insertions, 2 deletions
diff --git a/linguistics/pom.xml b/linguistics/pom.xml
index 1785befbc39..f743348dde3 100644
--- a/linguistics/pom.xml
+++ b/linguistics/pom.xml
@@ -66,6 +66,10 @@
       <groupId>org.apache.opennlp</groupId>
       <artifactId>opennlp-tools</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.optimaize.languagedetector</groupId>
+      <artifactId>language-detector</artifactId>
+    </dependency>
   </dependencies>
   <build>
     <plugins>
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index e6ce4eddb59..4ae3644d62c 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -1,17 +1,30 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 package com.yahoo.language.simple;
 
+import com.google.common.base.Optional;
+import com.optimaize.langdetect.LanguageDetector;
+import com.optimaize.langdetect.LanguageDetectorBuilder;
+import com.optimaize.langdetect.i18n.LdLocale;
+import com.optimaize.langdetect.ngram.NgramExtractors;
+import com.optimaize.langdetect.profiles.LanguageProfile;
+import com.optimaize.langdetect.profiles.LanguageProfileReader;
+import com.optimaize.langdetect.text.CommonTextObjectFactories;
+import com.optimaize.langdetect.text.TextObject;
+import com.optimaize.langdetect.text.TextObjectFactory;
 import com.yahoo.language.Language;
 import com.yahoo.language.detect.Detection;
 import com.yahoo.language.detect.Detector;
 import com.yahoo.language.detect.Hint;
 import com.yahoo.text.Utf8;
 
+import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Locale;
 
 /**
- * Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese,
- * Japanese and Korean are supported.  There are two ways to guess a String's langCode, by encoding and by character
+ * Includes functionality for determining the langCode from a sample or from the encoding.
+ * There are two ways to guess a String's langCode, by encoding and by character
  * set.  If the encoding is available this is a very good indication of the langCode.  If the encoding is not available,
  * then the actual characters in the string can be used to make an educated guess at the String's langCode.  Recall a
  * String in Java is unicode. Therefore, we can simply look at the unicode blocks of the characters in the string.
@@ -23,6 +36,27 @@ import java.nio.ByteBuffer;
  * @author Rich Pito
  */
 public class SimpleDetector implements Detector {
+    static private TextObjectFactory textObjectFactory;
+    static private LanguageDetector languageDetector;
+
+    static {
+        // origin: https://github.com/optimaize/language-detector
+        //load all languages:
+        List<LanguageProfile> languageProfiles;
+        try {
+            languageProfiles = new LanguageProfileReader().readAllBuiltIn();
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+
+        //build language detector:
+        languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
+                .withProfiles(languageProfiles)
+                .build();
+
+        //create a text object factory
+        textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
+    }
 
     @Override
     public Detection detect(byte[] input, int offset, int length, Hint hint) {
@@ -109,10 +143,26 @@ public class SimpleDetector implements Detector {
                 return Language.THAI;
             }
         }
+        if (Language.UNKNOWN.equals(soFar)){
+            return detectLangOptimaize(input);
+        }
         // got to the end, so return the current best guess
         return soFar;
     }
 
+    private static Language detectLangOptimaize(String input) {
+        if (input == null || input.length() == 0) {
+            return Language.UNKNOWN;
+        }
+        TextObject textObject = textObjectFactory.forText(input);
+        Optional<LdLocale> lang = languageDetector.detect(textObject);
+        if (lang.isPresent()) {
+            String language = lang.get().getLanguage();
+            return Language.fromLocale(new Locale(language));
+        }
+        return Language.UNKNOWN;
+    }
+
     private boolean isTrailingOctet(byte i) {
         return ((i >>> 6) & 3) == 2;
     }
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
index f9912f6b7a2..1905c6d98a9 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
@@ -50,6 +50,11 @@ public class SimpleDetectorTestCase {
         // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
         assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " +
                                         "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
+
+        // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F
+        assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии");
+        // https://he.wikipedia.org/wiki/Yahoo!
+        assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום");
     }
 
     @Test
author	Jefim Matskin <jefimm@wix.com>	2018-07-23 14:01:00 +0300
committer	Jefim Matskin <jefimm@wix.com>	2018-07-24 22:01:20 +0300
commit	66986b9b77b2d699430f29c135b2b30fb2c553f6 (patch)
tree	738b168ed36236fcc26ddd3213e4f8a1765db1e0 /linguistics
parent	ddd952a7bc5e4425cd2039bdad63491d3abf0dfe (diff)