summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java50
1 files changed, 50 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index e6ce4eddb59..8ac3237a953 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -1,13 +1,26 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
+import com.google.common.base.Optional;
+import com.optimaize.langdetect.LanguageDetector;
+import com.optimaize.langdetect.LanguageDetectorBuilder;
+import com.optimaize.langdetect.i18n.LdLocale;
+import com.optimaize.langdetect.ngram.NgramExtractors;
+import com.optimaize.langdetect.profiles.LanguageProfile;
+import com.optimaize.langdetect.profiles.LanguageProfileReader;
+import com.optimaize.langdetect.text.CommonTextObjectFactories;
+import com.optimaize.langdetect.text.TextObject;
+import com.optimaize.langdetect.text.TextObjectFactory;
import com.yahoo.language.Language;
import com.yahoo.language.detect.Detection;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.detect.Hint;
import com.yahoo.text.Utf8;
+import java.io.IOException;
import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Locale;
/**
* Includes functionality for determining the langCode from a sample or from the encoding. Currently only Chinese,
@@ -23,6 +36,27 @@ import java.nio.ByteBuffer;
* @author Rich Pito
*/
public class SimpleDetector implements Detector {
+ static private TextObjectFactory textObjectFactory;
+ static private LanguageDetector languageDetector;
+
+ static {
+ // origin: https://github.com/optimaize/language-detector
+ //load all languages:
+ List<LanguageProfile> languageProfiles;
+ try {
+ languageProfiles = new LanguageProfileReader().readAllBuiltIn();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ //build language detector:
+ languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
+ .withProfiles(languageProfiles)
+ .build();
+
+ //create a text object factory
+ textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
+ }
@Override
public Detection detect(byte[] input, int offset, int length, Hint hint) {
@@ -109,10 +143,26 @@ public class SimpleDetector implements Detector {
return Language.THAI;
}
}
+ if (Language.UNKNOWN.equals(soFar)){
+ return detectLangOptimaize(input);
+ }
// got to the end, so return the current best guess
return soFar;
}
+ private static Language detectLangOptimaize(String input) {
+ if (input == null || input.length() == 0) {
+ return Language.UNKNOWN;
+ }
+ TextObject textObject = textObjectFactory.forText(input);
+ Optional<LdLocale> lang = languageDetector.detect(textObject);
+ if (lang.isPresent()) {
+ String language = lang.get().getLanguage();
+ return Language.fromLocale(new Locale(language));
+ }
+ return Language.UNKNOWN;
+ }
+
private boolean isTrailingOctet(byte i) {
return ((i >>> 6) & 3) == 2;
}