summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/Linguistics.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java102
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java5
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java8
4 files changed, 118 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
index 75cdba0ab40..9006d855faa 100644
--- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
@@ -101,7 +101,10 @@ public interface Linguistics {
/**
* Returns the name and version of a processor component returned by
* this instance.
+ *
+ * @deprecated do not use
*/
+ @Deprecated // OK
Tuple2<String, Version> getVersion(Linguistics.Component component);
}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java
new file mode 100644
index 00000000000..7ba061aaef1
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java
@@ -0,0 +1,102 @@
+package com.yahoo.language.opennlp;
+
+import com.google.common.base.Optional;
+import com.optimaize.langdetect.LanguageDetector;
+import com.optimaize.langdetect.LanguageDetectorBuilder;
+import com.optimaize.langdetect.i18n.LdLocale;
+import com.optimaize.langdetect.ngram.NgramExtractors;
+import com.optimaize.langdetect.profiles.LanguageProfile;
+import com.optimaize.langdetect.profiles.LanguageProfileReader;
+import com.optimaize.langdetect.text.CommonTextObjectFactories;
+import com.optimaize.langdetect.text.TextObjectFactory;
+import com.yahoo.language.Language;
+import com.yahoo.language.detect.Detection;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.detect.Hint;
+import com.yahoo.language.simple.SimpleDetector;
+import com.yahoo.text.Utf8;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Detects the language of some sample text using SimpleDetector for CJK and Optimaize otherwise.
+ *
+ * @author bratseth
+ */
+public class OptimaizeDetector implements Detector {
+
+ static private Object initGuard = new Object();
+ static private TextObjectFactory textObjectFactory = null;
+ static private LanguageDetector languageDetector = null;
+
+ static private void initOptimaize() {
+ synchronized (initGuard) {
+ if ((textObjectFactory != null) && (languageDetector != null)) return;
+
+ // origin: https://github.com/optimaize/language-detector
+ // load all languages:
+ List<LanguageProfile> languageProfiles;
+ try {
+ languageProfiles = new LanguageProfileReader().readAllBuiltIn();
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+
+ //build language detector:
+ languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
+ .withProfiles(languageProfiles)
+ .build();
+
+ //create a text object factory
+ textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
+ }
+ }
+
+ private SimpleDetector simpleDetector = new SimpleDetector();
+
+ public OptimaizeDetector() {
+ initOptimaize();
+ }
+
+ @Override
+ public Detection detect(byte[] input, int offset, int length, Hint hint) {
+ return new Detection(guessLanguage(input, offset, length), simpleDetector.guessEncoding(input), false);
+ }
+
+ @Override
+ public Detection detect(ByteBuffer input, Hint hint) {
+ byte[] buf = new byte[input.remaining()];
+ input.get(buf, 0, buf.length);
+ return detect(buf, 0, buf.length, hint);
+ }
+
+ @Override
+ public Detection detect(String input, Hint hint) {
+ return new Detection(guessLanguage(input), Utf8.getCharset().name(), false);
+ }
+
+ private Language guessLanguage(byte[] buf, int offset, int length) {
+ return guessLanguage(Utf8.toString(buf, offset, length));
+ }
+
+ public Language guessLanguage(String input) {
+ if (input == null || input.length() == 0) return Language.UNKNOWN;
+
+ Language result = simpleDetector.guessLanguage(input);
+ if (result != Language.UNKNOWN) return result;
+
+ return guessLanguageUsingOptimaize(input);
+ }
+
+ private static Language guessLanguageUsingOptimaize(String input) {
+ Optional<LdLocale> result = languageDetector.detect(textObjectFactory.forText(input));
+ if ( ! result.isPresent()) return Language.UNKNOWN;
+
+ return Language.fromLocale(new Locale(result.get().getLanguage()));
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index bcd4492625d..1edfe5c804e 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -68,16 +68,21 @@ public class SimpleDetector implements Detector {
private final boolean enableOptimaize;
+ /** @deprecated use OptimaizeDetector to enable optimaize */
+ @Deprecated
SimpleDetector(boolean enableOptimaize) {
initOptimaize(enableOptimaize);
this.enableOptimaize = enableOptimaize;
}
+ @SuppressWarnings("deprecation")
public SimpleDetector() {
this(true);
}
+ /** @deprecated use OptimaizeDetector to enable optimaize */
+ @Deprecated
public SimpleDetector(SimpleLinguisticsConfig.Detector detector) {
this(detector.enableOptimaize());
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index 8cbbdeeae1d..b7bf0215ca4 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -32,14 +32,20 @@ public class SimpleLinguistics implements Linguistics {
private final GramSplitter gramSplitter;
@Inject
+ @SuppressWarnings("deprecation")
public SimpleLinguistics() {
this(true);
}
+
+ /** @deprecated use OpenNlpLinguistics to get optimaize */
+ @Deprecated // OK
public SimpleLinguistics(boolean enableOptimaize) {
this(new SimpleDetector(enableOptimaize));
}
+ /** @deprecated use OpenNlpLinguistics to get optimaize */
+ @Deprecated // OK
public SimpleLinguistics(SimpleLinguisticsConfig config) {
this(new SimpleDetector(config.detector()));
}
@@ -76,6 +82,8 @@ public class SimpleLinguistics implements Linguistics {
@Override
public CharacterClasses getCharacterClasses() { return characterClasses; }
+ /** @deprecated do not use */
+ @Deprecated // OK
@Override
public Tuple2<String, Version> getVersion(Component component) {
return new Tuple2<>("yahoo", new Version(1, 0));