summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2018-11-01 11:53:53 +0100
committerJon Bratseth <bratseth@oath.com>2018-11-01 11:53:53 +0100
commit51b9f0949fc1aea864a74160421e538dc99e17fc (patch)
tree55e13afe3c6b93bb099e32f974f113db7258e937 /linguistics
parentf5eb888d310e546e2d98f9028e9c19833475ec5c (diff)
Deprecated methods and add OptimaizeDetector
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/Linguistics.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java102
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java5
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java8
-rw-r--r--linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def6
-rw-r--r--linguistics/src/main/resources/configdefinitions/simple-linguistics.def1
-rw-r--r--linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java35
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java7
8 files changed, 161 insertions, 6 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
index 75cdba0ab40..9006d855faa 100644
--- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
@@ -101,7 +101,10 @@ public interface Linguistics {
/**
* Returns the name and version of a processor component returned by
* this instance.
+ *
+ * @deprecated do not use
*/
+ @Deprecated // OK
Tuple2<String, Version> getVersion(Linguistics.Component component);
}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java
new file mode 100644
index 00000000000..7ba061aaef1
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java
@@ -0,0 +1,102 @@
+package com.yahoo.language.opennlp;
+
+import com.google.common.base.Optional;
+import com.optimaize.langdetect.LanguageDetector;
+import com.optimaize.langdetect.LanguageDetectorBuilder;
+import com.optimaize.langdetect.i18n.LdLocale;
+import com.optimaize.langdetect.ngram.NgramExtractors;
+import com.optimaize.langdetect.profiles.LanguageProfile;
+import com.optimaize.langdetect.profiles.LanguageProfileReader;
+import com.optimaize.langdetect.text.CommonTextObjectFactories;
+import com.optimaize.langdetect.text.TextObjectFactory;
+import com.yahoo.language.Language;
+import com.yahoo.language.detect.Detection;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.detect.Hint;
+import com.yahoo.language.simple.SimpleDetector;
+import com.yahoo.text.Utf8;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Detects the language of some sample text using SimpleDetector for CJK and Optimaize otherwise.
+ *
+ * @author bratseth
+ */
+public class OptimaizeDetector implements Detector {
+
+ static private Object initGuard = new Object();
+ static private TextObjectFactory textObjectFactory = null;
+ static private LanguageDetector languageDetector = null;
+
+ static private void initOptimaize() {
+ synchronized (initGuard) {
+ if ((textObjectFactory != null) && (languageDetector != null)) return;
+
+ // origin: https://github.com/optimaize/language-detector
+ // load all languages:
+ List<LanguageProfile> languageProfiles;
+ try {
+ languageProfiles = new LanguageProfileReader().readAllBuiltIn();
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+
+ //build language detector:
+ languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
+ .withProfiles(languageProfiles)
+ .build();
+
+ //create a text object factory
+ textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
+ }
+ }
+
+ private SimpleDetector simpleDetector = new SimpleDetector();
+
+ public OptimaizeDetector() {
+ initOptimaize();
+ }
+
+ @Override
+ public Detection detect(byte[] input, int offset, int length, Hint hint) {
+ return new Detection(guessLanguage(input, offset, length), simpleDetector.guessEncoding(input), false);
+ }
+
+ @Override
+ public Detection detect(ByteBuffer input, Hint hint) {
+ byte[] buf = new byte[input.remaining()];
+ input.get(buf, 0, buf.length);
+ return detect(buf, 0, buf.length, hint);
+ }
+
+ @Override
+ public Detection detect(String input, Hint hint) {
+ return new Detection(guessLanguage(input), Utf8.getCharset().name(), false);
+ }
+
+ private Language guessLanguage(byte[] buf, int offset, int length) {
+ return guessLanguage(Utf8.toString(buf, offset, length));
+ }
+
+ public Language guessLanguage(String input) {
+ if (input == null || input.length() == 0) return Language.UNKNOWN;
+
+ Language result = simpleDetector.guessLanguage(input);
+ if (result != Language.UNKNOWN) return result;
+
+ return guessLanguageUsingOptimaize(input);
+ }
+
+ private static Language guessLanguageUsingOptimaize(String input) {
+ Optional<LdLocale> result = languageDetector.detect(textObjectFactory.forText(input));
+ if ( ! result.isPresent()) return Language.UNKNOWN;
+
+ return Language.fromLocale(new Locale(result.get().getLanguage()));
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index bcd4492625d..1edfe5c804e 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -68,16 +68,21 @@ public class SimpleDetector implements Detector {
private final boolean enableOptimaize;
+ /** @deprecated use OptimaizeDetector to enable optimaize */
+ @Deprecated
SimpleDetector(boolean enableOptimaize) {
initOptimaize(enableOptimaize);
this.enableOptimaize = enableOptimaize;
}
+ @SuppressWarnings("deprecation")
public SimpleDetector() {
this(true);
}
+ /** @deprecated use OptimaizeDetector to enable optimaize */
+ @Deprecated
public SimpleDetector(SimpleLinguisticsConfig.Detector detector) {
this(detector.enableOptimaize());
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index 8cbbdeeae1d..b7bf0215ca4 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -32,14 +32,20 @@ public class SimpleLinguistics implements Linguistics {
private final GramSplitter gramSplitter;
@Inject
+ @SuppressWarnings("deprecation")
public SimpleLinguistics() {
this(true);
}
+
+ /** @deprecated use OpenNlpLinguistics to get optimaize */
+ @Deprecated // OK
public SimpleLinguistics(boolean enableOptimaize) {
this(new SimpleDetector(enableOptimaize));
}
+ /** @deprecated use OpenNlpLinguistics to get optimaize */
+ @Deprecated // OK
public SimpleLinguistics(SimpleLinguisticsConfig config) {
this(new SimpleDetector(config.detector()));
}
@@ -76,6 +82,8 @@ public class SimpleLinguistics implements Linguistics {
@Override
public CharacterClasses getCharacterClasses() { return characterClasses; }
+ /** @deprecated do not use */
+ @Deprecated // OK
@Override
public Tuple2<String, Version> getVersion(Component component) {
return new Tuple2<>("yahoo", new Version(1, 0));
diff --git a/linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def b/linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def
new file mode 100644
index 00000000000..13194d471fd
--- /dev/null
+++ b/linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def
@@ -0,0 +1,6 @@
+# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=language.opennlp
+
+# Enable Optimaize language detector
+detector.enableOptimaize bool default=true
+
diff --git a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
index d5e7ced7419..1ddca52c443 100644
--- a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
+++ b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
@@ -1,4 +1,5 @@
# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+# Deprecated: Do not use
namespace=language.simple
# Enable Optimaize language detector
diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java
new file mode 100644
index 00000000000..ef3248ee0bb
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java
@@ -0,0 +1,35 @@
+package com.yahoo.language.opennlp;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.simple.SimpleDetector;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author bratseth
+ */
+public class OptimaizeDetectorTestCase {
+
+ private static final Detector detector = new OptimaizeDetector();
+
+ @Test
+ public void testDetection() {
+ assertLanguage(Language.UNKNOWN, "Hello!");
+
+ // Test fallback to SimpleDetector
+ assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input
+ "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002");
+
+ // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F
+ assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии");
+ // https://he.wikipedia.org/wiki/Yahoo!
+ assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום");
+ }
+
+ private static void assertLanguage(Language language, String input) {
+ assertEquals(language, detector.detect(input, null).getLanguage());
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
index 1905c6d98a9..0f5fbceccf2 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
@@ -16,7 +16,7 @@ import static org.junit.Assert.assertEquals;
public class SimpleDetectorTestCase {
@Test
- public void requireThatLanguageCanDetected() {
+ public void testDetection() {
assertLanguage(Language.UNKNOWN, "Hello!");
// "Chinese language"
@@ -50,11 +50,6 @@ public class SimpleDetectorTestCase {
// a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " +
"\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
-
- // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F
- assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии");
- // https://he.wikipedia.org/wiki/Yahoo!
- assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום");
}
@Test