summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2018-11-01 11:53:53 +0100
committerJon Bratseth <bratseth@oath.com>2018-11-01 11:53:53 +0100
commit51b9f0949fc1aea864a74160421e538dc99e17fc (patch)
tree55e13afe3c6b93bb099e32f974f113db7258e937
parentf5eb888d310e546e2d98f9028e9c19833475ec5c (diff)
Deprecated methods and add OptimaizeDetector
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java1
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java3
-rw-r--r--config-model/src/main/javacc/SDParser.jj1
-rw-r--r--container-search/src/main/java/com/yahoo/search/query/parser/ParserEnvironment.java2
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/query/parser/TestLinguistics.java1
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java1
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java1
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java1
-rw-r--r--linguistics/src/main/java/com/yahoo/language/Linguistics.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java102
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java5
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java8
-rw-r--r--linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def6
-rw-r--r--linguistics/src/main/resources/configdefinitions/simple-linguistics.def1
-rw-r--r--linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java35
-rw-r--r--linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java7
16 files changed, 171 insertions, 7 deletions
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java
index dd2ffba20ec..d2d28dadfda 100644
--- a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java
@@ -426,6 +426,7 @@ public class SDField extends Field implements TypedKey, FieldOperationContainer,
}
/** Parse an indexing expression which will use the simple linguistics implementatino suitable for testing */
+ @SuppressWarnings("deprecation")
public void parseIndexingScript(String script) {
parseIndexingScript(script, new SimpleLinguistics(false));
}
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java
index cd586960185..d1dc68373db 100644
--- a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java
@@ -12,7 +12,7 @@ import com.yahoo.vespa.indexinglanguage.expressions.StatementExpression;
import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig;
/**
- * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
+ * @author Einar M R Rosenvinge
*/
public class IndexingOperation implements FieldOperation {
@@ -27,6 +27,7 @@ public class IndexingOperation implements FieldOperation {
}
/** Creates an indexing operation which will use the simple linguistics implementation suitable for testing */
+ @SuppressWarnings("deprecation")
public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine) throws ParseException {
return fromStream(input, multiLine, new SimpleLinguistics(false));
}
diff --git a/config-model/src/main/javacc/SDParser.jj b/config-model/src/main/javacc/SDParser.jj
index d1c67a6d425..9494b1524dd 100644
--- a/config-model/src/main/javacc/SDParser.jj
+++ b/config-model/src/main/javacc/SDParser.jj
@@ -111,6 +111,7 @@ public class SDParser {
*
* @param multiline Whether or not to allow multi-line expressions.
*/
+ @SuppressWarnings("deprecation")
private IndexingOperation newIndexingOperation(boolean multiline) throws ParseException {
return newIndexingOperation(multiline, new SimpleLinguistics(false));
}
diff --git a/container-search/src/main/java/com/yahoo/search/query/parser/ParserEnvironment.java b/container-search/src/main/java/com/yahoo/search/query/parser/ParserEnvironment.java
index 7bd1b01b3e5..7aefa6f0cf2 100644
--- a/container-search/src/main/java/com/yahoo/search/query/parser/ParserEnvironment.java
+++ b/container-search/src/main/java/com/yahoo/search/query/parser/ParserEnvironment.java
@@ -18,6 +18,8 @@ import com.yahoo.search.searchchain.Execution;
public final class ParserEnvironment {
private IndexFacts indexFacts = new IndexFacts();
+
+ @SuppressWarnings("deprecation")
private Linguistics linguistics = new SimpleLinguistics(false);
private SpecialTokens specialTokens = new SpecialTokens();
diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/TestLinguistics.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/TestLinguistics.java
index fbe40494231..db5397e5292 100644
--- a/container-search/src/test/java/com/yahoo/prelude/query/parser/TestLinguistics.java
+++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/TestLinguistics.java
@@ -61,6 +61,7 @@ public class TestLinguistics implements Linguistics {
}
@Override
+ @Deprecated
public Tuple2<String, Version> getVersion(Linguistics.Component component) {
return linguistics.getVersion(component);
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java
index 231446b9c62..fddbd211e27 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java
@@ -177,6 +177,7 @@ public abstract class Expression extends Selectable {
public abstract DataType createdOutputType();
/** Creates an expression with simple lingustics for testing */
+ @SuppressWarnings("deprecation")
public static Expression fromString(String expression) throws ParseException {
return fromString(expression, new SimpleLinguistics(false));
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java
index 3e9f6ad5032..7addca75d2f 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java
@@ -89,6 +89,7 @@ public final class ScriptExpression extends ExpressionList<StatementExpression>
}
/** Creates an expression with simple lingustics for testing */
+ @SuppressWarnings("deprecation")
public static ScriptExpression fromString(String expression) throws ParseException {
return fromString(expression, new SimpleLinguistics(false));
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java
index 422457d18fa..16d069d84ec 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java
@@ -90,6 +90,7 @@ public final class StatementExpression extends ExpressionList<Expression> {
}
/** Creates an expression with simple lingustics for testing */
+ @SuppressWarnings("deprecation")
public static StatementExpression fromString(String expression) throws ParseException {
return fromString(expression, new SimpleLinguistics(false));
}
diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
index 75cdba0ab40..9006d855faa 100644
--- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
@@ -101,7 +101,10 @@ public interface Linguistics {
/**
* Returns the name and version of a processor component returned by
* this instance.
+ *
+ * @deprecated do not use
*/
+ @Deprecated // OK
Tuple2<String, Version> getVersion(Linguistics.Component component);
}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java
new file mode 100644
index 00000000000..7ba061aaef1
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java
@@ -0,0 +1,102 @@
+package com.yahoo.language.opennlp;
+
+import com.google.common.base.Optional;
+import com.optimaize.langdetect.LanguageDetector;
+import com.optimaize.langdetect.LanguageDetectorBuilder;
+import com.optimaize.langdetect.i18n.LdLocale;
+import com.optimaize.langdetect.ngram.NgramExtractors;
+import com.optimaize.langdetect.profiles.LanguageProfile;
+import com.optimaize.langdetect.profiles.LanguageProfileReader;
+import com.optimaize.langdetect.text.CommonTextObjectFactories;
+import com.optimaize.langdetect.text.TextObjectFactory;
+import com.yahoo.language.Language;
+import com.yahoo.language.detect.Detection;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.detect.Hint;
+import com.yahoo.language.simple.SimpleDetector;
+import com.yahoo.text.Utf8;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * Detects the language of some sample text using SimpleDetector for CJK and Optimaize otherwise.
+ *
+ * @author bratseth
+ */
+public class OptimaizeDetector implements Detector {
+
+ static private Object initGuard = new Object();
+ static private TextObjectFactory textObjectFactory = null;
+ static private LanguageDetector languageDetector = null;
+
+ static private void initOptimaize() {
+ synchronized (initGuard) {
+ if ((textObjectFactory != null) && (languageDetector != null)) return;
+
+ // origin: https://github.com/optimaize/language-detector
+ // load all languages:
+ List<LanguageProfile> languageProfiles;
+ try {
+ languageProfiles = new LanguageProfileReader().readAllBuiltIn();
+ } catch (IOException e) {
+ throw new UncheckedIOException(e);
+ }
+
+ //build language detector:
+ languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
+ .withProfiles(languageProfiles)
+ .build();
+
+ //create a text object factory
+ textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
+ }
+ }
+
+ private SimpleDetector simpleDetector = new SimpleDetector();
+
+ public OptimaizeDetector() {
+ initOptimaize();
+ }
+
+ @Override
+ public Detection detect(byte[] input, int offset, int length, Hint hint) {
+ return new Detection(guessLanguage(input, offset, length), simpleDetector.guessEncoding(input), false);
+ }
+
+ @Override
+ public Detection detect(ByteBuffer input, Hint hint) {
+ byte[] buf = new byte[input.remaining()];
+ input.get(buf, 0, buf.length);
+ return detect(buf, 0, buf.length, hint);
+ }
+
+ @Override
+ public Detection detect(String input, Hint hint) {
+ return new Detection(guessLanguage(input), Utf8.getCharset().name(), false);
+ }
+
+ private Language guessLanguage(byte[] buf, int offset, int length) {
+ return guessLanguage(Utf8.toString(buf, offset, length));
+ }
+
+ public Language guessLanguage(String input) {
+ if (input == null || input.length() == 0) return Language.UNKNOWN;
+
+ Language result = simpleDetector.guessLanguage(input);
+ if (result != Language.UNKNOWN) return result;
+
+ return guessLanguageUsingOptimaize(input);
+ }
+
+ private static Language guessLanguageUsingOptimaize(String input) {
+ Optional<LdLocale> result = languageDetector.detect(textObjectFactory.forText(input));
+ if ( ! result.isPresent()) return Language.UNKNOWN;
+
+ return Language.fromLocale(new Locale(result.get().getLanguage()));
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index bcd4492625d..1edfe5c804e 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -68,16 +68,21 @@ public class SimpleDetector implements Detector {
private final boolean enableOptimaize;
+ /** @deprecated use OptimaizeDetector to enable optimaize */
+ @Deprecated
SimpleDetector(boolean enableOptimaize) {
initOptimaize(enableOptimaize);
this.enableOptimaize = enableOptimaize;
}
+ @SuppressWarnings("deprecation")
public SimpleDetector() {
this(true);
}
+ /** @deprecated use OptimaizeDetector to enable optimaize */
+ @Deprecated
public SimpleDetector(SimpleLinguisticsConfig.Detector detector) {
this(detector.enableOptimaize());
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index 8cbbdeeae1d..b7bf0215ca4 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -32,14 +32,20 @@ public class SimpleLinguistics implements Linguistics {
private final GramSplitter gramSplitter;
@Inject
+ @SuppressWarnings("deprecation")
public SimpleLinguistics() {
this(true);
}
+
+ /** @deprecated use OpenNlpLinguistics to get optimaize */
+ @Deprecated // OK
public SimpleLinguistics(boolean enableOptimaize) {
this(new SimpleDetector(enableOptimaize));
}
+ /** @deprecated use OpenNlpLinguistics to get optimaize */
+ @Deprecated // OK
public SimpleLinguistics(SimpleLinguisticsConfig config) {
this(new SimpleDetector(config.detector()));
}
@@ -76,6 +82,8 @@ public class SimpleLinguistics implements Linguistics {
@Override
public CharacterClasses getCharacterClasses() { return characterClasses; }
+ /** @deprecated do not use */
+ @Deprecated // OK
@Override
public Tuple2<String, Version> getVersion(Component component) {
return new Tuple2<>("yahoo", new Version(1, 0));
diff --git a/linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def b/linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def
new file mode 100644
index 00000000000..13194d471fd
--- /dev/null
+++ b/linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def
@@ -0,0 +1,6 @@
+# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=language.opennlp
+
+# Enable Optimaize language detector
+detector.enableOptimaize bool default=true
+
diff --git a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
index d5e7ced7419..1ddca52c443 100644
--- a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
+++ b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
@@ -1,4 +1,5 @@
# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+# Deprecated: Do not use
namespace=language.simple
# Enable Optimaize language detector
diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java
new file mode 100644
index 00000000000..ef3248ee0bb
--- /dev/null
+++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java
@@ -0,0 +1,35 @@
+package com.yahoo.language.opennlp;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.detect.Detector;
+import com.yahoo.language.simple.SimpleDetector;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author bratseth
+ */
+public class OptimaizeDetectorTestCase {
+
+ private static final Detector detector = new OptimaizeDetector();
+
+ @Test
+ public void testDetection() {
+ assertLanguage(Language.UNKNOWN, "Hello!");
+
+ // Test fallback to SimpleDetector
+ assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input
+ "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002");
+
+ // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F
+ assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии");
+ // https://he.wikipedia.org/wiki/Yahoo!
+ assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום");
+ }
+
+ private static void assertLanguage(Language language, String input) {
+ assertEquals(language, detector.detect(input, null).getLanguage());
+ }
+
+}
diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
index 1905c6d98a9..0f5fbceccf2 100644
--- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java
@@ -16,7 +16,7 @@ import static org.junit.Assert.assertEquals;
public class SimpleDetectorTestCase {
@Test
- public void requireThatLanguageCanDetected() {
+ public void testDetection() {
assertLanguage(Language.UNKNOWN, "Hello!");
// "Chinese language"
@@ -50,11 +50,6 @@ public class SimpleDetectorTestCase {
// a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)".
assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " +
"\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
-
- // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F
- assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии");
- // https://he.wikipedia.org/wiki/Yahoo!
- assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום");
}
@Test