diff options
16 files changed, 171 insertions, 7 deletions
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java index dd2ffba20ec..d2d28dadfda 100644 --- a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java +++ b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java @@ -426,6 +426,7 @@ public class SDField extends Field implements TypedKey, FieldOperationContainer, } /** Parse an indexing expression which will use the simple linguistics implementatino suitable for testing */ + @SuppressWarnings("deprecation") public void parseIndexingScript(String script) { parseIndexingScript(script, new SimpleLinguistics(false)); } diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java index cd586960185..d1dc68373db 100644 --- a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java +++ b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java @@ -12,7 +12,7 @@ import com.yahoo.vespa.indexinglanguage.expressions.StatementExpression; import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; /** - * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a> + * @author Einar M R Rosenvinge */ public class IndexingOperation implements FieldOperation { @@ -27,6 +27,7 @@ public class IndexingOperation implements FieldOperation { } /** Creates an indexing operation which will use the simple linguistics implementation suitable for testing */ + @SuppressWarnings("deprecation") public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine) throws ParseException { return fromStream(input, multiLine, new SimpleLinguistics(false)); } diff --git a/config-model/src/main/javacc/SDParser.jj b/config-model/src/main/javacc/SDParser.jj index d1c67a6d425..9494b1524dd 100644 --- a/config-model/src/main/javacc/SDParser.jj +++ b/config-model/src/main/javacc/SDParser.jj @@ -111,6 +111,7 @@ public class SDParser { * * @param multiline Whether or not to allow multi-line expressions. */ + @SuppressWarnings("deprecation") private IndexingOperation newIndexingOperation(boolean multiline) throws ParseException { return newIndexingOperation(multiline, new SimpleLinguistics(false)); } diff --git a/container-search/src/main/java/com/yahoo/search/query/parser/ParserEnvironment.java b/container-search/src/main/java/com/yahoo/search/query/parser/ParserEnvironment.java index 7bd1b01b3e5..7aefa6f0cf2 100644 --- a/container-search/src/main/java/com/yahoo/search/query/parser/ParserEnvironment.java +++ b/container-search/src/main/java/com/yahoo/search/query/parser/ParserEnvironment.java @@ -18,6 +18,8 @@ import com.yahoo.search.searchchain.Execution; public final class ParserEnvironment { private IndexFacts indexFacts = new IndexFacts(); + + @SuppressWarnings("deprecation") private Linguistics linguistics = new SimpleLinguistics(false); private SpecialTokens specialTokens = new SpecialTokens(); diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/TestLinguistics.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/TestLinguistics.java index fbe40494231..db5397e5292 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/parser/TestLinguistics.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/TestLinguistics.java @@ -61,6 +61,7 @@ public class TestLinguistics implements Linguistics { } @Override + @Deprecated public Tuple2<String, Version> getVersion(Linguistics.Component component) { return linguistics.getVersion(component); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java index 231446b9c62..fddbd211e27 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java @@ -177,6 +177,7 @@ public abstract class Expression extends Selectable { public abstract DataType createdOutputType(); /** Creates an expression with simple lingustics for testing */ + @SuppressWarnings("deprecation") public static Expression fromString(String expression) throws ParseException { return fromString(expression, new SimpleLinguistics(false)); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java index 3e9f6ad5032..7addca75d2f 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java @@ -89,6 +89,7 @@ public final class ScriptExpression extends ExpressionList<StatementExpression> } /** Creates an expression with simple lingustics for testing */ + @SuppressWarnings("deprecation") public static ScriptExpression fromString(String expression) throws ParseException { return fromString(expression, new SimpleLinguistics(false)); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java index 422457d18fa..16d069d84ec 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java @@ -90,6 +90,7 @@ public final class StatementExpression extends ExpressionList<Expression> { } /** Creates an expression with simple lingustics for testing */ + @SuppressWarnings("deprecation") public static StatementExpression fromString(String expression) throws ParseException { return fromString(expression, new SimpleLinguistics(false)); } diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java index 75cdba0ab40..9006d855faa 100644 --- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java @@ -101,7 +101,10 @@ public interface Linguistics { /** * Returns the name and version of a processor component returned by * this instance. + * + * @deprecated do not use */ + @Deprecated // OK Tuple2<String, Version> getVersion(Linguistics.Component component); } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java new file mode 100644 index 00000000000..7ba061aaef1 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java @@ -0,0 +1,102 @@ +package com.yahoo.language.opennlp; + +import com.google.common.base.Optional; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObjectFactory; +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.language.simple.SimpleDetector; +import com.yahoo.text.Utf8; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; + +/** + * Detects the language of some sample text using SimpleDetector for CJK and Optimaize otherwise. + * + * @author bratseth + */ +public class OptimaizeDetector implements Detector { + + static private Object initGuard = new Object(); + static private TextObjectFactory textObjectFactory = null; + static private LanguageDetector languageDetector = null; + + static private void initOptimaize() { + synchronized (initGuard) { + if ((textObjectFactory != null) && (languageDetector != null)) return; + + // origin: https://github.com/optimaize/language-detector + // load all languages: + List<LanguageProfile> languageProfiles; + try { + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + + //build language detector: + languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + + //create a text object factory + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } + } + + private SimpleDetector simpleDetector = new SimpleDetector(); + + public OptimaizeDetector() { + initOptimaize(); + } + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + return new Detection(guessLanguage(input, offset, length), simpleDetector.guessEncoding(input), false); + } + + @Override + public Detection detect(ByteBuffer input, Hint hint) { + byte[] buf = new byte[input.remaining()]; + input.get(buf, 0, buf.length); + return detect(buf, 0, buf.length, hint); + } + + @Override + public Detection detect(String input, Hint hint) { + return new Detection(guessLanguage(input), Utf8.getCharset().name(), false); + } + + private Language guessLanguage(byte[] buf, int offset, int length) { + return guessLanguage(Utf8.toString(buf, offset, length)); + } + + public Language guessLanguage(String input) { + if (input == null || input.length() == 0) return Language.UNKNOWN; + + Language result = simpleDetector.guessLanguage(input); + if (result != Language.UNKNOWN) return result; + + return guessLanguageUsingOptimaize(input); + } + + private static Language guessLanguageUsingOptimaize(String input) { + Optional<LdLocale> result = languageDetector.detect(textObjectFactory.forText(input)); + if ( ! result.isPresent()) return Language.UNKNOWN; + + return Language.fromLocale(new Locale(result.get().getLanguage())); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index bcd4492625d..1edfe5c804e 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -68,16 +68,21 @@ public class SimpleDetector implements Detector { private final boolean enableOptimaize; + /** @deprecated use OptimaizeDetector to enable optimaize */ + @Deprecated SimpleDetector(boolean enableOptimaize) { initOptimaize(enableOptimaize); this.enableOptimaize = enableOptimaize; } + @SuppressWarnings("deprecation") public SimpleDetector() { this(true); } + /** @deprecated use OptimaizeDetector to enable optimaize */ + @Deprecated public SimpleDetector(SimpleLinguisticsConfig.Detector detector) { this(detector.enableOptimaize()); } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index 8cbbdeeae1d..b7bf0215ca4 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -32,14 +32,20 @@ public class SimpleLinguistics implements Linguistics { private final GramSplitter gramSplitter; @Inject + @SuppressWarnings("deprecation") public SimpleLinguistics() { this(true); } + + /** @deprecated use OpenNlpLinguistics to get optimaize */ + @Deprecated // OK public SimpleLinguistics(boolean enableOptimaize) { this(new SimpleDetector(enableOptimaize)); } + /** @deprecated use OpenNlpLinguistics to get optimaize */ + @Deprecated // OK public SimpleLinguistics(SimpleLinguisticsConfig config) { this(new SimpleDetector(config.detector())); } @@ -76,6 +82,8 @@ public class SimpleLinguistics implements Linguistics { @Override public CharacterClasses getCharacterClasses() { return characterClasses; } + /** @deprecated do not use */ + @Deprecated // OK @Override public Tuple2<String, Version> getVersion(Component component) { return new Tuple2<>("yahoo", new Version(1, 0)); diff --git a/linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def b/linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def new file mode 100644 index 00000000000..13194d471fd --- /dev/null +++ b/linguistics/src/main/resources/configdefinitions/opennlp-linguistics.def @@ -0,0 +1,6 @@ +# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +namespace=language.opennlp + +# Enable Optimaize language detector +detector.enableOptimaize bool default=true + diff --git a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def index d5e7ced7419..1ddca52c443 100644 --- a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def +++ b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def @@ -1,4 +1,5 @@ # Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +# Deprecated: Do not use namespace=language.simple # Enable Optimaize language detector diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java new file mode 100644 index 00000000000..ef3248ee0bb --- /dev/null +++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OptimaizeDetectorTestCase.java @@ -0,0 +1,35 @@ +package com.yahoo.language.opennlp; + +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.simple.SimpleDetector; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author bratseth + */ +public class OptimaizeDetectorTestCase { + + private static final Detector detector = new OptimaizeDetector(); + + @Test + public void testDetection() { + assertLanguage(Language.UNKNOWN, "Hello!"); + + // Test fallback to SimpleDetector + assertLanguage(Language.CHINESE_TRADITIONAL, // CHINESE_SIMPLIFIED input + "\u6211\u80FD\u541E\u4E0B\u73BB\u7483\u800C\u4E0D\u4F24\u8EAB\u4F53\u3002"); + + // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F + assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии"); + // https://he.wikipedia.org/wiki/Yahoo! + assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום"); + } + + private static void assertLanguage(Language language, String input) { + assertEquals(language, detector.detect(input, null).getLanguage()); + } + +} diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java index 1905c6d98a9..0f5fbceccf2 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleDetectorTestCase.java @@ -16,7 +16,7 @@ import static org.junit.Assert.assertEquals; public class SimpleDetectorTestCase { @Test - public void requireThatLanguageCanDetected() { + public void testDetection() { assertLanguage(Language.UNKNOWN, "Hello!"); // "Chinese language" @@ -50,11 +50,6 @@ public class SimpleDetectorTestCase { // a string from http://www.columbia.edu/kermit/utf8.html that says "I can eat glass (and it doesn't hurt me)". assertLanguage(Language.KOREAN, "\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " + "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694"); - - // from https://ru.wikipedia.org/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F - assertLanguage(Language.RUSSIAN, "Материал из Википедии — свободной энциклопедии"); - // https://he.wikipedia.org/wiki/Yahoo! - assertLanguage(Language.HEBREW, "אתר יאהו! הוא אחד מאתרי האינטרנט הפופולריים ביותר בעולם, עם מעל 500 מיליון כניסות בכל יום"); } @Test |