diff options
10 files changed, 39 insertions, 110 deletions
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java index 049f5392c04..38a4b641fd4 100644 --- a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java +++ b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java @@ -426,7 +426,7 @@ public class SDField extends Field implements TypedKey, FieldOperationContainer, /** Parse an indexing expression which will use the simple linguistics implementatino suitable for testing */ @SuppressWarnings("deprecation") public void parseIndexingScript(String script) { - parseIndexingScript(script, new SimpleLinguistics(false)); + parseIndexingScript(script, new SimpleLinguistics()); } public void parseIndexingScript(String script, Linguistics linguistics) { diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java index d1dc68373db..dece0064fcc 100644 --- a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java +++ b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java @@ -29,7 +29,7 @@ public class IndexingOperation implements FieldOperation { /** Creates an indexing operation which will use the simple linguistics implementation suitable for testing */ @SuppressWarnings("deprecation") public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine) throws ParseException { - return fromStream(input, multiLine, new SimpleLinguistics(false)); + return fromStream(input, multiLine, new SimpleLinguistics()); } public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine, Linguistics linguistics) @@ -51,4 +51,5 @@ public class IndexingOperation implements FieldOperation { } return new IndexingOperation(exp); } + } diff --git a/config-model/src/main/javacc/SDParser.jj b/config-model/src/main/javacc/SDParser.jj index 19c410b4b98..8bcb92b17cb 100644 --- a/config-model/src/main/javacc/SDParser.jj +++ b/config-model/src/main/javacc/SDParser.jj @@ -114,7 +114,7 @@ public class SDParser { */ @SuppressWarnings("deprecation") private IndexingOperation newIndexingOperation(boolean multiline) throws ParseException { - return newIndexingOperation(multiline, new SimpleLinguistics(false)); + return newIndexingOperation(multiline, new SimpleLinguistics()); } /** diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java index fddbd211e27..50dd7611bb0 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java @@ -179,7 +179,7 @@ public abstract class Expression extends Selectable { /** Creates an expression with simple lingustics for testing */ @SuppressWarnings("deprecation") public static Expression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(false)); + return fromString(expression, new SimpleLinguistics()); } public static Expression fromString(String expression, Linguistics linguistics) throws ParseException { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java index 7addca75d2f..320c47103aa 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java @@ -91,7 +91,7 @@ public final class ScriptExpression extends ExpressionList<StatementExpression> /** Creates an expression with simple lingustics for testing */ @SuppressWarnings("deprecation") public static ScriptExpression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(false)); + return fromString(expression, new SimpleLinguistics()); } public static ScriptExpression fromString(String expression, Linguistics linguistics) throws ParseException { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java index 16d069d84ec..cf1e808946d 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java @@ -92,7 +92,7 @@ public final class StatementExpression extends ExpressionList<Expression> { /** Creates an expression with simple lingustics for testing */ @SuppressWarnings("deprecation") public static StatementExpression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(false)); + return fromString(expression, new SimpleLinguistics()); } public static StatementExpression fromString(String expression, Linguistics linguistics) throws ParseException { diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 38181261d6a..1c7c71c00b6 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -1,14 +1,43 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.opennlp; +import com.google.inject.Inject; +import com.yahoo.language.detect.Detector; import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.simple.SimpleDetector; import com.yahoo.language.simple.SimpleLinguistics; +/** + * Returns a linguistics implementation based on OpenNlp, + * and (optionally, default on) Optimaize for language detection. + */ public class OpenNlpLinguistics extends SimpleLinguistics { + private final Detector detector; + + public OpenNlpLinguistics() { + this(true); + } + + @Inject + public OpenNlpLinguistics(OpennlpLinguisticsConfig config) { + this(config.detector().enableOptimaize()); + } + + public OpenNlpLinguistics(boolean enableOptimaize) { + this(enableOptimaize ? new OptimaizeDetector() : new SimpleDetector()); + } + + private OpenNlpLinguistics(Detector detector) { + this.detector = detector; + } + @Override public Tokenizer getTokenizer() { return new OpenNlpTokenizer(getNormalizer(), getTransformer()); } + @Override + public Detector getDetector() { return detector; } + } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index 1edfe5c804e..3de0eb3e997 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -1,26 +1,13 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; -import com.google.common.base.Optional; -import com.optimaize.langdetect.LanguageDetector; -import com.optimaize.langdetect.LanguageDetectorBuilder; -import com.optimaize.langdetect.i18n.LdLocale; -import com.optimaize.langdetect.ngram.NgramExtractors; -import com.optimaize.langdetect.profiles.LanguageProfile; -import com.optimaize.langdetect.profiles.LanguageProfileReader; -import com.optimaize.langdetect.text.CommonTextObjectFactories; -import com.optimaize.langdetect.text.TextObject; -import com.optimaize.langdetect.text.TextObjectFactory; import com.yahoo.language.Language; import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; import com.yahoo.text.Utf8; -import java.io.IOException; import java.nio.ByteBuffer; -import java.util.List; -import java.util.Locale; /** * Includes functionality for determining the langCode from a sample or from the encoding. @@ -38,55 +25,6 @@ import java.util.Locale; */ public class SimpleDetector implements Detector { - static private Object initGuard = new Object(); - static private TextObjectFactory textObjectFactory = null; - static private LanguageDetector languageDetector = null; - - static private void initOptimaize (boolean useOptimaize) { - if (!useOptimaize) return; - synchronized (initGuard) { - if ((textObjectFactory != null) && (languageDetector != null)) return; - - // origin: https://github.com/optimaize/language-detector - //load all languages: - List<LanguageProfile> languageProfiles; - try { - languageProfiles = new LanguageProfileReader().readAllBuiltIn(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - //build language detector: - languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) - .withProfiles(languageProfiles) - .build(); - - //create a text object factory - textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); - } - } - - private final boolean enableOptimaize; - - /** @deprecated use OptimaizeDetector to enable optimaize */ - @Deprecated - SimpleDetector(boolean enableOptimaize) { - initOptimaize(enableOptimaize); - this.enableOptimaize = enableOptimaize; - - } - - @SuppressWarnings("deprecation") - public SimpleDetector() { - this(true); - } - - /** @deprecated use OptimaizeDetector to enable optimaize */ - @Deprecated - public SimpleDetector(SimpleLinguisticsConfig.Detector detector) { - this(detector.enableOptimaize()); - } - @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false); @@ -172,26 +110,10 @@ public class SimpleDetector implements Detector { return Language.THAI; } } - if (enableOptimaize && Language.UNKNOWN.equals(soFar)){ - return detectLangOptimaize(input); - } // got to the end, so return the current best guess return soFar; } - private static Language detectLangOptimaize(String input) { - if (input == null || input.length() == 0) { - return Language.UNKNOWN; - } - TextObject textObject = textObjectFactory.forText(input); - Optional<LdLocale> lang = languageDetector.detect(textObject); - if (lang.isPresent()) { - String language = lang.get().getLanguage(); - return Language.fromLocale(new Locale(language)); - } - return Language.UNKNOWN; - } - private boolean isTrailingOctet(byte i) { return ((i >>> 6) & 3) == 2; } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index b7bf0215ca4..3c2e70b6677 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -17,7 +17,8 @@ import com.yahoo.language.process.Tokenizer; import com.yahoo.language.process.Transformer; /** - * Factory of pure Java linguistic processor implementations. + * Factory of simple linguistic processor implementations. + * Useful for testing and english-only use cases. * * @author bratseth * @author bjorncs @@ -34,26 +35,9 @@ public class SimpleLinguistics implements Linguistics { @Inject @SuppressWarnings("deprecation") public SimpleLinguistics() { - this(true); - - } - - /** @deprecated use OpenNlpLinguistics to get optimaize */ - @Deprecated // OK - public SimpleLinguistics(boolean enableOptimaize) { - this(new SimpleDetector(enableOptimaize)); - } - - /** @deprecated use OpenNlpLinguistics to get optimaize */ - @Deprecated // OK - public SimpleLinguistics(SimpleLinguisticsConfig config) { - this(new SimpleDetector(config.detector())); - } - - private SimpleLinguistics(Detector detector) { this.normalizer = new SimpleNormalizer(); this.transformer = new SimpleTransformer(); - this.detector = detector; + this.detector = new SimpleDetector(); this.characterClasses = new CharacterClasses(); this.gramSplitter = new GramSplitter(characterClasses); } diff --git a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def deleted file mode 100644 index 1ddca52c443..00000000000 --- a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -# Deprecated: Do not use -namespace=language.simple - -# Enable Optimaize language detector -detector.enableOptimaize bool default=true - |