summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2018-11-01 11:12:17 +0100
committergjoranv <gv@oath.com>2019-01-21 15:09:25 +0100
commit45c66eac03e6d258209f897b2f5da17212a58f41 (patch)
treec83a3bb1ec9246e818403d33c14d9fbef23b270b
parent953684a791ac6bb080ecd1c16e77fb57c3fcb85a (diff)
Make SimpleLinguistics simple again
- Remove SimpleLinguistics config and optional use of Optimaize - Add Optimaize to OpennlpLinguistics; on by default and config to disable
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java2
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java3
-rw-r--r--config-model/src/main/javacc/SDParser.jj2
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java2
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java2
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java2
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java29
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java78
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java22
-rw-r--r--linguistics/src/main/resources/configdefinitions/simple-linguistics.def7
10 files changed, 39 insertions, 110 deletions
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java
index 049f5392c04..38a4b641fd4 100644
--- a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java
@@ -426,7 +426,7 @@ public class SDField extends Field implements TypedKey, FieldOperationContainer,
/** Parse an indexing expression which will use the simple linguistics implementatino suitable for testing */
@SuppressWarnings("deprecation")
public void parseIndexingScript(String script) {
- parseIndexingScript(script, new SimpleLinguistics(false));
+ parseIndexingScript(script, new SimpleLinguistics());
}
public void parseIndexingScript(String script, Linguistics linguistics) {
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java
index d1dc68373db..dece0064fcc 100644
--- a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java
@@ -29,7 +29,7 @@ public class IndexingOperation implements FieldOperation {
/** Creates an indexing operation which will use the simple linguistics implementation suitable for testing */
@SuppressWarnings("deprecation")
public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine) throws ParseException {
- return fromStream(input, multiLine, new SimpleLinguistics(false));
+ return fromStream(input, multiLine, new SimpleLinguistics());
}
public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine, Linguistics linguistics)
@@ -51,4 +51,5 @@ public class IndexingOperation implements FieldOperation {
}
return new IndexingOperation(exp);
}
+
}
diff --git a/config-model/src/main/javacc/SDParser.jj b/config-model/src/main/javacc/SDParser.jj
index 19c410b4b98..8bcb92b17cb 100644
--- a/config-model/src/main/javacc/SDParser.jj
+++ b/config-model/src/main/javacc/SDParser.jj
@@ -114,7 +114,7 @@ public class SDParser {
*/
@SuppressWarnings("deprecation")
private IndexingOperation newIndexingOperation(boolean multiline) throws ParseException {
- return newIndexingOperation(multiline, new SimpleLinguistics(false));
+ return newIndexingOperation(multiline, new SimpleLinguistics());
}
/**
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java
index fddbd211e27..50dd7611bb0 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java
@@ -179,7 +179,7 @@ public abstract class Expression extends Selectable {
/** Creates an expression with simple lingustics for testing */
@SuppressWarnings("deprecation")
public static Expression fromString(String expression) throws ParseException {
- return fromString(expression, new SimpleLinguistics(false));
+ return fromString(expression, new SimpleLinguistics());
}
public static Expression fromString(String expression, Linguistics linguistics) throws ParseException {
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java
index 7addca75d2f..320c47103aa 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java
@@ -91,7 +91,7 @@ public final class ScriptExpression extends ExpressionList<StatementExpression>
/** Creates an expression with simple lingustics for testing */
@SuppressWarnings("deprecation")
public static ScriptExpression fromString(String expression) throws ParseException {
- return fromString(expression, new SimpleLinguistics(false));
+ return fromString(expression, new SimpleLinguistics());
}
public static ScriptExpression fromString(String expression, Linguistics linguistics) throws ParseException {
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java
index 16d069d84ec..cf1e808946d 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java
@@ -92,7 +92,7 @@ public final class StatementExpression extends ExpressionList<Expression> {
/** Creates an expression with simple lingustics for testing */
@SuppressWarnings("deprecation")
public static StatementExpression fromString(String expression) throws ParseException {
- return fromString(expression, new SimpleLinguistics(false));
+ return fromString(expression, new SimpleLinguistics());
}
public static StatementExpression fromString(String expression, Linguistics linguistics) throws ParseException {
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
index 38181261d6a..1c7c71c00b6 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
@@ -1,14 +1,43 @@
// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;
+import com.google.inject.Inject;
+import com.yahoo.language.detect.Detector;
import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.simple.SimpleDetector;
import com.yahoo.language.simple.SimpleLinguistics;
+/**
+ * Returns a linguistics implementation based on OpenNlp,
+ * and (optionally, default on) Optimaize for language detection.
+ */
public class OpenNlpLinguistics extends SimpleLinguistics {
+ private final Detector detector;
+
+ public OpenNlpLinguistics() {
+ this(true);
+ }
+
+ @Inject
+ public OpenNlpLinguistics(OpennlpLinguisticsConfig config) {
+ this(config.detector().enableOptimaize());
+ }
+
+ public OpenNlpLinguistics(boolean enableOptimaize) {
+ this(enableOptimaize ? new OptimaizeDetector() : new SimpleDetector());
+ }
+
+ private OpenNlpLinguistics(Detector detector) {
+ this.detector = detector;
+ }
+
@Override
public Tokenizer getTokenizer() {
return new OpenNlpTokenizer(getNormalizer(), getTransformer());
}
+ @Override
+ public Detector getDetector() { return detector; }
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index 1edfe5c804e..3de0eb3e997 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -1,26 +1,13 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
-import com.google.common.base.Optional;
-import com.optimaize.langdetect.LanguageDetector;
-import com.optimaize.langdetect.LanguageDetectorBuilder;
-import com.optimaize.langdetect.i18n.LdLocale;
-import com.optimaize.langdetect.ngram.NgramExtractors;
-import com.optimaize.langdetect.profiles.LanguageProfile;
-import com.optimaize.langdetect.profiles.LanguageProfileReader;
-import com.optimaize.langdetect.text.CommonTextObjectFactories;
-import com.optimaize.langdetect.text.TextObject;
-import com.optimaize.langdetect.text.TextObjectFactory;
import com.yahoo.language.Language;
import com.yahoo.language.detect.Detection;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.detect.Hint;
import com.yahoo.text.Utf8;
-import java.io.IOException;
import java.nio.ByteBuffer;
-import java.util.List;
-import java.util.Locale;
/**
* Includes functionality for determining the langCode from a sample or from the encoding.
@@ -38,55 +25,6 @@ import java.util.Locale;
*/
public class SimpleDetector implements Detector {
- static private Object initGuard = new Object();
- static private TextObjectFactory textObjectFactory = null;
- static private LanguageDetector languageDetector = null;
-
- static private void initOptimaize (boolean useOptimaize) {
- if (!useOptimaize) return;
- synchronized (initGuard) {
- if ((textObjectFactory != null) && (languageDetector != null)) return;
-
- // origin: https://github.com/optimaize/language-detector
- //load all languages:
- List<LanguageProfile> languageProfiles;
- try {
- languageProfiles = new LanguageProfileReader().readAllBuiltIn();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- //build language detector:
- languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
- .withProfiles(languageProfiles)
- .build();
-
- //create a text object factory
- textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
- }
- }
-
- private final boolean enableOptimaize;
-
- /** @deprecated use OptimaizeDetector to enable optimaize */
- @Deprecated
- SimpleDetector(boolean enableOptimaize) {
- initOptimaize(enableOptimaize);
- this.enableOptimaize = enableOptimaize;
-
- }
-
- @SuppressWarnings("deprecation")
- public SimpleDetector() {
- this(true);
- }
-
- /** @deprecated use OptimaizeDetector to enable optimaize */
- @Deprecated
- public SimpleDetector(SimpleLinguisticsConfig.Detector detector) {
- this(detector.enableOptimaize());
- }
-
@Override
public Detection detect(byte[] input, int offset, int length, Hint hint) {
return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false);
@@ -172,26 +110,10 @@ public class SimpleDetector implements Detector {
return Language.THAI;
}
}
- if (enableOptimaize && Language.UNKNOWN.equals(soFar)){
- return detectLangOptimaize(input);
- }
// got to the end, so return the current best guess
return soFar;
}
- private static Language detectLangOptimaize(String input) {
- if (input == null || input.length() == 0) {
- return Language.UNKNOWN;
- }
- TextObject textObject = textObjectFactory.forText(input);
- Optional<LdLocale> lang = languageDetector.detect(textObject);
- if (lang.isPresent()) {
- String language = lang.get().getLanguage();
- return Language.fromLocale(new Locale(language));
- }
- return Language.UNKNOWN;
- }
-
private boolean isTrailingOctet(byte i) {
return ((i >>> 6) & 3) == 2;
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index b7bf0215ca4..3c2e70b6677 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -17,7 +17,8 @@ import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.process.Transformer;
/**
- * Factory of pure Java linguistic processor implementations.
+ * Factory of simple linguistic processor implementations.
+ * Useful for testing and english-only use cases.
*
* @author bratseth
* @author bjorncs
@@ -34,26 +35,9 @@ public class SimpleLinguistics implements Linguistics {
@Inject
@SuppressWarnings("deprecation")
public SimpleLinguistics() {
- this(true);
-
- }
-
- /** @deprecated use OpenNlpLinguistics to get optimaize */
- @Deprecated // OK
- public SimpleLinguistics(boolean enableOptimaize) {
- this(new SimpleDetector(enableOptimaize));
- }
-
- /** @deprecated use OpenNlpLinguistics to get optimaize */
- @Deprecated // OK
- public SimpleLinguistics(SimpleLinguisticsConfig config) {
- this(new SimpleDetector(config.detector()));
- }
-
- private SimpleLinguistics(Detector detector) {
this.normalizer = new SimpleNormalizer();
this.transformer = new SimpleTransformer();
- this.detector = detector;
+ this.detector = new SimpleDetector();
this.characterClasses = new CharacterClasses();
this.gramSplitter = new GramSplitter(characterClasses);
}
diff --git a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
deleted file mode 100644
index 1ddca52c443..00000000000
--- a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-# Deprecated: Do not use
-namespace=language.simple
-
-# Enable Optimaize language detector
-detector.enableOptimaize bool default=true
-