summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2018-11-01 11:12:17 +0100
committergjoranv <gv@oath.com>2019-01-21 15:09:25 +0100
commit45c66eac03e6d258209f897b2f5da17212a58f41 (patch)
treec83a3bb1ec9246e818403d33c14d9fbef23b270b /linguistics
parent953684a791ac6bb080ecd1c16e77fb57c3fcb85a (diff)
Make SimpleLinguistics simple again
- Remove SimpleLinguistics config and optional use of Optimaize - Add Optimaize to OpennlpLinguistics; on by default and config to disable
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java29
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java78
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java22
-rw-r--r--linguistics/src/main/resources/configdefinitions/simple-linguistics.def7
4 files changed, 32 insertions, 104 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
index 38181261d6a..1c7c71c00b6 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
@@ -1,14 +1,43 @@
// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;
+import com.google.inject.Inject;
+import com.yahoo.language.detect.Detector;
import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.simple.SimpleDetector;
import com.yahoo.language.simple.SimpleLinguistics;
+/**
+ * Returns a linguistics implementation based on OpenNlp,
+ * and (optionally, default on) Optimaize for language detection.
+ */
public class OpenNlpLinguistics extends SimpleLinguistics {
+ private final Detector detector;
+
+ public OpenNlpLinguistics() {
+ this(true);
+ }
+
+ @Inject
+ public OpenNlpLinguistics(OpennlpLinguisticsConfig config) {
+ this(config.detector().enableOptimaize());
+ }
+
+ public OpenNlpLinguistics(boolean enableOptimaize) {
+ this(enableOptimaize ? new OptimaizeDetector() : new SimpleDetector());
+ }
+
+ private OpenNlpLinguistics(Detector detector) {
+ this.detector = detector;
+ }
+
@Override
public Tokenizer getTokenizer() {
return new OpenNlpTokenizer(getNormalizer(), getTransformer());
}
+ @Override
+ public Detector getDetector() { return detector; }
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index 1edfe5c804e..3de0eb3e997 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -1,26 +1,13 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
-import com.google.common.base.Optional;
-import com.optimaize.langdetect.LanguageDetector;
-import com.optimaize.langdetect.LanguageDetectorBuilder;
-import com.optimaize.langdetect.i18n.LdLocale;
-import com.optimaize.langdetect.ngram.NgramExtractors;
-import com.optimaize.langdetect.profiles.LanguageProfile;
-import com.optimaize.langdetect.profiles.LanguageProfileReader;
-import com.optimaize.langdetect.text.CommonTextObjectFactories;
-import com.optimaize.langdetect.text.TextObject;
-import com.optimaize.langdetect.text.TextObjectFactory;
import com.yahoo.language.Language;
import com.yahoo.language.detect.Detection;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.detect.Hint;
import com.yahoo.text.Utf8;
-import java.io.IOException;
import java.nio.ByteBuffer;
-import java.util.List;
-import java.util.Locale;
/**
* Includes functionality for determining the langCode from a sample or from the encoding.
@@ -38,55 +25,6 @@ import java.util.Locale;
*/
public class SimpleDetector implements Detector {
- static private Object initGuard = new Object();
- static private TextObjectFactory textObjectFactory = null;
- static private LanguageDetector languageDetector = null;
-
- static private void initOptimaize (boolean useOptimaize) {
- if (!useOptimaize) return;
- synchronized (initGuard) {
- if ((textObjectFactory != null) && (languageDetector != null)) return;
-
- // origin: https://github.com/optimaize/language-detector
- //load all languages:
- List<LanguageProfile> languageProfiles;
- try {
- languageProfiles = new LanguageProfileReader().readAllBuiltIn();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- //build language detector:
- languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
- .withProfiles(languageProfiles)
- .build();
-
- //create a text object factory
- textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
- }
- }
-
- private final boolean enableOptimaize;
-
- /** @deprecated use OptimaizeDetector to enable optimaize */
- @Deprecated
- SimpleDetector(boolean enableOptimaize) {
- initOptimaize(enableOptimaize);
- this.enableOptimaize = enableOptimaize;
-
- }
-
- @SuppressWarnings("deprecation")
- public SimpleDetector() {
- this(true);
- }
-
- /** @deprecated use OptimaizeDetector to enable optimaize */
- @Deprecated
- public SimpleDetector(SimpleLinguisticsConfig.Detector detector) {
- this(detector.enableOptimaize());
- }
-
@Override
public Detection detect(byte[] input, int offset, int length, Hint hint) {
return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false);
@@ -172,26 +110,10 @@ public class SimpleDetector implements Detector {
return Language.THAI;
}
}
- if (enableOptimaize && Language.UNKNOWN.equals(soFar)){
- return detectLangOptimaize(input);
- }
// got to the end, so return the current best guess
return soFar;
}
- private static Language detectLangOptimaize(String input) {
- if (input == null || input.length() == 0) {
- return Language.UNKNOWN;
- }
- TextObject textObject = textObjectFactory.forText(input);
- Optional<LdLocale> lang = languageDetector.detect(textObject);
- if (lang.isPresent()) {
- String language = lang.get().getLanguage();
- return Language.fromLocale(new Locale(language));
- }
- return Language.UNKNOWN;
- }
-
private boolean isTrailingOctet(byte i) {
return ((i >>> 6) & 3) == 2;
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index b7bf0215ca4..3c2e70b6677 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -17,7 +17,8 @@ import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.process.Transformer;
/**
- * Factory of pure Java linguistic processor implementations.
+ * Factory of simple linguistic processor implementations.
+ * Useful for testing and english-only use cases.
*
* @author bratseth
* @author bjorncs
@@ -34,26 +35,9 @@ public class SimpleLinguistics implements Linguistics {
@Inject
@SuppressWarnings("deprecation")
public SimpleLinguistics() {
- this(true);
-
- }
-
- /** @deprecated use OpenNlpLinguistics to get optimaize */
- @Deprecated // OK
- public SimpleLinguistics(boolean enableOptimaize) {
- this(new SimpleDetector(enableOptimaize));
- }
-
- /** @deprecated use OpenNlpLinguistics to get optimaize */
- @Deprecated // OK
- public SimpleLinguistics(SimpleLinguisticsConfig config) {
- this(new SimpleDetector(config.detector()));
- }
-
- private SimpleLinguistics(Detector detector) {
this.normalizer = new SimpleNormalizer();
this.transformer = new SimpleTransformer();
- this.detector = detector;
+ this.detector = new SimpleDetector();
this.characterClasses = new CharacterClasses();
this.gramSplitter = new GramSplitter(characterClasses);
}
diff --git a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
deleted file mode 100644
index 1ddca52c443..00000000000
--- a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-# Deprecated: Do not use
-namespace=language.simple
-
-# Enable Optimaize language detector
-detector.enableOptimaize bool default=true
-