summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorBjørn Christian Seime <bjorncs@oath.com>2018-07-26 12:37:47 +0200
committerBjørn Christian Seime <bjorncs@oath.com>2018-07-26 12:37:49 +0200
commit11abca9b4c5793b442af3e6bb1ae8990768ef8b1 (patch)
treeb2161955d09824f35cd82aae932c8650e843331c /linguistics
parente854864520d3734ed1baced433b9883f61964cc3 (diff)
Add config for simple-linguistics
Add a config parameter for enabling/disabling optimaize detector
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java17
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java31
-rw-r--r--linguistics/src/main/resources/configdefinitions/simple-linguistics.def5
3 files changed, 45 insertions, 8 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index 4ae3644d62c..2b31f95675b 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -34,6 +34,7 @@ import java.util.Locale;
* character blocks, so if there are no definitive signs of Japanese then it is assumed that the String is Chinese.
*
* @author Rich Pito
+ * @author bjorncs
*/
public class SimpleDetector implements Detector {
static private TextObjectFactory textObjectFactory;
@@ -58,6 +59,16 @@ public class SimpleDetector implements Detector {
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
}
+ private final boolean enableOptimaize;
+
+ public SimpleDetector() {
+ this.enableOptimaize = true;
+ }
+
+ public SimpleDetector(SimpleLinguisticsConfig.Detector detector) {
+ this.enableOptimaize = detector.enableOptimaize();
+ }
+
@Override
public Detection detect(byte[] input, int offset, int length, Hint hint) {
return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false);
@@ -75,11 +86,11 @@ public class SimpleDetector implements Detector {
return new Detection(guessLanguage(input), Utf8.getCharset().name(), false);
}
- public static Language guessLanguage(byte[] buf, int offset, int length) {
+ public Language guessLanguage(byte[] buf, int offset, int length) {
return guessLanguage(Utf8.toString(buf, offset, length));
}
- public static Language guessLanguage(String input) {
+ public Language guessLanguage(String input) {
if (input == null || input.length() == 0) {
return Language.UNKNOWN;
}
@@ -143,7 +154,7 @@ public class SimpleDetector implements Detector {
return Language.THAI;
}
}
- if (Language.UNKNOWN.equals(soFar)){
+ if (enableOptimaize && Language.UNKNOWN.equals(soFar)){
return detectLangOptimaize(input);
}
// got to the end, so return the current best guess
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index ad855a18088..cdfd5b4cb58 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
+import com.google.inject.Inject;
import com.yahoo.collections.Tuple2;
import com.yahoo.component.Version;
import com.yahoo.language.Linguistics;
@@ -19,15 +20,35 @@ import com.yahoo.language.process.Transformer;
* Factory of pure Java linguistic processor implementations.
*
* @author bratseth
+ * @author bjorncs
*/
public class SimpleLinguistics implements Linguistics {
// Threadsafe instances
- private final static Normalizer normalizer = new SimpleNormalizer();
- private final static Transformer transformer = new SimpleTransformer();
- private final static Detector detector = new SimpleDetector();
- private final static CharacterClasses characterClasses = new CharacterClasses();
- private final static GramSplitter gramSplitter = new GramSplitter(characterClasses);
+ private final Normalizer normalizer;
+ private final Transformer transformer;
+ private final Detector detector;
+ private final CharacterClasses characterClasses;
+ private final GramSplitter gramSplitter;
+
+ @Inject
+ public SimpleLinguistics() {
+ CharacterClasses characterClasses = new CharacterClasses();
+ this.normalizer = new SimpleNormalizer();
+ this.transformer = new SimpleTransformer();
+ this.detector = new SimpleDetector();
+ this.characterClasses = new CharacterClasses();
+ this.gramSplitter = new GramSplitter(characterClasses);
+ }
+
+ public SimpleLinguistics(SimpleLinguisticsConfig config) {
+ CharacterClasses characterClasses = new CharacterClasses();
+ this.normalizer = new SimpleNormalizer();
+ this.transformer = new SimpleTransformer();
+ this.detector = new SimpleDetector(config.detector());
+ this.characterClasses = new CharacterClasses();
+ this.gramSplitter = new GramSplitter(characterClasses);
+ }
@Override
public Stemmer getStemmer() { return new StemmerImpl(getTokenizer()); }
diff --git a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
new file mode 100644
index 00000000000..b2e7f6688af
--- /dev/null
+++ b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def
@@ -0,0 +1,5 @@
+# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=language.simple
+
+# Enable Optimaize language detector
+detector.enableOptimaize bool default=true \ No newline at end of file