From 11abca9b4c5793b442af3e6bb1ae8990768ef8b1 Mon Sep 17 00:00:00 2001 From: Bjørn Christian Seime Date: Thu, 26 Jul 2018 12:37:47 +0200 Subject: Add config for simple-linguistics Add a config parameter for enabling/disabling optimaize detector --- .../com/yahoo/language/simple/SimpleDetector.java | 17 +++++++++--- .../yahoo/language/simple/SimpleLinguistics.java | 31 ++++++++++++++++++---- .../configdefinitions/simple-linguistics.def | 5 ++++ 3 files changed, 45 insertions(+), 8 deletions(-) create mode 100644 linguistics/src/main/resources/configdefinitions/simple-linguistics.def diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index 4ae3644d62c..2b31f95675b 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -34,6 +34,7 @@ import java.util.Locale; * character blocks, so if there are no definitive signs of Japanese then it is assumed that the String is Chinese. * * @author Rich Pito + * @author bjorncs */ public class SimpleDetector implements Detector { static private TextObjectFactory textObjectFactory; @@ -58,6 +59,16 @@ public class SimpleDetector implements Detector { textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); } + private final boolean enableOptimaize; + + public SimpleDetector() { + this.enableOptimaize = true; + } + + public SimpleDetector(SimpleLinguisticsConfig.Detector detector) { + this.enableOptimaize = detector.enableOptimaize(); + } + @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false); @@ -75,11 +86,11 @@ public class SimpleDetector implements Detector { return new Detection(guessLanguage(input), Utf8.getCharset().name(), false); } - public static Language guessLanguage(byte[] buf, int offset, int length) { + public Language guessLanguage(byte[] buf, int offset, int length) { return guessLanguage(Utf8.toString(buf, offset, length)); } - public static Language guessLanguage(String input) { + public Language guessLanguage(String input) { if (input == null || input.length() == 0) { return Language.UNKNOWN; } @@ -143,7 +154,7 @@ public class SimpleDetector implements Detector { return Language.THAI; } } - if (Language.UNKNOWN.equals(soFar)){ + if (enableOptimaize && Language.UNKNOWN.equals(soFar)){ return detectLangOptimaize(input); } // got to the end, so return the current best guess diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index ad855a18088..cdfd5b4cb58 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; +import com.google.inject.Inject; import com.yahoo.collections.Tuple2; import com.yahoo.component.Version; import com.yahoo.language.Linguistics; @@ -19,15 +20,35 @@ import com.yahoo.language.process.Transformer; * Factory of pure Java linguistic processor implementations. * * @author bratseth + * @author bjorncs */ public class SimpleLinguistics implements Linguistics { // Threadsafe instances - private final static Normalizer normalizer = new SimpleNormalizer(); - private final static Transformer transformer = new SimpleTransformer(); - private final static Detector detector = new SimpleDetector(); - private final static CharacterClasses characterClasses = new CharacterClasses(); - private final static GramSplitter gramSplitter = new GramSplitter(characterClasses); + private final Normalizer normalizer; + private final Transformer transformer; + private final Detector detector; + private final CharacterClasses characterClasses; + private final GramSplitter gramSplitter; + + @Inject + public SimpleLinguistics() { + CharacterClasses characterClasses = new CharacterClasses(); + this.normalizer = new SimpleNormalizer(); + this.transformer = new SimpleTransformer(); + this.detector = new SimpleDetector(); + this.characterClasses = new CharacterClasses(); + this.gramSplitter = new GramSplitter(characterClasses); + } + + public SimpleLinguistics(SimpleLinguisticsConfig config) { + CharacterClasses characterClasses = new CharacterClasses(); + this.normalizer = new SimpleNormalizer(); + this.transformer = new SimpleTransformer(); + this.detector = new SimpleDetector(config.detector()); + this.characterClasses = new CharacterClasses(); + this.gramSplitter = new GramSplitter(characterClasses); + } @Override public Stemmer getStemmer() { return new StemmerImpl(getTokenizer()); } diff --git a/linguistics/src/main/resources/configdefinitions/simple-linguistics.def b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def new file mode 100644 index 00000000000..b2e7f6688af --- /dev/null +++ b/linguistics/src/main/resources/configdefinitions/simple-linguistics.def @@ -0,0 +1,5 @@ +# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +namespace=language.simple + +# Enable Optimaize language detector +detector.enableOptimaize bool default=true \ No newline at end of file -- cgit v1.2.3