diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo')
7 files changed, 39 insertions, 161 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/Language.java b/linguistics/src/main/java/com/yahoo/language/Language.java index ab8bcd4459f..655a9003fb1 100644 --- a/linguistics/src/main/java/com/yahoo/language/Language.java +++ b/linguistics/src/main/java/com/yahoo/language/Language.java @@ -529,10 +529,10 @@ public enum Language { } /** - * <p>Convenience method for calling <tt>fromLocale(LocaleFactory.fromLanguageTag(languageTag))</tt>.</p> + * <p>Convenience method for calling <code>fromLocale(LocaleFactory.fromLanguageTag(languageTag))</code>.</p> * - * @param languageTag The language tag for which the <tt>Language</tt> to return. - * @return the corresponding <tt>Language</tt>, or {@link #UNKNOWN} if not known. + * @param languageTag The language tag for which the <code>Language</code> to return. + * @return the corresponding <code>Language</code>, or {@link #UNKNOWN} if not known. */ public static Language fromLanguageTag(String languageTag) { if (languageTag == null) return UNKNOWN; @@ -540,7 +540,7 @@ public enum Language { } /** - * <p>Returns the <tt>Language</tt> whose {@link #languageCode()} is equal to <tt>locale.getLanguage()</tt>, with + * <p>Returns the <code>Language</code> whose {@link #languageCode()} is equal to <code>locale.getLanguage()</code>, with * the following additions:</p> * <ul> * <li>Language code "in" translates to {@link #INDONESIAN}</li> @@ -551,8 +551,8 @@ public enum Language { * is "hans", in which case it translates to {@link #CHINESE_SIMPLIFIED}.</li> * </ul> * - * @param locale The locale for which the <tt>Language</tt> to return. - * @return The corresponding <tt>Language</tt>, or {@link #UNKNOWN} if not known. + * @param locale The locale for which the <code>Language</code> to return. + * @return The corresponding <code>Language</code>, or {@link #UNKNOWN} if not known. */ public static Language fromLocale(Locale locale) { String str = locale.getLanguage(); @@ -582,7 +582,7 @@ public enum Language { /** * Returns the language from an encoding, or {@link #UNKNOWN} if it cannot be determined. * - * @param encoding The name of the encoding to derive the <tt>Language</tt> from. + * @param encoding The name of the encoding to derive the <code>Language</code> from. * @return the language given by the encoding, or {@link #UNKNOWN} if not determined. */ public static Language fromEncoding(String encoding) { diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java index 9006d855faa..ec92c5e857e 100644 --- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java @@ -42,15 +42,6 @@ public interface Linguistics { } /** - * The same as new com.yahoo.language.simple.SimpleLinguistics(). Prefer using that directly. - * - * @deprecated use new com.yahoo.language.simple.SimpleLinguistics() - */ - @Deprecated // OK - // TODO: Remove this field on Vespa 7 - Linguistics SIMPLE = new SimpleLinguistics(); - - /** * Returns a thread-unsafe stemmer or lemmatizer. * This is used at query time to do stemming of search terms to indexes which contains text tokenized * with stemming turned on @@ -98,13 +89,4 @@ public interface Linguistics { /** Returns a thread-unsafe character classes instance. */ CharacterClasses getCharacterClasses(); - /** - * Returns the name and version of a processor component returned by - * this instance. - * - * @deprecated do not use - */ - @Deprecated // OK - Tuple2<String, Version> getVersion(Linguistics.Component component); - } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 38181261d6a..1c7c71c00b6 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -1,14 +1,43 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.opennlp; +import com.google.inject.Inject; +import com.yahoo.language.detect.Detector; import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.simple.SimpleDetector; import com.yahoo.language.simple.SimpleLinguistics; +/** + * Returns a linguistics implementation based on OpenNlp, + * and (optionally, default on) Optimaize for language detection. + */ public class OpenNlpLinguistics extends SimpleLinguistics { + private final Detector detector; + + public OpenNlpLinguistics() { + this(true); + } + + @Inject + public OpenNlpLinguistics(OpennlpLinguisticsConfig config) { + this(config.detector().enableOptimaize()); + } + + public OpenNlpLinguistics(boolean enableOptimaize) { + this(enableOptimaize ? new OptimaizeDetector() : new SimpleDetector()); + } + + private OpenNlpLinguistics(Detector detector) { + this.detector = detector; + } + @Override public Tokenizer getTokenizer() { return new OpenNlpTokenizer(getNormalizer(), getTransformer()); } + @Override + public Detector getDetector() { return detector; } + } diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java index af486f715b0..628f6910c9e 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java +++ b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java @@ -22,26 +22,4 @@ public enum StemMode { this.value = value; } - /** - * Returns the stem mode as an int - * - * @deprecated do not use - */ - // TODO: Remove on Vespa 7 - @Deprecated // OK - public int getValue() { - return value; - } - - @Deprecated // OK - // TODO: Remove on Vespa 7 - public static StemMode valueOf(int value) { - for (StemMode mode : values()) { - if (mode.value == value) { - return mode; - } - } - return NONE; - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index 1edfe5c804e..3de0eb3e997 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -1,26 +1,13 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; -import com.google.common.base.Optional; -import com.optimaize.langdetect.LanguageDetector; -import com.optimaize.langdetect.LanguageDetectorBuilder; -import com.optimaize.langdetect.i18n.LdLocale; -import com.optimaize.langdetect.ngram.NgramExtractors; -import com.optimaize.langdetect.profiles.LanguageProfile; -import com.optimaize.langdetect.profiles.LanguageProfileReader; -import com.optimaize.langdetect.text.CommonTextObjectFactories; -import com.optimaize.langdetect.text.TextObject; -import com.optimaize.langdetect.text.TextObjectFactory; import com.yahoo.language.Language; import com.yahoo.language.detect.Detection; import com.yahoo.language.detect.Detector; import com.yahoo.language.detect.Hint; import com.yahoo.text.Utf8; -import java.io.IOException; import java.nio.ByteBuffer; -import java.util.List; -import java.util.Locale; /** * Includes functionality for determining the langCode from a sample or from the encoding. @@ -38,55 +25,6 @@ import java.util.Locale; */ public class SimpleDetector implements Detector { - static private Object initGuard = new Object(); - static private TextObjectFactory textObjectFactory = null; - static private LanguageDetector languageDetector = null; - - static private void initOptimaize (boolean useOptimaize) { - if (!useOptimaize) return; - synchronized (initGuard) { - if ((textObjectFactory != null) && (languageDetector != null)) return; - - // origin: https://github.com/optimaize/language-detector - //load all languages: - List<LanguageProfile> languageProfiles; - try { - languageProfiles = new LanguageProfileReader().readAllBuiltIn(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - //build language detector: - languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) - .withProfiles(languageProfiles) - .build(); - - //create a text object factory - textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); - } - } - - private final boolean enableOptimaize; - - /** @deprecated use OptimaizeDetector to enable optimaize */ - @Deprecated - SimpleDetector(boolean enableOptimaize) { - initOptimaize(enableOptimaize); - this.enableOptimaize = enableOptimaize; - - } - - @SuppressWarnings("deprecation") - public SimpleDetector() { - this(true); - } - - /** @deprecated use OptimaizeDetector to enable optimaize */ - @Deprecated - public SimpleDetector(SimpleLinguisticsConfig.Detector detector) { - this(detector.enableOptimaize()); - } - @Override public Detection detect(byte[] input, int offset, int length, Hint hint) { return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false); @@ -172,26 +110,10 @@ public class SimpleDetector implements Detector { return Language.THAI; } } - if (enableOptimaize && Language.UNKNOWN.equals(soFar)){ - return detectLangOptimaize(input); - } // got to the end, so return the current best guess return soFar; } - private static Language detectLangOptimaize(String input) { - if (input == null || input.length() == 0) { - return Language.UNKNOWN; - } - TextObject textObject = textObjectFactory.forText(input); - Optional<LdLocale> lang = languageDetector.detect(textObject); - if (lang.isPresent()) { - String language = lang.get().getLanguage(); - return Language.fromLocale(new Locale(language)); - } - return Language.UNKNOWN; - } - private boolean isTrailingOctet(byte i) { return ((i >>> 6) & 3) == 2; } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index b7bf0215ca4..389926f1c1b 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -17,7 +17,8 @@ import com.yahoo.language.process.Tokenizer; import com.yahoo.language.process.Transformer; /** - * Factory of pure Java linguistic processor implementations. + * Factory of simple linguistic processor implementations. + * Useful for testing and english-only use cases. * * @author bratseth * @author bjorncs @@ -34,26 +35,9 @@ public class SimpleLinguistics implements Linguistics { @Inject @SuppressWarnings("deprecation") public SimpleLinguistics() { - this(true); - - } - - /** @deprecated use OpenNlpLinguistics to get optimaize */ - @Deprecated // OK - public SimpleLinguistics(boolean enableOptimaize) { - this(new SimpleDetector(enableOptimaize)); - } - - /** @deprecated use OpenNlpLinguistics to get optimaize */ - @Deprecated // OK - public SimpleLinguistics(SimpleLinguisticsConfig config) { - this(new SimpleDetector(config.detector())); - } - - private SimpleLinguistics(Detector detector) { this.normalizer = new SimpleNormalizer(); this.transformer = new SimpleTransformer(); - this.detector = detector; + this.detector = new SimpleDetector(); this.characterClasses = new CharacterClasses(); this.gramSplitter = new GramSplitter(characterClasses); } @@ -82,11 +66,4 @@ public class SimpleLinguistics implements Linguistics { @Override public CharacterClasses getCharacterClasses() { return characterClasses; } - /** @deprecated do not use */ - @Deprecated // OK - @Override - public Tuple2<String, Version> getVersion(Component component) { - return new Tuple2<>("yahoo", new Version(1, 0)); - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java index 7bb13744e20..3b7dcca3bc1 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java @@ -24,16 +24,6 @@ public abstract class CharacterUtils { public static CharacterUtils getInstance() { return JAVA_5; } - - /** - * explicitly returns a version matching java 4 semantics - * @deprecated Only for n-gram backwards compat - */ - // TODO: Remove on Vespa 7 - @Deprecated // OK - public static CharacterUtils getJava4Instance() { - return JAVA_4; - } /** * Returns the code point at the given index of the {@link CharSequence}. |