aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/Language.java14
-rw-r--r--linguistics/src/main/java/com/yahoo/language/Linguistics.java18
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java29
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/StemMode.java22
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java78
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java29
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java10
7 files changed, 39 insertions, 161 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/Language.java b/linguistics/src/main/java/com/yahoo/language/Language.java
index ab8bcd4459f..655a9003fb1 100644
--- a/linguistics/src/main/java/com/yahoo/language/Language.java
+++ b/linguistics/src/main/java/com/yahoo/language/Language.java
@@ -529,10 +529,10 @@ public enum Language {
}
/**
- * <p>Convenience method for calling <tt>fromLocale(LocaleFactory.fromLanguageTag(languageTag))</tt>.</p>
+ * <p>Convenience method for calling <code>fromLocale(LocaleFactory.fromLanguageTag(languageTag))</code>.</p>
*
- * @param languageTag The language tag for which the <tt>Language</tt> to return.
- * @return the corresponding <tt>Language</tt>, or {@link #UNKNOWN} if not known.
+ * @param languageTag The language tag for which the <code>Language</code> to return.
+ * @return the corresponding <code>Language</code>, or {@link #UNKNOWN} if not known.
*/
public static Language fromLanguageTag(String languageTag) {
if (languageTag == null) return UNKNOWN;
@@ -540,7 +540,7 @@ public enum Language {
}
/**
- * <p>Returns the <tt>Language</tt> whose {@link #languageCode()} is equal to <tt>locale.getLanguage()</tt>, with
+ * <p>Returns the <code>Language</code> whose {@link #languageCode()} is equal to <code>locale.getLanguage()</code>, with
* the following additions:</p>
* <ul>
* <li>Language code "in" translates to {@link #INDONESIAN}</li>
@@ -551,8 +551,8 @@ public enum Language {
* is "hans", in which case it translates to {@link #CHINESE_SIMPLIFIED}.</li>
* </ul>
*
- * @param locale The locale for which the <tt>Language</tt> to return.
- * @return The corresponding <tt>Language</tt>, or {@link #UNKNOWN} if not known.
+ * @param locale The locale for which the <code>Language</code> to return.
+ * @return The corresponding <code>Language</code>, or {@link #UNKNOWN} if not known.
*/
public static Language fromLocale(Locale locale) {
String str = locale.getLanguage();
@@ -582,7 +582,7 @@ public enum Language {
/**
* Returns the language from an encoding, or {@link #UNKNOWN} if it cannot be determined.
*
- * @param encoding The name of the encoding to derive the <tt>Language</tt> from.
+ * @param encoding The name of the encoding to derive the <code>Language</code> from.
* @return the language given by the encoding, or {@link #UNKNOWN} if not determined.
*/
public static Language fromEncoding(String encoding) {
diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
index 9006d855faa..ec92c5e857e 100644
--- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java
@@ -42,15 +42,6 @@ public interface Linguistics {
}
/**
- * The same as new com.yahoo.language.simple.SimpleLinguistics(). Prefer using that directly.
- *
- * @deprecated use new com.yahoo.language.simple.SimpleLinguistics()
- */
- @Deprecated // OK
- // TODO: Remove this field on Vespa 7
- Linguistics SIMPLE = new SimpleLinguistics();
-
- /**
* Returns a thread-unsafe stemmer or lemmatizer.
* This is used at query time to do stemming of search terms to indexes which contains text tokenized
* with stemming turned on
@@ -98,13 +89,4 @@ public interface Linguistics {
/** Returns a thread-unsafe character classes instance. */
CharacterClasses getCharacterClasses();
- /**
- * Returns the name and version of a processor component returned by
- * this instance.
- *
- * @deprecated do not use
- */
- @Deprecated // OK
- Tuple2<String, Version> getVersion(Linguistics.Component component);
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
index 38181261d6a..1c7c71c00b6 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
@@ -1,14 +1,43 @@
// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;
+import com.google.inject.Inject;
+import com.yahoo.language.detect.Detector;
import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.simple.SimpleDetector;
import com.yahoo.language.simple.SimpleLinguistics;
+/**
+ * Returns a linguistics implementation based on OpenNlp,
+ * and (optionally, default on) Optimaize for language detection.
+ */
public class OpenNlpLinguistics extends SimpleLinguistics {
+ private final Detector detector;
+
+ public OpenNlpLinguistics() {
+ this(true);
+ }
+
+ @Inject
+ public OpenNlpLinguistics(OpennlpLinguisticsConfig config) {
+ this(config.detector().enableOptimaize());
+ }
+
+ public OpenNlpLinguistics(boolean enableOptimaize) {
+ this(enableOptimaize ? new OptimaizeDetector() : new SimpleDetector());
+ }
+
+ private OpenNlpLinguistics(Detector detector) {
+ this.detector = detector;
+ }
+
@Override
public Tokenizer getTokenizer() {
return new OpenNlpTokenizer(getNormalizer(), getTransformer());
}
+ @Override
+ public Detector getDetector() { return detector; }
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java
index af486f715b0..628f6910c9e 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/StemMode.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/StemMode.java
@@ -22,26 +22,4 @@ public enum StemMode {
this.value = value;
}
- /**
- * Returns the stem mode as an int
- *
- * @deprecated do not use
- */
- // TODO: Remove on Vespa 7
- @Deprecated // OK
- public int getValue() {
- return value;
- }
-
- @Deprecated // OK
- // TODO: Remove on Vespa 7
- public static StemMode valueOf(int value) {
- for (StemMode mode : values()) {
- if (mode.value == value) {
- return mode;
- }
- }
- return NONE;
- }
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index 1edfe5c804e..3de0eb3e997 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -1,26 +1,13 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
-import com.google.common.base.Optional;
-import com.optimaize.langdetect.LanguageDetector;
-import com.optimaize.langdetect.LanguageDetectorBuilder;
-import com.optimaize.langdetect.i18n.LdLocale;
-import com.optimaize.langdetect.ngram.NgramExtractors;
-import com.optimaize.langdetect.profiles.LanguageProfile;
-import com.optimaize.langdetect.profiles.LanguageProfileReader;
-import com.optimaize.langdetect.text.CommonTextObjectFactories;
-import com.optimaize.langdetect.text.TextObject;
-import com.optimaize.langdetect.text.TextObjectFactory;
import com.yahoo.language.Language;
import com.yahoo.language.detect.Detection;
import com.yahoo.language.detect.Detector;
import com.yahoo.language.detect.Hint;
import com.yahoo.text.Utf8;
-import java.io.IOException;
import java.nio.ByteBuffer;
-import java.util.List;
-import java.util.Locale;
/**
* Includes functionality for determining the langCode from a sample or from the encoding.
@@ -38,55 +25,6 @@ import java.util.Locale;
*/
public class SimpleDetector implements Detector {
- static private Object initGuard = new Object();
- static private TextObjectFactory textObjectFactory = null;
- static private LanguageDetector languageDetector = null;
-
- static private void initOptimaize (boolean useOptimaize) {
- if (!useOptimaize) return;
- synchronized (initGuard) {
- if ((textObjectFactory != null) && (languageDetector != null)) return;
-
- // origin: https://github.com/optimaize/language-detector
- //load all languages:
- List<LanguageProfile> languageProfiles;
- try {
- languageProfiles = new LanguageProfileReader().readAllBuiltIn();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- //build language detector:
- languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
- .withProfiles(languageProfiles)
- .build();
-
- //create a text object factory
- textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
- }
- }
-
- private final boolean enableOptimaize;
-
- /** @deprecated use OptimaizeDetector to enable optimaize */
- @Deprecated
- SimpleDetector(boolean enableOptimaize) {
- initOptimaize(enableOptimaize);
- this.enableOptimaize = enableOptimaize;
-
- }
-
- @SuppressWarnings("deprecation")
- public SimpleDetector() {
- this(true);
- }
-
- /** @deprecated use OptimaizeDetector to enable optimaize */
- @Deprecated
- public SimpleDetector(SimpleLinguisticsConfig.Detector detector) {
- this(detector.enableOptimaize());
- }
-
@Override
public Detection detect(byte[] input, int offset, int length, Hint hint) {
return new Detection(guessLanguage(input, offset, length), guessEncoding(input), false);
@@ -172,26 +110,10 @@ public class SimpleDetector implements Detector {
return Language.THAI;
}
}
- if (enableOptimaize && Language.UNKNOWN.equals(soFar)){
- return detectLangOptimaize(input);
- }
// got to the end, so return the current best guess
return soFar;
}
- private static Language detectLangOptimaize(String input) {
- if (input == null || input.length() == 0) {
- return Language.UNKNOWN;
- }
- TextObject textObject = textObjectFactory.forText(input);
- Optional<LdLocale> lang = languageDetector.detect(textObject);
- if (lang.isPresent()) {
- String language = lang.get().getLanguage();
- return Language.fromLocale(new Locale(language));
- }
- return Language.UNKNOWN;
- }
-
private boolean isTrailingOctet(byte i) {
return ((i >>> 6) & 3) == 2;
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index b7bf0215ca4..389926f1c1b 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -17,7 +17,8 @@ import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.process.Transformer;
/**
- * Factory of pure Java linguistic processor implementations.
+ * Factory of simple linguistic processor implementations.
+ * Useful for testing and english-only use cases.
*
* @author bratseth
* @author bjorncs
@@ -34,26 +35,9 @@ public class SimpleLinguistics implements Linguistics {
@Inject
@SuppressWarnings("deprecation")
public SimpleLinguistics() {
- this(true);
-
- }
-
- /** @deprecated use OpenNlpLinguistics to get optimaize */
- @Deprecated // OK
- public SimpleLinguistics(boolean enableOptimaize) {
- this(new SimpleDetector(enableOptimaize));
- }
-
- /** @deprecated use OpenNlpLinguistics to get optimaize */
- @Deprecated // OK
- public SimpleLinguistics(SimpleLinguisticsConfig config) {
- this(new SimpleDetector(config.detector()));
- }
-
- private SimpleLinguistics(Detector detector) {
this.normalizer = new SimpleNormalizer();
this.transformer = new SimpleTransformer();
- this.detector = detector;
+ this.detector = new SimpleDetector();
this.characterClasses = new CharacterClasses();
this.gramSplitter = new GramSplitter(characterClasses);
}
@@ -82,11 +66,4 @@ public class SimpleLinguistics implements Linguistics {
@Override
public CharacterClasses getCharacterClasses() { return characterClasses; }
- /** @deprecated do not use */
- @Deprecated // OK
- @Override
- public Tuple2<String, Version> getVersion(Component component) {
- return new Tuple2<>("yahoo", new Version(1, 0));
- }
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java
index 7bb13744e20..3b7dcca3bc1 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/kstem/CharacterUtils.java
@@ -24,16 +24,6 @@ public abstract class CharacterUtils {
public static CharacterUtils getInstance() {
return JAVA_5;
}
-
- /**
- * explicitly returns a version matching java 4 semantics
- * @deprecated Only for n-gram backwards compat
- */
- // TODO: Remove on Vespa 7
- @Deprecated // OK
- public static CharacterUtils getJava4Instance() {
- return JAVA_4;
- }
/**
* Returns the code point at the given index of the {@link CharSequence}.