diff options
Diffstat (limited to 'linguistics/src/main/java')
4 files changed, 118 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/Linguistics.java b/linguistics/src/main/java/com/yahoo/language/Linguistics.java index 75cdba0ab40..9006d855faa 100644 --- a/linguistics/src/main/java/com/yahoo/language/Linguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/Linguistics.java @@ -101,7 +101,10 @@ public interface Linguistics { /** * Returns the name and version of a processor component returned by * this instance. + * + * @deprecated do not use */ + @Deprecated // OK Tuple2<String, Version> getVersion(Linguistics.Component component); } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java new file mode 100644 index 00000000000..7ba061aaef1 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java @@ -0,0 +1,102 @@ +package com.yahoo.language.opennlp; + +import com.google.common.base.Optional; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObjectFactory; +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.language.simple.SimpleDetector; +import com.yahoo.text.Utf8; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; + +/** + * Detects the language of some sample text using SimpleDetector for CJK and Optimaize otherwise. + * + * @author bratseth + */ +public class OptimaizeDetector implements Detector { + + static private Object initGuard = new Object(); + static private TextObjectFactory textObjectFactory = null; + static private LanguageDetector languageDetector = null; + + static private void initOptimaize() { + synchronized (initGuard) { + if ((textObjectFactory != null) && (languageDetector != null)) return; + + // origin: https://github.com/optimaize/language-detector + // load all languages: + List<LanguageProfile> languageProfiles; + try { + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + + //build language detector: + languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + + //create a text object factory + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } + } + + private SimpleDetector simpleDetector = new SimpleDetector(); + + public OptimaizeDetector() { + initOptimaize(); + } + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + return new Detection(guessLanguage(input, offset, length), simpleDetector.guessEncoding(input), false); + } + + @Override + public Detection detect(ByteBuffer input, Hint hint) { + byte[] buf = new byte[input.remaining()]; + input.get(buf, 0, buf.length); + return detect(buf, 0, buf.length, hint); + } + + @Override + public Detection detect(String input, Hint hint) { + return new Detection(guessLanguage(input), Utf8.getCharset().name(), false); + } + + private Language guessLanguage(byte[] buf, int offset, int length) { + return guessLanguage(Utf8.toString(buf, offset, length)); + } + + public Language guessLanguage(String input) { + if (input == null || input.length() == 0) return Language.UNKNOWN; + + Language result = simpleDetector.guessLanguage(input); + if (result != Language.UNKNOWN) return result; + + return guessLanguageUsingOptimaize(input); + } + + private static Language guessLanguageUsingOptimaize(String input) { + Optional<LdLocale> result = languageDetector.detect(textObjectFactory.forText(input)); + if ( ! result.isPresent()) return Language.UNKNOWN; + + return Language.fromLocale(new Locale(result.get().getLanguage())); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java index bcd4492625d..1edfe5c804e 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java @@ -68,16 +68,21 @@ public class SimpleDetector implements Detector { private final boolean enableOptimaize; + /** @deprecated use OptimaizeDetector to enable optimaize */ + @Deprecated SimpleDetector(boolean enableOptimaize) { initOptimaize(enableOptimaize); this.enableOptimaize = enableOptimaize; } + @SuppressWarnings("deprecation") public SimpleDetector() { this(true); } + /** @deprecated use OptimaizeDetector to enable optimaize */ + @Deprecated public SimpleDetector(SimpleLinguisticsConfig.Detector detector) { this(detector.enableOptimaize()); } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index 8cbbdeeae1d..b7bf0215ca4 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -32,14 +32,20 @@ public class SimpleLinguistics implements Linguistics { private final GramSplitter gramSplitter; @Inject + @SuppressWarnings("deprecation") public SimpleLinguistics() { this(true); } + + /** @deprecated use OpenNlpLinguistics to get optimaize */ + @Deprecated // OK public SimpleLinguistics(boolean enableOptimaize) { this(new SimpleDetector(enableOptimaize)); } + /** @deprecated use OpenNlpLinguistics to get optimaize */ + @Deprecated // OK public SimpleLinguistics(SimpleLinguisticsConfig config) { this(new SimpleDetector(config.detector())); } @@ -76,6 +82,8 @@ public class SimpleLinguistics implements Linguistics { @Override public CharacterClasses getCharacterClasses() { return characterClasses; } + /** @deprecated do not use */ + @Deprecated // OK @Override public Tuple2<String, Version> getVersion(Component component) { return new Tuple2<>("yahoo", new Version(1, 0)); |