diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java new file mode 100644 index 00000000000..849452aeafd --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java @@ -0,0 +1,92 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.language.simple.SimpleDetector; +import opennlp.tools.langdetect.LanguageDetectorConfig; +import opennlp.tools.langdetect.LanguageDetectorME; +import opennlp.tools.langdetect.LanguageDetectorModel; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Detects the language of some sample text using {@link SimpleDetector} for CJK input, and OpenNLP otherwise. + * + * @author jonmv + */ +class OpenNlpDetector implements Detector { + + private static final Object monitor = new Object(); + private static LanguageDetectorModel model; + + private final SimpleDetector simple = new SimpleDetector(); + private final Map<String, Language> languagesByISO3 = new HashMap<>(); + private final LanguageDetectorME detector; + private final LanguageDetectorConfig config; + + OpenNlpDetector() { + detector = new LanguageDetectorME(loadModel()); + config = new LanguageDetectorConfig(); + config.setMinDiff(0.02); + config.setChunkSize(64); + for (Locale locale : Locale.getAvailableLocales()) + languagesByISO3.put(locale.getISO3Language(), Language.fromLocale(locale)); + } + + private static LanguageDetectorModel loadModel() { + synchronized (monitor) { + if (model == null) { + try { + model = new LanguageDetectorModel(OpenNlpDetector.class.getResourceAsStream("/models/langdetect-183.bin")); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + return model; + } + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + Charset encoding = Charset.forName(simple.guessEncoding(input, offset, length)); + return new Detection(detectLanguage(new String(input, offset, length, encoding)), encoding.name(), false); + } + + @Override + public Detection detect(ByteBuffer input, Hint hint) { + if (input.hasArray()) + return detect(input.array(), input.arrayOffset() + input.position(), input.remaining(), hint); + + byte[] buffer = new byte[input.remaining()]; + input.get(buffer); + return detect(buffer, 0, buffer.length, hint); + } + + @Override + public Detection detect(String input, Hint hint) { + return new Detection(detectLanguage(input), UTF_8.name(), false); + } + + private Language detectLanguage(String input) { + Language simpleGuess = simple.guessLanguage(input); + if (simpleGuess != Language.UNKNOWN) + return simpleGuess; + + var prediction = detector.probingPredictLanguages(input, config).getLanguages()[0]; + return prediction.getConfidence() > 0.03 ? languagesByISO3.getOrDefault(prediction.getLang(), Language.UNKNOWN) + : Language.UNKNOWN; + } + +} |