diff options
author | Jon Marius Venstad <jonmv@users.noreply.github.com> | 2021-12-18 12:05:59 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-12-18 12:05:59 +0100 |
commit | db8d449a9f8c93df16874123078c280fb346174f (patch) | |
tree | 9d96823262df3e60d5da93f697758154b1ae93b1 /linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java | |
parent | b4f5820672908823982c69260a8a5df3163aa236 (diff) |
Revert "Replace optimaize with OpenNLP language detector [run-systemtest]"
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java | 92 |
1 files changed, 0 insertions, 92 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java deleted file mode 100644 index 849452aeafd..00000000000 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.opennlp; - -import com.yahoo.language.Language; -import com.yahoo.language.detect.Detection; -import com.yahoo.language.detect.Detector; -import com.yahoo.language.detect.Hint; -import com.yahoo.language.simple.SimpleDetector; -import opennlp.tools.langdetect.LanguageDetectorConfig; -import opennlp.tools.langdetect.LanguageDetectorME; -import opennlp.tools.langdetect.LanguageDetectorModel; - -import java.io.IOException; -import java.io.UncheckedIOException; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; - -import static java.nio.charset.StandardCharsets.UTF_8; - -/** - * Detects the language of some sample text using {@link SimpleDetector} for CJK input, and OpenNLP otherwise. - * - * @author jonmv - */ -class OpenNlpDetector implements Detector { - - private static final Object monitor = new Object(); - private static LanguageDetectorModel model; - - private final SimpleDetector simple = new SimpleDetector(); - private final Map<String, Language> languagesByISO3 = new HashMap<>(); - private final LanguageDetectorME detector; - private final LanguageDetectorConfig config; - - OpenNlpDetector() { - detector = new LanguageDetectorME(loadModel()); - config = new LanguageDetectorConfig(); - config.setMinDiff(0.02); - config.setChunkSize(64); - for (Locale locale : Locale.getAvailableLocales()) - languagesByISO3.put(locale.getISO3Language(), Language.fromLocale(locale)); - } - - private static LanguageDetectorModel loadModel() { - synchronized (monitor) { - if (model == null) { - try { - model = new LanguageDetectorModel(OpenNlpDetector.class.getResourceAsStream("/models/langdetect-183.bin")); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } - return model; - } - - @Override - public Detection detect(byte[] input, int offset, int length, Hint hint) { - Charset encoding = Charset.forName(simple.guessEncoding(input, offset, length)); - return new Detection(detectLanguage(new String(input, offset, length, encoding)), encoding.name(), false); - } - - @Override - public Detection detect(ByteBuffer input, Hint hint) { - if (input.hasArray()) - return detect(input.array(), input.arrayOffset() + input.position(), input.remaining(), hint); - - byte[] buffer = new byte[input.remaining()]; - input.get(buffer); - return detect(buffer, 0, buffer.length, hint); - } - - @Override - public Detection detect(String input, Hint hint) { - return new Detection(detectLanguage(input), UTF_8.name(), false); - } - - private Language detectLanguage(String input) { - Language simpleGuess = simple.guessLanguage(input); - if (simpleGuess != Language.UNKNOWN) - return simpleGuess; - - var prediction = detector.probingPredictLanguages(input, config).getLanguages()[0]; - return prediction.getConfidence() > 0.03 ? languagesByISO3.getOrDefault(prediction.getLang(), Language.UNKNOWN) - : Language.UNKNOWN; - } - -} |