diff options
author | Jon Marius Venstad <venstad@gmail.com> | 2021-12-20 11:45:07 +0100 |
---|---|---|
committer | Jon Marius Venstad <venstad@gmail.com> | 2021-12-20 11:45:07 +0100 |
commit | 3471a55fc2f4775bfac13f2969f902eaea519e1d (patch) | |
tree | 8cd87f28b39e96cd195f378d32d57dbcfa311274 /linguistics/src/main/java/com/yahoo | |
parent | e4a1aa6c3df7d7e131d0422966e6bdda6b0d71c2 (diff) |
Upper bound on input size, and use opennlp before simple detector
Diffstat (limited to 'linguistics/src/main/java/com/yahoo')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java | 9 |
1 files changed, 3 insertions, 6 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java index 41e7c8a3ee8..cb4580a9c01 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java @@ -40,6 +40,7 @@ class OpenNlpDetector implements Detector { config = new LanguageDetectorConfig(); config.setMinDiff(0.02); config.setChunkSize(64); + config.setMaxLength(256); for (Locale locale : Locale.getAvailableLocales()) { Language language = Language.fromLocale(locale); if (language != null) @@ -83,13 +84,9 @@ class OpenNlpDetector implements Detector { } private Language detectLanguage(String input) { - Language simpleGuess = simple.guessLanguage(input); - if (simpleGuess != Language.UNKNOWN) - return simpleGuess; - var prediction = detector.probingPredictLanguages(input, config).getLanguages()[0]; - return prediction.getConfidence() > 0.03 ? languagesByISO3.getOrDefault(prediction.getLang(), Language.UNKNOWN) - : Language.UNKNOWN; + var result = prediction.getConfidence() > 0.02 ? languagesByISO3.get(prediction.getLang()) : null; + return result != null ? result : simple.guessLanguage(input.substring(0, Math.min(input.length(), 256))); } } |