From 3471a55fc2f4775bfac13f2969f902eaea519e1d Mon Sep 17 00:00:00 2001 From: Jon Marius Venstad Date: Mon, 20 Dec 2021 11:45:07 +0100 Subject: Upper bound on input size, and use opennlp before simple detector --- .../main/java/com/yahoo/language/opennlp/OpenNlpDetector.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'linguistics/src/main/java/com/yahoo') diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java index 41e7c8a3ee8..cb4580a9c01 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java @@ -40,6 +40,7 @@ class OpenNlpDetector implements Detector { config = new LanguageDetectorConfig(); config.setMinDiff(0.02); config.setChunkSize(64); + config.setMaxLength(256); for (Locale locale : Locale.getAvailableLocales()) { Language language = Language.fromLocale(locale); if (language != null) @@ -83,13 +84,9 @@ class OpenNlpDetector implements Detector { } private Language detectLanguage(String input) { - Language simpleGuess = simple.guessLanguage(input); - if (simpleGuess != Language.UNKNOWN) - return simpleGuess; - var prediction = detector.probingPredictLanguages(input, config).getLanguages()[0]; - return prediction.getConfidence() > 0.03 ? languagesByISO3.getOrDefault(prediction.getLang(), Language.UNKNOWN) - : Language.UNKNOWN; + var result = prediction.getConfidence() > 0.02 ? languagesByISO3.get(prediction.getLang()) : null; + return result != null ? result : simple.guessLanguage(input.substring(0, Math.min(input.length(), 256))); } } -- cgit v1.2.3