summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Marius Venstad <venstad@gmail.com>2021-12-20 11:45:07 +0100
committerJon Marius Venstad <venstad@gmail.com>2021-12-20 11:45:07 +0100
commit3471a55fc2f4775bfac13f2969f902eaea519e1d (patch)
tree8cd87f28b39e96cd195f378d32d57dbcfa311274 /linguistics
parente4a1aa6c3df7d7e131d0422966e6bdda6b0d71c2 (diff)
Upper bound on input size, and use opennlp before simple detector
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java9
1 files changed, 3 insertions, 6 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java
index 41e7c8a3ee8..cb4580a9c01 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java
@@ -40,6 +40,7 @@ class OpenNlpDetector implements Detector {
config = new LanguageDetectorConfig();
config.setMinDiff(0.02);
config.setChunkSize(64);
+ config.setMaxLength(256);
for (Locale locale : Locale.getAvailableLocales()) {
Language language = Language.fromLocale(locale);
if (language != null)
@@ -83,13 +84,9 @@ class OpenNlpDetector implements Detector {
}
private Language detectLanguage(String input) {
- Language simpleGuess = simple.guessLanguage(input);
- if (simpleGuess != Language.UNKNOWN)
- return simpleGuess;
-
var prediction = detector.probingPredictLanguages(input, config).getLanguages()[0];
- return prediction.getConfidence() > 0.03 ? languagesByISO3.getOrDefault(prediction.getLang(), Language.UNKNOWN)
- : Language.UNKNOWN;
+ var result = prediction.getConfidence() > 0.02 ? languagesByISO3.get(prediction.getLang()) : null;
+ return result != null ? result : simple.guessLanguage(input.substring(0, Math.min(input.length(), 256)));
}
}