diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language')
2 files changed, 32 insertions, 1 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java b/linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java new file mode 100644 index 00000000000..8d1eb51b388 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java @@ -0,0 +1,32 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import opennlp.tools.ngram.NGramCharModel; +import opennlp.tools.util.normalizer.CharSequenceNormalizer; + +import java.util.HashSet; +import java.util.Set; + +/** + * Avoids using the unnecessarily slow {@link NGramCharModel}. + * + * @author jonmv + */ +public class DefaultLanguageDetectorContextGenerator extends opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator { + + public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength, CharSequenceNormalizer... normalizers) { + super(minLength, maxLength, normalizers); + } + + @Override + public String[] getContext(CharSequence document) { + int[] normalized = normalizer.normalize(document).codePoints().map(Character::toLowerCase).toArray(); + Set<String> grams = new HashSet<>(); + for (int i = 0; i < normalized.length; i++) + for (int j = minLength; j <= maxLength && i + j < normalized.length; j++) + grams.add(new String(normalized, i, j)); + + return grams.toArray(new String[0]); + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java index aa4387bcc45..fdca5355008 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java @@ -1,7 +1,6 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.opennlp; -import opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator; import opennlp.tools.langdetect.LanguageDetectorContextGenerator; import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; |