diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java b/linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java new file mode 100644 index 00000000000..8d1eb51b388 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java @@ -0,0 +1,32 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import opennlp.tools.ngram.NGramCharModel; +import opennlp.tools.util.normalizer.CharSequenceNormalizer; + +import java.util.HashSet; +import java.util.Set; + +/** + * Avoids using the unnecessarily slow {@link NGramCharModel}. + * + * @author jonmv + */ +public class DefaultLanguageDetectorContextGenerator extends opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator { + + public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength, CharSequenceNormalizer... normalizers) { + super(minLength, maxLength, normalizers); + } + + @Override + public String[] getContext(CharSequence document) { + int[] normalized = normalizer.normalize(document).codePoints().map(Character::toLowerCase).toArray(); + Set<String> grams = new HashSet<>(); + for (int i = 0; i < normalized.length; i++) + for (int j = minLength; j <= maxLength && i + j < normalized.length; j++) + grams.add(new String(normalized, i, j)); + + return grams.toArray(new String[0]); + } + +} |