blob: 470bb31bf67ba03e578710ce0669ee7b3116edc4 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;
import opennlp.tools.ngram.NGramCharModel;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
import java.util.HashSet;
import java.util.Set;
/**
* Avoids using the unnecessarily slow {@link NGramCharModel}.
*
* @author jonmv
*/
public class DefaultLanguageDetectorContextGenerator extends opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator {
public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength, CharSequenceNormalizer... normalizers) {
super(minLength, maxLength, normalizers);
}
@Override
@SuppressWarnings("unchecked")
public <T extends CharSequence> T[] getContext(CharSequence document) {
int[] normalized = normalizer.normalize(document).codePoints().map(Character::toLowerCase).toArray();
Set<String> grams = new HashSet<>();
for (int i = 0; i < normalized.length; i++)
for (int j = minLength; j <= maxLength && i + j < normalized.length; j++)
grams.add(new String(normalized, i, j));
return (T[])grams.toArray(new String[grams.size()]);
}
}
|