blob: 8d1eb51b3887399c271abf35316bb66b4ebb63ec (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;
import opennlp.tools.ngram.NGramCharModel;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
import java.util.HashSet;
import java.util.Set;
/**
* Avoids using the unnecessarily slow {@link NGramCharModel}.
*
* @author jonmv
*/
public class DefaultLanguageDetectorContextGenerator extends opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator {
public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength, CharSequenceNormalizer... normalizers) {
super(minLength, maxLength, normalizers);
}
@Override
public String[] getContext(CharSequence document) {
int[] normalized = normalizer.normalize(document).codePoints().map(Character::toLowerCase).toArray();
Set<String> grams = new HashSet<>();
for (int i = 0; i < normalized.length; i++)
for (int j = minLength; j <= maxLength && i + j < normalized.length; j++)
grams.add(new String(normalized, i, j));
return grams.toArray(new String[0]);
}
}
|