opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;

import opennlp.tools.ngram.NGramCharModel;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;

import java.util.HashSet;
import java.util.Set;

/**
 * Avoids using the unnecessarily slow {@link NGramCharModel}.
 *
 * @author jonmv
 */
public class DefaultLanguageDetectorContextGenerator extends opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator {

    public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength, CharSequenceNormalizer... normalizers) {
        super(minLength, maxLength, normalizers);
    }

    @Override
    @SuppressWarnings("unchecked")
    public <T extends CharSequence> T[] getContext(CharSequence document) {
        int[] normalized = normalizer.normalize(document).codePoints().map(Character::toLowerCase).toArray();
        Set<String> grams = new HashSet<>();
        for (int i = 0; i < normalized.length; i++)
            for (int j = minLength; j <= maxLength && i + j < normalized.length; j++)
                grams.add(new String(normalized, i, j));

        return (T[])grams.toArray(new String[grams.size()]);
    }

}