summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
blob: aa4387bcc456ce08e7270e6da0d73c91a69c2fb0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;

import opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator;
import opennlp.tools.langdetect.LanguageDetectorContextGenerator;
import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;

/**
 * Overrides the UrlCharSequenceNormalizer, which has a bad regex, until fixed: https://issues.apache.org/jira/browse/OPENNLP-1350
 *
 * @author jonmv
 */
@SuppressWarnings("unused") // Loaded by black magic.
public class LanguageDetectorFactory extends opennlp.tools.langdetect.LanguageDetectorFactory {

    @Override
    public LanguageDetectorContextGenerator getContextGenerator() {
        return new DefaultLanguageDetectorContextGenerator(1, 3,
                                                           EmojiCharSequenceNormalizer.getInstance(),
                                                           UrlCharSequenceNormalizer.getInstance(),
                                                           TwitterCharSequenceNormalizer.getInstance(),
                                                           NumberCharSequenceNormalizer.getInstance(),
                                                           ShrinkCharSequenceNormalizer.getInstance());
    }

}