diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java new file mode 100644 index 00000000000..aa4387bcc45 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java @@ -0,0 +1,29 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator; +import opennlp.tools.langdetect.LanguageDetectorContextGenerator; +import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; +import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; +import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; +import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; + +/** + * Overrides the UrlCharSequenceNormalizer, which has a bad regex, until fixed: https://issues.apache.org/jira/browse/OPENNLP-1350 + * + * @author jonmv + */ +@SuppressWarnings("unused") // Loaded by black magic. +public class LanguageDetectorFactory extends opennlp.tools.langdetect.LanguageDetectorFactory { + + @Override + public LanguageDetectorContextGenerator getContextGenerator() { + return new DefaultLanguageDetectorContextGenerator(1, 3, + EmojiCharSequenceNormalizer.getInstance(), + UrlCharSequenceNormalizer.getInstance(), + TwitterCharSequenceNormalizer.getInstance(), + NumberCharSequenceNormalizer.getInstance(), + ShrinkCharSequenceNormalizer.getInstance()); + } + +} |