summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java29
1 files changed, 29 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
new file mode 100644
index 00000000000..aa4387bcc45
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
@@ -0,0 +1,29 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.opennlp;
+
+import opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator;
+import opennlp.tools.langdetect.LanguageDetectorContextGenerator;
+import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
+
+/**
+ * Overrides the UrlCharSequenceNormalizer, which has a bad regex, until fixed: https://issues.apache.org/jira/browse/OPENNLP-1350
+ *
+ * @author jonmv
+ */
+@SuppressWarnings("unused") // Loaded by black magic.
+public class LanguageDetectorFactory extends opennlp.tools.langdetect.LanguageDetectorFactory {
+
+ @Override
+ public LanguageDetectorContextGenerator getContextGenerator() {
+ return new DefaultLanguageDetectorContextGenerator(1, 3,
+ EmojiCharSequenceNormalizer.getInstance(),
+ UrlCharSequenceNormalizer.getInstance(),
+ TwitterCharSequenceNormalizer.getInstance(),
+ NumberCharSequenceNormalizer.getInstance(),
+ ShrinkCharSequenceNormalizer.getInstance());
+ }
+
+}