aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Marius Venstad <venstad@gmail.com>2021-12-20 13:42:19 +0100
committerJon Marius Venstad <venstad@gmail.com>2021-12-20 13:42:19 +0100
commita8908be0652c8213dbe232c3a94fe2d74994842c (patch)
treec7b727ed507d3b667eaca40ff20b6bcbffa6d637 /linguistics
parent76c192666396a934dc0d419a81e3c67a8e82509d (diff)
Override ngram creation with something less silly
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java32
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java1
2 files changed, 32 insertions, 1 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java b/linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java
new file mode 100644
index 00000000000..8d1eb51b388
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java
@@ -0,0 +1,32 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.opennlp;
+
+import opennlp.tools.ngram.NGramCharModel;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Avoids using the unnecessarily slow {@link NGramCharModel}.
+ *
+ * @author jonmv
+ */
+public class DefaultLanguageDetectorContextGenerator extends opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator {
+
+ public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength, CharSequenceNormalizer... normalizers) {
+ super(minLength, maxLength, normalizers);
+ }
+
+ @Override
+ public String[] getContext(CharSequence document) {
+ int[] normalized = normalizer.normalize(document).codePoints().map(Character::toLowerCase).toArray();
+ Set<String> grams = new HashSet<>();
+ for (int i = 0; i < normalized.length; i++)
+ for (int j = minLength; j <= maxLength && i + j < normalized.length; j++)
+ grams.add(new String(normalized, i, j));
+
+ return grams.toArray(new String[0]);
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
index aa4387bcc45..fdca5355008 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
@@ -1,7 +1,6 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;
-import opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator;
import opennlp.tools.langdetect.LanguageDetectorContextGenerator;
import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;