summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorArne Juul <arnej@yahooinc.com>2022-10-06 14:01:37 +0000
committerArne Juul <arnej@yahooinc.com>2022-10-06 14:19:34 +0000
commit91629f81f8425b46e71026b1e733dad2f8ea270c (patch)
tree9bccfe6fcb47ce668c576ee29da8afa2ebea1037 /linguistics
parent3cbbac35a188b578f1360ede59de6175b5d43665 (diff)
much simpler CharSequenceNormalizer
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java10
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java51
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java48
3 files changed, 100 insertions, 9 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
index 305aead056b..0cf4634c6c3 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java
@@ -2,10 +2,6 @@
package com.yahoo.language.opennlp;
import opennlp.tools.langdetect.LanguageDetectorContextGenerator;
-import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
/**
* Overrides the UrlCharSequenceNormalizer, which has a bad regex, until fixed: https://issues.apache.org/jira/browse/OPENNLP-1350
@@ -18,11 +14,7 @@ public class LanguageDetectorFactory extends opennlp.tools.langdetect.LanguageDe
@Override
public LanguageDetectorContextGenerator getContextGenerator() {
return new DefaultLanguageDetectorContextGenerator(1, 3,
- EmojiCharSequenceNormalizer.getInstance(),
- UrlCharSequenceNormalizer.getInstance(),
- TwitterCharSequenceNormalizer.getInstance(),
- NumberCharSequenceNormalizer.getInstance(),
- ShrinkCharSequenceNormalizer.getInstance());
+ VespaCharSequenceNormalizer.getInstance());
}
}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java
new file mode 100644
index 00000000000..df8f3fad520
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java
@@ -0,0 +1,51 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.opennlp;
+
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+import java.util.function.IntConsumer;
+import java.util.stream.IntStream;
+
+/**
+ * Simple normalizer
+ *
+ * @author arnej
+ */
+public class VespaCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final VespaCharSequenceNormalizer INSTANCE = new VespaCharSequenceNormalizer();
+
+ public static VespaCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ // filter replacing sequences of non-letters with a single space
+ static class OnlyLetters implements IntStream.IntMapMultiConsumer {
+ boolean addSpace = false;
+ public void accept(int codepoint, IntConsumer target) {
+ if (WordCharDetector.isWordChar(codepoint)) {
+ if (addSpace) {
+ target.accept(' ');
+ addSpace = false;
+ }
+ target.accept(Character.toLowerCase(codepoint));
+ } else {
+ addSpace = true;
+ }
+ }
+ }
+
+ public CharSequence normalize(CharSequence text) {
+ if (text.isEmpty()) {
+ return text;
+ }
+ var r = text
+ .codePoints()
+ .mapMulti(new OnlyLetters())
+ .collect(StringBuilder::new,
+ StringBuilder::appendCodePoint,
+ StringBuilder::append);
+ return r;
+ }
+
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java
new file mode 100644
index 00000000000..d7e3f88ae8d
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java
@@ -0,0 +1,48 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.opennlp;
+
+class WordCharDetector {
+ public static boolean isWordChar(int codepoint) {
+ int unicodeGeneralCategory = Character.getType(codepoint);
+ switch (unicodeGeneralCategory) {
+ case Character.LOWERCASE_LETTER:
+ case Character.OTHER_LETTER:
+ case Character.TITLECASE_LETTER:
+ case Character.UPPERCASE_LETTER:
+ case Character.MODIFIER_LETTER:
+ return true;
+/*
+ * these are the other categories, currently considered non-word-chars:
+ *
+ case Character.CONNECTOR_PUNCTUATION:
+ case Character.CONTROL:
+ case Character.CURRENCY_SYMBOL:
+ case Character.DASH_PUNCTUATION:
+ case Character.ENCLOSING_MARK:
+ case Character.END_PUNCTUATION:
+ case Character.FINAL_QUOTE_PUNCTUATION:
+ case Character.FORMAT:
+ case Character.INITIAL_QUOTE_PUNCTUATION:
+ case Character.MATH_SYMBOL:
+ case Character.MODIFIER_SYMBOL:
+ case Character.NON_SPACING_MARK:
+ case Character.OTHER_PUNCTUATION:
+ case Character.OTHER_SYMBOL:
+ case Character.PRIVATE_USE:
+ case Character.START_PUNCTUATION:
+ case Character.SURROGATE:
+ case Character.UNASSIGNED:
+ case Character.DECIMAL_DIGIT_NUMBER:
+ case Character.LETTER_NUMBER:
+ case Character.OTHER_NUMBER:
+ case Character.COMBINING_SPACING_MARK:
+ case Character.LINE_SEPARATOR:
+ case Character.SPACE_SEPARATOR:
+ case Character.PARAGRAPH_SEPARATOR:
+ *
+ */
+ default:
+ return false;
+ }
+ }
+}