diff options
author | Arne Juul <arnej@yahooinc.com> | 2022-10-06 14:01:37 +0000 |
---|---|---|
committer | Arne Juul <arnej@yahooinc.com> | 2022-10-06 14:19:34 +0000 |
commit | 91629f81f8425b46e71026b1e733dad2f8ea270c (patch) | |
tree | 9bccfe6fcb47ce668c576ee29da8afa2ebea1037 /linguistics | |
parent | 3cbbac35a188b578f1360ede59de6175b5d43665 (diff) |
much simpler CharSequenceNormalizer
Diffstat (limited to 'linguistics')
3 files changed, 100 insertions, 9 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java index 305aead056b..0cf4634c6c3 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java @@ -2,10 +2,6 @@ package com.yahoo.language.opennlp; import opennlp.tools.langdetect.LanguageDetectorContextGenerator; -import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; -import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; -import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; -import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; /** * Overrides the UrlCharSequenceNormalizer, which has a bad regex, until fixed: https://issues.apache.org/jira/browse/OPENNLP-1350 @@ -18,11 +14,7 @@ public class LanguageDetectorFactory extends opennlp.tools.langdetect.LanguageDe @Override public LanguageDetectorContextGenerator getContextGenerator() { return new DefaultLanguageDetectorContextGenerator(1, 3, - EmojiCharSequenceNormalizer.getInstance(), - UrlCharSequenceNormalizer.getInstance(), - TwitterCharSequenceNormalizer.getInstance(), - NumberCharSequenceNormalizer.getInstance(), - ShrinkCharSequenceNormalizer.getInstance()); + VespaCharSequenceNormalizer.getInstance()); } } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java new file mode 100644 index 00000000000..df8f3fad520 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java @@ -0,0 +1,51 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import opennlp.tools.util.normalizer.CharSequenceNormalizer; + +import java.util.function.IntConsumer; +import java.util.stream.IntStream; + +/** + * Simple normalizer + * + * @author arnej + */ +public class VespaCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final VespaCharSequenceNormalizer INSTANCE = new VespaCharSequenceNormalizer(); + + public static VespaCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + // filter replacing sequences of non-letters with a single space + static class OnlyLetters implements IntStream.IntMapMultiConsumer { + boolean addSpace = false; + public void accept(int codepoint, IntConsumer target) { + if (WordCharDetector.isWordChar(codepoint)) { + if (addSpace) { + target.accept(' '); + addSpace = false; + } + target.accept(Character.toLowerCase(codepoint)); + } else { + addSpace = true; + } + } + } + + public CharSequence normalize(CharSequence text) { + if (text.isEmpty()) { + return text; + } + var r = text + .codePoints() + .mapMulti(new OnlyLetters()) + .collect(StringBuilder::new, + StringBuilder::appendCodePoint, + StringBuilder::append); + return r; + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java new file mode 100644 index 00000000000..d7e3f88ae8d --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java @@ -0,0 +1,48 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +class WordCharDetector { + public static boolean isWordChar(int codepoint) { + int unicodeGeneralCategory = Character.getType(codepoint); + switch (unicodeGeneralCategory) { + case Character.LOWERCASE_LETTER: + case Character.OTHER_LETTER: + case Character.TITLECASE_LETTER: + case Character.UPPERCASE_LETTER: + case Character.MODIFIER_LETTER: + return true; +/* + * these are the other categories, currently considered non-word-chars: + * + case Character.CONNECTOR_PUNCTUATION: + case Character.CONTROL: + case Character.CURRENCY_SYMBOL: + case Character.DASH_PUNCTUATION: + case Character.ENCLOSING_MARK: + case Character.END_PUNCTUATION: + case Character.FINAL_QUOTE_PUNCTUATION: + case Character.FORMAT: + case Character.INITIAL_QUOTE_PUNCTUATION: + case Character.MATH_SYMBOL: + case Character.MODIFIER_SYMBOL: + case Character.NON_SPACING_MARK: + case Character.OTHER_PUNCTUATION: + case Character.OTHER_SYMBOL: + case Character.PRIVATE_USE: + case Character.START_PUNCTUATION: + case Character.SURROGATE: + case Character.UNASSIGNED: + case Character.DECIMAL_DIGIT_NUMBER: + case Character.LETTER_NUMBER: + case Character.OTHER_NUMBER: + case Character.COMBINING_SPACING_MARK: + case Character.LINE_SEPARATOR: + case Character.SPACE_SEPARATOR: + case Character.PARAGRAPH_SEPARATOR: + * + */ + default: + return false; + } + } +} |