diff options
Diffstat (limited to 'opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java')
-rw-r--r-- | opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java | 51 |
1 files changed, 51 insertions, 0 deletions
diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java new file mode 100644 index 00000000000..df8f3fad520 --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java @@ -0,0 +1,51 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import opennlp.tools.util.normalizer.CharSequenceNormalizer; + +import java.util.function.IntConsumer; +import java.util.stream.IntStream; + +/** + * Simple normalizer + * + * @author arnej + */ +public class VespaCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final VespaCharSequenceNormalizer INSTANCE = new VespaCharSequenceNormalizer(); + + public static VespaCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + // filter replacing sequences of non-letters with a single space + static class OnlyLetters implements IntStream.IntMapMultiConsumer { + boolean addSpace = false; + public void accept(int codepoint, IntConsumer target) { + if (WordCharDetector.isWordChar(codepoint)) { + if (addSpace) { + target.accept(' '); + addSpace = false; + } + target.accept(Character.toLowerCase(codepoint)); + } else { + addSpace = true; + } + } + } + + public CharSequence normalize(CharSequence text) { + if (text.isEmpty()) { + return text; + } + var r = text + .codePoints() + .mapMulti(new OnlyLetters()) + .collect(StringBuilder::new, + StringBuilder::appendCodePoint, + StringBuilder::append); + return r; + } + +} |