summaryrefslogtreecommitdiffstats
path: root/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java')
-rw-r--r--opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java51
1 files changed, 51 insertions, 0 deletions
diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java
new file mode 100644
index 00000000000..df8f3fad520
--- /dev/null
+++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java
@@ -0,0 +1,51 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.opennlp;
+
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+import java.util.function.IntConsumer;
+import java.util.stream.IntStream;
+
+/**
+ * Simple normalizer
+ *
+ * @author arnej
+ */
+public class VespaCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final VespaCharSequenceNormalizer INSTANCE = new VespaCharSequenceNormalizer();
+
+ public static VespaCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ // filter replacing sequences of non-letters with a single space
+ static class OnlyLetters implements IntStream.IntMapMultiConsumer {
+ boolean addSpace = false;
+ public void accept(int codepoint, IntConsumer target) {
+ if (WordCharDetector.isWordChar(codepoint)) {
+ if (addSpace) {
+ target.accept(' ');
+ addSpace = false;
+ }
+ target.accept(Character.toLowerCase(codepoint));
+ } else {
+ addSpace = true;
+ }
+ }
+ }
+
+ public CharSequence normalize(CharSequence text) {
+ if (text.isEmpty()) {
+ return text;
+ }
+ var r = text
+ .codePoints()
+ .mapMulti(new OnlyLetters())
+ .collect(StringBuilder::new,
+ StringBuilder::appendCodePoint,
+ StringBuilder::append);
+ return r;
+ }
+
+}