aboutsummaryrefslogtreecommitdiffstats
path: root/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2022-11-26 00:10:17 +0100
committerHenning Baldersheim <balder@yahoo-inc.com>2022-11-26 23:55:10 +0100
commita8665da65c39d9e4a56c74c2d8e6a7bd61c7c313 (patch)
tree7ef8738fca139dfdab1464c5edfd3d7423427b9b /opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java
parentb36cb57248dfc02bae9dfe7b2cca0ddd551881c6 (diff)
Split out opennlp-linguistics
Diffstat (limited to 'opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java')
-rw-r--r--opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java31
1 files changed, 31 insertions, 0 deletions
diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java
new file mode 100644
index 00000000000..883319e2f8b
--- /dev/null
+++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java
@@ -0,0 +1,31 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.opennlp;
+
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+import java.util.regex.Pattern;
+
+/**
+ * Modifies {@link opennlp.tools.util.normalizer.UrlCharSequenceNormalizer} to avoid the bad email regex.
+ *
+ * @author jonmv
+ */
+public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern URL_REGEX =
+ Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
+ private static final Pattern MAIL_REGEX =
+ Pattern.compile("(?<![-+_.0-9A-Za-z])[-+_.0-9A-Za-z]+@[-0-9A-Za-z]+[-.0-9A-Za-z]+");
+
+ private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer();
+
+ public static UrlCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize(CharSequence text) {
+ String modified = URL_REGEX.matcher(text).replaceAll(" ");
+ return MAIL_REGEX.matcher(modified).replaceAll(" ");
+ }
+
+}