aboutsummaryrefslogtreecommitdiffstats
path: root/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java')
-rw-r--r--opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java31
1 files changed, 31 insertions, 0 deletions
diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java
new file mode 100644
index 00000000000..883319e2f8b
--- /dev/null
+++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java
@@ -0,0 +1,31 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.opennlp;
+
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+import java.util.regex.Pattern;
+
+/**
+ * Modifies {@link opennlp.tools.util.normalizer.UrlCharSequenceNormalizer} to avoid the bad email regex.
+ *
+ * @author jonmv
+ */
+public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern URL_REGEX =
+ Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
+ private static final Pattern MAIL_REGEX =
+ Pattern.compile("(?<![-+_.0-9A-Za-z])[-+_.0-9A-Za-z]+@[-0-9A-Za-z]+[-.0-9A-Za-z]+");
+
+ private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer();
+
+ public static UrlCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize(CharSequence text) {
+ String modified = URL_REGEX.matcher(text).replaceAll(" ");
+ return MAIL_REGEX.matcher(modified).replaceAll(" ");
+ }
+
+}