blob: dff72625170d8c3b85f832284766247cb1121d9f (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
import java.util.regex.Pattern;
/**
* Modifies {@link opennlp.tools.util.normalizer.UrlCharSequenceNormalizer} to avoid the bad email regex.
*
* @author jonmv
*/
public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {
private static final Pattern URL_REGEX =
Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
private static final Pattern MAIL_REGEX =
Pattern.compile("(?<![-+_.0-9A-Za-z])[-+_.0-9A-Za-z]+@[-0-9A-Za-z]+[-.0-9A-Za-z]+");
private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer();
public static UrlCharSequenceNormalizer getInstance() {
return INSTANCE;
}
public CharSequence normalize(CharSequence text) {
String modified = URL_REGEX.matcher(text).replaceAll(" ");
return MAIL_REGEX.matcher(modified).replaceAll(" ");
}
}
|