aboutsummaryrefslogtreecommitdiffstats
path: root/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java
blob: 883319e2f8b9deeb4fc6b737c425e8510216b369 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.opennlp;

import opennlp.tools.util.normalizer.CharSequenceNormalizer;

import java.util.regex.Pattern;

/**
 * Modifies {@link opennlp.tools.util.normalizer.UrlCharSequenceNormalizer} to avoid the bad email regex.
 *
 * @author jonmv
 */
public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {

    private static final Pattern URL_REGEX =
            Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
    private static final Pattern MAIL_REGEX =
            Pattern.compile("(?<![-+_.0-9A-Za-z])[-+_.0-9A-Za-z]+@[-0-9A-Za-z]+[-.0-9A-Za-z]+");

    private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer();

    public static UrlCharSequenceNormalizer getInstance() {
        return INSTANCE;
    }

    public CharSequence normalize(CharSequence text) {
        String modified = URL_REGEX.matcher(text).replaceAll(" ");
        return MAIL_REGEX.matcher(modified).replaceAll(" ");
    }

}