aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java
blob: 9efa7007e7b387ad9e0fe5c083474259153b4666 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;

import com.yahoo.language.Language;
import com.yahoo.language.process.Transformer;

import java.text.Normalizer;
import java.util.regex.Pattern;

/**
 * Converts all accented characters into their de-accented counterparts followed by their combining diacritics, then
 * strips off the diacritics using a regex.
 *
 * @author Simon Thoresen Hult
 */
public class SimpleTransformer implements Transformer {

    private final static Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");

    @Override
    public String accentDrop(String input, Language language) {
        return pattern.matcher(Normalizer.normalize(input, Normalizer.Form.NFD)).replaceAll("");
    }

}