aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java
blob: 3ab1750bcee77b49c9b6119634fd1c6af7e9020c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;

import com.yahoo.language.Language;
import com.yahoo.language.process.Transformer;

import java.text.Normalizer;
import java.util.regex.Pattern;

/**
 * Converts all accented characters into their de-accented counterparts followed by their combining diacritics, then
 * strips off the diacritics using a regex.
 *
 * @author Simon Thoresen Hult
 */
public class SimpleTransformer implements Transformer {

    private final static Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");

    @Override
    public String accentDrop(String input, Language language) {
        return pattern.matcher(Normalizer.normalize(input, Normalizer.Form.NFD)).replaceAll("");
    }

}