blob: fce3344bfad197f07a5453d1bc9ba26327b205f4 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
import com.yahoo.language.Language;
import com.yahoo.language.process.Transformer;
import java.text.Normalizer;
import java.util.regex.Pattern;
/**
* Converts all accented characters into their de-accented counterparts followed by their combining diacritics, then
* strips off the diacritics using a regex.
*
* @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
*/
public class SimpleTransformer implements Transformer {
private final static Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
@Override
public String accentDrop(String input, Language language) {
return pattern.matcher(Normalizer.normalize(input, Normalizer.Form.NFD)).replaceAll("");
}
}
|