blob: 6187a4c47b2ab2840c151f017ce17be43c657541 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;
import com.yahoo.language.Language;
import com.yahoo.language.process.Transformer;
import java.text.Normalizer;
import java.util.regex.Pattern;
/**
* Converts all accented characters into their de-accented counterparts followed by their combining diacritics, then
* strips off the diacritics using a regex.
*
* @author Simon Thoresen Hult
*/
public class SimpleTransformer implements Transformer {
private final static Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
@Override
public String accentDrop(String input, Language language) {
return pattern.matcher(Normalizer.normalize(input, Normalizer.Form.NFD)).replaceAll("");
}
}
|