diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java new file mode 100644 index 00000000000..409ef44986e --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTransformer.java @@ -0,0 +1,25 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.simple; + +import com.yahoo.language.Language; +import com.yahoo.language.process.Transformer; + +import java.text.Normalizer; +import java.util.regex.Pattern; + +/** + * Converts all accented characters into their de-accented counterparts followed by their combining diacritics, then + * strips off the diacritics using a regex. + * + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + */ +public class SimpleTransformer implements Transformer { + + private final static Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); + + @Override + public String accentDrop(String input, Language language) { + return pattern.matcher(Normalizer.normalize(input, Normalizer.Form.NFD)).replaceAll(""); + } + +} |