diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 068fc0126d7..a8470d86869 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -9,6 +9,8 @@ import com.yahoo.language.simple.kstem.KStemmer; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.logging.Logger; +import java.util.logging.Level; /** * <p>A tokenizer which splits on whitespace, normalizes and transforms using the given implementations @@ -25,6 +27,7 @@ public class SimpleTokenizer implements Tokenizer { private final Normalizer normalizer; private final Transformer transformer; private final KStemmer stemmer = new KStemmer(); + private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName()); public SimpleTokenizer() { this(new SimpleNormalizer(), new SimpleTransformer()); @@ -64,12 +67,17 @@ public class SimpleTokenizer implements Tokenizer { } private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { + log.log(Level.FINEST, "processToken '"+token+"'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); - if (stemMode != StemMode.NONE) + if (stemMode != StemMode.NONE) { + String oldToken = token; token = stemmer.stem(token); + log.log(Level.FINEST, "stem '"+oldToken+"' to '"+token+"'"); + } + log.log(Level.FINEST, "processed token is: "+token); return token; } |