diff options
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 07349811bd4..0ebd4e0f638 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -11,9 +11,12 @@ import opennlp.tools.stemmer.snowball.SnowballStemmer; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.logging.Logger; +import java.util.logging.Level; public class OpenNlpTokenizer implements Tokenizer { private final static int SPACE_CODE = 32; + private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName()); private final Normalizer normalizer; private final Transformer transformer; private final SimpleTokenizer simpleTokenizer; @@ -57,6 +60,7 @@ public class OpenNlpTokenizer implements Tokenizer { } private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) { + log.log(Level.FINEST, "getStemmerForLanguage '"+language+"' mode: "+stemMode); if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) { return null; } @@ -120,13 +124,17 @@ public class OpenNlpTokenizer implements Tokenizer { private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { + log.log(Level.FINEST, "processToken '"+token+"'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) { + String oldToken = token; token = doStemming(token, stemmer); + log.log(Level.FINEST, "stem '"+oldToken+"' to '"+token+"'"); } + log.log(Level.FINEST, "processed token is: "+token); return token; } |