summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java8
1 files changed, 8 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index 07349811bd4..0ebd4e0f638 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -11,9 +11,12 @@ import opennlp.tools.stemmer.snowball.SnowballStemmer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.logging.Logger;
+import java.util.logging.Level;
public class OpenNlpTokenizer implements Tokenizer {
private final static int SPACE_CODE = 32;
+ private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName());
private final Normalizer normalizer;
private final Transformer transformer;
private final SimpleTokenizer simpleTokenizer;
@@ -57,6 +60,7 @@ public class OpenNlpTokenizer implements Tokenizer {
}
private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) {
+ log.log(Level.FINEST, "getStemmerForLanguage '"+language+"' mode: "+stemMode);
if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) {
return null;
}
@@ -120,13 +124,17 @@ public class OpenNlpTokenizer implements Tokenizer {
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
Stemmer stemmer) {
+ log.log(Level.FINEST, "processToken '"+token+"'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
if (stemMode != StemMode.NONE) {
+ String oldToken = token;
token = doStemming(token, stemmer);
+ log.log(Level.FINEST, "stem '"+oldToken+"' to '"+token+"'");
}
+ log.log(Level.FINEST, "processed token is: "+token);
return token;
}