summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2020-04-24 08:34:55 +0000
committerArne Juul <arnej@verizonmedia.com>2020-04-24 08:34:55 +0000
commit976f28680df424bf028eb12a3f413049c17bf098 (patch)
treed537588846890f6ecb2c5bf6f4edb0b568f82394 /linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
parent62eb464f5703ee7ba8af7b3ed5573fa706f937f3 (diff)
add more tracing and debug logging of stemming
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java10
1 files changed, 9 insertions, 1 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 068fc0126d7..a8470d86869 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -9,6 +9,8 @@ import com.yahoo.language.simple.kstem.KStemmer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.logging.Logger;
+import java.util.logging.Level;
/**
* <p>A tokenizer which splits on whitespace, normalizes and transforms using the given implementations
@@ -25,6 +27,7 @@ public class SimpleTokenizer implements Tokenizer {
private final Normalizer normalizer;
private final Transformer transformer;
private final KStemmer stemmer = new KStemmer();
+ private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName());
public SimpleTokenizer() {
this(new SimpleNormalizer(), new SimpleTransformer());
@@ -64,12 +67,17 @@ public class SimpleTokenizer implements Tokenizer {
}
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
+ log.log(Level.FINEST, "processToken '"+token+"'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
- if (stemMode != StemMode.NONE)
+ if (stemMode != StemMode.NONE) {
+ String oldToken = token;
token = stemmer.stem(token);
+ log.log(Level.FINEST, "stem '"+oldToken+"' to '"+token+"'");
+ }
+ log.log(Level.FINEST, "processed token is: "+token);
return token;
}