summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2020-04-24 08:34:55 +0000
committerArne Juul <arnej@verizonmedia.com>2020-04-24 08:34:55 +0000
commit976f28680df424bf028eb12a3f413049c17bf098 (patch)
treed537588846890f6ecb2c5bf6f4edb0b568f82394 /linguistics
parent62eb464f5703ee7ba8af7b3ed5573fa706f937f3 (diff)
add more tracing and debug logging of stemming
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java4
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java8
-rw-r--r--linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java4
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java10
4 files changed, 25 insertions, 1 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
index 1c7c71c00b6..0837b25c151 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java
@@ -6,6 +6,8 @@ import com.yahoo.language.detect.Detector;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.simple.SimpleDetector;
import com.yahoo.language.simple.SimpleLinguistics;
+import java.util.logging.Logger;
+import java.util.logging.Level;
/**
* Returns a linguistics implementation based on OpenNlp,
@@ -13,6 +15,7 @@ import com.yahoo.language.simple.SimpleLinguistics;
*/
public class OpenNlpLinguistics extends SimpleLinguistics {
+ private static final Logger log = Logger.getLogger(OpenNlpLinguistics.class.getName());
private final Detector detector;
public OpenNlpLinguistics() {
@@ -26,6 +29,7 @@ public class OpenNlpLinguistics extends SimpleLinguistics {
public OpenNlpLinguistics(boolean enableOptimaize) {
this(enableOptimaize ? new OptimaizeDetector() : new SimpleDetector());
+ log.log(Level.FINE, "using "+(enableOptimaize ? "Optimaize" : "Simple")+" detector");
}
private OpenNlpLinguistics(Detector detector) {
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
index 07349811bd4..0ebd4e0f638 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java
@@ -11,9 +11,12 @@ import opennlp.tools.stemmer.snowball.SnowballStemmer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.logging.Logger;
+import java.util.logging.Level;
public class OpenNlpTokenizer implements Tokenizer {
private final static int SPACE_CODE = 32;
+ private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName());
private final Normalizer normalizer;
private final Transformer transformer;
private final SimpleTokenizer simpleTokenizer;
@@ -57,6 +60,7 @@ public class OpenNlpTokenizer implements Tokenizer {
}
private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) {
+ log.log(Level.FINEST, "getStemmerForLanguage '"+language+"' mode: "+stemMode);
if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) {
return null;
}
@@ -120,13 +124,17 @@ public class OpenNlpTokenizer implements Tokenizer {
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
Stemmer stemmer) {
+ log.log(Level.FINEST, "processToken '"+token+"'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
if (stemMode != StemMode.NONE) {
+ String oldToken = token;
token = doStemming(token, stemmer);
+ log.log(Level.FINEST, "stem '"+oldToken+"' to '"+token+"'");
}
+ log.log(Level.FINEST, "processed token is: "+token);
return token;
}
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java
index ef1d6c966ac..a42c9f0504e 100644
--- a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java
@@ -22,6 +22,8 @@ import java.io.UncheckedIOException;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.Locale;
+import java.util.logging.Logger;
+import java.util.logging.Level;
/**
* Detects the language of some sample text using SimpleDetector for CJK and Optimaize otherwise.
@@ -33,6 +35,7 @@ public class OptimaizeDetector implements Detector {
static private Object initGuard = new Object();
static private TextObjectFactory textObjectFactory = null;
static private LanguageDetector languageDetector = null;
+ static private final Logger log = Logger.getLogger(OptimaizeDetector.class.getName());
static private void initOptimaize() {
synchronized (initGuard) {
@@ -96,6 +99,7 @@ public class OptimaizeDetector implements Detector {
private static Language guessLanguageUsingOptimaize(String input) {
Optional<LdLocale> result = languageDetector.detect(textObjectFactory.forText(input));
if ( ! result.isPresent()) return Language.UNKNOWN;
+ log.log(Level.FINE, "guessing language "+result.get()+" from input: "+input);
return Language.fromLocale(new Locale(result.get().getLanguage()));
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 068fc0126d7..a8470d86869 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -9,6 +9,8 @@ import com.yahoo.language.simple.kstem.KStemmer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.logging.Logger;
+import java.util.logging.Level;
/**
* <p>A tokenizer which splits on whitespace, normalizes and transforms using the given implementations
@@ -25,6 +27,7 @@ public class SimpleTokenizer implements Tokenizer {
private final Normalizer normalizer;
private final Transformer transformer;
private final KStemmer stemmer = new KStemmer();
+ private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName());
public SimpleTokenizer() {
this(new SimpleNormalizer(), new SimpleTransformer());
@@ -64,12 +67,17 @@ public class SimpleTokenizer implements Tokenizer {
}
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
+ log.log(Level.FINEST, "processToken '"+token+"'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
- if (stemMode != StemMode.NONE)
+ if (stemMode != StemMode.NONE) {
+ String oldToken = token;
token = stemmer.stem(token);
+ log.log(Level.FINEST, "stem '"+oldToken+"' to '"+token+"'");
+ }
+ log.log(Level.FINEST, "processed token is: "+token);
return token;
}