From 976f28680df424bf028eb12a3f413049c17bf098 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Fri, 24 Apr 2020 08:34:55 +0000 Subject: add more tracing and debug logging of stemming --- .../java/com/yahoo/language/opennlp/OpenNlpLinguistics.java | 4 ++++ .../main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java | 8 ++++++++ .../java/com/yahoo/language/opennlp/OptimaizeDetector.java | 4 ++++ .../main/java/com/yahoo/language/simple/SimpleTokenizer.java | 10 +++++++++- 4 files changed, 25 insertions(+), 1 deletion(-) (limited to 'linguistics/src/main/java/com/yahoo/language') diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 1c7c71c00b6..0837b25c151 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -6,6 +6,8 @@ import com.yahoo.language.detect.Detector; import com.yahoo.language.process.Tokenizer; import com.yahoo.language.simple.SimpleDetector; import com.yahoo.language.simple.SimpleLinguistics; +import java.util.logging.Logger; +import java.util.logging.Level; /** * Returns a linguistics implementation based on OpenNlp, @@ -13,6 +15,7 @@ import com.yahoo.language.simple.SimpleLinguistics; */ public class OpenNlpLinguistics extends SimpleLinguistics { + private static final Logger log = Logger.getLogger(OpenNlpLinguistics.class.getName()); private final Detector detector; public OpenNlpLinguistics() { @@ -26,6 +29,7 @@ public class OpenNlpLinguistics extends SimpleLinguistics { public OpenNlpLinguistics(boolean enableOptimaize) { this(enableOptimaize ? new OptimaizeDetector() : new SimpleDetector()); + log.log(Level.FINE, "using "+(enableOptimaize ? "Optimaize" : "Simple")+" detector"); } private OpenNlpLinguistics(Detector detector) { diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 07349811bd4..0ebd4e0f638 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -11,9 +11,12 @@ import opennlp.tools.stemmer.snowball.SnowballStemmer; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.logging.Logger; +import java.util.logging.Level; public class OpenNlpTokenizer implements Tokenizer { private final static int SPACE_CODE = 32; + private static final Logger log = Logger.getLogger(OpenNlpTokenizer.class.getName()); private final Normalizer normalizer; private final Transformer transformer; private final SimpleTokenizer simpleTokenizer; @@ -57,6 +60,7 @@ public class OpenNlpTokenizer implements Tokenizer { } private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) { + log.log(Level.FINEST, "getStemmerForLanguage '"+language+"' mode: "+stemMode); if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) { return null; } @@ -120,13 +124,17 @@ public class OpenNlpTokenizer implements Tokenizer { private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { + log.log(Level.FINEST, "processToken '"+token+"'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) { + String oldToken = token; token = doStemming(token, stemmer); + log.log(Level.FINEST, "stem '"+oldToken+"' to '"+token+"'"); } + log.log(Level.FINEST, "processed token is: "+token); return token; } diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java index ef1d6c966ac..a42c9f0504e 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OptimaizeDetector.java @@ -22,6 +22,8 @@ import java.io.UncheckedIOException; import java.nio.ByteBuffer; import java.util.List; import java.util.Locale; +import java.util.logging.Logger; +import java.util.logging.Level; /** * Detects the language of some sample text using SimpleDetector for CJK and Optimaize otherwise. @@ -33,6 +35,7 @@ public class OptimaizeDetector implements Detector { static private Object initGuard = new Object(); static private TextObjectFactory textObjectFactory = null; static private LanguageDetector languageDetector = null; + static private final Logger log = Logger.getLogger(OptimaizeDetector.class.getName()); static private void initOptimaize() { synchronized (initGuard) { @@ -96,6 +99,7 @@ public class OptimaizeDetector implements Detector { private static Language guessLanguageUsingOptimaize(String input) { Optional result = languageDetector.detect(textObjectFactory.forText(input)); if ( ! result.isPresent()) return Language.UNKNOWN; + log.log(Level.FINE, "guessing language "+result.get()+" from input: "+input); return Language.fromLocale(new Locale(result.get().getLanguage())); } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 068fc0126d7..a8470d86869 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -9,6 +9,8 @@ import com.yahoo.language.simple.kstem.KStemmer; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.logging.Logger; +import java.util.logging.Level; /** *

A tokenizer which splits on whitespace, normalizes and transforms using the given implementations @@ -25,6 +27,7 @@ public class SimpleTokenizer implements Tokenizer { private final Normalizer normalizer; private final Transformer transformer; private final KStemmer stemmer = new KStemmer(); + private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName()); public SimpleTokenizer() { this(new SimpleNormalizer(), new SimpleTransformer()); @@ -64,12 +67,17 @@ public class SimpleTokenizer implements Tokenizer { } private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { + log.log(Level.FINEST, "processToken '"+token+"'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); - if (stemMode != StemMode.NONE) + if (stemMode != StemMode.NONE) { + String oldToken = token; token = stemmer.stem(token); + log.log(Level.FINEST, "stem '"+oldToken+"' to '"+token+"'"); + } + log.log(Level.FINEST, "processed token is: "+token); return token; } -- cgit v1.2.3