diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-05-04 17:23:27 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-05-04 17:23:27 +0200 |
commit | af59be1ed263f1476dd5df0a696f328a7de72ccd (patch) | |
tree | eaa85feb51486663f486adbe21f2905c83c60a98 /linguistics | |
parent | b399aa85883146aa3ba1396769d8e82c88877674 (diff) |
Wire in (but don't use) SpecialTokens
Diffstat (limited to 'linguistics')
6 files changed, 40 insertions, 18 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index e1185cb2457..73518876c3f 100644 --- a/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -4,6 +4,7 @@ package com.yahoo.language.opennlp; import com.yahoo.language.Language; import com.yahoo.language.LinguisticsCase; import com.yahoo.language.process.Normalizer; +import com.yahoo.language.process.SpecialTokenRegistry; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.TokenType; @@ -32,15 +33,21 @@ public class OpenNlpTokenizer implements Tokenizer { private final Normalizer normalizer; private final Transformer transformer; private final SimpleTokenizer simpleTokenizer; + private final SpecialTokenRegistry specialTokenRegistry; public OpenNlpTokenizer() { this(new SimpleNormalizer(), new SimpleTransformer()); } public OpenNlpTokenizer(Normalizer normalizer, Transformer transformer) { + this(normalizer, transformer, new SpecialTokenRegistry(List.of())); + } + + public OpenNlpTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) { this.normalizer = normalizer; this.transformer = transformer; - simpleTokenizer = new SimpleTokenizer(normalizer, transformer); + this.specialTokenRegistry = specialTokenRegistry; + this.simpleTokenizer = new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); } @Override diff --git a/linguistics/src/main/java/com/yahoo/language/process/SpecialTokenRegistry.java b/linguistics/src/main/java/com/yahoo/language/process/SpecialTokenRegistry.java index b65c3ba663c..60071c3147c 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/SpecialTokenRegistry.java +++ b/linguistics/src/main/java/com/yahoo/language/process/SpecialTokenRegistry.java @@ -12,8 +12,8 @@ import java.util.Map; import java.util.stream.Collectors; /** - * A registry which is responsible for knowing the current - * set of special tokens.Usage of this registry is multithread safe. + * Immutable named lists of "special tokens" - strings which should override the normal tokenizer semantics + * and be tokenized into a single token. * * @author bratseth */ diff --git a/linguistics/src/main/java/com/yahoo/language/process/SpecialTokens.java b/linguistics/src/main/java/com/yahoo/language/process/SpecialTokens.java index c1b05a00377..1170ce9ad1e 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/SpecialTokens.java +++ b/linguistics/src/main/java/com/yahoo/language/process/SpecialTokens.java @@ -11,8 +11,8 @@ import java.util.Objects; import static com.yahoo.language.LinguisticsCase.toLowerCase; /** - * A list of special tokens - string that should be treated as word - * no matter what they contain. Special tokens are case insensitive. + * An immutable list of special tokens - strings which should override the normal tokenizer semantics + * and be tokenized into a single token. Special tokens are case insensitive. * * @author bratseth */ diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java index 57a5b6edb68..ad154d1b003 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java +++ b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java @@ -4,7 +4,7 @@ package com.yahoo.language.process; /** * An enumeration of token types. * - * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a> + * @author Mathias Mølster Lidal */ public enum TokenType { diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java index e1a04b2985d..44f6c74b206 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java @@ -11,10 +11,12 @@ import com.yahoo.language.process.GramSplitter; import com.yahoo.language.process.Normalizer; import com.yahoo.language.process.Segmenter; import com.yahoo.language.process.SegmenterImpl; +import com.yahoo.language.process.SpecialTokenRegistry; import com.yahoo.language.process.Stemmer; import com.yahoo.language.process.StemmerImpl; import com.yahoo.language.process.Tokenizer; import com.yahoo.language.process.Transformer; +import com.yahoo.vespa.configdefinition.SpecialtokensConfig; /** * Factory of simple linguistic processor implementations. @@ -31,21 +33,27 @@ public class SimpleLinguistics implements Linguistics { private final Detector detector; private final CharacterClasses characterClasses; private final GramSplitter gramSplitter; + private final SpecialTokenRegistry specialTokenRegistry; - @Inject public SimpleLinguistics() { + this(new SpecialtokensConfig.Builder().build()); + } + + @Inject + public SimpleLinguistics(SpecialtokensConfig specialTokensConfig) { this.normalizer = new SimpleNormalizer(); this.transformer = new SimpleTransformer(); this.detector = new SimpleDetector(); this.characterClasses = new CharacterClasses(); this.gramSplitter = new GramSplitter(characterClasses); + this.specialTokenRegistry = new SpecialTokenRegistry(specialTokensConfig); } @Override public Stemmer getStemmer() { return new StemmerImpl(getTokenizer()); } @Override - public Tokenizer getTokenizer() { return new SimpleTokenizer(normalizer, transformer); } + public Tokenizer getTokenizer() { return new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); } @Override public Normalizer getNormalizer() { return normalizer; } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 7df432f496d..740307c0cca 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -23,11 +23,13 @@ import java.util.logging.Level; */ public class SimpleTokenizer implements Tokenizer { + private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName()); private final static int SPACE_CODE = 32; + private final Normalizer normalizer; private final Transformer transformer; private final KStemmer stemmer = new KStemmer(); - private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName()); + private final SpecialTokenRegistry specialTokenRegistry; public SimpleTokenizer() { this(new SimpleNormalizer(), new SimpleTransformer()); @@ -38,8 +40,13 @@ public class SimpleTokenizer implements Tokenizer { } public SimpleTokenizer(Normalizer normalizer, Transformer transformer) { + this(normalizer, transformer, new SpecialTokenRegistry(List.of())); + } + + public SimpleTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) { this.normalizer = normalizer; this.transformer = transformer; + this.specialTokenRegistry = specialTokenRegistry; } @Override @@ -56,8 +63,8 @@ public class SimpleTokenizer implements Tokenizer { String original = input.substring(prev, next); String token = processToken(original, language, stemMode, removeAccents); tokens.add(new SimpleToken(original).setOffset(prev) - .setType(prevType) - .setTokenString(token)); + .setType(prevType) + .setTokenString(token)); prev = next; prevType = nextType; } @@ -67,20 +74,20 @@ public class SimpleTokenizer implements Tokenizer { } private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { - final String original = token; - log.log(Level.FINEST, () -> "processToken '"+original+"'"); + String original = token; + log.log(Level.FINEST, () -> "processToken '" + original + "'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) { - final String oldToken = token; + String oldToken = token; token = stemmer.stem(token); - final String newToken = token; - log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'"); + String newToken = token; + log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'"); } - final String result = token; - log.log(Level.FINEST, () -> "processed token is: "+result); + String result = token; + log.log(Level.FINEST, () -> "processed token is: " + result); return result; } |