diff options
author | Valerij Fredriksen <freva@users.noreply.github.com> | 2021-05-05 18:25:51 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-05-05 18:25:51 +0200 |
commit | a2c9cd4bc04f1a3eaa31524b3970b96be5c2eda9 (patch) | |
tree | 102fd34bdb96e8191734ae88dac64d8e4eab0c7b /linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java | |
parent | 8c61a373af0066fbdf1cca354c24b197c7347321 (diff) | |
parent | 360894c9120b1cb6f89809fedc90f6fc0047b662 (diff) |
Merge pull request #17754 from vespa-engine/revert-17747-bratseth/special-tokens-take-2
Revert "Reapply "Bratseth/special tokens""
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java | 27 |
1 files changed, 10 insertions, 17 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 740307c0cca..7df432f496d 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -23,13 +23,11 @@ import java.util.logging.Level; */ public class SimpleTokenizer implements Tokenizer { - private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName()); private final static int SPACE_CODE = 32; - private final Normalizer normalizer; private final Transformer transformer; private final KStemmer stemmer = new KStemmer(); - private final SpecialTokenRegistry specialTokenRegistry; + private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName()); public SimpleTokenizer() { this(new SimpleNormalizer(), new SimpleTransformer()); @@ -40,13 +38,8 @@ public class SimpleTokenizer implements Tokenizer { } public SimpleTokenizer(Normalizer normalizer, Transformer transformer) { - this(normalizer, transformer, new SpecialTokenRegistry(List.of())); - } - - public SimpleTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) { this.normalizer = normalizer; this.transformer = transformer; - this.specialTokenRegistry = specialTokenRegistry; } @Override @@ -63,8 +56,8 @@ public class SimpleTokenizer implements Tokenizer { String original = input.substring(prev, next); String token = processToken(original, language, stemMode, removeAccents); tokens.add(new SimpleToken(original).setOffset(prev) - .setType(prevType) - .setTokenString(token)); + .setType(prevType) + .setTokenString(token)); prev = next; prevType = nextType; } @@ -74,20 +67,20 @@ public class SimpleTokenizer implements Tokenizer { } private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { - String original = token; - log.log(Level.FINEST, () -> "processToken '" + original + "'"); + final String original = token; + log.log(Level.FINEST, () -> "processToken '"+original+"'"); token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) { - String oldToken = token; + final String oldToken = token; token = stemmer.stem(token); - String newToken = token; - log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'"); + final String newToken = token; + log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'"); } - String result = token; - log.log(Level.FINEST, () -> "processed token is: " + result); + final String result = token; + log.log(Level.FINEST, () -> "processed token is: "+result); return result; } |