// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; import com.yahoo.language.Language; import com.yahoo.language.LinguisticsCase; import com.yahoo.language.process.Normalizer; import com.yahoo.language.process.SpecialTokenRegistry; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.TokenType; import com.yahoo.language.process.Tokenizer; import com.yahoo.language.process.Transformer; import com.yahoo.language.simple.kstem.KStemmer; import java.util.ArrayList; import java.util.List; import java.util.function.Function; /** *
A tokenizer which splits on whitespace, normalizes and transforms using the given implementations * and stems using the kstem algorithm.
* *This is not multithread safe.
* * @author Mathias Mølster Lidal * @author bratseth */ public class SimpleTokenizer implements Tokenizer { private final static int SPACE_CODE = 32; private final Normalizer normalizer; private final Transformer transformer; private final KStemmer stemmer = new KStemmer(); private final SpecialTokenRegistry specialTokenRegistry; public SimpleTokenizer() { this(new SimpleNormalizer(), new SimpleTransformer()); } public SimpleTokenizer(Normalizer normalizer) { this(normalizer, new SimpleTransformer()); } public SimpleTokenizer(Normalizer normalizer, Transformer transformer) { this(normalizer, transformer, new SpecialTokenRegistry(List.of())); } public SimpleTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) { this.normalizer = normalizer; this.transformer = transformer; this.specialTokenRegistry = specialTokenRegistry; } /** Tokenize the input, applying the transform of this to each token string. */ @Override public Iterable