summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java27
1 files changed, 10 insertions, 17 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 740307c0cca..7df432f496d 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -23,13 +23,11 @@ import java.util.logging.Level;
*/
public class SimpleTokenizer implements Tokenizer {
- private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName());
private final static int SPACE_CODE = 32;
-
private final Normalizer normalizer;
private final Transformer transformer;
private final KStemmer stemmer = new KStemmer();
- private final SpecialTokenRegistry specialTokenRegistry;
+ private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName());
public SimpleTokenizer() {
this(new SimpleNormalizer(), new SimpleTransformer());
@@ -40,13 +38,8 @@ public class SimpleTokenizer implements Tokenizer {
}
public SimpleTokenizer(Normalizer normalizer, Transformer transformer) {
- this(normalizer, transformer, new SpecialTokenRegistry(List.of()));
- }
-
- public SimpleTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) {
this.normalizer = normalizer;
this.transformer = transformer;
- this.specialTokenRegistry = specialTokenRegistry;
}
@Override
@@ -63,8 +56,8 @@ public class SimpleTokenizer implements Tokenizer {
String original = input.substring(prev, next);
String token = processToken(original, language, stemMode, removeAccents);
tokens.add(new SimpleToken(original).setOffset(prev)
- .setType(prevType)
- .setTokenString(token));
+ .setType(prevType)
+ .setTokenString(token));
prev = next;
prevType = nextType;
}
@@ -74,20 +67,20 @@ public class SimpleTokenizer implements Tokenizer {
}
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
- String original = token;
- log.log(Level.FINEST, () -> "processToken '" + original + "'");
+ final String original = token;
+ log.log(Level.FINEST, () -> "processToken '"+original+"'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
if (stemMode != StemMode.NONE) {
- String oldToken = token;
+ final String oldToken = token;
token = stemmer.stem(token);
- String newToken = token;
- log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'");
+ final String newToken = token;
+ log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'");
}
- String result = token;
- log.log(Level.FINEST, () -> "processed token is: " + result);
+ final String result = token;
+ log.log(Level.FINEST, () -> "processed token is: "+result);
return result;
}