aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
commit72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
Publish
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java76
1 files changed, 76 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
new file mode 100644
index 00000000000..48a12c54e86
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -0,0 +1,76 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.simple;
+
+import com.yahoo.language.Language;
+import com.yahoo.language.LinguisticsCase;
+import com.yahoo.language.process.*;
+import com.yahoo.language.simple.kstem.KStemmer;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * <p>A tokenizer which splits on whitespace, normalizes and transforms using the given implementations
+ * and stems using the kstem algorithm.</p>
+ *
+ * <p>This is not multithread safe.</p>
+ *
+ * @author <a href="mailto:mathiasm@yahoo-inc.com">Mathias Mølster Lidal</a>
+ * @author bratseth
+ */
+public class SimpleTokenizer implements Tokenizer {
+
+ private final static int SPACE_CODE = 32;
+ private final Normalizer normalizer;
+ private final Transformer transformer;
+ private final KStemmer stemmer = new KStemmer();
+
+ public SimpleTokenizer() {
+ this(new SimpleNormalizer(), new SimpleTransformer());
+ }
+
+ public SimpleTokenizer(Normalizer normalizer) {
+ this(normalizer, new SimpleTransformer());
+ }
+
+ public SimpleTokenizer(Normalizer normalizer, Transformer transformer) {
+ this.normalizer = normalizer;
+ this.transformer = transformer;
+ }
+
+ @Override
+ public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
+ if (input.isEmpty()) return Collections.emptyList();
+
+ List<Token> tokens = new ArrayList<>();
+ int nextCode = input.codePointAt(0);
+ TokenType prevType = SimpleTokenType.valueOf(nextCode);
+ for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
+ nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
+ TokenType nextType = SimpleTokenType.valueOf(nextCode);
+ if (!prevType.isIndexable() || !nextType.isIndexable()) {
+ String original = input.substring(prev, next);
+ String token = processToken(original, language, stemMode, removeAccents);
+ tokens.add(new SimpleToken(original).setOffset(prev)
+ .setType(prevType)
+ .setTokenString(token));
+ prev = next;
+ prevType = nextType;
+ }
+ next += Character.charCount(nextCode);
+ }
+ return tokens;
+ }
+
+ private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
+ token = normalizer.normalize(token);
+ token = LinguisticsCase.toLowerCase(token);
+ if (removeAccents)
+ token = transformer.accentDrop(token, language);
+ if (stemMode != StemMode.NONE)
+ token = stemmer.stem(token);
+ return token;
+ }
+
+}