summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java26
1 files changed, 21 insertions, 5 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 3dc28b99144..b791c843357 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -7,8 +7,8 @@ import com.yahoo.language.process.*;
import com.yahoo.language.simple.kstem.KStemmer;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
+import java.util.function.Function;
import java.util.logging.Logger;
import java.util.logging.Level;
@@ -49,30 +49,46 @@ public class SimpleTokenizer implements Tokenizer {
this.specialTokenRegistry = specialTokenRegistry;
}
+ /** Tokenize the input, applying the transform of this to each token string. */
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
- if (input.isEmpty()) return Collections.emptyList();
+ return tokenize(input,
+ token -> processToken(token, language, stemMode, removeAccents));
+ }
+
+ /** Tokenize the input, and apply the given transform to each token string. */
+ public Iterable<Token> tokenize(String input, Function<String, String> tokenProocessor) {
+ if (input.isEmpty()) return List.of();
List<Token> tokens = new ArrayList<>();
int nextCode = input.codePointAt(0);
TokenType prevType = SimpleTokenType.valueOf(nextCode);
+ TokenType tokenType = prevType;
for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
TokenType nextType = SimpleTokenType.valueOf(nextCode);
if (!prevType.isIndexable() || !nextType.isIndexable()) {
String original = input.substring(prev, next);
- String token = processToken(original, language, stemMode, removeAccents);
tokens.add(new SimpleToken(original).setOffset(prev)
- .setType(prevType)
- .setTokenString(token));
+ .setType(tokenType)
+ .setTokenString(tokenProocessor.apply(original)));
prev = next;
prevType = nextType;
+ tokenType = prevType;
+ }
+ else {
+ tokenType = determineType(tokenType, nextType);
}
next += Character.charCount(nextCode);
}
return tokens;
}
+ private TokenType determineType(TokenType tokenType, TokenType characterType) {
+ if (characterType == TokenType.ALPHABETIC) return TokenType.ALPHABETIC;
+ return tokenType;
+ }
+
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
String original = token;
log.log(Level.FINEST, () -> "processToken '" + original + "'");