summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java12
1 files changed, 9 insertions, 3 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index b791c843357..d86ca30a632 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -57,7 +57,7 @@ public class SimpleTokenizer implements Tokenizer {
}
/** Tokenize the input, and apply the given transform to each token string. */
- public Iterable<Token> tokenize(String input, Function<String, String> tokenProocessor) {
+ public Iterable<Token> tokenize(String input, Function<String, String> tokenProcessor) {
if (input.isEmpty()) return List.of();
List<Token> tokens = new ArrayList<>();
@@ -67,11 +67,11 @@ public class SimpleTokenizer implements Tokenizer {
for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
TokenType nextType = SimpleTokenType.valueOf(nextCode);
- if (!prevType.isIndexable() || !nextType.isIndexable()) {
+ if (isAtTokenBoundary(prevType, nextType)) {
String original = input.substring(prev, next);
tokens.add(new SimpleToken(original).setOffset(prev)
.setType(tokenType)
- .setTokenString(tokenProocessor.apply(original)));
+ .setTokenString(tokenProcessor.apply(original)));
prev = next;
prevType = nextType;
tokenType = prevType;
@@ -84,6 +84,12 @@ public class SimpleTokenizer implements Tokenizer {
return tokens;
}
+ private boolean isAtTokenBoundary(TokenType prevType, TokenType nextType) {
+ // Always index each symbol as a token
+ if (prevType == TokenType.INDEXABLE_SYMBOL || nextType == TokenType.INDEXABLE_SYMBOL) return true;
+ return !prevType.isIndexable() || !nextType.isIndexable();
+ }
+
private TokenType determineType(TokenType tokenType, TokenType characterType) {
if (characterType == TokenType.ALPHABETIC) return TokenType.ALPHABETIC;
return tokenType;