summaryrefslogtreecommitdiffstats
path: root/indexinglanguage
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2023-10-20 12:55:27 +0200
committerjonmv <venstad@gmail.com>2023-10-20 12:55:27 +0200
commit2ecc9af04be2dbebedbf0032990cd3b699aa35d5 (patch)
treef361ac72663142129db0c06f6d42d5b8458de1d2 /indexinglanguage
parentdc809366178df3cb3fd0a303a097b333cb4dbac6 (diff)
Take config to mean number of code points in match max length
Diffstat (limited to 'indexinglanguage')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java2
1 files changed, 1 insertions, 1 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 40bdaa9db5d..191d067effe 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -72,7 +72,7 @@ public class LinguisticsAnnotator {
Tokenizer tokenizer = factory.getTokenizer();
String input = (text.getString().length() <= config.getMaxTokenizeLength())
? text.getString()
- : Text.safeSubstring(text.getString(), config.getMaxTokenizeLength());
+ : Text.substringByCodepoints(text.getString(), 0, config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(),
config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());