diff options
author | jonmv <venstad@gmail.com> | 2023-10-20 12:55:27 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2023-10-20 12:55:27 +0200 |
commit | 2ecc9af04be2dbebedbf0032990cd3b699aa35d5 (patch) | |
tree | f361ac72663142129db0c06f6d42d5b8458de1d2 /indexinglanguage | |
parent | dc809366178df3cb3fd0a303a097b333cb4dbac6 (diff) |
Take config to mean number of code points in match max length
Diffstat (limited to 'indexinglanguage')
-rw-r--r-- | indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 40bdaa9db5d..191d067effe 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -72,7 +72,7 @@ public class LinguisticsAnnotator { Tokenizer tokenizer = factory.getTokenizer(); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() - : Text.safeSubstring(text.getString(), config.getMaxTokenizeLength()); + : Text.substringByCodepoints(text.getString(), 0, config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); |