diff options
author | jonmv <venstad@gmail.com> | 2023-10-20 12:06:12 +0200 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2023-10-20 12:06:12 +0200 |
commit | dc809366178df3cb3fd0a303a097b333cb4dbac6 (patch) | |
tree | b197adbb0943a5cfe9f80e4c2e90d4df928a740b /indexinglanguage | |
parent | b87b0db14a2078a3c60da99aad498ed62b2bf2db (diff) |
Avoid cutting surrogate pairs when tokenising
Diffstat (limited to 'indexinglanguage')
-rw-r--r-- | indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 61ee3069127..40bdaa9db5d 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -12,6 +12,7 @@ import com.yahoo.language.Linguistics; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.Tokenizer; +import com.yahoo.text.Text; import java.util.HashMap; import java.util.Map; @@ -71,7 +72,7 @@ public class LinguisticsAnnotator { Tokenizer tokenizer = factory.getTokenizer(); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() - : text.getString().substring(0, config.getMaxTokenizeLength()); + : Text.safeSubstring(text.getString(), config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); |