summaryrefslogtreecommitdiffstats
path: root/indexinglanguage
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2023-10-20 12:06:12 +0200
committerjonmv <venstad@gmail.com>2023-10-20 12:06:12 +0200
commitdc809366178df3cb3fd0a303a097b333cb4dbac6 (patch)
treeb197adbb0943a5cfe9f80e4c2e90d4df928a740b /indexinglanguage
parentb87b0db14a2078a3c60da99aad498ed62b2bf2db (diff)
Avoid cutting surrogate pairs when tokenising
Diffstat (limited to 'indexinglanguage')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java3
1 files changed, 2 insertions, 1 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 61ee3069127..40bdaa9db5d 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -12,6 +12,7 @@ import com.yahoo.language.Linguistics;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.Tokenizer;
+import com.yahoo.text.Text;
import java.util.HashMap;
import java.util.Map;
@@ -71,7 +72,7 @@ public class LinguisticsAnnotator {
Tokenizer tokenizer = factory.getTokenizer();
String input = (text.getString().length() <= config.getMaxTokenizeLength())
? text.getString()
- : text.getString().substring(0, config.getMaxTokenizeLength());
+ : Text.safeSubstring(text.getString(), config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(),
config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());