aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
diff options
context:
space:
mode:
Diffstat (limited to 'indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java11
1 files changed, 6 insertions, 5 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 173df65a47e..191d067effe 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -1,4 +1,4 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.indexinglanguage.linguistics;
import com.yahoo.document.annotation.Annotation;
@@ -12,6 +12,7 @@ import com.yahoo.language.Linguistics;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.Tokenizer;
+import com.yahoo.text.Text;
import java.util.HashMap;
import java.util.Map;
@@ -71,7 +72,7 @@ public class LinguisticsAnnotator {
Tokenizer tokenizer = factory.getTokenizer();
String input = (text.getString().length() <= config.getMaxTokenizeLength())
? text.getString()
- : text.getString().substring(0, config.getMaxTokenizeLength());
+ : Text.substringByCodepoints(text.getString(), 0, config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(),
config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
@@ -88,9 +89,9 @@ public class LinguisticsAnnotator {
* Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the
* original.
*
- * @param termToLowerCase The term to lower case.
- * @param origTerm The original term.
- * @return the created TERM annotation.
+ * @param termToLowerCase the term to lower case
+ * @param origTerm the original term
+ * @return the created TERM annotation
*/
public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
String annotationValue = toLowerCase(termToLowerCase);