diff options
Diffstat (limited to 'indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java')
-rw-r--r-- | indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java | 11 |
1 files changed, 6 insertions, 5 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 173df65a47e..191d067effe 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.indexinglanguage.linguistics; import com.yahoo.document.annotation.Annotation; @@ -12,6 +12,7 @@ import com.yahoo.language.Linguistics; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.Tokenizer; +import com.yahoo.text.Text; import java.util.HashMap; import java.util.Map; @@ -71,7 +72,7 @@ public class LinguisticsAnnotator { Tokenizer tokenizer = factory.getTokenizer(); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() - : text.getString().substring(0, config.getMaxTokenizeLength()); + : Text.substringByCodepoints(text.getString(), 0, config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); @@ -88,9 +89,9 @@ public class LinguisticsAnnotator { * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the * original. * - * @param termToLowerCase The term to lower case. - * @param origTerm The original term. - * @return the created TERM annotation. + * @param termToLowerCase the term to lower case + * @param origTerm the original term + * @return the created TERM annotation */ public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) { String annotationValue = toLowerCase(termToLowerCase); |