From 74bffb810050342bd32065a818e4f74b8cd7ce51 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Thu, 25 Jun 2020 14:09:24 +0200 Subject: Surrogate aware gram splitting --- .../com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'indexinglanguage') diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java index d91338e3d3f..adf3e4ecaaa 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java @@ -65,10 +65,10 @@ public final class NGramExpression extends Expression { // annotate gram as a word term String gramString = gram.extractFrom(input.getString()); - typedSpan(gram.getStart(), gram.getLength(), TokenType.ALPHABETIC, spanList). + typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList). annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString)); - lastPosition = gram.getStart() + gram.getLength(); + lastPosition = gram.getStart() + gram.getCodePointCount(); } // handle punctuation at the end if (lastPosition < input.toString().length()) { -- cgit v1.2.3