diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-10-20 17:50:35 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-20 17:50:35 +0200 |
commit | 46bd10fe1fbbbbf388155bf72e73ab26fbd0bfab (patch) | |
tree | 37c8e940004f64ac2fa0fe538749f36ce354db8b | |
parent | 2f60ec76050d005a04dc20b09ab7877bdd3abfb5 (diff) | |
parent | e228115788634d77f5b6354c12c1718252044860 (diff) |
Merge pull request #29045 from vespa-engine/jonmv/fix-text-substring-in-tokenisation
Avoid cutting surrogate pairs when tokenising
4 files changed, 12 insertions, 7 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 61ee3069127..191d067effe 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -12,6 +12,7 @@ import com.yahoo.language.Linguistics; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; import com.yahoo.language.process.Tokenizer; +import com.yahoo.text.Text; import java.util.HashMap; import java.util.Map; @@ -71,7 +72,7 @@ public class LinguisticsAnnotator { Tokenizer tokenizer = factory.getTokenizer(); String input = (text.getString().length() <= config.getMaxTokenizeLength()) ? text.getString() - : text.getString().substring(0, config.getMaxTokenizeLength()); + : Text.substringByCodepoints(text.getString(), 0, config.getMaxTokenizeLength()); Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(), config.getRemoveAccents()); TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences()); diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java index 143acc174f0..5ad6a382abd 100644 --- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java +++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java @@ -7,7 +7,7 @@ import java.util.Locale; /** * This class provides a case normalization operation to be used e.g. when - * document search should be case insensitive. + * document search should be case-insensitive. * * @author Simon Thoresen Hult */ diff --git a/vespajlib/src/main/java/com/yahoo/text/Text.java b/vespajlib/src/main/java/com/yahoo/text/Text.java index 7c835965a1a..fe931ef34a3 100644 --- a/vespajlib/src/main/java/com/yahoo/text/Text.java +++ b/vespajlib/src/main/java/com/yahoo/text/Text.java @@ -170,15 +170,15 @@ public final class Text { } /** - * Returns a string which is never larger than the given number of characters. + * Returns a string which is never larger than the given number of code points. * If the string is longer than the given length it will be truncated. * If length is 4 or less the string will be truncated to length. * If length is longer than 4, it will be truncated at length-4 with " ..." added at the end. */ public static String truncate(String s, int length) { - if (s.length() <= length) return s; - if (length <= 4) return s.substring(0, length); - return s.substring(0, length - 4) + " ..."; + if (s.codePointCount(0, s.length()) <= length) return s; + if (length <= 4) return substringByCodepoints(s, 0, length); + return substringByCodepoints(s, 0, length - 4) + " ..."; } public static String substringByCodepoints(String s, int fromCP, int toCP) { @@ -208,4 +208,5 @@ public final class Text { public static String format(String format, Object... args) { return String.format(Locale.US, format, args); } + } diff --git a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java index f192f678c13..9bb4668b7cb 100644 --- a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java +++ b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java @@ -104,6 +104,9 @@ public class TextTestCase { assertEquals("", Text.truncate("ab", 0)); assertEquals("ab c", Text.truncate("ab cde", 4)); assertEquals("a ...", Text.truncate("ab cde", 5)); + assertEquals("abc ...", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 7)); + assertEquals("abc\uD83D\uDE48 ...", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 8)); + assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 9)); } @Test @@ -152,6 +155,6 @@ public class TextTestCase { sum = benchmarkIsValid(strings, 100000000); diff = System.nanoTime() - start; System.out.println("Validation num isValid = " + sum + ". Took " + diff + "ns"); - } + } |