From dc809366178df3cb3fd0a303a097b333cb4dbac6 Mon Sep 17 00:00:00 2001 From: jonmv Date: Fri, 20 Oct 2023 12:06:12 +0200 Subject: Avoid cutting surrogate pairs when tokenising --- vespajlib/src/main/java/com/yahoo/text/Text.java | 14 ++++++++++++-- .../src/test/java/com/yahoo/text/TextTestCase.java | 22 +++++++++++++++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) (limited to 'vespajlib') diff --git a/vespajlib/src/main/java/com/yahoo/text/Text.java b/vespajlib/src/main/java/com/yahoo/text/Text.java index 7c835965a1a..e133407a967 100644 --- a/vespajlib/src/main/java/com/yahoo/text/Text.java +++ b/vespajlib/src/main/java/com/yahoo/text/Text.java @@ -177,8 +177,8 @@ public final class Text { */ public static String truncate(String s, int length) { if (s.length() <= length) return s; - if (length <= 4) return s.substring(0, length); - return s.substring(0, length - 4) + " ..."; + if (length <= 4) return safeSubstring(s, length); + return safeSubstring(s, length - 4) + " ..."; } public static String substringByCodepoints(String s, int fromCP, int toCP) { @@ -208,4 +208,14 @@ public final class Text { public static String format(String format, Object... args) { return String.format(Locale.US, format, args); } + + /** Like {@link String#substring(int)}, but if this would split a surrogate pair at the end, the leading high surrogate is also cut. */ + public static String safeSubstring(String s, int length) { + boolean pairCut = 0 < length + && length < s.length() + && Character.isHighSurrogate(s.charAt(length - 1)) + && Character.isLowSurrogate(s.charAt(length)); + return s.substring(0, length - (pairCut ? 1 : 0)); + } + } diff --git a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java index f192f678c13..b4324797086 100644 --- a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java +++ b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java @@ -82,6 +82,24 @@ public class TextTestCase { Text.substringByCodepoints(withSurrogates, 4, 8)); } + @Test + public void testSafeSubstring() { + String withSurrogates = "abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef"; + assertEquals("", Text.safeSubstring(withSurrogates, 0)); + assertEquals("a", Text.safeSubstring(withSurrogates, 1)); + assertEquals("ab", Text.safeSubstring(withSurrogates, 2)); + assertEquals("abc", Text.safeSubstring(withSurrogates, 3)); + assertEquals("abc", Text.safeSubstring(withSurrogates, 4)); + assertEquals("abc\uD83D\uDE48", Text.safeSubstring(withSurrogates, 5)); + assertEquals("abc\uD83D\uDE48", Text.safeSubstring(withSurrogates, 6)); + assertEquals("abc\uD83D\uDE48\uD83D\uDE49", Text.safeSubstring(withSurrogates, 7)); + assertEquals("abc\uD83D\uDE48\uD83D\uDE49", Text.safeSubstring(withSurrogates, 8)); + assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4A", Text.safeSubstring(withSurrogates, 9)); + assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Ad", Text.safeSubstring(withSurrogates, 10)); + assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Ade", Text.safeSubstring(withSurrogates, 11)); + assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", Text.safeSubstring(withSurrogates, 12)); + } + @Test public void testIsDisplayable() { assertTrue(Text.isDisplayable('A')); @@ -104,6 +122,8 @@ public class TextTestCase { assertEquals("", Text.truncate("ab", 0)); assertEquals("ab c", Text.truncate("ab cde", 4)); assertEquals("a ...", Text.truncate("ab cde", 5)); + assertEquals("abc\uD83D\uDE48 ...", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 9)); + assertEquals("abc\uD83D\uDE48 ...", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 10)); } @Test @@ -152,6 +172,6 @@ public class TextTestCase { sum = benchmarkIsValid(strings, 100000000); diff = System.nanoTime() - start; System.out.println("Validation num isValid = " + sum + ". Took " + diff + "ns"); - } + } -- cgit v1.2.3