aboutsummaryrefslogtreecommitdiffstats
path: root/vespajlib/src
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2023-10-20 12:06:12 +0200
committerjonmv <venstad@gmail.com>2023-10-20 12:06:12 +0200
commitdc809366178df3cb3fd0a303a097b333cb4dbac6 (patch)
treeb197adbb0943a5cfe9f80e4c2e90d4df928a740b /vespajlib/src
parentb87b0db14a2078a3c60da99aad498ed62b2bf2db (diff)
Avoid cutting surrogate pairs when tokenising
Diffstat (limited to 'vespajlib/src')
-rw-r--r--vespajlib/src/main/java/com/yahoo/text/Text.java14
-rw-r--r--vespajlib/src/test/java/com/yahoo/text/TextTestCase.java22
2 files changed, 33 insertions, 3 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/text/Text.java b/vespajlib/src/main/java/com/yahoo/text/Text.java
index 7c835965a1a..e133407a967 100644
--- a/vespajlib/src/main/java/com/yahoo/text/Text.java
+++ b/vespajlib/src/main/java/com/yahoo/text/Text.java
@@ -177,8 +177,8 @@ public final class Text {
*/
public static String truncate(String s, int length) {
if (s.length() <= length) return s;
- if (length <= 4) return s.substring(0, length);
- return s.substring(0, length - 4) + " ...";
+ if (length <= 4) return safeSubstring(s, length);
+ return safeSubstring(s, length - 4) + " ...";
}
public static String substringByCodepoints(String s, int fromCP, int toCP) {
@@ -208,4 +208,14 @@ public final class Text {
public static String format(String format, Object... args) {
return String.format(Locale.US, format, args);
}
+
+ /** Like {@link String#substring(int)}, but if this would split a surrogate pair at the end, the leading high surrogate is also cut. */
+ public static String safeSubstring(String s, int length) {
+ boolean pairCut = 0 < length
+ && length < s.length()
+ && Character.isHighSurrogate(s.charAt(length - 1))
+ && Character.isLowSurrogate(s.charAt(length));
+ return s.substring(0, length - (pairCut ? 1 : 0));
+ }
+
}
diff --git a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java
index f192f678c13..b4324797086 100644
--- a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java
+++ b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java
@@ -83,6 +83,24 @@ public class TextTestCase {
}
@Test
+ public void testSafeSubstring() {
+ String withSurrogates = "abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef";
+ assertEquals("", Text.safeSubstring(withSurrogates, 0));
+ assertEquals("a", Text.safeSubstring(withSurrogates, 1));
+ assertEquals("ab", Text.safeSubstring(withSurrogates, 2));
+ assertEquals("abc", Text.safeSubstring(withSurrogates, 3));
+ assertEquals("abc", Text.safeSubstring(withSurrogates, 4));
+ assertEquals("abc\uD83D\uDE48", Text.safeSubstring(withSurrogates, 5));
+ assertEquals("abc\uD83D\uDE48", Text.safeSubstring(withSurrogates, 6));
+ assertEquals("abc\uD83D\uDE48\uD83D\uDE49", Text.safeSubstring(withSurrogates, 7));
+ assertEquals("abc\uD83D\uDE48\uD83D\uDE49", Text.safeSubstring(withSurrogates, 8));
+ assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4A", Text.safeSubstring(withSurrogates, 9));
+ assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Ad", Text.safeSubstring(withSurrogates, 10));
+ assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Ade", Text.safeSubstring(withSurrogates, 11));
+ assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", Text.safeSubstring(withSurrogates, 12));
+ }
+
+ @Test
public void testIsDisplayable() {
assertTrue(Text.isDisplayable('A'));
assertTrue(Text.isDisplayable('a'));
@@ -104,6 +122,8 @@ public class TextTestCase {
assertEquals("", Text.truncate("ab", 0));
assertEquals("ab c", Text.truncate("ab cde", 4));
assertEquals("a ...", Text.truncate("ab cde", 5));
+ assertEquals("abc\uD83D\uDE48 ...", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 9));
+ assertEquals("abc\uD83D\uDE48 ...", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 10));
}
@Test
@@ -152,6 +172,6 @@ public class TextTestCase {
sum = benchmarkIsValid(strings, 100000000);
diff = System.nanoTime() - start;
System.out.println("Validation num isValid = " + sum + ". Took " + diff + "ns");
-
}
+
}