diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-09-15 11:30:12 +0200 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2023-09-15 11:36:08 +0200 |
commit | 4875240fd78357fafa8ea25a14a67298333506ca (patch) | |
tree | c743ac2d8e744e539533081775219d670c8fcfcf /vespajlib | |
parent | 15a597984592cef5fab77c9a72a3b04a356a4943 (diff) |
- Add utility to do substring extraction by codepoints, instead of java char index.
- Test and use it in SubstringExpression in indeing language.
Diffstat (limited to 'vespajlib')
-rw-r--r-- | vespajlib/src/main/java/com/yahoo/text/Text.java | 19 | ||||
-rw-r--r-- | vespajlib/src/test/java/com/yahoo/text/TextTestCase.java | 31 |
2 files changed, 50 insertions, 0 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/text/Text.java b/vespajlib/src/main/java/com/yahoo/text/Text.java index adf91a9b21e..2f0051d4795 100644 --- a/vespajlib/src/main/java/com/yahoo/text/Text.java +++ b/vespajlib/src/main/java/com/yahoo/text/Text.java @@ -182,6 +182,25 @@ public final class Text { return s.substring(0, length - 4) + " ..."; } + public static String substringByCodepoints(String s, int fromCP, int toCP) { + int len = s.length(); + if ((fromCP >= len) || (fromCP >= toCP)) return ""; + + int from = s.offsetByCodePoints(0, fromCP); + if (from >= len) return ""; + int lenCP = toCP - fromCP; + if (from + lenCP >= len) return s.substring(from); + + try { + int to = s.offsetByCodePoints(from, toCP - fromCP); + return (to >= len) + ? s.substring(from) + : s.substring(from, to); + } catch (IndexOutOfBoundsException e) { + return s.substring(from); + } + } + public static String format(String format, Object... args) { return String.format(Locale.US, format, args); } diff --git a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java index 033918f0bad..2639882230f 100644 --- a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java +++ b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java @@ -50,6 +50,37 @@ public class TextTestCase { validateText(OptionalInt.empty(), new StringBuilder().appendCodePoint(0xD800).appendCodePoint(0xDC00).toString()); } + static private String fromCP(String prefix, int [] codePoints, String suffix) { + StringBuilder sb = new StringBuilder(prefix); + for (int cp : codePoints) { + sb.appendCodePoint(cp); + } + sb.append(suffix); + return sb.toString(); + } + + @Test + public void testSubstringByCodePoint() { + assertEquals("", Text.substringByCodepoints("", 0, 0)); + assertEquals("", Text.substringByCodepoints("abcdef", 0, 0)); + assertEquals("", Text.substringByCodepoints("abcdef", 3, 3)); + assertEquals("", Text.substringByCodepoints("abcdef", 3, 2)); + assertEquals("", Text.substringByCodepoints("abcdef", 7, 9)); + assertEquals("abcdef", Text.substringByCodepoints("abcdef", 0, 9)); + assertEquals("a", Text.substringByCodepoints("abcdef", 0, 1)); + assertEquals("cd", Text.substringByCodepoints("abcdef", 2, 4)); + + String withSurrogates = fromCP("abc", new int[]{0x10F000, 0x10F001, 0x10F002}, "def"); + assertEquals(withSurrogates, Text.substringByCodepoints(withSurrogates, 0, 11)); + assertEquals(withSurrogates, Text.substringByCodepoints(withSurrogates, 0, 20)); + assertEquals(fromCP("bc", new int[]{0x10F000, 0x10F001}, ""), + Text.substringByCodepoints(withSurrogates, 1, 5)); + assertEquals(fromCP("", new int[]{0x10F001}, ""), + Text.substringByCodepoints(withSurrogates, 4, 5)); + assertEquals(fromCP("", new int[]{0x10F001, 0x10F002}, "de"), + Text.substringByCodepoints(withSurrogates, 4, 8)); + } + @Test public void testIsDisplayable() { assertTrue(Text.isDisplayable('A')); |