summaryrefslogtreecommitdiffstats
path: root/vespajlib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2023-09-15 11:30:12 +0200
committerHenning Baldersheim <balder@yahoo-inc.com>2023-09-15 11:36:08 +0200
commit4875240fd78357fafa8ea25a14a67298333506ca (patch)
treec743ac2d8e744e539533081775219d670c8fcfcf /vespajlib
parent15a597984592cef5fab77c9a72a3b04a356a4943 (diff)
- Add utility to do substring extraction by codepoints, instead of java char index.
- Test and use it in SubstringExpression in indeing language.
Diffstat (limited to 'vespajlib')
-rw-r--r--vespajlib/src/main/java/com/yahoo/text/Text.java19
-rw-r--r--vespajlib/src/test/java/com/yahoo/text/TextTestCase.java31
2 files changed, 50 insertions, 0 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/text/Text.java b/vespajlib/src/main/java/com/yahoo/text/Text.java
index adf91a9b21e..2f0051d4795 100644
--- a/vespajlib/src/main/java/com/yahoo/text/Text.java
+++ b/vespajlib/src/main/java/com/yahoo/text/Text.java
@@ -182,6 +182,25 @@ public final class Text {
return s.substring(0, length - 4) + " ...";
}
+ public static String substringByCodepoints(String s, int fromCP, int toCP) {
+ int len = s.length();
+ if ((fromCP >= len) || (fromCP >= toCP)) return "";
+
+ int from = s.offsetByCodePoints(0, fromCP);
+ if (from >= len) return "";
+ int lenCP = toCP - fromCP;
+ if (from + lenCP >= len) return s.substring(from);
+
+ try {
+ int to = s.offsetByCodePoints(from, toCP - fromCP);
+ return (to >= len)
+ ? s.substring(from)
+ : s.substring(from, to);
+ } catch (IndexOutOfBoundsException e) {
+ return s.substring(from);
+ }
+ }
+
public static String format(String format, Object... args) {
return String.format(Locale.US, format, args);
}
diff --git a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java
index 033918f0bad..2639882230f 100644
--- a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java
+++ b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java
@@ -50,6 +50,37 @@ public class TextTestCase {
validateText(OptionalInt.empty(), new StringBuilder().appendCodePoint(0xD800).appendCodePoint(0xDC00).toString());
}
+ static private String fromCP(String prefix, int [] codePoints, String suffix) {
+ StringBuilder sb = new StringBuilder(prefix);
+ for (int cp : codePoints) {
+ sb.appendCodePoint(cp);
+ }
+ sb.append(suffix);
+ return sb.toString();
+ }
+
+ @Test
+ public void testSubstringByCodePoint() {
+ assertEquals("", Text.substringByCodepoints("", 0, 0));
+ assertEquals("", Text.substringByCodepoints("abcdef", 0, 0));
+ assertEquals("", Text.substringByCodepoints("abcdef", 3, 3));
+ assertEquals("", Text.substringByCodepoints("abcdef", 3, 2));
+ assertEquals("", Text.substringByCodepoints("abcdef", 7, 9));
+ assertEquals("abcdef", Text.substringByCodepoints("abcdef", 0, 9));
+ assertEquals("a", Text.substringByCodepoints("abcdef", 0, 1));
+ assertEquals("cd", Text.substringByCodepoints("abcdef", 2, 4));
+
+ String withSurrogates = fromCP("abc", new int[]{0x10F000, 0x10F001, 0x10F002}, "def");
+ assertEquals(withSurrogates, Text.substringByCodepoints(withSurrogates, 0, 11));
+ assertEquals(withSurrogates, Text.substringByCodepoints(withSurrogates, 0, 20));
+ assertEquals(fromCP("bc", new int[]{0x10F000, 0x10F001}, ""),
+ Text.substringByCodepoints(withSurrogates, 1, 5));
+ assertEquals(fromCP("", new int[]{0x10F001}, ""),
+ Text.substringByCodepoints(withSurrogates, 4, 5));
+ assertEquals(fromCP("", new int[]{0x10F001, 0x10F002}, "de"),
+ Text.substringByCodepoints(withSurrogates, 4, 8));
+ }
+
@Test
public void testIsDisplayable() {
assertTrue(Text.isDisplayable('A'));