diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-09-15 11:30:12 +0200 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2023-09-15 11:36:08 +0200 |
commit | 4875240fd78357fafa8ea25a14a67298333506ca (patch) | |
tree | c743ac2d8e744e539533081775219d670c8fcfcf | |
parent | 15a597984592cef5fab77c9a72a3b04a356a4943 (diff) |
- Add utility to do substring extraction by codepoints, instead of java char index.
- Test and use it in SubstringExpression in indeing language.
4 files changed, 56 insertions, 12 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/SubstringExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/SubstringExpression.java index 6f96a215edb..1f9a341519e 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/SubstringExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/SubstringExpression.java @@ -3,6 +3,7 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; import com.yahoo.document.datatypes.StringFieldValue; +import com.yahoo.text.Text; /** * @author Simon Thoresen Hult @@ -32,15 +33,8 @@ public final class SubstringExpression extends Expression { @Override protected void doExecute(ExecutionContext context) { String input = String.valueOf(context.getValue()); - int len = input.length(); - if (from >= len) { - input = ""; - } else if (to >= len) { - input = input.substring(from); - } else { - input = input.substring(from, to); - } - context.setValue(new StringFieldValue(input)); + String substring = Text.substringByCodepoints(input, from, to); + context.setValue(new StringFieldValue(substring)); } @Override diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/SubstringTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/SubstringTestCase.java index f6c5de398a0..f6bed7831ae 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/SubstringTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/SubstringTestCase.java @@ -26,9 +26,9 @@ public class SubstringTestCase { @Test public void requireThatHashCodeAndEqualsAreImplemented() { Expression exp = new SubstringExpression(6, 9); - assertFalse(exp.equals(new Object())); - assertFalse(exp.equals(new SubstringExpression(66, 99))); - assertFalse(exp.equals(new SubstringExpression(6, 99))); + assertNotEquals(exp, new Object()); + assertNotEquals(exp, new SubstringExpression(66, 99)); + assertNotEquals(exp, new SubstringExpression(6, 99)); assertEquals(exp, new SubstringExpression(6, 9)); assertEquals(exp.hashCode(), new SubstringExpression(6, 9).hashCode()); } diff --git a/vespajlib/src/main/java/com/yahoo/text/Text.java b/vespajlib/src/main/java/com/yahoo/text/Text.java index adf91a9b21e..2f0051d4795 100644 --- a/vespajlib/src/main/java/com/yahoo/text/Text.java +++ b/vespajlib/src/main/java/com/yahoo/text/Text.java @@ -182,6 +182,25 @@ public final class Text { return s.substring(0, length - 4) + " ..."; } + public static String substringByCodepoints(String s, int fromCP, int toCP) { + int len = s.length(); + if ((fromCP >= len) || (fromCP >= toCP)) return ""; + + int from = s.offsetByCodePoints(0, fromCP); + if (from >= len) return ""; + int lenCP = toCP - fromCP; + if (from + lenCP >= len) return s.substring(from); + + try { + int to = s.offsetByCodePoints(from, toCP - fromCP); + return (to >= len) + ? s.substring(from) + : s.substring(from, to); + } catch (IndexOutOfBoundsException e) { + return s.substring(from); + } + } + public static String format(String format, Object... args) { return String.format(Locale.US, format, args); } diff --git a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java index 033918f0bad..2639882230f 100644 --- a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java +++ b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java @@ -50,6 +50,37 @@ public class TextTestCase { validateText(OptionalInt.empty(), new StringBuilder().appendCodePoint(0xD800).appendCodePoint(0xDC00).toString()); } + static private String fromCP(String prefix, int [] codePoints, String suffix) { + StringBuilder sb = new StringBuilder(prefix); + for (int cp : codePoints) { + sb.appendCodePoint(cp); + } + sb.append(suffix); + return sb.toString(); + } + + @Test + public void testSubstringByCodePoint() { + assertEquals("", Text.substringByCodepoints("", 0, 0)); + assertEquals("", Text.substringByCodepoints("abcdef", 0, 0)); + assertEquals("", Text.substringByCodepoints("abcdef", 3, 3)); + assertEquals("", Text.substringByCodepoints("abcdef", 3, 2)); + assertEquals("", Text.substringByCodepoints("abcdef", 7, 9)); + assertEquals("abcdef", Text.substringByCodepoints("abcdef", 0, 9)); + assertEquals("a", Text.substringByCodepoints("abcdef", 0, 1)); + assertEquals("cd", Text.substringByCodepoints("abcdef", 2, 4)); + + String withSurrogates = fromCP("abc", new int[]{0x10F000, 0x10F001, 0x10F002}, "def"); + assertEquals(withSurrogates, Text.substringByCodepoints(withSurrogates, 0, 11)); + assertEquals(withSurrogates, Text.substringByCodepoints(withSurrogates, 0, 20)); + assertEquals(fromCP("bc", new int[]{0x10F000, 0x10F001}, ""), + Text.substringByCodepoints(withSurrogates, 1, 5)); + assertEquals(fromCP("", new int[]{0x10F001}, ""), + Text.substringByCodepoints(withSurrogates, 4, 5)); + assertEquals(fromCP("", new int[]{0x10F001, 0x10F002}, "de"), + Text.substringByCodepoints(withSurrogates, 4, 8)); + } + @Test public void testIsDisplayable() { assertTrue(Text.isDisplayable('A')); |