From bac8ab58a18d25db1871d3e933cb0cc018be5439 Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Fri, 29 Oct 2021 15:41:14 +0200 Subject: Use UTF-8 bytewise ordering for StringResultNode comparisons The C++ backend uses `memcmp` ordering of UTF-8 strings for its `StringResultNode` instances and expects the container to feed it nodes in the same order. However, the Java code used `String` internally, which compares UTF-16 codepoints instead of UTF-8 octets. These may not agree on the ordering, particularly in the presence of surrogate pairs. Java `StringResultNode` now uses a raw UTF-8 byte array as its value backing, which has the added benefit that (de-)serializing is effectively a no-op. Some extra `String` roundtrip work needed now to support the various type-erased `ResultNode` functionality, but this is not expected to be called in a hot path. --- .../searchlib/expression/StringResultNode.java | 85 +++++++++++++++------ .../searchlib/expression/ExpressionTestCase.java | 88 +++++++++++++++++++++- vespajlib/abi-spec.json | 1 + .../java/com/yahoo/vespa/objects/Identifiable.java | 9 ++- 4 files changed, 156 insertions(+), 27 deletions(-) diff --git a/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java b/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java index 6a84c7ff950..b90a5aadbb6 100644 --- a/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java +++ b/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java @@ -6,6 +6,8 @@ import com.yahoo.vespa.objects.Deserializer; import com.yahoo.vespa.objects.ObjectVisitor; import com.yahoo.vespa.objects.Serializer; +import java.util.Arrays; + /** * This result holds a string. * @@ -16,18 +18,20 @@ public class StringResultNode extends SingleResultNode { // The global class identifier shared with C++. public static final int classId = registerClass(0x4000 + 53, StringResultNode.class); - private static StringResultNode negativeInfinity = new StringResultNode(""); - private static PositiveInfinityResultNode positiveInfinity = new PositiveInfinityResultNode(); + private static final StringResultNode negativeInfinity = new StringResultNode(""); + private static final PositiveInfinityResultNode positiveInfinity = new PositiveInfinityResultNode(); + + private static final byte[] EMPTY_UTF8_ARRAY = new byte[0]; - // The string value of this node. - private String value; + // The string value of this node, in raw UTF-8 octets. + private byte[] utf8Value; /** * Constructs an empty result node. NOTE: This instance is broken until non-optional member data is set. */ public StringResultNode() { super(); - value = ""; + utf8Value = EMPTY_UTF8_ARRAY; } /** @@ -40,6 +44,19 @@ public class StringResultNode extends SingleResultNode { setValue(value); } + private StringResultNode(byte[] rawUtf8Value) { + super(); + utf8Value = rawUtf8Value; + } + + /** + * Creates a new StringResultNode backed by an underlying byte array. The input is + * presumed to be in valid UTF-8 format, but is _not_ checked for validity. + */ + protected static StringResultNode ofUncheckedUtf8Array(byte[] rawUtf8Value) { + return new StringResultNode(rawUtf8Value); + } + /** * Sets the value of this result. * @@ -50,7 +67,7 @@ public class StringResultNode extends SingleResultNode { if (value == null) { throw new IllegalArgumentException("Value can not be null."); } - this.value = value; + this.utf8Value = Utf8.toBytes(value); return this; } @@ -68,13 +85,14 @@ public class StringResultNode extends SingleResultNode { @Override protected void onDeserialize(Deserializer buf) { - value = getUtf8(buf); + // We expect the UTF-8 we get from the backend to be pre-checked and valid. + utf8Value = getRawUtf8Bytes(buf); } @Override public long getInteger() { try { - return Integer.valueOf(value); + return Integer.valueOf(getString()); } catch (java.lang.NumberFormatException e) { return 0; } @@ -83,7 +101,7 @@ public class StringResultNode extends SingleResultNode { @Override public double getFloat() { try { - return Double.valueOf(value); + return Double.valueOf(getString()); } catch (java.lang.NumberFormatException e) { return 0; } @@ -91,50 +109,53 @@ public class StringResultNode extends SingleResultNode { @Override public String getString() { - return value; + return Utf8.toString(utf8Value); } @Override public byte[] getRaw() { - return Utf8.toBytes(value); + return utf8Value; } @Override protected int onCmp(ResultNode rhs) { return (rhs instanceof PositiveInfinityResultNode) ? -1 - : value.compareTo(rhs.getString()); + : internalNonPositiveInfinityCompareTo(rhs); } @Override public int hashCode() { - return super.hashCode() + value.hashCode(); + return super.hashCode() + Arrays.hashCode(utf8Value); } @Override public void visitMembers(ObjectVisitor visitor) { super.visitMembers(visitor); - visitor.visit("value", value); + visitor.visit("value", getString()); } + @Override public void add(ResultNode rhs) { - value += rhs.getString(); + setValue(getString() + rhs.getString()); } + @Override public void min(ResultNode rhs) { - if (value.compareTo(rhs.getString()) > 0) { - value = rhs.getString(); + if (internalNonPositiveInfinityCompareTo(rhs) > 0) { + set(rhs); } } + @Override public void max(ResultNode rhs) { - if (value.compareTo(rhs.getString()) < 0) { - value = rhs.getString(); + if (internalNonPositiveInfinityCompareTo(rhs) < 0) { + set(rhs); } } public void append(ResultNode rhs) { - value += rhs.getString(); + setValue(getString() + rhs.getString()); } @Override @@ -144,16 +165,34 @@ public class StringResultNode extends SingleResultNode { @Override public void set(ResultNode rhs) { - value = rhs.getString(); + if (rhs instanceof StringResultNode) { + utf8Value = ((StringResultNode) rhs).utf8Value; + } else { + setValue(rhs.getString()); + } } @Override public void negate() { - char a[] = value.toCharArray(); + char[] a = getString().toCharArray(); for (int i = 0; i < a.length; i++) { a[i] = (char)-a[i]; } - value = new String(a); + setValue(new String(a)); + } + + private int internalNonPositiveInfinityCompareTo(ResultNode rhs) { + // Note: this may not necessarily be well-defined _semantically_ unless rhs is + // also a StringResultNode. The C++ implementation explicitly expects rhs to be + // such an instance, but this depends on a classId check that is _not_ done in + // the Java implementation... + // We use getString() instead of getRaw() to support implicit stringification + // (legacy Java implementation behavior), but it's not given that this is always + // the desired outcome. + var rhsAsUtf8 = (rhs instanceof StringResultNode) + ? ((StringResultNode)rhs).utf8Value + : Utf8.toBytes(rhs.getString()); + return Arrays.compareUnsigned(utf8Value, rhsAsUtf8); } /** diff --git a/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java b/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java index c5adfed4974..6e034cce652 100644 --- a/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java +++ b/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java @@ -705,6 +705,21 @@ public class ExpressionTestCase { assertEquals(0, large.compareTo(large)); } + private void compareAllPairwise(ResultNode... orderedNodes) { + for (int i = 0; i < orderedNodes.length; ++i) { + for (int j = 0; j < orderedNodes.length; ++j) { + var ctx = String.format("lhs(i=%d): %s, rhs(j=%d): %s", i, orderedNodes[i], j, orderedNodes[j]); + if (j < i) { + assertTrue(ctx, orderedNodes[i].compareTo(orderedNodes[j]) > 0); + } else if (j > i) { + assertTrue(ctx, orderedNodes[i].compareTo(orderedNodes[j]) < 0); + } else { // j == i + assertEquals(ctx, orderedNodes[i].compareTo(orderedNodes[j]), 0); + } + } + } + } + @Test public void testStringResultNode() { try { @@ -719,16 +734,87 @@ public class ExpressionTestCase { } catch (IllegalArgumentException e) { // expected } - StringResultNode a = new StringResultNode("7.3"); + StringResultNode a = new StringResultNode("42"); + assertEquals(a.getInteger(), 42); + assertEquals(a.getFloat(), 42, delta); + assertEquals(a.getString(), "42"); + + a = new StringResultNode("7.3"); assertEquals(a.getInteger(), 0); assertEquals(a.getFloat(), 7.3, delta); assertEquals(a.getString(), "7.3"); + byte[] raw = a.getRaw(); assertEquals(raw.length, 3); + assertEquals(Arrays.compareUnsigned(raw, Utf8.toBytes("7.3")), 0); + assertResultNode(a); compare(new StringResultNode(), new StringResultNode("z"), new StringResultNode("zz")); compare(new StringResultNode("z"), new StringResultNode("zz"), new StringResultNode("zzz")); compare(new StringResultNode("a"), new StringResultNode("zz"), new PositiveInfinityResultNode()); + assertEquals(new StringResultNode("zz").compareTo(new StringResultNode("zz")), 0); + + // Remove this assertion if we remove support for implicit stringification + compare(new StringResultNode(), new StringResultNode("12"), new IntegerResultNode(13)); + + var node = new StringResultNode("foo"); + node.append(new StringResultNode("bar")); + assertEquals(node.getString(), "foobar"); + node.add(new StringResultNode("baz")); + assertEquals(node.getString(), "foobarbaz"); + + node.set(new StringResultNode("ABC")); + assertEquals(node.getString(), "ABC"); + + node.max(new StringResultNode("ABA")); + assertEquals(node.getString(), "ABC"); + + node.max(new StringResultNode("ABD")); + assertEquals(node.getString(), "ABD"); + + node.min(new StringResultNode("ABE")); + assertEquals(node.getString(), "ABD"); + + node.min(new StringResultNode("ABC")); + assertEquals(node.getString(), "ABC"); + + node.set(new IntegerResultNode(1234)); + assertEquals(node.getString(), "1234"); + } + + @Test + public void string_result_node_comparison_is_memcmp_unsigned_byte_ordered() { + // Comparison is first by shared prefix (if equal), then ties are resolved by checking length. + // This is not actually valid UTF-8, so we depend on unchecked-ness... + compareAllPairwise(StringResultNode.ofUncheckedUtf8Array(new byte[] { 0 }), + StringResultNode.ofUncheckedUtf8Array(new byte[] { 0, 0 }), + StringResultNode.ofUncheckedUtf8Array(new byte[] { 0, 0, 1 }), + StringResultNode.ofUncheckedUtf8Array(new byte[] { 1 }), + StringResultNode.ofUncheckedUtf8Array(new byte[] { 127 }), // 0x7F + StringResultNode.ofUncheckedUtf8Array(new byte[] { -128 }), // 0x80 + StringResultNode.ofUncheckedUtf8Array(new byte[] { -1 }), // 0xFF + StringResultNode.ofUncheckedUtf8Array(new byte[] { -1, 0 })); + } + + // Ensure that the ordering of string nodes on the container matches that of the C++ backend. + // The backend directly orders on the memcmp order of UTF-8 strings, so the container must do the same. + // In particular, this means that UTF-16 java.lang.String sorting is a no-go. + @Test + public void string_result_node_comparison_is_utf8_byte_ordered() { + // UTF-8 representation of "fried shrimp" emoji U+1F364 which requires a UTF-16 surrogate pair (2 chars) + // Its UTF-16 representation is D83C DF64 + var surrogateUtf8 = new byte[] { -16, -97, -115, -92 }; // F0 9F 8D A4 + // UTF-8 representation of U+F0BA, which exists in the Unicode private usage range. Only requires 1 UTF-16 char. + // Its UTF-16 representation is F0BA + var nonSurrogateUtf8 = new byte[] { -17, -126, -70 }; // EF 82 BA + + var s1 = new StringResultNode(Utf8.toString(surrogateUtf8)); + var s2 = new StringResultNode(Utf8.toString(nonSurrogateUtf8)); + // UTF-8: s2 < s1, since 0xEF < 0xF0 + // UTF-16: s1 < s2, since 0xD83C < 0xF0BA + // Assert that ordering is defined by UTF-8, not (surrogated) UTF-16. + assertTrue(s2.compareTo(s1) < 0); + assertTrue(s1.compareTo(s2) > 0); } @Test diff --git a/vespajlib/abi-spec.json b/vespajlib/abi-spec.json index c426195bc37..5eeee267cf6 100644 --- a/vespajlib/abi-spec.json +++ b/vespajlib/abi-spec.json @@ -3428,6 +3428,7 @@ "protected static com.yahoo.vespa.objects.Identifiable deserializeOptional(com.yahoo.vespa.objects.Deserializer)", "protected static boolean equals(java.lang.Object, java.lang.Object)", "public void visitMembers(com.yahoo.vespa.objects.ObjectVisitor)", + "protected static byte[] getRawUtf8Bytes(com.yahoo.vespa.objects.Deserializer)", "protected java.lang.String getUtf8(com.yahoo.vespa.objects.Deserializer)", "protected void putUtf8(com.yahoo.vespa.objects.Serializer, java.lang.String)", "public bridge synthetic java.lang.Object clone()" diff --git a/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java b/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java index 8c11a0cbda1..947b312ac3b 100644 --- a/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java +++ b/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java @@ -354,10 +354,13 @@ public class Identifiable extends Selectable implements Cloneable { } } - protected String getUtf8(Deserializer buf) { + protected static byte[] getRawUtf8Bytes(Deserializer buf) { int len = buf.getInt(null); - byte[] arr = buf.getBytes(null, len); - return Utf8.toString(arr); + return buf.getBytes(null, len); + } + + protected String getUtf8(Deserializer buf) { + return Utf8.toString(getRawUtf8Bytes(buf)); } protected void putUtf8(Serializer buf, String val) { -- cgit v1.2.3