From bac8ab58a18d25db1871d3e933cb0cc018be5439 Mon Sep 17 00:00:00 2001
From: Tor Brede Vekterli <vekterli@yahooinc.com>
Date: Fri, 29 Oct 2021 15:41:14 +0200
Subject: Use UTF-8 bytewise ordering for StringResultNode comparisons

The C++ backend uses `memcmp` ordering of UTF-8 strings for its
`StringResultNode` instances and expects the container to feed it
nodes in the same order. However, the Java code used `String` internally,
which compares UTF-16 codepoints instead of UTF-8 octets. These
may not agree on the ordering, particularly in the presence of
surrogate pairs.

Java `StringResultNode` now uses a raw UTF-8 byte array as its value
backing, which has the added benefit that (de-)serializing is
effectively a no-op. Some extra `String` roundtrip work needed now
to support the various type-erased `ResultNode` functionality, but
this is not expected to be called in a hot path.
---
 .../searchlib/expression/StringResultNode.java     | 85 +++++++++++++++------
 .../searchlib/expression/ExpressionTestCase.java   | 88 +++++++++++++++++++++-
 vespajlib/abi-spec.json                            |  1 +
 .../java/com/yahoo/vespa/objects/Identifiable.java |  9 ++-
 4 files changed, 156 insertions(+), 27 deletions(-)
diff --git a/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java b/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java
index 6a84c7ff950..b90a5aadbb6 100644
--- a/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java
+++ b/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java
@@ -6,6 +6,8 @@ import com.yahoo.vespa.objects.Deserializer;
 import com.yahoo.vespa.objects.ObjectVisitor;
 import com.yahoo.vespa.objects.Serializer;
 
+import java.util.Arrays;
+
 /**
  * This result holds a string.
  *
@@ -16,18 +18,20 @@ public class StringResultNode extends SingleResultNode {
 
     // The global class identifier shared with C++.
     public static final int classId = registerClass(0x4000 + 53, StringResultNode.class);
-    private static StringResultNode negativeInfinity = new StringResultNode("");
-    private static PositiveInfinityResultNode positiveInfinity = new PositiveInfinityResultNode();
+    private static final StringResultNode negativeInfinity = new StringResultNode("");
+    private static final PositiveInfinityResultNode positiveInfinity = new PositiveInfinityResultNode();
+
+    private static final byte[] EMPTY_UTF8_ARRAY = new byte[0];
 
-    // The string value of this node.
-    private String value;
+    // The string value of this node, in raw UTF-8 octets.
+    private byte[] utf8Value;
 
     /**
      * Constructs an empty result node. <b>NOTE:</b> This instance is broken until non-optional member data is set.
      */
     public StringResultNode() {
         super();
-        value = "";
+        utf8Value = EMPTY_UTF8_ARRAY;
     }
 
     /**
@@ -40,6 +44,19 @@ public class StringResultNode extends SingleResultNode {
         setValue(value);
     }
 
+    private StringResultNode(byte[] rawUtf8Value) {
+        super();
+        utf8Value = rawUtf8Value;
+    }
+
+    /**
+     * Creates a new StringResultNode backed by an underlying byte array. The input is
+     * presumed to be in valid UTF-8 format, but is _not_ checked for validity.
+     */
+    protected static StringResultNode ofUncheckedUtf8Array(byte[] rawUtf8Value) {
+        return new StringResultNode(rawUtf8Value);
+    }
+
     /**
      * Sets the value of this result.
      *
@@ -50,7 +67,7 @@ public class StringResultNode extends SingleResultNode {
         if (value == null) {
             throw new IllegalArgumentException("Value can not be null.");
         }
-        this.value = value;
+        this.utf8Value = Utf8.toBytes(value);
         return this;
     }
 
@@ -68,13 +85,14 @@ public class StringResultNode extends SingleResultNode {
 
     @Override
     protected void onDeserialize(Deserializer buf) {
-        value = getUtf8(buf);
+        // We expect the UTF-8 we get from the backend to be pre-checked and valid.
+        utf8Value = getRawUtf8Bytes(buf);
     }
 
     @Override
     public long getInteger() {
         try {
-            return Integer.valueOf(value);
+            return Integer.valueOf(getString());
         } catch (java.lang.NumberFormatException e) {
             return 0;
         }
@@ -83,7 +101,7 @@ public class StringResultNode extends SingleResultNode {
     @Override
     public double getFloat() {
         try {
-            return Double.valueOf(value);
+            return Double.valueOf(getString());
         } catch (java.lang.NumberFormatException e) {
             return 0;
         }
@@ -91,50 +109,53 @@ public class StringResultNode extends SingleResultNode {
 
     @Override
     public String getString() {
-        return value;
+        return Utf8.toString(utf8Value);
     }
 
     @Override
     public byte[] getRaw() {
-        return Utf8.toBytes(value);
+        return utf8Value;
     }
 
     @Override
     protected int onCmp(ResultNode rhs) {
         return (rhs instanceof PositiveInfinityResultNode)
                ? -1
-               : value.compareTo(rhs.getString());
+               : internalNonPositiveInfinityCompareTo(rhs);
     }
 
     @Override
     public int hashCode() {
-        return super.hashCode() + value.hashCode();
+        return super.hashCode() + Arrays.hashCode(utf8Value);
     }
 
     @Override
     public void visitMembers(ObjectVisitor visitor) {
         super.visitMembers(visitor);
-        visitor.visit("value", value);
+        visitor.visit("value", getString());
     }
 
+    @Override
     public void add(ResultNode rhs) {
-        value += rhs.getString();
+        setValue(getString() + rhs.getString());
     }
 
+    @Override
     public void min(ResultNode rhs) {
-        if (value.compareTo(rhs.getString()) > 0) {
-            value = rhs.getString();
+        if (internalNonPositiveInfinityCompareTo(rhs) > 0) {
+            set(rhs);
         }
     }
 
+    @Override
     public void max(ResultNode rhs) {
-        if (value.compareTo(rhs.getString()) < 0) {
-            value = rhs.getString();
+        if (internalNonPositiveInfinityCompareTo(rhs) < 0) {
+            set(rhs);
         }
     }
 
     public void append(ResultNode rhs) {
-        value += rhs.getString();
+        setValue(getString() + rhs.getString());
     }
 
     @Override
@@ -144,16 +165,34 @@ public class StringResultNode extends SingleResultNode {
 
     @Override
     public void set(ResultNode rhs) {
-        value = rhs.getString();
+        if (rhs instanceof StringResultNode) {
+            utf8Value = ((StringResultNode) rhs).utf8Value;
+        } else {
+            setValue(rhs.getString());
+        }
     }
 
     @Override
     public void negate() {
-        char a[] = value.toCharArray();
+        char[] a = getString().toCharArray();
         for (int i = 0; i < a.length; i++) {
             a[i] = (char)-a[i];
         }
-        value = new String(a);
+        setValue(new String(a));
+    }
+
+    private int internalNonPositiveInfinityCompareTo(ResultNode rhs) {
+        // Note: this may not necessarily be well-defined _semantically_ unless rhs is
+        // also a StringResultNode. The C++ implementation explicitly expects rhs to be
+        // such an instance, but this depends on a classId check that is _not_ done in
+        // the Java implementation...
+        // We use getString() instead of getRaw() to support implicit stringification
+        // (legacy Java implementation behavior), but it's not given that this is always
+        // the desired outcome.
+        var rhsAsUtf8 = (rhs instanceof StringResultNode)
+                        ? ((StringResultNode)rhs).utf8Value
+                        : Utf8.toBytes(rhs.getString());
+        return Arrays.compareUnsigned(utf8Value, rhsAsUtf8);
     }
 
     /**
diff --git a/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java b/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java
index c5adfed4974..6e034cce652 100644
--- a/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java
+++ b/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java
@@ -705,6 +705,21 @@ public class ExpressionTestCase {
         assertEquals(0, large.compareTo(large));
     }
 
+    private void compareAllPairwise(ResultNode... orderedNodes) {
+        for (int i = 0; i < orderedNodes.length; ++i) {
+            for (int j = 0; j < orderedNodes.length; ++j) {
+                var ctx = String.format("lhs(i=%d): %s, rhs(j=%d): %s", i, orderedNodes[i], j, orderedNodes[j]);
+                if (j < i) {
+                    assertTrue(ctx, orderedNodes[i].compareTo(orderedNodes[j]) > 0);
+                } else if (j > i) {
+                    assertTrue(ctx, orderedNodes[i].compareTo(orderedNodes[j]) < 0);
+                } else { // j == i
+                    assertEquals(ctx, orderedNodes[i].compareTo(orderedNodes[j]), 0);
+                }
+            }
+        }
+    }
+
     @Test
     public void testStringResultNode() {
         try {
@@ -719,16 +734,87 @@ public class ExpressionTestCase {
         } catch (IllegalArgumentException e) {
             // expected
         }
-        StringResultNode a = new StringResultNode("7.3");
+        StringResultNode a = new StringResultNode("42");
+        assertEquals(a.getInteger(), 42);
+        assertEquals(a.getFloat(), 42, delta);
+        assertEquals(a.getString(), "42");
+
+        a = new StringResultNode("7.3");
         assertEquals(a.getInteger(), 0);
         assertEquals(a.getFloat(), 7.3, delta);
         assertEquals(a.getString(), "7.3");
+
         byte[] raw = a.getRaw();
         assertEquals(raw.length, 3);
+        assertEquals(Arrays.compareUnsigned(raw, Utf8.toBytes("7.3")), 0);
+
         assertResultNode(a);
         compare(new StringResultNode(), new StringResultNode("z"), new StringResultNode("zz"));
         compare(new StringResultNode("z"), new StringResultNode("zz"), new StringResultNode("zzz"));
         compare(new StringResultNode("a"), new StringResultNode("zz"), new PositiveInfinityResultNode());
+        assertEquals(new StringResultNode("zz").compareTo(new StringResultNode("zz")), 0);
+
+        // Remove this assertion if we remove support for implicit stringification
+        compare(new StringResultNode(), new StringResultNode("12"), new IntegerResultNode(13));
+
+        var node = new StringResultNode("foo");
+        node.append(new StringResultNode("bar"));
+        assertEquals(node.getString(), "foobar");
+        node.add(new StringResultNode("baz"));
+        assertEquals(node.getString(), "foobarbaz");
+
+        node.set(new StringResultNode("ABC"));
+        assertEquals(node.getString(), "ABC");
+
+        node.max(new StringResultNode("ABA"));
+        assertEquals(node.getString(), "ABC");
+
+        node.max(new StringResultNode("ABD"));
+        assertEquals(node.getString(), "ABD");
+
+        node.min(new StringResultNode("ABE"));
+        assertEquals(node.getString(), "ABD");
+
+        node.min(new StringResultNode("ABC"));
+        assertEquals(node.getString(), "ABC");
+
+        node.set(new IntegerResultNode(1234));
+        assertEquals(node.getString(), "1234");
+    }
+
+    @Test
+    public void string_result_node_comparison_is_memcmp_unsigned_byte_ordered() {
+        // Comparison is first by shared prefix (if equal), then ties are resolved by checking length.
+        // This is not actually valid UTF-8, so we depend on unchecked-ness...
+        compareAllPairwise(StringResultNode.ofUncheckedUtf8Array(new byte[] { 0 }),
+                           StringResultNode.ofUncheckedUtf8Array(new byte[] { 0, 0 }),
+                           StringResultNode.ofUncheckedUtf8Array(new byte[] { 0, 0, 1 }),
+                           StringResultNode.ofUncheckedUtf8Array(new byte[] { 1 }),
+                           StringResultNode.ofUncheckedUtf8Array(new byte[] { 127 }),  // 0x7F
+                           StringResultNode.ofUncheckedUtf8Array(new byte[] { -128 }), // 0x80
+                           StringResultNode.ofUncheckedUtf8Array(new byte[] { -1 }),   // 0xFF
+                           StringResultNode.ofUncheckedUtf8Array(new byte[] { -1, 0 }));
+    }
+
+    // Ensure that the ordering of string nodes on the container matches that of the C++ backend.
+    // The backend directly orders on the memcmp order of UTF-8 strings, so the container must do the same.
+    // In particular, this means that UTF-16 java.lang.String sorting is a no-go.
+    @Test
+    public void string_result_node_comparison_is_utf8_byte_ordered() {
+        // UTF-8 representation of "fried shrimp" emoji U+1F364 which requires a UTF-16 surrogate pair (2 chars)
+        // Its UTF-16 representation is D83C DF64
+        var surrogateUtf8 = new byte[] { -16, -97, -115, -92 }; // F0 9F 8D A4
+        // UTF-8 representation of U+F0BA, which exists in the Unicode private usage range. Only requires 1 UTF-16 char.
+        // Its UTF-16 representation is F0BA
+        var nonSurrogateUtf8 = new byte[] { -17, -126, -70 }; // EF 82 BA
+
+        var s1 = new StringResultNode(Utf8.toString(surrogateUtf8));
+        var s2 = new StringResultNode(Utf8.toString(nonSurrogateUtf8));
+        //   UTF-8:  s2 < s1, since 0xEF < 0xF0
+        //   UTF-16: s1 < s2, since 0xD83C < 0xF0BA
+        // Assert that ordering is defined by UTF-8, not (surrogated) UTF-16.
+        assertTrue(s2.compareTo(s1) < 0);
+        assertTrue(s1.compareTo(s2) > 0);
     }
 
     @Test
diff --git a/vespajlib/abi-spec.json b/vespajlib/abi-spec.json
index c426195bc37..5eeee267cf6 100644
--- a/vespajlib/abi-spec.json
+++ b/vespajlib/abi-spec.json
@@ -3428,6 +3428,7 @@
       "protected static com.yahoo.vespa.objects.Identifiable deserializeOptional(com.yahoo.vespa.objects.Deserializer)",
       "protected static boolean equals(java.lang.Object, java.lang.Object)",
       "public void visitMembers(com.yahoo.vespa.objects.ObjectVisitor)",
+      "protected static byte[] getRawUtf8Bytes(com.yahoo.vespa.objects.Deserializer)",
       "protected java.lang.String getUtf8(com.yahoo.vespa.objects.Deserializer)",
       "protected void putUtf8(com.yahoo.vespa.objects.Serializer, java.lang.String)",
       "public bridge synthetic java.lang.Object clone()"
diff --git a/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java b/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java
index 8c11a0cbda1..947b312ac3b 100644
--- a/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java
+++ b/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java
@@ -354,10 +354,13 @@ public class Identifiable extends Selectable implements Cloneable {
         }
     }
 
-    protected String getUtf8(Deserializer buf) {
+    protected static byte[] getRawUtf8Bytes(Deserializer buf) {
         int len = buf.getInt(null);
-        byte[] arr = buf.getBytes(null, len);
-        return Utf8.toString(arr);
+        return buf.getBytes(null, len);
+    }
+
+    protected String getUtf8(Deserializer buf) {
+        return Utf8.toString(getRawUtf8Bytes(buf));
     }
 
     protected void putUtf8(Serializer buf, String val) {
-- 
cgit v1.2.3