aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2021-11-01 16:11:31 +0100
committerGitHub <noreply@github.com>2021-11-01 16:11:31 +0100
commit17fbfda757924a90ea71c9260ea22eab1a7c3949 (patch)
tree313864a00656f1aed57456dbbc46be9688d4ef0f
parentf2d3685aff36e286a4aafdb90a019bbad36ddc0d (diff)
parentbac8ab58a18d25db1871d3e933cb0cc018be5439 (diff)
Merge pull request #19813 from vespa-engine/vekterli/use-utf8-ordering-for-string-result-nodes
Use UTF-8 bytewise ordering for Java StringResultNode comparisons
-rw-r--r--searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java85
-rw-r--r--searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java88
-rw-r--r--vespajlib/abi-spec.json1
-rw-r--r--vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java9
4 files changed, 156 insertions, 27 deletions
diff --git a/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java b/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java
index 6a84c7ff950..b90a5aadbb6 100644
--- a/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java
+++ b/searchlib/src/main/java/com/yahoo/searchlib/expression/StringResultNode.java
@@ -6,6 +6,8 @@ import com.yahoo.vespa.objects.Deserializer;
import com.yahoo.vespa.objects.ObjectVisitor;
import com.yahoo.vespa.objects.Serializer;
+import java.util.Arrays;
+
/**
* This result holds a string.
*
@@ -16,18 +18,20 @@ public class StringResultNode extends SingleResultNode {
// The global class identifier shared with C++.
public static final int classId = registerClass(0x4000 + 53, StringResultNode.class);
- private static StringResultNode negativeInfinity = new StringResultNode("");
- private static PositiveInfinityResultNode positiveInfinity = new PositiveInfinityResultNode();
+ private static final StringResultNode negativeInfinity = new StringResultNode("");
+ private static final PositiveInfinityResultNode positiveInfinity = new PositiveInfinityResultNode();
+
+ private static final byte[] EMPTY_UTF8_ARRAY = new byte[0];
- // The string value of this node.
- private String value;
+ // The string value of this node, in raw UTF-8 octets.
+ private byte[] utf8Value;
/**
* Constructs an empty result node. <b>NOTE:</b> This instance is broken until non-optional member data is set.
*/
public StringResultNode() {
super();
- value = "";
+ utf8Value = EMPTY_UTF8_ARRAY;
}
/**
@@ -40,6 +44,19 @@ public class StringResultNode extends SingleResultNode {
setValue(value);
}
+ private StringResultNode(byte[] rawUtf8Value) {
+ super();
+ utf8Value = rawUtf8Value;
+ }
+
+ /**
+ * Creates a new StringResultNode backed by an underlying byte array. The input is
+ * presumed to be in valid UTF-8 format, but is _not_ checked for validity.
+ */
+ protected static StringResultNode ofUncheckedUtf8Array(byte[] rawUtf8Value) {
+ return new StringResultNode(rawUtf8Value);
+ }
+
/**
* Sets the value of this result.
*
@@ -50,7 +67,7 @@ public class StringResultNode extends SingleResultNode {
if (value == null) {
throw new IllegalArgumentException("Value can not be null.");
}
- this.value = value;
+ this.utf8Value = Utf8.toBytes(value);
return this;
}
@@ -68,13 +85,14 @@ public class StringResultNode extends SingleResultNode {
@Override
protected void onDeserialize(Deserializer buf) {
- value = getUtf8(buf);
+ // We expect the UTF-8 we get from the backend to be pre-checked and valid.
+ utf8Value = getRawUtf8Bytes(buf);
}
@Override
public long getInteger() {
try {
- return Integer.valueOf(value);
+ return Integer.valueOf(getString());
} catch (java.lang.NumberFormatException e) {
return 0;
}
@@ -83,7 +101,7 @@ public class StringResultNode extends SingleResultNode {
@Override
public double getFloat() {
try {
- return Double.valueOf(value);
+ return Double.valueOf(getString());
} catch (java.lang.NumberFormatException e) {
return 0;
}
@@ -91,50 +109,53 @@ public class StringResultNode extends SingleResultNode {
@Override
public String getString() {
- return value;
+ return Utf8.toString(utf8Value);
}
@Override
public byte[] getRaw() {
- return Utf8.toBytes(value);
+ return utf8Value;
}
@Override
protected int onCmp(ResultNode rhs) {
return (rhs instanceof PositiveInfinityResultNode)
? -1
- : value.compareTo(rhs.getString());
+ : internalNonPositiveInfinityCompareTo(rhs);
}
@Override
public int hashCode() {
- return super.hashCode() + value.hashCode();
+ return super.hashCode() + Arrays.hashCode(utf8Value);
}
@Override
public void visitMembers(ObjectVisitor visitor) {
super.visitMembers(visitor);
- visitor.visit("value", value);
+ visitor.visit("value", getString());
}
+ @Override
public void add(ResultNode rhs) {
- value += rhs.getString();
+ setValue(getString() + rhs.getString());
}
+ @Override
public void min(ResultNode rhs) {
- if (value.compareTo(rhs.getString()) > 0) {
- value = rhs.getString();
+ if (internalNonPositiveInfinityCompareTo(rhs) > 0) {
+ set(rhs);
}
}
+ @Override
public void max(ResultNode rhs) {
- if (value.compareTo(rhs.getString()) < 0) {
- value = rhs.getString();
+ if (internalNonPositiveInfinityCompareTo(rhs) < 0) {
+ set(rhs);
}
}
public void append(ResultNode rhs) {
- value += rhs.getString();
+ setValue(getString() + rhs.getString());
}
@Override
@@ -144,16 +165,34 @@ public class StringResultNode extends SingleResultNode {
@Override
public void set(ResultNode rhs) {
- value = rhs.getString();
+ if (rhs instanceof StringResultNode) {
+ utf8Value = ((StringResultNode) rhs).utf8Value;
+ } else {
+ setValue(rhs.getString());
+ }
}
@Override
public void negate() {
- char a[] = value.toCharArray();
+ char[] a = getString().toCharArray();
for (int i = 0; i < a.length; i++) {
a[i] = (char)-a[i];
}
- value = new String(a);
+ setValue(new String(a));
+ }
+
+ private int internalNonPositiveInfinityCompareTo(ResultNode rhs) {
+ // Note: this may not necessarily be well-defined _semantically_ unless rhs is
+ // also a StringResultNode. The C++ implementation explicitly expects rhs to be
+ // such an instance, but this depends on a classId check that is _not_ done in
+ // the Java implementation...
+ // We use getString() instead of getRaw() to support implicit stringification
+ // (legacy Java implementation behavior), but it's not given that this is always
+ // the desired outcome.
+ var rhsAsUtf8 = (rhs instanceof StringResultNode)
+ ? ((StringResultNode)rhs).utf8Value
+ : Utf8.toBytes(rhs.getString());
+ return Arrays.compareUnsigned(utf8Value, rhsAsUtf8);
}
/**
diff --git a/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java b/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java
index c5adfed4974..6e034cce652 100644
--- a/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java
+++ b/searchlib/src/test/java/com/yahoo/searchlib/expression/ExpressionTestCase.java
@@ -705,6 +705,21 @@ public class ExpressionTestCase {
assertEquals(0, large.compareTo(large));
}
+ private void compareAllPairwise(ResultNode... orderedNodes) {
+ for (int i = 0; i < orderedNodes.length; ++i) {
+ for (int j = 0; j < orderedNodes.length; ++j) {
+ var ctx = String.format("lhs(i=%d): %s, rhs(j=%d): %s", i, orderedNodes[i], j, orderedNodes[j]);
+ if (j < i) {
+ assertTrue(ctx, orderedNodes[i].compareTo(orderedNodes[j]) > 0);
+ } else if (j > i) {
+ assertTrue(ctx, orderedNodes[i].compareTo(orderedNodes[j]) < 0);
+ } else { // j == i
+ assertEquals(ctx, orderedNodes[i].compareTo(orderedNodes[j]), 0);
+ }
+ }
+ }
+ }
+
@Test
public void testStringResultNode() {
try {
@@ -719,16 +734,87 @@ public class ExpressionTestCase {
} catch (IllegalArgumentException e) {
// expected
}
- StringResultNode a = new StringResultNode("7.3");
+ StringResultNode a = new StringResultNode("42");
+ assertEquals(a.getInteger(), 42);
+ assertEquals(a.getFloat(), 42, delta);
+ assertEquals(a.getString(), "42");
+
+ a = new StringResultNode("7.3");
assertEquals(a.getInteger(), 0);
assertEquals(a.getFloat(), 7.3, delta);
assertEquals(a.getString(), "7.3");
+
byte[] raw = a.getRaw();
assertEquals(raw.length, 3);
+ assertEquals(Arrays.compareUnsigned(raw, Utf8.toBytes("7.3")), 0);
+
assertResultNode(a);
compare(new StringResultNode(), new StringResultNode("z"), new StringResultNode("zz"));
compare(new StringResultNode("z"), new StringResultNode("zz"), new StringResultNode("zzz"));
compare(new StringResultNode("a"), new StringResultNode("zz"), new PositiveInfinityResultNode());
+ assertEquals(new StringResultNode("zz").compareTo(new StringResultNode("zz")), 0);
+
+ // Remove this assertion if we remove support for implicit stringification
+ compare(new StringResultNode(), new StringResultNode("12"), new IntegerResultNode(13));
+
+ var node = new StringResultNode("foo");
+ node.append(new StringResultNode("bar"));
+ assertEquals(node.getString(), "foobar");
+ node.add(new StringResultNode("baz"));
+ assertEquals(node.getString(), "foobarbaz");
+
+ node.set(new StringResultNode("ABC"));
+ assertEquals(node.getString(), "ABC");
+
+ node.max(new StringResultNode("ABA"));
+ assertEquals(node.getString(), "ABC");
+
+ node.max(new StringResultNode("ABD"));
+ assertEquals(node.getString(), "ABD");
+
+ node.min(new StringResultNode("ABE"));
+ assertEquals(node.getString(), "ABD");
+
+ node.min(new StringResultNode("ABC"));
+ assertEquals(node.getString(), "ABC");
+
+ node.set(new IntegerResultNode(1234));
+ assertEquals(node.getString(), "1234");
+ }
+
+ @Test
+ public void string_result_node_comparison_is_memcmp_unsigned_byte_ordered() {
+ // Comparison is first by shared prefix (if equal), then ties are resolved by checking length.
+ // This is not actually valid UTF-8, so we depend on unchecked-ness...
+ compareAllPairwise(StringResultNode.ofUncheckedUtf8Array(new byte[] { 0 }),
+ StringResultNode.ofUncheckedUtf8Array(new byte[] { 0, 0 }),
+ StringResultNode.ofUncheckedUtf8Array(new byte[] { 0, 0, 1 }),
+ StringResultNode.ofUncheckedUtf8Array(new byte[] { 1 }),
+ StringResultNode.ofUncheckedUtf8Array(new byte[] { 127 }), // 0x7F
+ StringResultNode.ofUncheckedUtf8Array(new byte[] { -128 }), // 0x80
+ StringResultNode.ofUncheckedUtf8Array(new byte[] { -1 }), // 0xFF
+ StringResultNode.ofUncheckedUtf8Array(new byte[] { -1, 0 }));
+ }
+
+ // Ensure that the ordering of string nodes on the container matches that of the C++ backend.
+ // The backend directly orders on the memcmp order of UTF-8 strings, so the container must do the same.
+ // In particular, this means that UTF-16 java.lang.String sorting is a no-go.
+ @Test
+ public void string_result_node_comparison_is_utf8_byte_ordered() {
+ // UTF-8 representation of "fried shrimp" emoji U+1F364 which requires a UTF-16 surrogate pair (2 chars)
+ // Its UTF-16 representation is D83C DF64
+ var surrogateUtf8 = new byte[] { -16, -97, -115, -92 }; // F0 9F 8D A4
+ // UTF-8 representation of U+F0BA, which exists in the Unicode private usage range. Only requires 1 UTF-16 char.
+ // Its UTF-16 representation is F0BA
+ var nonSurrogateUtf8 = new byte[] { -17, -126, -70 }; // EF 82 BA
+
+ var s1 = new StringResultNode(Utf8.toString(surrogateUtf8));
+ var s2 = new StringResultNode(Utf8.toString(nonSurrogateUtf8));
+ // UTF-8: s2 < s1, since 0xEF < 0xF0
+ // UTF-16: s1 < s2, since 0xD83C < 0xF0BA
+ // Assert that ordering is defined by UTF-8, not (surrogated) UTF-16.
+ assertTrue(s2.compareTo(s1) < 0);
+ assertTrue(s1.compareTo(s2) > 0);
}
@Test
diff --git a/vespajlib/abi-spec.json b/vespajlib/abi-spec.json
index c426195bc37..5eeee267cf6 100644
--- a/vespajlib/abi-spec.json
+++ b/vespajlib/abi-spec.json
@@ -3428,6 +3428,7 @@
"protected static com.yahoo.vespa.objects.Identifiable deserializeOptional(com.yahoo.vespa.objects.Deserializer)",
"protected static boolean equals(java.lang.Object, java.lang.Object)",
"public void visitMembers(com.yahoo.vespa.objects.ObjectVisitor)",
+ "protected static byte[] getRawUtf8Bytes(com.yahoo.vespa.objects.Deserializer)",
"protected java.lang.String getUtf8(com.yahoo.vespa.objects.Deserializer)",
"protected void putUtf8(com.yahoo.vespa.objects.Serializer, java.lang.String)",
"public bridge synthetic java.lang.Object clone()"
diff --git a/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java b/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java
index 8c11a0cbda1..947b312ac3b 100644
--- a/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java
+++ b/vespajlib/src/main/java/com/yahoo/vespa/objects/Identifiable.java
@@ -354,10 +354,13 @@ public class Identifiable extends Selectable implements Cloneable {
}
}
- protected String getUtf8(Deserializer buf) {
+ protected static byte[] getRawUtf8Bytes(Deserializer buf) {
int len = buf.getInt(null);
- byte[] arr = buf.getBytes(null, len);
- return Utf8.toString(arr);
+ return buf.getBytes(null, len);
+ }
+
+ protected String getUtf8(Deserializer buf) {
+ return Utf8.toString(getRawUtf8Bytes(buf));
}
protected void putUtf8(Serializer buf, String val) {