From 58c464422049066d6908b9bcba1ebdf20e9c76a2 Mon Sep 17 00:00:00 2001
From: Martin Polden <mpolden@mpolden.no>
Date: Fri, 2 Oct 2020 14:34:44 +0200
Subject: Remove custom Utf8.toString implementation

`String::new` is now faster for both ASCII and Unicode strings:

```
Utf8::toString of ascii string took 132 ms
String::new of ascii string took 59 ms
Change = -55.30%
Utf8::toString of unicode string took 410 ms
String::new of unicode string took 280 ms
Change = -31.71%
```

There's at least two reasons for this:

* Java 9 introduced compact strings, which means that `String` is now backed by
a byte array to reduce the memory footprint of ASCII strings.
* Detection of Unicode strings may use HotSpot intrinsics.
---
 vespajlib/src/main/java/com/yahoo/text/Utf8.java   | 25 +++------------
 .../src/test/java/com/yahoo/text/Utf8TestCase.java | 36 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 21 deletions(-)

(limited to 'vespajlib')

diff --git a/vespajlib/src/main/java/com/yahoo/text/Utf8.java b/vespajlib/src/main/java/com/yahoo/text/Utf8.java
index cb8ca244fe2..a8a0adf1a7d 100644
--- a/vespajlib/src/main/java/com/yahoo/text/Utf8.java
+++ b/vespajlib/src/main/java/com/yahoo/text/Utf8.java
@@ -48,8 +48,7 @@ public final class Utf8 {
      * @return String decoded from UTF-8
      */
     public static String toString(byte[] data, int offset, int length) {
-        String s = toStringAscii(data, offset, length);
-        return s != null ? s : toString(ByteBuffer.wrap(data, offset, length));
+        return toString(ByteBuffer.wrap(data, offset, length));
     }
 
     /**
@@ -118,14 +117,14 @@ public final class Utf8 {
         return utf8 != null ? utf8 : string.getBytes(StandardCharsets.UTF_8);
     }
     /**
-     * Will try an optimistic approach to utf8 decoding.
+     * Decode a UTF-8 string.
      *
      * @param utf8 The string to encode.
      * @return Utf8 encoded array
      */
     public static String toString(byte [] utf8) {
-        String s = toStringAscii(utf8, 0, utf8.length);
-        return s != null ? s : new String(utf8, StandardCharsets.UTF_8);
+        // This is just wrapper for String::new now. Pre-Java 9 this had an more efficient approach for ASCII strings.
+        return new String(utf8, StandardCharsets.UTF_8);
     }
 
     /**
@@ -145,22 +144,6 @@ public final class Utf8 {
         return utf8;
     }
 
-    private static String toStringAscii(byte [] b, int offset, int length) {
-        if (length > 0) {
-            char [] s = new char[length];
-            for (int i=0; i < length; i++) {
-                if (b[offset + i] >= 0) {
-                    s[i] = (char)b[offset+i];
-                } else {
-                    return null;
-                }
-            }
-            return new String(s);
-        } else {
-            return "";
-        }
-    }
-
     /**
      * Utility method as toBytes(String).
      *
diff --git a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java
index 79437af30b9..2ffedee6a17 100644
--- a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java
+++ b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java
@@ -1,6 +1,7 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 package com.yahoo.text;
 
+import com.google.common.collect.ImmutableMap;
 import org.junit.Ignore;
 import org.junit.Test;
 
@@ -8,7 +9,9 @@ import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.charset.CharsetEncoder;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
+import java.util.function.Function;
 
 import static com.yahoo.text.Lowercase.toLowerCase;
 import static com.yahoo.text.Utf8.calculateBytePositions;
@@ -551,4 +554,37 @@ public class Utf8TestCase {
         assertArrayEquals(stringAsUtf8, handEncoded);
     }
 
+    @Test
+    @Ignore
+    public void benchmarkDecoding() {
+        String ascii = "This is just sort of random mix.";
+        String unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8";
+        int iterations = 100_000; // Use 100_000+ for benchmarking
+
+        ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, s) -> {
+            long time1 = benchmarkDecoding(Utf8::toString, s, iterations);
+            System.out.printf("Utf8::toString of %s string took %d ms\n", type, time1);
+            long time2 = benchmarkDecoding((b) -> new String(b, StandardCharsets.UTF_8), s, iterations);
+            System.out.printf("String::new of %s string took %d ms\n", type, time2);
+            double change = ((double) time2 / (double) time1) - 1;
+            System.out.printf("Change = %.02f%%\n", change * 100);
+        });
+    }
+
+    private String decode(Function<byte[], String> stringFunction, String s, int iterations) {
+        String res = null;
+        for (int i = 0; i < iterations; i++) {
+            res = stringFunction.apply((s + i).getBytes());
+        }
+        return res;
+    }
+
+    private long benchmarkDecoding(Function<byte[], String> stringFunction, String s, int iterations) {
+        decode(stringFunction, s, iterations); // Warmup
+        long start = System.currentTimeMillis();
+        decode(stringFunction, s, iterations);
+        long end = System.currentTimeMillis();
+        return end - start;
+    }
+
 }
-- 
cgit v1.2.3