Remove custom Utf8.toBytes implementation

`String` optimizations have caught up.
author: Martin Polden <mpolden@mpolden.no> 2020-10-05 11:09:33 +0200
committer: Martin Polden <mpolden@mpolden.no> 2020-10-05 11:10:03 +0200
commit: 17246bdb35ff824c4fa424134e4c5cd7732f0b14 (patch)
tree: 26d920b3641159b8a86cba78465e287a592af434 /vespajlib
parent: ea9dc5a16e3caed2c395c522dbb9a2a94006ce8f (diff)
2 files changed, 46 insertions, 33 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/text/Utf8.java b/vespajlib/src/main/java/com/yahoo/text/Utf8.java
index a8a0adf1a7d..b81f0447d04 100644
--- a/vespajlib/src/main/java/com/yahoo/text/Utf8.java
+++ b/vespajlib/src/main/java/com/yahoo/text/Utf8.java
@@ -106,45 +106,27 @@ public final class Utf8 {
     }
 
     /**
-     * Will try an optimistic approach to utf8 encoding.
-     * That is 4.6x faster that the brute encode for ascii, not accounting for reduced memory footprint and GC.
+     * Encode a UTF-8 string.
      *
      * @param string The string to encode.
      * @return Utf8 encoded array
      */
     public static byte[] toBytes(String string) {
-        byte [] utf8 = toBytesAscii(string);
-        return utf8 != null ? utf8 : string.getBytes(StandardCharsets.UTF_8);
+        // This is just wrapper for String::getBytes. Pre-Java 9 this had an more efficient approach for ASCII-only strings.
+        return string.getBytes(StandardCharsets.UTF_8);
     }
     /**
      * Decode a UTF-8 string.
      *
-     * @param utf8 The string to encode.
+     * @param utf8 The bytes to decode.
      * @return Utf8 encoded array
      */
     public static String toString(byte [] utf8) {
-        // This is just wrapper for String::new now. Pre-Java 9 this had an more efficient approach for ASCII strings.
+        // This is just wrapper for String::new. Pre-Java 9 this had an more efficient approach for ASCII-onlu strings.
         return new String(utf8, StandardCharsets.UTF_8);
     }
 
     /**
-     * If String is purely ascii 7bit it will encode it as a byte array.
-     * @param str The string to encode
-     * @return Utf8 encoded array
-     */
-    private static byte[] toBytesAscii(final CharSequence str) {
-        byte [] utf8 = new byte[str.length()];
-        for (int i=0; i < utf8.length; i++) {
-            char c = str.charAt(i);
-            if ((c < 0) || (c >= 0x80)) {
-                return null;
-            }
-            utf8[i] = (byte)c;
-        }
-        return utf8;
-    }
-
-    /**
      * Utility method as toBytes(String).
      *
      * @param str
diff --git a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java
index 2ffedee6a17..97d0717dc0b 100644
--- a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java
+++ b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java
@@ -557,32 +557,63 @@ public class Utf8TestCase {
     @Test
     @Ignore
     public void benchmarkDecoding() {
-        String ascii = "This is just sort of random mix.";
-        String unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8";
+        byte[] ascii = "This is just sort of random mix.".getBytes();
+        byte[] unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8".getBytes(StandardCharsets.UTF_8);
         int iterations = 100_000; // Use 100_000+ for benchmarking
 
-        ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, s) -> {
-            long time1 = benchmarkDecoding(Utf8::toString, s, iterations);
+        ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, b) -> {
+            long time1 = benchmark(() -> decode(Utf8::toString, b, iterations));
             System.out.printf("Utf8::toString of %s string took %d ms\n", type, time1);
-            long time2 = benchmarkDecoding((b) -> new String(b, StandardCharsets.UTF_8), s, iterations);
+            long time2 = benchmark(() -> decode((b1) -> new String(b1, StandardCharsets.UTF_8), b, iterations));
             System.out.printf("String::new of %s string took %d ms\n", type, time2);
             double change = ((double) time2 / (double) time1) - 1;
             System.out.printf("Change = %.02f%%\n", change * 100);
         });
     }
 
-    private String decode(Function<byte[], String> stringFunction, String s, int iterations) {
+    @Test
+    @Ignore
+    public void benchmarkEncoding() {
+        String ascii = "This is just sort of random mix.";
+        String unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8";
+        int iterations = 1_000_000; // Use 1_000_000+ for benchmarking
+
+        ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, s) -> {
+            long time1 = benchmark(() -> encode(Utf8::toBytes, s, iterations));
+            System.out.printf("Utf8::toBytes of %s string took %d ms\n", type, time1);
+            long time2 = benchmark(() -> encode((s1) -> s1.getBytes(StandardCharsets.UTF_8), s, iterations));
+            System.out.printf("String::getBytes of %s string took %d ms\n", type, time2);
+            double change = ((double) time2 / (double) time1) - 1;
+            System.out.printf("Change = %.02f%%\n", change * 100);
+        });
+    }
+
+
+    private byte[] encode(Function<String, byte[]> encoder, String s, int iterations) {
+        byte[] res = null;
+        for (int i = 0; i < iterations; i++) {
+            res = encoder.apply(s + i); // Append counter to avoid String cache
+        }
+        return res;
+    }
+
+    private String decode(Function<byte[], String> decoder, byte[] b, int iterations) {
         String res = null;
         for (int i = 0; i < iterations; i++) {
-            res = stringFunction.apply((s + i).getBytes());
+            // Append counter to avoid String cache
+            byte[] counter = String.valueOf(i).getBytes();
+            byte[] result = new byte[b.length + counter.length];
+            System.arraycopy(b, 0, result, 0, b.length);
+            System.arraycopy(counter, 0, result, b.length, counter.length);
+            res = decoder.apply(result);
         }
         return res;
     }
 
-    private long benchmarkDecoding(Function<byte[], String> stringFunction, String s, int iterations) {
-        decode(stringFunction, s, iterations); // Warmup
+    private long benchmark(Runnable r) {
+        r.run(); // Warmup
         long start = System.currentTimeMillis();
-        decode(stringFunction, s, iterations);
+        r.run();
         long end = System.currentTimeMillis();
         return end - start;
     }
author	Martin Polden <mpolden@mpolden.no>	2020-10-05 11:09:33 +0200
committer	Martin Polden <mpolden@mpolden.no>	2020-10-05 11:10:03 +0200
commit	17246bdb35ff824c4fa424134e4c5cd7732f0b14 (patch)
tree	26d920b3641159b8a86cba78465e287a592af434 /vespajlib
parent	ea9dc5a16e3caed2c395c522dbb9a2a94006ce8f (diff)