From 58c464422049066d6908b9bcba1ebdf20e9c76a2 Mon Sep 17 00:00:00 2001 From: Martin Polden Date: Fri, 2 Oct 2020 14:34:44 +0200 Subject: Remove custom Utf8.toString implementation `String::new` is now faster for both ASCII and Unicode strings: ``` Utf8::toString of ascii string took 132 ms String::new of ascii string took 59 ms Change = -55.30% Utf8::toString of unicode string took 410 ms String::new of unicode string took 280 ms Change = -31.71% ``` There's at least two reasons for this: * Java 9 introduced compact strings, which means that `String` is now backed by a byte array to reduce the memory footprint of ASCII strings. * Detection of Unicode strings may use HotSpot intrinsics. --- vespajlib/src/main/java/com/yahoo/text/Utf8.java | 25 +++------------ .../src/test/java/com/yahoo/text/Utf8TestCase.java | 36 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 21 deletions(-) (limited to 'vespajlib') diff --git a/vespajlib/src/main/java/com/yahoo/text/Utf8.java b/vespajlib/src/main/java/com/yahoo/text/Utf8.java index cb8ca244fe2..a8a0adf1a7d 100644 --- a/vespajlib/src/main/java/com/yahoo/text/Utf8.java +++ b/vespajlib/src/main/java/com/yahoo/text/Utf8.java @@ -48,8 +48,7 @@ public final class Utf8 { * @return String decoded from UTF-8 */ public static String toString(byte[] data, int offset, int length) { - String s = toStringAscii(data, offset, length); - return s != null ? s : toString(ByteBuffer.wrap(data, offset, length)); + return toString(ByteBuffer.wrap(data, offset, length)); } /** @@ -118,14 +117,14 @@ public final class Utf8 { return utf8 != null ? utf8 : string.getBytes(StandardCharsets.UTF_8); } /** - * Will try an optimistic approach to utf8 decoding. + * Decode a UTF-8 string. * * @param utf8 The string to encode. * @return Utf8 encoded array */ public static String toString(byte [] utf8) { - String s = toStringAscii(utf8, 0, utf8.length); - return s != null ? s : new String(utf8, StandardCharsets.UTF_8); + // This is just wrapper for String::new now. Pre-Java 9 this had an more efficient approach for ASCII strings. + return new String(utf8, StandardCharsets.UTF_8); } /** @@ -145,22 +144,6 @@ public final class Utf8 { return utf8; } - private static String toStringAscii(byte [] b, int offset, int length) { - if (length > 0) { - char [] s = new char[length]; - for (int i=0; i < length; i++) { - if (b[offset + i] >= 0) { - s[i] = (char)b[offset+i]; - } else { - return null; - } - } - return new String(s); - } else { - return ""; - } - } - /** * Utility method as toBytes(String). * diff --git a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java index 79437af30b9..2ffedee6a17 100644 --- a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java +++ b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.text; +import com.google.common.collect.ImmutableMap; import org.junit.Ignore; import org.junit.Test; @@ -8,7 +9,9 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; +import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.function.Function; import static com.yahoo.text.Lowercase.toLowerCase; import static com.yahoo.text.Utf8.calculateBytePositions; @@ -551,4 +554,37 @@ public class Utf8TestCase { assertArrayEquals(stringAsUtf8, handEncoded); } + @Test + @Ignore + public void benchmarkDecoding() { + String ascii = "This is just sort of random mix."; + String unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8"; + int iterations = 100_000; // Use 100_000+ for benchmarking + + ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, s) -> { + long time1 = benchmarkDecoding(Utf8::toString, s, iterations); + System.out.printf("Utf8::toString of %s string took %d ms\n", type, time1); + long time2 = benchmarkDecoding((b) -> new String(b, StandardCharsets.UTF_8), s, iterations); + System.out.printf("String::new of %s string took %d ms\n", type, time2); + double change = ((double) time2 / (double) time1) - 1; + System.out.printf("Change = %.02f%%\n", change * 100); + }); + } + + private String decode(Function stringFunction, String s, int iterations) { + String res = null; + for (int i = 0; i < iterations; i++) { + res = stringFunction.apply((s + i).getBytes()); + } + return res; + } + + private long benchmarkDecoding(Function stringFunction, String s, int iterations) { + decode(stringFunction, s, iterations); // Warmup + long start = System.currentTimeMillis(); + decode(stringFunction, s, iterations); + long end = System.currentTimeMillis(); + return end - start; + } + } -- cgit v1.2.3