diff options
author | Martin Polden <mpolden@mpolden.no> | 2020-10-05 11:09:33 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2020-10-05 11:10:03 +0200 |
commit | 17246bdb35ff824c4fa424134e4c5cd7732f0b14 (patch) | |
tree | 26d920b3641159b8a86cba78465e287a592af434 /vespajlib | |
parent | ea9dc5a16e3caed2c395c522dbb9a2a94006ce8f (diff) |
Remove custom Utf8.toBytes implementation
`String` optimizations have caught up.
Diffstat (limited to 'vespajlib')
-rw-r--r-- | vespajlib/src/main/java/com/yahoo/text/Utf8.java | 28 | ||||
-rw-r--r-- | vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java | 51 |
2 files changed, 46 insertions, 33 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/text/Utf8.java b/vespajlib/src/main/java/com/yahoo/text/Utf8.java index a8a0adf1a7d..b81f0447d04 100644 --- a/vespajlib/src/main/java/com/yahoo/text/Utf8.java +++ b/vespajlib/src/main/java/com/yahoo/text/Utf8.java @@ -106,45 +106,27 @@ public final class Utf8 { } /** - * Will try an optimistic approach to utf8 encoding. - * That is 4.6x faster that the brute encode for ascii, not accounting for reduced memory footprint and GC. + * Encode a UTF-8 string. * * @param string The string to encode. * @return Utf8 encoded array */ public static byte[] toBytes(String string) { - byte [] utf8 = toBytesAscii(string); - return utf8 != null ? utf8 : string.getBytes(StandardCharsets.UTF_8); + // This is just wrapper for String::getBytes. Pre-Java 9 this had an more efficient approach for ASCII-only strings. + return string.getBytes(StandardCharsets.UTF_8); } /** * Decode a UTF-8 string. * - * @param utf8 The string to encode. + * @param utf8 The bytes to decode. * @return Utf8 encoded array */ public static String toString(byte [] utf8) { - // This is just wrapper for String::new now. Pre-Java 9 this had an more efficient approach for ASCII strings. + // This is just wrapper for String::new. Pre-Java 9 this had an more efficient approach for ASCII-onlu strings. return new String(utf8, StandardCharsets.UTF_8); } /** - * If String is purely ascii 7bit it will encode it as a byte array. - * @param str The string to encode - * @return Utf8 encoded array - */ - private static byte[] toBytesAscii(final CharSequence str) { - byte [] utf8 = new byte[str.length()]; - for (int i=0; i < utf8.length; i++) { - char c = str.charAt(i); - if ((c < 0) || (c >= 0x80)) { - return null; - } - utf8[i] = (byte)c; - } - return utf8; - } - - /** * Utility method as toBytes(String). * * @param str diff --git a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java index 2ffedee6a17..97d0717dc0b 100644 --- a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java +++ b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java @@ -557,32 +557,63 @@ public class Utf8TestCase { @Test @Ignore public void benchmarkDecoding() { - String ascii = "This is just sort of random mix."; - String unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8"; + byte[] ascii = "This is just sort of random mix.".getBytes(); + byte[] unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8".getBytes(StandardCharsets.UTF_8); int iterations = 100_000; // Use 100_000+ for benchmarking - ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, s) -> { - long time1 = benchmarkDecoding(Utf8::toString, s, iterations); + ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, b) -> { + long time1 = benchmark(() -> decode(Utf8::toString, b, iterations)); System.out.printf("Utf8::toString of %s string took %d ms\n", type, time1); - long time2 = benchmarkDecoding((b) -> new String(b, StandardCharsets.UTF_8), s, iterations); + long time2 = benchmark(() -> decode((b1) -> new String(b1, StandardCharsets.UTF_8), b, iterations)); System.out.printf("String::new of %s string took %d ms\n", type, time2); double change = ((double) time2 / (double) time1) - 1; System.out.printf("Change = %.02f%%\n", change * 100); }); } - private String decode(Function<byte[], String> stringFunction, String s, int iterations) { + @Test + @Ignore + public void benchmarkEncoding() { + String ascii = "This is just sort of random mix."; + String unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8"; + int iterations = 1_000_000; // Use 1_000_000+ for benchmarking + + ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, s) -> { + long time1 = benchmark(() -> encode(Utf8::toBytes, s, iterations)); + System.out.printf("Utf8::toBytes of %s string took %d ms\n", type, time1); + long time2 = benchmark(() -> encode((s1) -> s1.getBytes(StandardCharsets.UTF_8), s, iterations)); + System.out.printf("String::getBytes of %s string took %d ms\n", type, time2); + double change = ((double) time2 / (double) time1) - 1; + System.out.printf("Change = %.02f%%\n", change * 100); + }); + } + + + private byte[] encode(Function<String, byte[]> encoder, String s, int iterations) { + byte[] res = null; + for (int i = 0; i < iterations; i++) { + res = encoder.apply(s + i); // Append counter to avoid String cache + } + return res; + } + + private String decode(Function<byte[], String> decoder, byte[] b, int iterations) { String res = null; for (int i = 0; i < iterations; i++) { - res = stringFunction.apply((s + i).getBytes()); + // Append counter to avoid String cache + byte[] counter = String.valueOf(i).getBytes(); + byte[] result = new byte[b.length + counter.length]; + System.arraycopy(b, 0, result, 0, b.length); + System.arraycopy(counter, 0, result, b.length, counter.length); + res = decoder.apply(result); } return res; } - private long benchmarkDecoding(Function<byte[], String> stringFunction, String s, int iterations) { - decode(stringFunction, s, iterations); // Warmup + private long benchmark(Runnable r) { + r.run(); // Warmup long start = System.currentTimeMillis(); - decode(stringFunction, s, iterations); + r.run(); long end = System.currentTimeMillis(); return end - start; } |