summaryrefslogtreecommitdiffstats
path: root/vespajlib
diff options
context:
space:
mode:
authorMartin Polden <mpolden@mpolden.no>2020-10-05 11:09:33 +0200
committerMartin Polden <mpolden@mpolden.no>2020-10-05 11:10:03 +0200
commit17246bdb35ff824c4fa424134e4c5cd7732f0b14 (patch)
tree26d920b3641159b8a86cba78465e287a592af434 /vespajlib
parentea9dc5a16e3caed2c395c522dbb9a2a94006ce8f (diff)
Remove custom Utf8.toBytes implementation
`String` optimizations have caught up.
Diffstat (limited to 'vespajlib')
-rw-r--r--vespajlib/src/main/java/com/yahoo/text/Utf8.java28
-rw-r--r--vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java51
2 files changed, 46 insertions, 33 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/text/Utf8.java b/vespajlib/src/main/java/com/yahoo/text/Utf8.java
index a8a0adf1a7d..b81f0447d04 100644
--- a/vespajlib/src/main/java/com/yahoo/text/Utf8.java
+++ b/vespajlib/src/main/java/com/yahoo/text/Utf8.java
@@ -106,45 +106,27 @@ public final class Utf8 {
}
/**
- * Will try an optimistic approach to utf8 encoding.
- * That is 4.6x faster that the brute encode for ascii, not accounting for reduced memory footprint and GC.
+ * Encode a UTF-8 string.
*
* @param string The string to encode.
* @return Utf8 encoded array
*/
public static byte[] toBytes(String string) {
- byte [] utf8 = toBytesAscii(string);
- return utf8 != null ? utf8 : string.getBytes(StandardCharsets.UTF_8);
+ // This is just wrapper for String::getBytes. Pre-Java 9 this had an more efficient approach for ASCII-only strings.
+ return string.getBytes(StandardCharsets.UTF_8);
}
/**
* Decode a UTF-8 string.
*
- * @param utf8 The string to encode.
+ * @param utf8 The bytes to decode.
* @return Utf8 encoded array
*/
public static String toString(byte [] utf8) {
- // This is just wrapper for String::new now. Pre-Java 9 this had an more efficient approach for ASCII strings.
+ // This is just wrapper for String::new. Pre-Java 9 this had an more efficient approach for ASCII-onlu strings.
return new String(utf8, StandardCharsets.UTF_8);
}
/**
- * If String is purely ascii 7bit it will encode it as a byte array.
- * @param str The string to encode
- * @return Utf8 encoded array
- */
- private static byte[] toBytesAscii(final CharSequence str) {
- byte [] utf8 = new byte[str.length()];
- for (int i=0; i < utf8.length; i++) {
- char c = str.charAt(i);
- if ((c < 0) || (c >= 0x80)) {
- return null;
- }
- utf8[i] = (byte)c;
- }
- return utf8;
- }
-
- /**
* Utility method as toBytes(String).
*
* @param str
diff --git a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java
index 2ffedee6a17..97d0717dc0b 100644
--- a/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java
+++ b/vespajlib/src/test/java/com/yahoo/text/Utf8TestCase.java
@@ -557,32 +557,63 @@ public class Utf8TestCase {
@Test
@Ignore
public void benchmarkDecoding() {
- String ascii = "This is just sort of random mix.";
- String unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8";
+ byte[] ascii = "This is just sort of random mix.".getBytes();
+ byte[] unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8".getBytes(StandardCharsets.UTF_8);
int iterations = 100_000; // Use 100_000+ for benchmarking
- ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, s) -> {
- long time1 = benchmarkDecoding(Utf8::toString, s, iterations);
+ ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, b) -> {
+ long time1 = benchmark(() -> decode(Utf8::toString, b, iterations));
System.out.printf("Utf8::toString of %s string took %d ms\n", type, time1);
- long time2 = benchmarkDecoding((b) -> new String(b, StandardCharsets.UTF_8), s, iterations);
+ long time2 = benchmark(() -> decode((b1) -> new String(b1, StandardCharsets.UTF_8), b, iterations));
System.out.printf("String::new of %s string took %d ms\n", type, time2);
double change = ((double) time2 / (double) time1) - 1;
System.out.printf("Change = %.02f%%\n", change * 100);
});
}
- private String decode(Function<byte[], String> stringFunction, String s, int iterations) {
+ @Test
+ @Ignore
+ public void benchmarkEncoding() {
+ String ascii = "This is just sort of random mix.";
+ String unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8";
+ int iterations = 1_000_000; // Use 1_000_000+ for benchmarking
+
+ ImmutableMap.of("ascii", ascii, "unicode", unicode).forEach((type, s) -> {
+ long time1 = benchmark(() -> encode(Utf8::toBytes, s, iterations));
+ System.out.printf("Utf8::toBytes of %s string took %d ms\n", type, time1);
+ long time2 = benchmark(() -> encode((s1) -> s1.getBytes(StandardCharsets.UTF_8), s, iterations));
+ System.out.printf("String::getBytes of %s string took %d ms\n", type, time2);
+ double change = ((double) time2 / (double) time1) - 1;
+ System.out.printf("Change = %.02f%%\n", change * 100);
+ });
+ }
+
+
+ private byte[] encode(Function<String, byte[]> encoder, String s, int iterations) {
+ byte[] res = null;
+ for (int i = 0; i < iterations; i++) {
+ res = encoder.apply(s + i); // Append counter to avoid String cache
+ }
+ return res;
+ }
+
+ private String decode(Function<byte[], String> decoder, byte[] b, int iterations) {
String res = null;
for (int i = 0; i < iterations; i++) {
- res = stringFunction.apply((s + i).getBytes());
+ // Append counter to avoid String cache
+ byte[] counter = String.valueOf(i).getBytes();
+ byte[] result = new byte[b.length + counter.length];
+ System.arraycopy(b, 0, result, 0, b.length);
+ System.arraycopy(counter, 0, result, b.length, counter.length);
+ res = decoder.apply(result);
}
return res;
}
- private long benchmarkDecoding(Function<byte[], String> stringFunction, String s, int iterations) {
- decode(stringFunction, s, iterations); // Warmup
+ private long benchmark(Runnable r) {
+ r.run(); // Warmup
long start = System.currentTimeMillis();
- decode(stringFunction, s, iterations);
+ r.run();
long end = System.currentTimeMillis();
return end - start;
}