// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.text;
import org.junit.Ignore;
import org.junit.Test;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Map;
import java.util.function.Function;
import static com.yahoo.text.Utf8.calculateBytePositions;
import static com.yahoo.text.Utf8.calculateStringPositions;
import static org.hamcrest.CoreMatchers.equalTo;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
/**
* @author Bjorn Borud
* @author Steinar Knutsen
*/
public class Utf8TestCase {
private static final String TEST_STRING = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8";
private static final int[] TEST_CODEPOINTS = {0x0, 0x7f, 0x80, 0x7ff, 0x800, 0xd7ff, 0xe000, 0xffff, 0x10000, 0x10ffff,
0x34, 0x355, 0x2567, 0xfff, 0xe987, 0x100abc
};
@Test
public void testSimple() {
String s1 = "test";
String s2 = "f\u00F8rst";
String s3 = "\u00C5pen";
byte[] b4 = { (byte) 0xE5, (byte) 0xA4, (byte) 0x89, (byte) 0xE6,
(byte) 0x85, (byte) 0x8B };
byte[] b1 = Utf8.toBytes(s1);
byte[] b2 = Utf8.toBytes(s2);
byte[] b3 = Utf8.toBytes(s3);
String s4 = Utf8.toString(b4);
assertEquals('t', b1[0]);
assertEquals('e', b1[1]);
assertEquals('s', b1[2]);
assertEquals('t', b1[3]);
assertEquals('f', b2[0]);
assertEquals((byte) 0xC3, b2[1]);
assertEquals((byte) 0xB8, b2[2]);
assertEquals('r', b2[3]);
assertEquals('s', b2[4]);
assertEquals('t', b2[5]);
assertEquals((byte) 0xC3, b3[0]);
assertEquals((byte) 0x85, b3[1]);
assertEquals('p', b3[2]);
assertEquals('e', b3[3]);
assertEquals('n', b3[4]);
assertEquals('\u5909', s4.charAt(0));
assertEquals('\u614B', s4.charAt(1));
String ss1 = Utf8.toString(b1);
String ss2 = Utf8.toString(b2);
String ss3 = Utf8.toString(b3);
byte[] bb4 = Utf8.toBytes(s4);
assertEquals(s1, ss1);
assertEquals(s3, ss3);
assertEquals(s2, ss2);
assertEquals(Utf8.toString(b4), Utf8.toString(bb4));
}
private int javaCountBytes(String str) {
byte[] octets = Utf8.toBytes(str);
return octets.length;
}
private String makeString(int codePoint) {
char[] chars = Character.toChars(codePoint);
return String.valueOf(chars);
}
@Test
public void testByteCounting() {
for (int c : TEST_CODEPOINTS) {
String testCharacter = makeString(c);
assertEquals(javaCountBytes(testCharacter), Utf8.byteCount(testCharacter));
}
assertEquals(javaCountBytes(TEST_STRING), Utf8.byteCount(TEST_STRING));
}
@Test
public void testTotalBytes() {
//Test with a random mix of
assertEquals(1,Utf8.totalBytes((byte)0x05));
assertEquals(4,Utf8.totalBytes((byte)0xF3));
assertEquals(4,Utf8.totalBytes((byte)0xF0));
assertEquals(1,Utf8.totalBytes((byte)0x7F));
assertEquals(2,Utf8.totalBytes((byte)0xC2));
assertEquals(3,Utf8.totalBytes((byte)0xE0));
}
@Test
public void testUnitCounting() {
for (int c : TEST_CODEPOINTS) {
String testCharacter = makeString(c);
byte[] utf8 = Utf8.toBytes(testCharacter);
assertEquals(testCharacter.length(), Utf8.unitCount(utf8));
assertEquals(testCharacter.length(), Utf8.unitCount(utf8[0]));
}
byte[] stringAsUtf8 = Utf8.toBytes(TEST_STRING);
assertEquals(TEST_STRING.length(), Utf8.unitCount(stringAsUtf8));
}
@Test
public void testCumbersomeEncoding() {
String[] a = {"abc", "def", "ghi\u00e8"};
int[] aLens = {3, 3, 5};
CharsetEncoder ce = Utf8.getNewEncoder();
ByteBuffer forWire = ByteBuffer.allocate(500);
for (int i = 0; i < a.length; i++) {
forWire.putInt(aLens[i]);
Utf8.toBytes(a[i], 0,
a[i].length(), forWire, ce);
}
forWire.flip();
int totalLimit = forWire.limit();
for (String anA : a) {
int len = forWire.getInt();
forWire.limit(forWire.position() + len);
String s = Utf8.toString(forWire);
assertEquals(anA, s);
forWire.limit(totalLimit);
}
assertEquals(0, forWire.remaining());
}
@Test
public void basic() {
String foo = "Washington";
int[] indexes = calculateBytePositions(foo);
assertThat(indexes.length, is(foo.length() + 1));
for (int i = 0; i < indexes.length; i++) {
assertThat(indexes[i], is(i));
}
}
@Test
public void decodeBasic() {
byte[] foo = Utf8.toBytes("Washington");
int[] indexes = calculateStringPositions(foo);
assertThat(indexes.length, is(foo.length + 1));
for (int i = 0; i < indexes.length; i++) {
assertThat(indexes[i], is(i));
}
}
@Test
public void highBytes() {
String foo = "\u0128st\u0200e";
//utf-8
// 0xC4A8 0x73 0x74 0xC880 0x65
int[] indexes = calculateBytePositions(foo);
assertThat(indexes.length, is(foo.length() + 1));
assertThat(indexes[0], is(0)); //128
assertThat(indexes[1], is(2)); //s
assertThat(indexes[2], is(3)); //t
assertThat(indexes[3], is(4)); //200
assertThat(indexes[4], is(6)); //e
}
@Test
public void decodeHighBytes() {
byte[] foo = Utf8.toBytes("\u0128st\u0200e");
//utf-8
// 0xC4A8 0x73 0x74 0xC880 0x65
int[] indexes = calculateStringPositions(foo);
assertThat(indexes.length, is(foo.length + 1));
assertThat(indexes[0], is(0)); //128
assertThat(indexes[1], is(0)); //128
assertThat(indexes[2], is(1)); //s
assertThat(indexes[3], is(2)); //t
assertThat(indexes[4], is(3)); //200
assertThat(indexes[5], is(3)); //200
assertThat(indexes[6], is(4)); //e
}
@Test
public void moreHighBytes() {
String foo = "\u0200\u0201\u0202abc\u0300def\u0301g\u07ff\u0800a\uffffa";
//utf-8
//0xC880 0xC881 0xC882 0x61 0x62 0x63 0xCC80 0x64 0x65 0x66 0xCC81 0x67 0xDFBF 0xE0A080 0x61 0xEFBFBF 0x61
int[] indexes = calculateBytePositions(foo);
assertThat(indexes.length, is(foo.length() + 1));
assertThat(indexes[0], is(0)); //200
assertThat(indexes[1], is(2)); //201
assertThat(indexes[2], is(4)); //202
assertThat(indexes[3], is(6)); //a
assertThat(indexes[4], is(7)); //b
assertThat(indexes[5], is(8)); //c
assertThat(indexes[6], is(9)); //300
assertThat(indexes[7], is(11)); //d
assertThat(indexes[8], is(12)); //e
assertThat(indexes[9], is(13)); //f
assertThat(indexes[10], is(14)); //301
assertThat(indexes[11], is(16)); //g
assertThat(indexes[12], is(17)); //7ff
assertThat(indexes[13], is(19)); //800
assertThat(indexes[14], is(22)); //a
assertThat(indexes[15], is(23)); //ffff
assertThat(indexes[16], is(26)); //a
}
@Test
public void decodeMoreHighBytes() {
String foo = "\u0200\u0201\u0202abc\u0300def\u0301g\u07ff\u0800a\uffffa";
//utf-8
//0xC880 0xC881 0xC882 0x61 0x62 0x63 0xCC80 0x64 0x65 0x66 0xCC81 0x67 0xDFBF 0xE0A080 0x61 0xEFBFBF 0x61
int[] indexes = calculateStringPositions(Utf8.toBytes(foo));
assertThat(indexes.length, is(28));
assertThat(indexes[0], is(0)); //200
assertThat(indexes[1], is(0)); //200
assertThat(indexes[2], is(1)); //201
assertThat(indexes[3], is(1)); //201
assertThat(indexes[4], is(2)); //202
assertThat(indexes[5], is(2)); //202
assertThat(indexes[6], is(3)); //a
assertThat(indexes[7], is(4)); //b
assertThat(indexes[8], is(5)); //c
assertThat(indexes[9], is(6)); //300
assertThat(indexes[10], is(6)); //300
assertThat(indexes[11], is(7)); //d
assertThat(indexes[12], is(8)); //e
assertThat(indexes[13], is(9)); //f
assertThat(indexes[14], is(10)); //301
assertThat(indexes[15], is(10)); //301
assertThat(indexes[16], is(11)); //g
assertThat(indexes[17], is(12)); //7ff
assertThat(indexes[18], is(12)); //7ff
assertThat(indexes[19], is(13)); //800
assertThat(indexes[20], is(13)); //800
assertThat(indexes[21], is(13)); //800
assertThat(indexes[22], is(14)); //a
assertThat(indexes[23], is(15)); //ffff
assertThat(indexes[24], is(15)); //ffff
assertThat(indexes[25], is(15)); //ffff
assertThat(indexes[26], is(16)); //a
}
@Test
public void testOptimisticEncoder() {
for (char i=0; i < 256; i++) {
StringBuilder sb = new StringBuilder();
for (char c=0; c < i; c++) {
sb.append(c);
}
assertArrayEquals(Utf8.toBytesStd(sb.toString()), Utf8.toBytes(sb.toString()));
}
}
@Test
public void testLong()
{
for (long l=-0x10000; l < 0x10000; l++) {
assertLongEquals(l);
}
assertLongEquals(Long.MAX_VALUE);
assertLongEquals(Long.MIN_VALUE);
}
private void assertLongEquals(long l) {
byte [] a = Utf8.toBytes(String.valueOf(l));
byte [] b = Utf8.toAsciiBytes(l);
if (!Arrays.equals(a, b)) {
assertArrayEquals(a, b);
}
}
@Test
public void testBoolean() {
assertEquals("true", String.valueOf(true));
assertEquals("false", String.valueOf(false));
assertArrayEquals(Utf8.toAsciiBytes(true), new Utf8String(String.valueOf(true)).getBytes());
assertArrayEquals(Utf8.toAsciiBytes(false), new Utf8String(String.valueOf(false)).getBytes());
}
@Test
public void testInt()
{
for (int l=-0x10000; l < 0x10000; l++) {
byte [] a = Utf8.toBytes(String.valueOf(l));
byte [] b = Utf8.toAsciiBytes(l);
if (!Arrays.equals(a, b)) {
assertArrayEquals(a, b);
}
}
}
@Test
public void testShort()
{
for (short l=-0x1000; l < 0x1000; l++) {
byte [] a = Utf8.toBytes(String.valueOf(l));
byte [] b = Utf8.toAsciiBytes(l);
if (!Arrays.equals(a, b)) {
assertArrayEquals(a, b);
}
}
}
@Test
public void surrogatePairs() {
String foo = "a\uD800\uDC00b";
//unicode
//0x61 0x10000 0x62
//utf-16
//0x61 0xD800DC00 0x62
//utf-8
//0x61 0xF0908080 0x62
int[] indexes = calculateBytePositions(foo);
assertThat(indexes.length, is(foo.length() + 1));
assertThat(indexes[0], is(0)); //a
assertThat(indexes[1], is(1)); //10000
assertThat(indexes[2], is(1)); //10000, second of surrogate pair
assertThat(indexes[3], is(5)); //b
}
@Test
public void decodeSurrogatePairs() {
String foo = "a\uD800\uDC00b";
//unicode
//0x61 0x10000 0x62
//utf-16
//0x61 0xD800DC00 0x62
//utf-8
//0x61 0xF0908080 0x62
int[] indexes = calculateStringPositions(Utf8.toBytes(foo));
assertThat(indexes.length, is(7));
assertThat(indexes[0], is(0)); //a
assertThat(indexes[1], is(1)); //10000
assertThat(indexes[2], is(1)); //10000
assertThat(indexes[3], is(1)); //10000
assertThat(indexes[4], is(1)); //10000
assertThat(indexes[5], is(2)); //b
}
@Test
public void encodeStartEndPositions() {
String foo = "abcde";
int start = 0;
int length = foo.length(); //5
int end = start + length;
int[] indexes = calculateBytePositions(foo);
int byteStart = indexes[start];
int byteEnd = indexes[end];
int byteLength = byteEnd - byteStart;
assertThat(byteStart, equalTo(start));
assertThat(byteEnd, equalTo(end));
assertThat(byteLength, equalTo(length));
}
@Test
public void encodeStartEndPositionsMultibyteCharsAtEnd() {
String foo = "\u0200abcde\uD800\uDC00";
int start = 0;
int length = foo.length(); //8
int end = start + length;
int[] indexes = calculateBytePositions(foo);
int byteStart = indexes[start];
int byteEnd = indexes[end];
int byteLength = byteEnd - byteStart;
//utf-8
//0xC880 a b c d e 0xD800DC00
assertThat(byteStart, equalTo(start));
assertThat(byteEnd, equalTo(11));
assertThat(byteLength, equalTo(11));
}
@Test
public void decodeStartEndPositions() {
byte[] foo = Utf8.toBytes("abcde");
int start = 0;
int length = foo.length; //5
int end = start + length;
int[] indexes = calculateStringPositions(foo);
int stringStart = indexes[start];
int stringEnd = indexes[end];
int stringLength = stringEnd - stringStart;
assertThat(stringStart, equalTo(start));
assertThat(stringEnd, equalTo(end));
assertThat(stringLength, equalTo(length));
}
@Test
public void decodeStartEndPositionsMultibyteCharsAtEnd() {
byte[] foo = Utf8.toBytes("\u0200abcde\uD800\uDC00");
int start = 0;
int length = foo.length; //11
int end = start + length;
int[] indexes = calculateStringPositions(foo);
int stringStart = indexes[start];
int stringEnd = indexes[end];
int stringLength = stringEnd - stringStart;
//utf-8
//0xC880 a b c d e 0xD800DC00
assertThat(stringStart, equalTo(start));
assertThat(stringEnd, equalTo(8));
assertThat(stringLength, equalTo(8));
}
@Test
public void emptyInputStringResultsInArrayWithSingleZero() {
byte[] empty = new byte[] {};
int[] indexes = calculateStringPositions(empty);
assertThat(indexes.length, is(1));
assertThat(indexes[0], is(0));
}
@Test
public void testEncoding() {
for (int c : TEST_CODEPOINTS) {
byte[] encoded = Utf8.encode(c);
String testCharacter = makeString(c);
byte[] utf8 = Utf8.toBytes(testCharacter);
assertArrayEquals(utf8, encoded);
}
byte[] stringAsUtf8 = Utf8.toBytes(TEST_STRING);
byte[] handEncoded = new byte[Utf8.byteCount(TEST_STRING)];
for (int i = 0, j = 0; i < TEST_STRING.length(); i = TEST_STRING.offsetByCodePoints(i, 1)) {
j = Utf8.encode(TEST_STRING.codePointAt(i), handEncoded, j);
}
assertArrayEquals(stringAsUtf8, handEncoded);
}
@Test
public void testStreamEncoding() throws IOException {
for (int c : TEST_CODEPOINTS) {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
Utf8.encode(c, buffer);
byte[] encoded = buffer.toByteArray();
String testCharacter = makeString(c);
byte[] utf8 = Utf8.toBytes(testCharacter);
assertArrayEquals(utf8, encoded);
}
byte[] stringAsUtf8 = Utf8.toBytes(TEST_STRING);
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
for (int i = 0; i < TEST_STRING.length(); i = TEST_STRING.offsetByCodePoints(i, 1)) {
Utf8.encode(TEST_STRING.codePointAt(i), buffer);
}
byte[] handEncoded = buffer.toByteArray();
assertArrayEquals(stringAsUtf8, handEncoded);
}
@Test
public void testByteBufferEncoding() {
for (int c : TEST_CODEPOINTS) {
ByteBuffer buffer = ByteBuffer.allocate(4);
Utf8.encode(c, buffer);
byte[] encoded = new byte[buffer.position()];
buffer.flip();
for (int i = 0; i < encoded.length; ++i) {
encoded[i] = buffer.get();
}
String testCharacter = makeString(c);
byte[] utf8 = Utf8.toBytes(testCharacter);
assertArrayEquals(utf8, encoded);
}
byte[] stringAsUtf8 = Utf8.toBytes(TEST_STRING);
ByteBuffer buffer = ByteBuffer.allocate(TEST_STRING.length() * 4);
for (int i = 0; i < TEST_STRING.length(); i = TEST_STRING.offsetByCodePoints(i, 1)) {
Utf8.encode(TEST_STRING.codePointAt(i), buffer);
}
byte[] handEncoded = new byte[buffer.position()];
buffer.flip();
for (int i = 0; i < handEncoded.length; ++i) {
handEncoded[i] = buffer.get();
}
assertArrayEquals(stringAsUtf8, handEncoded);
}
@Test
@Ignore
public void benchmarkDecoding() {
byte[] ascii = "This is just sort of random mix.".getBytes();
byte[] unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8".getBytes(StandardCharsets.UTF_8);
int iterations = 100_000; // Use 100_000+ for benchmarking
Map.of("ascii", ascii, "unicode", unicode).forEach((type, b) -> {
long time1 = benchmark(() -> decode(Utf8::toString, b, iterations));
System.out.printf("Utf8::toString of %s string took %d ms\n", type, time1);
long time2 = benchmark(() -> decode((b1) -> new String(b1, StandardCharsets.UTF_8), b, iterations));
System.out.printf("String::new of %s string took %d ms\n", type, time2);
double change = ((double) time2 / (double) time1) - 1;
System.out.printf("Change = %.02f%%\n", change * 100);
});
}
@Test
@Ignore
public void benchmarkEncoding() {
String ascii = "This is just sort of random mix.";
String unicode = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8";
int iterations = 1_000_000; // Use 1_000_000+ for benchmarking
Map.of("ascii", ascii, "unicode", unicode).forEach((type, s) -> {
long time1 = benchmark(() -> encode(Utf8::toBytes, s, iterations));
System.out.printf("Utf8::toBytes of %s string took %d ms\n", type, time1);
long time2 = benchmark(() -> encode((s1) -> s1.getBytes(StandardCharsets.UTF_8), s, iterations));
System.out.printf("String::getBytes of %s string took %d ms\n", type, time2);
double change = ((double) time2 / (double) time1) - 1;
System.out.printf("Change = %.02f%%\n", change * 100);
});
}
private byte[] encode(Function encoder, String s, int iterations) {
byte[] res = null;
for (int i = 0; i < iterations; i++) {
res = encoder.apply(s + i); // Append counter to avoid String cache
}
return res;
}
private String decode(Function decoder, byte[] b, int iterations) {
String res = null;
for (int i = 0; i < iterations; i++) {
// Append counter to avoid String cache
byte[] counter = String.valueOf(i).getBytes();
byte[] result = new byte[b.length + counter.length];
System.arraycopy(b, 0, result, 0, b.length);
System.arraycopy(counter, 0, result, b.length, counter.length);
res = decoder.apply(result);
}
return res;
}
private long benchmark(Runnable r) {
r.run(); // Warmup
long start = System.currentTimeMillis();
r.run();
long end = System.currentTimeMillis();
return end - start;
}
}