// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.text; import org.junit.Ignore; import org.junit.Test; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; import java.util.Arrays; import static com.yahoo.text.Lowercase.toLowerCase; import static com.yahoo.text.Utf8.calculateBytePositions; import static com.yahoo.text.Utf8.calculateStringPositions; import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.CoreMatchers.is; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; /** * @author Bjorn Borud * @author Steinar Knutsen */ public class Utf8TestCase { private static final String TEST_STRING = "This is just sort of random mix. \u5370\u57df\u60c5\u5831\u53EF\u4EE5\u6709x\u00e9\u00e8"; private static final int[] TEST_CODEPOINTS = {0x0, 0x7f, 0x80, 0x7ff, 0x800, 0xd7ff, 0xe000, 0xffff, 0x10000, 0x10ffff, 0x34, 0x355, 0x2567, 0xfff, 0xe987, 0x100abc }; public void dumpSome() throws java.io.IOException { int i = 32; int j = 3; int cnt = 0; while (i < 0x110000) { if (i < 0xD800 || i >= 0xE000) ++cnt; i += j; ++j; } System.out.println("allocate "+cnt+" array entries"); int codes[] = new int[cnt]; i = 32; j = 3; cnt = 0; while (i < 0x110000) { if (i < 0xD800 || i >= 0xE000) codes[cnt++] = i; i += j; ++j; } assertEquals(cnt, codes.length); System.out.println("fill "+cnt+" array entries"); String str = new String(codes, 0, cnt); byte[] arr = Utf8.toBytes(str); java.io.FileOutputStream fos = new java.io.FileOutputStream("random-long-utf8.dat"); fos.write(arr); fos.close(); } public void dumpMore() throws java.io.IOException { java.text.Normalizer.Form form = java.text.Normalizer.Form.NFKC; java.io.FileOutputStream fos = new java.io.FileOutputStream("lowercase-table.dat"); for (int i = 0; i < 0x110000; i++) { StringBuilder b = new StringBuilder(); b.appendCodePoint(i); String n1 = b.toString(); String n2 = java.text.Normalizer.normalize(b, form); if (n1.equals(n2)) { String l = toLowerCase(n1); int chars = l.length(); int codes = l.codePointCount(0, chars); if (codes != 1) { System.out.println("codepoint "+i+" transformed into "+codes+" codepoints: "+n1+" -> "+l); } else { int lc = l.codePointAt(0); if (lc != i) { String o = "lowercase( "+i+" )= "+lc+"\n"; byte[] arr = Utf8.toBytes(o); fos.write(arr); } } } } fos.close(); } @Test public void testSimple() { String s1 = "test"; String s2 = "f\u00F8rst"; String s3 = "\u00C5pen"; byte[] b4 = { (byte) 0xE5, (byte) 0xA4, (byte) 0x89, (byte) 0xE6, (byte) 0x85, (byte) 0x8B }; byte[] b1 = Utf8.toBytes(s1); byte[] b2 = Utf8.toBytes(s2); byte[] b3 = Utf8.toBytes(s3); String s4 = Utf8.toString(b4); assertEquals('t', b1[0]); assertEquals('e', b1[1]); assertEquals('s', b1[2]); assertEquals('t', b1[3]); assertEquals('f', b2[0]); assertEquals((byte) 0xC3, b2[1]); assertEquals((byte) 0xB8, b2[2]); assertEquals('r', b2[3]); assertEquals('s', b2[4]); assertEquals('t', b2[5]); assertEquals((byte) 0xC3, b3[0]); assertEquals((byte) 0x85, b3[1]); assertEquals('p', b3[2]); assertEquals('e', b3[3]); assertEquals('n', b3[4]); assertEquals('\u5909', s4.charAt(0)); assertEquals('\u614B', s4.charAt(1)); String ss1 = Utf8.toString(b1); String ss2 = Utf8.toString(b2); String ss3 = Utf8.toString(b3); byte[] bb4 = Utf8.toBytes(s4); assertEquals(s1, ss1); assertEquals(s3, ss3); assertEquals(s2, ss2); assertEquals(Utf8.toString(b4), Utf8.toString(bb4)); } private int javaCountBytes(String str) { byte[] octets = Utf8.toBytes(str); return octets.length; } private String makeString(int codePoint) { char[] chars = Character.toChars(codePoint); return String.valueOf(chars); } @Test public void testByteCounting() { for (int c : TEST_CODEPOINTS) { String testCharacter = makeString(c); assertEquals(javaCountBytes(testCharacter), Utf8.byteCount(testCharacter)); } assertEquals(javaCountBytes(TEST_STRING), Utf8.byteCount(TEST_STRING)); } @Test public void testTotalBytes() { //Test with a random mix of assertEquals(1,Utf8.totalBytes((byte)0x05)); assertEquals(4,Utf8.totalBytes((byte)0xF3)); assertEquals(4,Utf8.totalBytes((byte)0xF0)); assertEquals(1,Utf8.totalBytes((byte)0x7F)); assertEquals(2,Utf8.totalBytes((byte)0xC2)); assertEquals(3,Utf8.totalBytes((byte)0xE0)); } @Test public void testUnitCounting() { for (int c : TEST_CODEPOINTS) { String testCharacter = makeString(c); byte[] utf8 = Utf8.toBytes(testCharacter); assertEquals(testCharacter.length(), Utf8.unitCount(utf8)); assertEquals(testCharacter.length(), Utf8.unitCount(utf8[0])); } byte[] stringAsUtf8 = Utf8.toBytes(TEST_STRING); assertEquals(TEST_STRING.length(), Utf8.unitCount(stringAsUtf8)); } @Test public void testCumbersomeEncoding() { String[] a = {"abc", "def", "ghi\u00e8"}; int[] aLens = {3, 3, 5}; CharsetEncoder ce = Utf8.getNewEncoder(); ByteBuffer forWire = ByteBuffer.allocate(500); for (int i = 0; i < a.length; i++) { forWire.putInt(aLens[i]); Utf8.toBytes(a[i], 0, a[i].length(), forWire, ce); } forWire.flip(); int totalLimit = forWire.limit(); for (String anA : a) { int len = forWire.getInt(); forWire.limit(forWire.position() + len); String s = Utf8.toString(forWire); assertEquals(anA, s); forWire.limit(totalLimit); } assertEquals(0, forWire.remaining()); } @Test public void basic() { String foo = "Washington"; int[] indexes = calculateBytePositions(foo); assertThat(indexes.length, is(foo.length() + 1)); for (int i = 0; i < indexes.length; i++) { assertThat(indexes[i], is(i)); } } @Test public void decodeBasic() { byte[] foo = Utf8.toBytes("Washington"); int[] indexes = calculateStringPositions(foo); assertThat(indexes.length, is(foo.length + 1)); for (int i = 0; i < indexes.length; i++) { assertThat(indexes[i], is(i)); } } @Test public void highBytes() { String foo = "\u0128st\u0200e"; //utf-8 // 0xC4A8 0x73 0x74 0xC880 0x65 int[] indexes = calculateBytePositions(foo); assertThat(indexes.length, is(foo.length() + 1)); assertThat(indexes[0], is(0)); //128 assertThat(indexes[1], is(2)); //s assertThat(indexes[2], is(3)); //t assertThat(indexes[3], is(4)); //200 assertThat(indexes[4], is(6)); //e } @Test public void decodeHighBytes() { byte[] foo = Utf8.toBytes("\u0128st\u0200e"); //utf-8 // 0xC4A8 0x73 0x74 0xC880 0x65 int[] indexes = calculateStringPositions(foo); assertThat(indexes.length, is(foo.length + 1)); assertThat(indexes[0], is(0)); //128 assertThat(indexes[1], is(0)); //128 assertThat(indexes[2], is(1)); //s assertThat(indexes[3], is(2)); //t assertThat(indexes[4], is(3)); //200 assertThat(indexes[5], is(3)); //200 assertThat(indexes[6], is(4)); //e } @Test public void moreHighBytes() { String foo = "\u0200\u0201\u0202abc\u0300def\u0301g\u07ff\u0800a\uffffa"; //utf-8 //0xC880 0xC881 0xC882 0x61 0x62 0x63 0xCC80 0x64 0x65 0x66 0xCC81 0x67 0xDFBF 0xE0A080 0x61 0xEFBFBF 0x61 int[] indexes = calculateBytePositions(foo); assertThat(indexes.length, is(foo.length() + 1)); assertThat(indexes[0], is(0)); //200 assertThat(indexes[1], is(2)); //201 assertThat(indexes[2], is(4)); //202 assertThat(indexes[3], is(6)); //a assertThat(indexes[4], is(7)); //b assertThat(indexes[5], is(8)); //c assertThat(indexes[6], is(9)); //300 assertThat(indexes[7], is(11)); //d assertThat(indexes[8], is(12)); //e assertThat(indexes[9], is(13)); //f assertThat(indexes[10], is(14)); //301 assertThat(indexes[11], is(16)); //g assertThat(indexes[12], is(17)); //7ff assertThat(indexes[13], is(19)); //800 assertThat(indexes[14], is(22)); //a assertThat(indexes[15], is(23)); //ffff assertThat(indexes[16], is(26)); //a } @Test public void decodeMoreHighBytes() { String foo = "\u0200\u0201\u0202abc\u0300def\u0301g\u07ff\u0800a\uffffa"; //utf-8 //0xC880 0xC881 0xC882 0x61 0x62 0x63 0xCC80 0x64 0x65 0x66 0xCC81 0x67 0xDFBF 0xE0A080 0x61 0xEFBFBF 0x61 int[] indexes = calculateStringPositions(Utf8.toBytes(foo)); assertThat(indexes.length, is(28)); assertThat(indexes[0], is(0)); //200 assertThat(indexes[1], is(0)); //200 assertThat(indexes[2], is(1)); //201 assertThat(indexes[3], is(1)); //201 assertThat(indexes[4], is(2)); //202 assertThat(indexes[5], is(2)); //202 assertThat(indexes[6], is(3)); //a assertThat(indexes[7], is(4)); //b assertThat(indexes[8], is(5)); //c assertThat(indexes[9], is(6)); //300 assertThat(indexes[10], is(6)); //300 assertThat(indexes[11], is(7)); //d assertThat(indexes[12], is(8)); //e assertThat(indexes[13], is(9)); //f assertThat(indexes[14], is(10)); //301 assertThat(indexes[15], is(10)); //301 assertThat(indexes[16], is(11)); //g assertThat(indexes[17], is(12)); //7ff assertThat(indexes[18], is(12)); //7ff assertThat(indexes[19], is(13)); //800 assertThat(indexes[20], is(13)); //800 assertThat(indexes[21], is(13)); //800 assertThat(indexes[22], is(14)); //a assertThat(indexes[23], is(15)); //ffff assertThat(indexes[24], is(15)); //ffff assertThat(indexes[25], is(15)); //ffff assertThat(indexes[26], is(16)); //a } @Test public void testOptimisticEncoder() { for (char i=0; i < 256; i++) { StringBuilder sb = new StringBuilder(); for (char c=0; c < i; c++) { sb.append(c); } assertTrue(Arrays.equals(Utf8.toBytesStd(sb.toString()), Utf8.toBytes(sb.toString()))); } } @Test public void testLong() { for (long l=-0x10000; l < 0x10000; l++) { assertLongEquals(l); } assertLongEquals(Long.MAX_VALUE); assertLongEquals(Long.MIN_VALUE); } private void assertLongEquals(long l) { byte [] a = Utf8.toBytes(String.valueOf(l)); byte [] b = Utf8.toAsciiBytes(l); if (!Arrays.equals(a, b)) { assertTrue(Arrays.equals(a, b)); } } @Test public void testBoolean() { assertEquals("true", String.valueOf(true)); assertEquals("false", String.valueOf(false)); assertTrue(Arrays.equals(Utf8.toAsciiBytes(true), new Utf8String(String.valueOf(true)).getBytes())); assertTrue(Arrays.equals(Utf8.toAsciiBytes(false), new Utf8String(String.valueOf(false)).getBytes())); } @Test public void testInt() { for (int l=-0x10000; l < 0x10000; l++) { byte [] a = Utf8.toBytes(String.valueOf(l)); byte [] b = Utf8.toAsciiBytes(l); if (!Arrays.equals(a, b)) { assertTrue(Arrays.equals(a, b)); } } } @Test public void testShort() { for (short l=-0x1000; l < 0x1000; l++) { byte [] a = Utf8.toBytes(String.valueOf(l)); byte [] b = Utf8.toAsciiBytes(l); if (!Arrays.equals(a, b)) { assertTrue(Arrays.equals(a, b)); } } } @Test public void surrogatePairs() { String foo = "a\uD800\uDC00b"; //unicode //0x61 0x10000 0x62 //utf-16 //0x61 0xD800DC00 0x62 //utf-8 //0x61 0xF0908080 0x62 int[] indexes = calculateBytePositions(foo); assertThat(indexes.length, is(foo.length() + 1)); assertThat(indexes[0], is(0)); //a assertThat(indexes[1], is(1)); //10000 assertThat(indexes[2], is(1)); //10000, second of surrogate pair assertThat(indexes[3], is(5)); //b } @Test public void decodeSurrogatePairs() { String foo = "a\uD800\uDC00b"; //unicode //0x61 0x10000 0x62 //utf-16 //0x61 0xD800DC00 0x62 //utf-8 //0x61 0xF0908080 0x62 int[] indexes = calculateStringPositions(Utf8.toBytes(foo)); assertThat(indexes.length, is(7)); assertThat(indexes[0], is(0)); //a assertThat(indexes[1], is(1)); //10000 assertThat(indexes[2], is(1)); //10000 assertThat(indexes[3], is(1)); //10000 assertThat(indexes[4], is(1)); //10000 assertThat(indexes[5], is(2)); //b } @Test public void encodeStartEndPositions() { String foo = "abcde"; int start = 0; int length = foo.length(); //5 int end = start + length; int[] indexes = calculateBytePositions(foo); int byteStart = indexes[start]; int byteEnd = indexes[end]; int byteLength = byteEnd - byteStart; assertThat(byteStart, equalTo(start)); assertThat(byteEnd, equalTo(end)); assertThat(byteLength, equalTo(length)); } @Test public void encodeStartEndPositionsMultibyteCharsAtEnd() { String foo = "\u0200abcde\uD800\uDC00"; int start = 0; int length = foo.length(); //8 int end = start + length; int[] indexes = calculateBytePositions(foo); int byteStart = indexes[start]; int byteEnd = indexes[end]; int byteLength = byteEnd - byteStart; //utf-8 //0xC880 a b c d e 0xD800DC00 assertThat(byteStart, equalTo(start)); assertThat(byteEnd, equalTo(11)); assertThat(byteLength, equalTo(11)); } @Test public void decodeStartEndPositions() { byte[] foo = Utf8.toBytes("abcde"); int start = 0; int length = foo.length; //5 int end = start + length; int[] indexes = calculateStringPositions(foo); int stringStart = indexes[start]; int stringEnd = indexes[end]; int stringLength = stringEnd - stringStart; assertThat(stringStart, equalTo(start)); assertThat(stringEnd, equalTo(end)); assertThat(stringLength, equalTo(length)); } @Test public void decodeStartEndPositionsMultibyteCharsAtEnd() { byte[] foo = Utf8.toBytes("\u0200abcde\uD800\uDC00"); int start = 0; int length = foo.length; //11 int end = start + length; int[] indexes = calculateStringPositions(foo); int stringStart = indexes[start]; int stringEnd = indexes[end]; int stringLength = stringEnd - stringStart; //utf-8 //0xC880 a b c d e 0xD800DC00 assertThat(stringStart, equalTo(start)); assertThat(stringEnd, equalTo(8)); assertThat(stringLength, equalTo(8)); } @Test public void emptyInputStringResultsInArrayWithSingleZero() { byte[] empty = new byte[] {}; int[] indexes = calculateStringPositions(empty); assertThat(indexes.length, is(1)); assertThat(indexes[0], is(0)); } @Test public void testEncoding() { for (int c : TEST_CODEPOINTS) { byte[] encoded = Utf8.encode(c); String testCharacter = makeString(c); byte[] utf8 = Utf8.toBytes(testCharacter); assertArrayEquals(utf8, encoded); } byte[] stringAsUtf8 = Utf8.toBytes(TEST_STRING); byte[] handEncoded = new byte[Utf8.byteCount(TEST_STRING)]; for (int i = 0, j = 0; i < TEST_STRING.length(); i = TEST_STRING.offsetByCodePoints(i, 1)) { j = Utf8.encode(TEST_STRING.codePointAt(i), handEncoded, j); } assertArrayEquals(stringAsUtf8, handEncoded); } @Test public void testStreamEncoding() throws IOException { for (int c : TEST_CODEPOINTS) { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); Utf8.encode(c, buffer); byte[] encoded = buffer.toByteArray(); String testCharacter = makeString(c); byte[] utf8 = Utf8.toBytes(testCharacter); assertArrayEquals(utf8, encoded); } byte[] stringAsUtf8 = Utf8.toBytes(TEST_STRING); ByteArrayOutputStream buffer = new ByteArrayOutputStream(); for (int i = 0; i < TEST_STRING.length(); i = TEST_STRING.offsetByCodePoints(i, 1)) { Utf8.encode(TEST_STRING.codePointAt(i), buffer); } byte[] handEncoded = buffer.toByteArray(); assertArrayEquals(stringAsUtf8, handEncoded); } @Test public void testByteBufferEncoding() { for (int c : TEST_CODEPOINTS) { ByteBuffer buffer = ByteBuffer.allocate(4); Utf8.encode(c, buffer); byte[] encoded = new byte[buffer.position()]; buffer.flip(); for (int i = 0; i < encoded.length; ++i) { encoded[i] = buffer.get(); } String testCharacter = makeString(c); byte[] utf8 = Utf8.toBytes(testCharacter); assertArrayEquals(utf8, encoded); } byte[] stringAsUtf8 = Utf8.toBytes(TEST_STRING); ByteBuffer buffer = ByteBuffer.allocate(TEST_STRING.length() * 4); for (int i = 0; i < TEST_STRING.length(); i = TEST_STRING.offsetByCodePoints(i, 1)) { Utf8.encode(TEST_STRING.codePointAt(i), buffer); } byte[] handEncoded = new byte[buffer.position()]; buffer.flip(); for (int i = 0; i < handEncoded.length; ++i) { handEncoded[i] = buffer.get(); } assertArrayEquals(stringAsUtf8, handEncoded); } }