diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2022-06-07 13:13:45 +0200 |
---|---|---|
committer | gjoranv <gv@verizonmedia.com> | 2022-06-08 11:45:31 +0200 |
commit | 5fce9fedea4b8259d8d1bc1d26d47cc3b837b252 (patch) | |
tree | 5ae004ca96fa829088302972cbe33dd00c611a48 /document | |
parent | 40cf39429211d66616f8dce58198ecdfe43735eb (diff) |
Further GC document level compression. Avoids a buffer copy that is no longer relevant.
Diffstat (limited to 'document')
-rw-r--r-- | document/doc/document-format.html | 18 | ||||
-rwxr-xr-x | document/src/main/java/com/yahoo/document/BaseStructDataType.java | 3 | ||||
-rw-r--r-- | document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java | 2 | ||||
-rw-r--r-- | document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java | 51 | ||||
-rw-r--r-- | document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java | 2 | ||||
-rw-r--r-- | document/src/test/document/serializecpp-lz4-level9.dat | bin | 336 -> 0 bytes | |||
-rw-r--r-- | document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java | 31 | ||||
-rw-r--r-- | document/src/test/java/com/yahoo/document/DocumentTestCase.java | 28 | ||||
-rw-r--r-- | document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat | bin | 312 -> 0 bytes | |||
-rw-r--r-- | document/src/tests/.gitignore | 1 | ||||
-rw-r--r-- | document/src/tests/data/document-cpp-currentversion-lz4-9.dat | bin | 305 -> 0 bytes |
11 files changed, 33 insertions, 103 deletions
diff --git a/document/doc/document-format.html b/document/doc/document-format.html index 26e6cceca5c..ce985b8a10d 100644 --- a/document/doc/document-format.html +++ b/document/doc/document-format.html @@ -19,7 +19,6 @@ look-ups.</p> <li><b>Robustness</b>. The format shall detect errors gracefully.</li> <li><b>Speed</b>. Deserialization shall be fast, especially for basic fields like <b>DocumentId</b>.</li> <li><b>Size</b>. The serialized format shall be compact and allow for efficient storage and transfer. -That is partly achieved by allowing different kinds of compression. As of now <b>lz4</b> are supported.</li> </ul> </p> @@ -137,26 +136,9 @@ Inventory bits describing the FieldMap element with data:<br> <td>2, 4 or 8</td> <td>Length of data block (see below). NOTE THAT THIS LENGTH INCLUDE ITSELF.</td> </tr> -<tr><td>Compression</td> -<td>Byte</td> -<td>1</td> -<td>Compression method -<br> - 0: No compression<br> - 5: Uncompressable<br> - 6: lz4 <br> -<p>Note that the uncompressable flag is not a configurable option. Rather it -will be used in document instances who are configured for compression, but -where compression yields negative results, to avoid later serializations to -retry compression.</p> -</td> <tr><td>Number of fields<td>Integer_1_4</td> <td>1 or 4</td> <td>Number of fields in data array</td> -<tr><td colspan = "4"><b>Below item is present if compression method is not uncompressed or uncompressable</b></td></tr> -<tr><td>Uncompressed data length<td>Integer_2_4_8</td> -<td>2, 4 or 8</td> -<td>Length of data block after decompression</td> <tr><td colspan = "4"><b>Below block is repeated "Number of fields" times</b></td></tr> <tr><td>Field ID<td>Integer_1_4</td> <td>1 or 4</td> diff --git a/document/src/main/java/com/yahoo/document/BaseStructDataType.java b/document/src/main/java/com/yahoo/document/BaseStructDataType.java index a84a08c5677..adecae9d7eb 100755 --- a/document/src/main/java/com/yahoo/document/BaseStructDataType.java +++ b/document/src/main/java/com/yahoo/document/BaseStructDataType.java @@ -1,9 +1,6 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.document; -import com.yahoo.compress.CompressionType; -import com.yahoo.compress.Compressor; - import java.util.Collection; import java.util.LinkedHashMap; import java.util.Map; diff --git a/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java b/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java index 2f2f5647603..d6833a482f1 100644 --- a/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java +++ b/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java @@ -1,7 +1,6 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.document; -import com.yahoo.compress.CompressionType; import com.yahoo.config.subscription.ConfigSubscriber; import com.yahoo.document.annotation.AnnotationReferenceDataType; import com.yahoo.document.annotation.AnnotationType; @@ -434,7 +433,6 @@ public class DocumentTypeManagerConfigurer implements ConfigSubscriber.SingleSub } void createEmptyStructs() { - String docName = docTypeConfig.name(); for (var typeconf : docTypeConfig.structtype()) { if (isPositionStruct(typeconf)) { int geoVersion = usev8geopositions ? 8 : 7; diff --git a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java index 47ae31b53fe..2796609543f 100644 --- a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java +++ b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java @@ -2,8 +2,6 @@ package com.yahoo.document.serialization; import com.yahoo.collections.Tuple2; -import com.yahoo.compress.CompressionType; -import com.yahoo.compress.Compressor; import com.yahoo.document.annotation.AlternateSpanList; import com.yahoo.document.annotation.Annotation; import com.yahoo.document.annotation.AnnotationReference; @@ -69,7 +67,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Optional; import static com.yahoo.text.Utf8.calculateStringPositions; @@ -80,8 +77,7 @@ import static com.yahoo.text.Utf8.calculateStringPositions; */ public class VespaDocumentDeserializer6 extends BufferSerializer implements DocumentDeserializer { - private final Compressor compressor = new Compressor(); - private DocumentTypeManager manager; + private final DocumentTypeManager manager; private short version; private List<SpanNode> spanNodes; private List<Annotation> annotations; @@ -263,21 +259,7 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu } int dataSize = getInt(null); - byte comprCode = getByte(null); - CompressionType compression = CompressionType.valueOf(comprCode); - - int uncompressedSize = 0; - if (compression != CompressionType.NONE && - compression != CompressionType.INCOMPRESSIBLE) - { - // uncompressedsize (full size of FIELDS only, after decompression) - long pSize = getInt2_4_8Bytes(null); - //TODO: Look into how to support data segments larger than INT_MAX bytes - if (pSize > Integer.MAX_VALUE) { - throw new DeserializationException("Uncompressed size of data block is too large."); - } - uncompressedSize = (int) pSize; - } + byte ignoredComprCode = getByte(null); int numberOfFields = getInt1_4Bytes(null); @@ -289,14 +271,12 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu // save a reference to the big buffer we're reading from: GrowableByteBuffer bigBuf = buf; - - byte[] destination = compressor.decompress(compression, getBuf().array(), position(), uncompressedSize, Optional.of(dataSize)); - + GrowableByteBuffer thisStructOnly = GrowableByteBuffer.wrap(getBuf().array(), position(), dataSize); // set position in original buffer to after data position(position() + dataSize); // for a while: deserialize from this buffer instead: - buf = GrowableByteBuffer.wrap(destination); + buf = thisStructOnly; s.clear(); StructDataType type = s.getDataType(); @@ -325,21 +305,7 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu } int dataSize = getInt(null); - byte comprCode = getByte(null); - CompressionType compression = CompressionType.valueOf(comprCode); - - int uncompressedSize = 0; - if (compression != CompressionType.NONE && - compression != CompressionType.INCOMPRESSIBLE) - { - // uncompressedsize (full size of FIELDS only, after decompression) - long pSize = getInt2_4_8Bytes(null); - //TODO: Look into how to support data segments larger than INT_MAX bytes - if (pSize > Integer.MAX_VALUE) { - throw new DeserializationException("Uncompressed size of data block is too large."); - } - uncompressedSize = (int) pSize; - } + byte unusedComprCode = getByte(null); int numberOfFields = getInt1_4Bytes(null); @@ -351,14 +317,13 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu // save a reference to the big buffer we're reading from: GrowableByteBuffer bigBuf = buf; - - byte[] destination = compressor.decompress(compression, getBuf().array(), position(), uncompressedSize, Optional.of(dataSize)); + GrowableByteBuffer thisStructOnly = GrowableByteBuffer.wrap(getBuf().array(), position(), dataSize); // set position in original buffer to after data position(position() + dataSize); // for a while: deserialize from this buffer instead: - buf = GrowableByteBuffer.wrap(destination); + buf = thisStructOnly; StructDataType priType = target.getDataType().contentStruct(); @@ -613,7 +578,7 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu DocumentType docType = manager.getDocumentType(new DataTypeName(docTypeName)); if (docType == null) { throw new DeserializationException("No known document type with name " + - new Utf8String(docTypeName).toString()); + new Utf8String(docTypeName)); } return docType; } diff --git a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java index ee36dcb1a9c..2de345b6e35 100644 --- a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java +++ b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java @@ -483,7 +483,7 @@ public class VespaDocumentSerializer6 extends BufferSerializer implements Docume write(tree.getRoot()); { //add all annotations to temporary list and sort it, to get predictable serialization - List<Annotation> tmpAnnotationList = new ArrayList<Annotation>(tree.numAnnotations()); + List<Annotation> tmpAnnotationList = new ArrayList<>(tree.numAnnotations()); for (Annotation annotation : tree) { tmpAnnotationList.add(annotation); } diff --git a/document/src/test/document/serializecpp-lz4-level9.dat b/document/src/test/document/serializecpp-lz4-level9.dat Binary files differdeleted file mode 100644 index 1dffaa2d7a7..00000000000 --- a/document/src/test/document/serializecpp-lz4-level9.dat +++ /dev/null diff --git a/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java b/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java index 77b621a3c36..1a5016fb155 100644 --- a/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java +++ b/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java @@ -1,7 +1,6 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.document; -import com.yahoo.compress.CompressionType; import com.yahoo.document.annotation.AbstractTypesTest; import com.yahoo.document.datatypes.Array; import com.yahoo.document.datatypes.BoolFieldValue; @@ -13,7 +12,10 @@ import com.yahoo.document.datatypes.LongFieldValue; import com.yahoo.document.datatypes.Raw; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.datatypes.WeightedSet; -import com.yahoo.document.serialization.*; +import com.yahoo.document.serialization.DocumentDeserializer; +import com.yahoo.document.serialization.DocumentDeserializerFactory; +import com.yahoo.document.serialization.DocumentSerializer; +import com.yahoo.document.serialization.DocumentSerializerFactory; import com.yahoo.io.GrowableByteBuffer; import org.junit.Test; @@ -27,9 +29,10 @@ import java.util.Arrays; import java.util.List; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotSame; -import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; /** * Tests serialization of all versions. @@ -92,7 +95,7 @@ public class DocumentSerializationTestCase extends AbstractTypesTest { doc.setFieldValue("intfield", 5); doc.setFieldValue("floatfield", -9.23); doc.setFieldValue("stringfield", "This is a string."); - doc.setFieldValue("longfield", new LongFieldValue(398420092938472983l)); + doc.setFieldValue("longfield", new LongFieldValue(398420092938472983L)); doc.setFieldValue("doublefield", new DoubleFieldValue(98374532.398820)); doc.setFieldValue("bytefield", new ByteFieldValue(254)); doc.setFieldValue("boolfield", new BoolFieldValue(true)); @@ -120,8 +123,8 @@ public class DocumentSerializationTestCase extends AbstractTypesTest { class TestDoc { - String testFile; - int version; + final String testFile; + final int version; TestDoc(String testFile, int version) { this.testFile = testFile; @@ -133,10 +136,8 @@ public class DocumentSerializationTestCase extends AbstractTypesTest { List<TestDoc> tests = new ArrayList<>(); tests.add(new TestDoc(path + "document-java-currentversion-uncompressed.dat", Document.SERIALIZED_VERSION)); - tests.add(new TestDoc(path + "document-java-currentversion-lz4-9.dat", Document.SERIALIZED_VERSION)); tests.add(new TestDoc(path + "document-java-v8-uncompressed.dat", 8)); tests.add(new TestDoc(cpppath + "document-cpp-currentversion-uncompressed.dat", 7)); - tests.add(new TestDoc(cpppath + "document-cpp-currentversion-lz4-9.dat", 7)); tests.add(new TestDoc(cpppath + "document-cpp-v8-uncompressed.dat", 7)); for (TestDoc test : tests) { File f = new File(test.testFile); @@ -146,7 +147,7 @@ public class DocumentSerializationTestCase extends AbstractTypesTest { int remaining = buffer.length; while (remaining > 0) { int read = fin.read(buffer, pos, remaining); - assertFalse(read == -1); + assertNotEquals(-1, read); pos += read; remaining -= read; } @@ -159,7 +160,7 @@ public class DocumentSerializationTestCase extends AbstractTypesTest { assertEquals(new IntegerFieldValue(5), doc.getFieldValue("intfield")); assertEquals(-9.23, ((FloatFieldValue)doc.getFieldValue("floatfield")).getFloat(), 1E-6); assertEquals(new StringFieldValue("This is a string."), doc.getFieldValue("stringfield")); - assertEquals(new LongFieldValue(398420092938472983l), doc.getFieldValue("longfield")); + assertEquals(new LongFieldValue(398420092938472983L), doc.getFieldValue("longfield")); assertEquals(98374532.398820, ((DoubleFieldValue)doc.getFieldValue("doublefield")).getDouble(), 1E-6); assertEquals(new ByteFieldValue((byte)254), doc.getFieldValue("bytefield")); // Todo add cpp serialization @@ -167,20 +168,20 @@ public class DocumentSerializationTestCase extends AbstractTypesTest { ByteBuffer bbuffer = ((Raw)doc.getFieldValue("rawfield")).getByteBuffer(); if (!Arrays.equals("RAW DATA".getBytes(), bbuffer.array())) { System.err.println("Expected 'RAW DATA' but got '" + new String(bbuffer.array()) + "'."); - assertTrue(false); + fail(); } if (test.version > 6) { Document docInDoc = (Document)doc.getFieldValue("docfield"); - assertTrue(docInDoc != null); + assertNotNull(docInDoc); assertEquals(new StringFieldValue("Elvis is dead"), docInDoc.getFieldValue("stringindocfield")); } Array array = (Array)doc.getFieldValue("arrayoffloatfield"); - assertTrue(array != null); + assertNotNull(array); assertEquals(1.0f, ((FloatFieldValue)array.get(0)).getFloat(), 1E-6); assertEquals(2.0f, ((FloatFieldValue)array.get(1)).getFloat(), 1E-6); WeightedSet wset = (WeightedSet)doc.getFieldValue("wsfield"); - assertTrue(wset != null); + assertNotNull(wset); assertEquals(Integer.valueOf(50), wset.get(new StringFieldValue("Weighted 0"))); assertEquals(Integer.valueOf(199), wset.get(new StringFieldValue("Weighted 1"))); } diff --git a/document/src/test/java/com/yahoo/document/DocumentTestCase.java b/document/src/test/java/com/yahoo/document/DocumentTestCase.java index e2bdd846dd1..f1c74ad6efc 100644 --- a/document/src/test/java/com/yahoo/document/DocumentTestCase.java +++ b/document/src/test/java/com/yahoo/document/DocumentTestCase.java @@ -3,7 +3,6 @@ package com.yahoo.document; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; -import com.yahoo.compress.CompressionType; import com.yahoo.document.datatypes.Array; import com.yahoo.document.datatypes.BoolFieldValue; import com.yahoo.document.datatypes.ByteFieldValue; @@ -29,8 +28,8 @@ import org.junit.Test; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Map; @@ -217,7 +216,7 @@ public class DocumentTestCase extends DocumentTestCaseBase { doc.setFieldValue("long", longVal); } - class VariableIteratorHandler extends FieldPathIteratorHandler { + static class VariableIteratorHandler extends FieldPathIteratorHandler { public String retVal = ""; @@ -462,7 +461,7 @@ public class DocumentTestCase extends DocumentTestCaseBase { } } - class RemoveIteratorHandler extends FieldPathIteratorHandler { + static class RemoveIteratorHandler extends FieldPathIteratorHandler { public ModificationStatus doModify(FieldValue fv) { return ModificationStatus.REMOVED; @@ -639,17 +638,6 @@ public class DocumentTestCase extends DocumentTestCaseBase { } @Test - public void testCppDocCompressed() throws IOException { - docMan = setUpCppDocType(); - byte[] data = readFile("src/test/document/serializecpp-lz4-level9.dat"); - ByteBuffer buf = ByteBuffer.wrap(data); - - Document doc = docMan.createDocument(new GrowableByteBuffer(buf)); - - validateCppDoc(doc); - } - - @Test public void testCppDoc() throws IOException { docMan = setUpCppDocType(); byte[] data = readFile("src/test/document/serializecpp.dat"); @@ -794,7 +782,7 @@ public class DocumentTestCase extends DocumentTestCaseBase { BufferSerializer buf = new BufferSerializer(); try { new Document(DocumentDeserializerFactory.create6(docMan, buf.getBuf())); - assertTrue(false); + fail(); } catch (Exception e) { assertTrue(true); } @@ -802,7 +790,7 @@ public class DocumentTestCase extends DocumentTestCaseBase { buf = BufferSerializer.wrap("Hello world".getBytes()); try { new Document(DocumentDeserializerFactory.create6(docMan, buf.getBuf())); - assertTrue(false); + fail(); } catch (Exception e) { assertTrue(true); } @@ -983,7 +971,7 @@ public class DocumentTestCase extends DocumentTestCaseBase { setUpSertestDocType(); Document doc = getSertestDocument(); String json = doc.toJson(); - Map<String, Object> parsed = new ObjectMapper().readValue(json, new TypeReference<Map<String, Object>>() { + Map<String, Object> parsed = new ObjectMapper().readValue(json, new TypeReference<>() { }); assertEquals(parsed.get("id"), "id:ns:sertest::foobar"); assertTrue(parsed.get("fields") instanceof Map); @@ -1218,11 +1206,11 @@ public class DocumentTestCase extends DocumentTestCaseBase { } @Test - public void testDocumentIdWithNonTextCharacterCanBeDeserialized() throws UnsupportedEncodingException { + public void testDocumentIdWithNonTextCharacterCanBeDeserialized() { DocumentIdFixture f = new DocumentIdFixture(); // Document id = "id:a:b::0x7c" - String docId = new String(new byte[]{105, 100, 58, 97, 58, 98, 58, 58, 7, 99}, "UTF-8"); + String docId = new String(new byte[]{105, 100, 58, 97, 58, 98, 58, 58, 7, 99}, StandardCharsets.UTF_8); f.serialize(docId); Document result = f.deserialize(); diff --git a/document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat b/document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat Binary files differdeleted file mode 100644 index 033844ac09b..00000000000 --- a/document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat +++ /dev/null diff --git a/document/src/tests/.gitignore b/document/src/tests/.gitignore index 4d5d93dd093..f310f65eb7e 100644 --- a/document/src/tests/.gitignore +++ b/document/src/tests/.gitignore @@ -15,5 +15,4 @@ testrunner *_test document_gtest_runner_app document_testrunner_app -/serializecpp-lz4-level9.dat /serializecppsplit_body.dat diff --git a/document/src/tests/data/document-cpp-currentversion-lz4-9.dat b/document/src/tests/data/document-cpp-currentversion-lz4-9.dat Binary files differdeleted file mode 100644 index 3383d97f253..00000000000 --- a/document/src/tests/data/document-cpp-currentversion-lz4-9.dat +++ /dev/null |