Further GC document level compression. Avoids a buffer copy that is no longer relevant.

author: Henning Baldersheim <balder@yahoo-inc.com> 2022-06-07 13:13:45 +0200
committer: gjoranv <gv@verizonmedia.com> 2022-06-08 11:45:31 +0200
commit: 5fce9fedea4b8259d8d1bc1d26d47cc3b837b252 (patch)
tree: 5ae004ca96fa829088302972cbe33dd00c611a48 /document
parent: 40cf39429211d66616f8dce58198ecdfe43735eb (diff)
11 files changed, 33 insertions, 103 deletions
diff --git a/document/doc/document-format.html b/document/doc/document-format.html
index 26e6cceca5c..ce985b8a10d 100644
--- a/document/doc/document-format.html
+++ b/document/doc/document-format.html
@@ -19,7 +19,6 @@ look-ups.</p>
 <li><b>Robustness</b>. The format shall detect errors gracefully.</li>
 <li><b>Speed</b>. Deserialization shall be fast, especially for basic fields like <b>DocumentId</b>.</li> 
 <li><b>Size</b>. The serialized format shall be compact and allow for efficient storage and transfer.
-That is partly achieved by allowing different kinds of compression. As of now <b>lz4</b> are supported.</li> 
 </ul>
 </p>
 
@@ -137,26 +136,9 @@ Inventory bits describing the FieldMap element with data:<br>
 <td>2, 4 or 8</td>
 <td>Length of data block (see below). NOTE THAT THIS LENGTH INCLUDE ITSELF.</td>
 </tr>
-<tr><td>Compression</td>
-<td>Byte</td>
-<td>1</td>
-<td>Compression method
-<br>
-&nbsp;0: No compression<br>
-&nbsp;5: Uncompressable<br>
-&nbsp;6: lz4 <br>
-<p>Note that the uncompressable flag is not a configurable option. Rather it
-will be used in document instances who are configured for compression, but
-where compression yields negative results, to avoid later serializations to
-retry compression.</p>
-</td>
 <tr><td>Number of fields<td>Integer_1_4</td>
 <td>1 or 4</td>
 <td>Number of fields in data array</td>
-<tr><td colspan = "4"><b>Below item is present if compression method is not uncompressed or uncompressable</b></td></tr>
-<tr><td>Uncompressed data length<td>Integer_2_4_8</td>
-<td>2, 4 or 8</td>
-<td>Length of data block after decompression</td>
 <tr><td colspan = "4"><b>Below block is repeated "Number of fields" times</b></td></tr>
 <tr><td>Field ID<td>Integer_1_4</td>
 <td>1 or 4</td>
diff --git a/document/src/main/java/com/yahoo/document/BaseStructDataType.java b/document/src/main/java/com/yahoo/document/BaseStructDataType.java
index a84a08c5677..adecae9d7eb 100755
--- a/document/src/main/java/com/yahoo/document/BaseStructDataType.java
+++ b/document/src/main/java/com/yahoo/document/BaseStructDataType.java
@@ -1,9 +1,6 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 package com.yahoo.document;
 
-import com.yahoo.compress.CompressionType;
-import com.yahoo.compress.Compressor;
-
 import java.util.Collection;
 import java.util.LinkedHashMap;
 import java.util.Map;
diff --git a/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java b/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java
index 2f2f5647603..d6833a482f1 100644
--- a/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java
+++ b/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java
@@ -1,7 +1,6 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 package com.yahoo.document;
 
-import com.yahoo.compress.CompressionType;
 import com.yahoo.config.subscription.ConfigSubscriber;
 import com.yahoo.document.annotation.AnnotationReferenceDataType;
 import com.yahoo.document.annotation.AnnotationType;
@@ -434,7 +433,6 @@ public class DocumentTypeManagerConfigurer implements ConfigSubscriber.SingleSub
             }
 
             void createEmptyStructs() {
-                String docName = docTypeConfig.name();
                 for (var typeconf : docTypeConfig.structtype()) {
                     if (isPositionStruct(typeconf)) {
                         int geoVersion = usev8geopositions ? 8 : 7;
diff --git a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java
index 47ae31b53fe..2796609543f 100644
--- a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java
+++ b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java
@@ -2,8 +2,6 @@
 package com.yahoo.document.serialization;
 
 import com.yahoo.collections.Tuple2;
-import com.yahoo.compress.CompressionType;
-import com.yahoo.compress.Compressor;
 import com.yahoo.document.annotation.AlternateSpanList;
 import com.yahoo.document.annotation.Annotation;
 import com.yahoo.document.annotation.AnnotationReference;
@@ -69,7 +67,6 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Optional;
 
 import static com.yahoo.text.Utf8.calculateStringPositions;
 
@@ -80,8 +77,7 @@ import static com.yahoo.text.Utf8.calculateStringPositions;
  */
 public class VespaDocumentDeserializer6 extends BufferSerializer implements DocumentDeserializer {
 
-    private final Compressor compressor = new Compressor();
-    private DocumentTypeManager manager;
+    private final DocumentTypeManager manager;
     private short version;
     private List<SpanNode> spanNodes;
     private List<Annotation> annotations;
@@ -263,21 +259,7 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
         }
 
         int dataSize = getInt(null);
-        byte comprCode = getByte(null);
-        CompressionType compression = CompressionType.valueOf(comprCode);
-
-        int uncompressedSize = 0;
-        if (compression != CompressionType.NONE &&
-            compression != CompressionType.INCOMPRESSIBLE)
-        {
-            // uncompressedsize (full size of FIELDS only, after decompression)
-            long pSize = getInt2_4_8Bytes(null);
-            //TODO: Look into how to support data segments larger than INT_MAX bytes
-            if (pSize > Integer.MAX_VALUE) {
-                throw new DeserializationException("Uncompressed size of data block is too large.");
-            }
-            uncompressedSize = (int) pSize;
-        }
+        byte ignoredComprCode = getByte(null);
 
         int numberOfFields = getInt1_4Bytes(null);
 
@@ -289,14 +271,12 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
 
         // save a reference to the big buffer we're reading from:
         GrowableByteBuffer bigBuf = buf;
-
-        byte[] destination = compressor.decompress(compression, getBuf().array(), position(), uncompressedSize, Optional.of(dataSize));
-
+        GrowableByteBuffer thisStructOnly = GrowableByteBuffer.wrap(getBuf().array(), position(), dataSize);
         // set position in original buffer to after data
         position(position() + dataSize);
 
         // for a while: deserialize from this buffer instead:
-        buf = GrowableByteBuffer.wrap(destination);
+        buf = thisStructOnly;
 
         s.clear();
         StructDataType type = s.getDataType();
@@ -325,21 +305,7 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
         }
 
         int dataSize = getInt(null);
-        byte comprCode = getByte(null);
-        CompressionType compression = CompressionType.valueOf(comprCode);
-
-        int uncompressedSize = 0;
-        if (compression != CompressionType.NONE &&
-            compression != CompressionType.INCOMPRESSIBLE)
-        {
-            // uncompressedsize (full size of FIELDS only, after decompression)
-            long pSize = getInt2_4_8Bytes(null);
-            //TODO: Look into how to support data segments larger than INT_MAX bytes
-            if (pSize > Integer.MAX_VALUE) {
-                throw new DeserializationException("Uncompressed size of data block is too large.");
-            }
-            uncompressedSize = (int) pSize;
-        }
+        byte unusedComprCode = getByte(null);
 
         int numberOfFields = getInt1_4Bytes(null);
 
@@ -351,14 +317,13 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
 
         // save a reference to the big buffer we're reading from:
         GrowableByteBuffer bigBuf = buf;
-
-        byte[] destination = compressor.decompress(compression, getBuf().array(), position(), uncompressedSize, Optional.of(dataSize));
+        GrowableByteBuffer thisStructOnly = GrowableByteBuffer.wrap(getBuf().array(), position(), dataSize);
 
         // set position in original buffer to after data
         position(position() + dataSize);
 
         // for a while: deserialize from this buffer instead:
-        buf = GrowableByteBuffer.wrap(destination);
+        buf = thisStructOnly;
 
         StructDataType priType = target.getDataType().contentStruct();
 
@@ -613,7 +578,7 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
         DocumentType docType = manager.getDocumentType(new DataTypeName(docTypeName));
         if (docType == null) {
             throw new DeserializationException("No known document type with name " + 
-                                               new Utf8String(docTypeName).toString());
+                                               new Utf8String(docTypeName));
         }
         return docType;
     }
diff --git a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java
index ee36dcb1a9c..2de345b6e35 100644
--- a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java
+++ b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java
@@ -483,7 +483,7 @@ public class VespaDocumentSerializer6 extends BufferSerializer implements Docume
             write(tree.getRoot());
             {
                 //add all annotations to temporary list and sort it, to get predictable serialization
-                List<Annotation> tmpAnnotationList = new ArrayList<Annotation>(tree.numAnnotations());
+                List<Annotation> tmpAnnotationList = new ArrayList<>(tree.numAnnotations());
                 for (Annotation annotation : tree) {
                     tmpAnnotationList.add(annotation);
                 }
diff --git a/document/src/test/document/serializecpp-lz4-level9.dat b/document/src/test/document/serializecpp-lz4-level9.dat
deleted file mode 100644
index 1dffaa2d7a7..00000000000
--- a/document/src/test/document/serializecpp-lz4-level9.dat
+++ /dev/null
diff --git a/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java b/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java
index 77b621a3c36..1a5016fb155 100644
--- a/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java
+++ b/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java
@@ -1,7 +1,6 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 package com.yahoo.document;
 
-import com.yahoo.compress.CompressionType;
 import com.yahoo.document.annotation.AbstractTypesTest;
 import com.yahoo.document.datatypes.Array;
 import com.yahoo.document.datatypes.BoolFieldValue;
@@ -13,7 +12,10 @@ import com.yahoo.document.datatypes.LongFieldValue;
 import com.yahoo.document.datatypes.Raw;
 import com.yahoo.document.datatypes.StringFieldValue;
 import com.yahoo.document.datatypes.WeightedSet;
-import com.yahoo.document.serialization.*;
+import com.yahoo.document.serialization.DocumentDeserializer;
+import com.yahoo.document.serialization.DocumentDeserializerFactory;
+import com.yahoo.document.serialization.DocumentSerializer;
+import com.yahoo.document.serialization.DocumentSerializerFactory;
 import com.yahoo.io.GrowableByteBuffer;
 import org.junit.Test;
 
@@ -27,9 +29,10 @@ import java.util.Arrays;
 import java.util.List;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 /**
  * Tests serialization of all versions.
@@ -92,7 +95,7 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
             doc.setFieldValue("intfield", 5);
             doc.setFieldValue("floatfield", -9.23);
             doc.setFieldValue("stringfield", "This is a string.");
-            doc.setFieldValue("longfield", new LongFieldValue(398420092938472983l));
+            doc.setFieldValue("longfield", new LongFieldValue(398420092938472983L));
             doc.setFieldValue("doublefield", new DoubleFieldValue(98374532.398820));
             doc.setFieldValue("bytefield", new ByteFieldValue(254));
             doc.setFieldValue("boolfield", new BoolFieldValue(true));
@@ -120,8 +123,8 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
 
         class TestDoc {
 
-            String testFile;
-            int version;
+            final String testFile;
+            final int version;
 
             TestDoc(String testFile, int version) {
                 this.testFile = testFile;
@@ -133,10 +136,8 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
 
         List<TestDoc> tests = new ArrayList<>();
         tests.add(new TestDoc(path + "document-java-currentversion-uncompressed.dat", Document.SERIALIZED_VERSION));
-        tests.add(new TestDoc(path + "document-java-currentversion-lz4-9.dat", Document.SERIALIZED_VERSION));
         tests.add(new TestDoc(path + "document-java-v8-uncompressed.dat", 8));
         tests.add(new TestDoc(cpppath + "document-cpp-currentversion-uncompressed.dat", 7));
-        tests.add(new TestDoc(cpppath + "document-cpp-currentversion-lz4-9.dat", 7));
         tests.add(new TestDoc(cpppath + "document-cpp-v8-uncompressed.dat", 7));
         for (TestDoc test : tests) {
             File f = new File(test.testFile);
@@ -146,7 +147,7 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
             int remaining = buffer.length;
             while (remaining > 0) {
                 int read = fin.read(buffer, pos, remaining);
-                assertFalse(read == -1);
+                assertNotEquals(-1, read);
                 pos += read;
                 remaining -= read;
             }
@@ -159,7 +160,7 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
             assertEquals(new IntegerFieldValue(5), doc.getFieldValue("intfield"));
             assertEquals(-9.23, ((FloatFieldValue)doc.getFieldValue("floatfield")).getFloat(), 1E-6);
             assertEquals(new StringFieldValue("This is a string."), doc.getFieldValue("stringfield"));
-            assertEquals(new LongFieldValue(398420092938472983l), doc.getFieldValue("longfield"));
+            assertEquals(new LongFieldValue(398420092938472983L), doc.getFieldValue("longfield"));
             assertEquals(98374532.398820, ((DoubleFieldValue)doc.getFieldValue("doublefield")).getDouble(), 1E-6);
             assertEquals(new ByteFieldValue((byte)254), doc.getFieldValue("bytefield"));
             // Todo add cpp serialization
@@ -167,20 +168,20 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
             ByteBuffer bbuffer = ((Raw)doc.getFieldValue("rawfield")).getByteBuffer();
             if (!Arrays.equals("RAW DATA".getBytes(), bbuffer.array())) {
                 System.err.println("Expected 'RAW DATA' but got '" + new String(bbuffer.array()) + "'.");
-                assertTrue(false);
+                fail();
             }
             if (test.version > 6) {
                 Document docInDoc = (Document)doc.getFieldValue("docfield");
-                assertTrue(docInDoc != null);
+                assertNotNull(docInDoc);
                 assertEquals(new StringFieldValue("Elvis is dead"),
                              docInDoc.getFieldValue("stringindocfield"));
             }
             Array array = (Array)doc.getFieldValue("arrayoffloatfield");
-            assertTrue(array != null);
+            assertNotNull(array);
             assertEquals(1.0f, ((FloatFieldValue)array.get(0)).getFloat(), 1E-6);
             assertEquals(2.0f, ((FloatFieldValue)array.get(1)).getFloat(), 1E-6);
             WeightedSet wset = (WeightedSet)doc.getFieldValue("wsfield");
-            assertTrue(wset != null);
+            assertNotNull(wset);
             assertEquals(Integer.valueOf(50), wset.get(new StringFieldValue("Weighted 0")));
             assertEquals(Integer.valueOf(199), wset.get(new StringFieldValue("Weighted 1")));
         }
diff --git a/document/src/test/java/com/yahoo/document/DocumentTestCase.java b/document/src/test/java/com/yahoo/document/DocumentTestCase.java
index e2bdd846dd1..f1c74ad6efc 100644
--- a/document/src/test/java/com/yahoo/document/DocumentTestCase.java
+++ b/document/src/test/java/com/yahoo/document/DocumentTestCase.java
@@ -3,7 +3,6 @@ package com.yahoo.document;
 
 import com.fasterxml.jackson.core.type.TypeReference;
 import com.fasterxml.jackson.databind.ObjectMapper;
-import com.yahoo.compress.CompressionType;
 import com.yahoo.document.datatypes.Array;
 import com.yahoo.document.datatypes.BoolFieldValue;
 import com.yahoo.document.datatypes.ByteFieldValue;
@@ -29,8 +28,8 @@ import org.junit.Test;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Map;
 
@@ -217,7 +216,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
         doc.setFieldValue("long", longVal);
     }
 
-    class VariableIteratorHandler extends FieldPathIteratorHandler {
+    static class VariableIteratorHandler extends FieldPathIteratorHandler {
 
         public String retVal = "";
 
@@ -462,7 +461,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
         }
     }
 
-    class RemoveIteratorHandler extends FieldPathIteratorHandler {
+    static class RemoveIteratorHandler extends FieldPathIteratorHandler {
 
         public ModificationStatus doModify(FieldValue fv) {
             return ModificationStatus.REMOVED;
@@ -639,17 +638,6 @@ public class DocumentTestCase extends DocumentTestCaseBase {
     }
 
     @Test
-    public void testCppDocCompressed() throws IOException {
-        docMan = setUpCppDocType();
-        byte[] data = readFile("src/test/document/serializecpp-lz4-level9.dat");
-        ByteBuffer buf = ByteBuffer.wrap(data);
-
-        Document doc = docMan.createDocument(new GrowableByteBuffer(buf));
-
-        validateCppDoc(doc);
-    }
-
-    @Test
     public void testCppDoc() throws IOException {
         docMan = setUpCppDocType();
         byte[] data = readFile("src/test/document/serializecpp.dat");
@@ -794,7 +782,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
         BufferSerializer buf = new BufferSerializer();
         try {
             new Document(DocumentDeserializerFactory.create6(docMan, buf.getBuf()));
-            assertTrue(false);
+            fail();
         } catch (Exception e) {
             assertTrue(true);
         }
@@ -802,7 +790,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
         buf = BufferSerializer.wrap("Hello world".getBytes());
         try {
             new Document(DocumentDeserializerFactory.create6(docMan, buf.getBuf()));
-            assertTrue(false);
+            fail();
         } catch (Exception e) {
             assertTrue(true);
         }
@@ -983,7 +971,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
         setUpSertestDocType();
         Document doc = getSertestDocument();
         String json = doc.toJson();
-        Map<String, Object> parsed = new ObjectMapper().readValue(json, new TypeReference<Map<String, Object>>() {
+        Map<String, Object> parsed = new ObjectMapper().readValue(json, new TypeReference<>() {
         });
         assertEquals(parsed.get("id"), "id:ns:sertest::foobar");
         assertTrue(parsed.get("fields") instanceof Map);
@@ -1218,11 +1206,11 @@ public class DocumentTestCase extends DocumentTestCaseBase {
     }
 
     @Test
-    public void testDocumentIdWithNonTextCharacterCanBeDeserialized() throws UnsupportedEncodingException {
+    public void testDocumentIdWithNonTextCharacterCanBeDeserialized() {
         DocumentIdFixture f = new DocumentIdFixture();
 
         // Document id = "id:a:b::0x7c"
-        String docId = new String(new byte[]{105, 100, 58, 97, 58, 98, 58, 58, 7, 99}, "UTF-8");
+        String docId = new String(new byte[]{105, 100, 58, 97, 58, 98, 58, 58, 7, 99}, StandardCharsets.UTF_8);
         f.serialize(docId);
 
         Document result = f.deserialize();
diff --git a/document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat b/document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat
deleted file mode 100644
index 033844ac09b..00000000000
--- a/document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat
+++ /dev/null
diff --git a/document/src/tests/.gitignore b/document/src/tests/.gitignore
index 4d5d93dd093..f310f65eb7e 100644
--- a/document/src/tests/.gitignore
+++ b/document/src/tests/.gitignore
@@ -15,5 +15,4 @@ testrunner
 *_test
 document_gtest_runner_app
 document_testrunner_app
-/serializecpp-lz4-level9.dat
 /serializecppsplit_body.dat
diff --git a/document/src/tests/data/document-cpp-currentversion-lz4-9.dat b/document/src/tests/data/document-cpp-currentversion-lz4-9.dat
deleted file mode 100644
index 3383d97f253..00000000000
--- a/document/src/tests/data/document-cpp-currentversion-lz4-9.dat
+++ /dev/null
author	Henning Baldersheim <balder@yahoo-inc.com>	2022-06-07 13:13:45 +0200
committer	gjoranv <gv@verizonmedia.com>	2022-06-08 11:45:31 +0200
commit	5fce9fedea4b8259d8d1bc1d26d47cc3b837b252 (patch)
tree	5ae004ca96fa829088302972cbe33dd00c611a48 /document
parent	40cf39429211d66616f8dce58198ecdfe43735eb (diff)