summaryrefslogtreecommitdiffstats
path: root/document
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2022-06-07 13:13:45 +0200
committergjoranv <gv@verizonmedia.com>2022-06-08 11:45:31 +0200
commit5fce9fedea4b8259d8d1bc1d26d47cc3b837b252 (patch)
tree5ae004ca96fa829088302972cbe33dd00c611a48 /document
parent40cf39429211d66616f8dce58198ecdfe43735eb (diff)
Further GC document level compression. Avoids a buffer copy that is no longer relevant.
Diffstat (limited to 'document')
-rw-r--r--document/doc/document-format.html18
-rwxr-xr-xdocument/src/main/java/com/yahoo/document/BaseStructDataType.java3
-rw-r--r--document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java2
-rw-r--r--document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java51
-rw-r--r--document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java2
-rw-r--r--document/src/test/document/serializecpp-lz4-level9.datbin336 -> 0 bytes
-rw-r--r--document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java31
-rw-r--r--document/src/test/java/com/yahoo/document/DocumentTestCase.java28
-rw-r--r--document/src/test/serializeddocuments/document-java-currentversion-lz4-9.datbin312 -> 0 bytes
-rw-r--r--document/src/tests/.gitignore1
-rw-r--r--document/src/tests/data/document-cpp-currentversion-lz4-9.datbin305 -> 0 bytes
11 files changed, 33 insertions, 103 deletions
diff --git a/document/doc/document-format.html b/document/doc/document-format.html
index 26e6cceca5c..ce985b8a10d 100644
--- a/document/doc/document-format.html
+++ b/document/doc/document-format.html
@@ -19,7 +19,6 @@ look-ups.</p>
<li><b>Robustness</b>. The format shall detect errors gracefully.</li>
<li><b>Speed</b>. Deserialization shall be fast, especially for basic fields like <b>DocumentId</b>.</li>
<li><b>Size</b>. The serialized format shall be compact and allow for efficient storage and transfer.
-That is partly achieved by allowing different kinds of compression. As of now <b>lz4</b> are supported.</li>
</ul>
</p>
@@ -137,26 +136,9 @@ Inventory bits describing the FieldMap element with data:<br>
<td>2, 4 or 8</td>
<td>Length of data block (see below). NOTE THAT THIS LENGTH INCLUDE ITSELF.</td>
</tr>
-<tr><td>Compression</td>
-<td>Byte</td>
-<td>1</td>
-<td>Compression method
-<br>
-&nbsp;0: No compression<br>
-&nbsp;5: Uncompressable<br>
-&nbsp;6: lz4 <br>
-<p>Note that the uncompressable flag is not a configurable option. Rather it
-will be used in document instances who are configured for compression, but
-where compression yields negative results, to avoid later serializations to
-retry compression.</p>
-</td>
<tr><td>Number of fields<td>Integer_1_4</td>
<td>1 or 4</td>
<td>Number of fields in data array</td>
-<tr><td colspan = "4"><b>Below item is present if compression method is not uncompressed or uncompressable</b></td></tr>
-<tr><td>Uncompressed data length<td>Integer_2_4_8</td>
-<td>2, 4 or 8</td>
-<td>Length of data block after decompression</td>
<tr><td colspan = "4"><b>Below block is repeated "Number of fields" times</b></td></tr>
<tr><td>Field ID<td>Integer_1_4</td>
<td>1 or 4</td>
diff --git a/document/src/main/java/com/yahoo/document/BaseStructDataType.java b/document/src/main/java/com/yahoo/document/BaseStructDataType.java
index a84a08c5677..adecae9d7eb 100755
--- a/document/src/main/java/com/yahoo/document/BaseStructDataType.java
+++ b/document/src/main/java/com/yahoo/document/BaseStructDataType.java
@@ -1,9 +1,6 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.document;
-import com.yahoo.compress.CompressionType;
-import com.yahoo.compress.Compressor;
-
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.Map;
diff --git a/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java b/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java
index 2f2f5647603..d6833a482f1 100644
--- a/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java
+++ b/document/src/main/java/com/yahoo/document/DocumentTypeManagerConfigurer.java
@@ -1,7 +1,6 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.document;
-import com.yahoo.compress.CompressionType;
import com.yahoo.config.subscription.ConfigSubscriber;
import com.yahoo.document.annotation.AnnotationReferenceDataType;
import com.yahoo.document.annotation.AnnotationType;
@@ -434,7 +433,6 @@ public class DocumentTypeManagerConfigurer implements ConfigSubscriber.SingleSub
}
void createEmptyStructs() {
- String docName = docTypeConfig.name();
for (var typeconf : docTypeConfig.structtype()) {
if (isPositionStruct(typeconf)) {
int geoVersion = usev8geopositions ? 8 : 7;
diff --git a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java
index 47ae31b53fe..2796609543f 100644
--- a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java
+++ b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentDeserializer6.java
@@ -2,8 +2,6 @@
package com.yahoo.document.serialization;
import com.yahoo.collections.Tuple2;
-import com.yahoo.compress.CompressionType;
-import com.yahoo.compress.Compressor;
import com.yahoo.document.annotation.AlternateSpanList;
import com.yahoo.document.annotation.Annotation;
import com.yahoo.document.annotation.AnnotationReference;
@@ -69,7 +67,6 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.Optional;
import static com.yahoo.text.Utf8.calculateStringPositions;
@@ -80,8 +77,7 @@ import static com.yahoo.text.Utf8.calculateStringPositions;
*/
public class VespaDocumentDeserializer6 extends BufferSerializer implements DocumentDeserializer {
- private final Compressor compressor = new Compressor();
- private DocumentTypeManager manager;
+ private final DocumentTypeManager manager;
private short version;
private List<SpanNode> spanNodes;
private List<Annotation> annotations;
@@ -263,21 +259,7 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
}
int dataSize = getInt(null);
- byte comprCode = getByte(null);
- CompressionType compression = CompressionType.valueOf(comprCode);
-
- int uncompressedSize = 0;
- if (compression != CompressionType.NONE &&
- compression != CompressionType.INCOMPRESSIBLE)
- {
- // uncompressedsize (full size of FIELDS only, after decompression)
- long pSize = getInt2_4_8Bytes(null);
- //TODO: Look into how to support data segments larger than INT_MAX bytes
- if (pSize > Integer.MAX_VALUE) {
- throw new DeserializationException("Uncompressed size of data block is too large.");
- }
- uncompressedSize = (int) pSize;
- }
+ byte ignoredComprCode = getByte(null);
int numberOfFields = getInt1_4Bytes(null);
@@ -289,14 +271,12 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
// save a reference to the big buffer we're reading from:
GrowableByteBuffer bigBuf = buf;
-
- byte[] destination = compressor.decompress(compression, getBuf().array(), position(), uncompressedSize, Optional.of(dataSize));
-
+ GrowableByteBuffer thisStructOnly = GrowableByteBuffer.wrap(getBuf().array(), position(), dataSize);
// set position in original buffer to after data
position(position() + dataSize);
// for a while: deserialize from this buffer instead:
- buf = GrowableByteBuffer.wrap(destination);
+ buf = thisStructOnly;
s.clear();
StructDataType type = s.getDataType();
@@ -325,21 +305,7 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
}
int dataSize = getInt(null);
- byte comprCode = getByte(null);
- CompressionType compression = CompressionType.valueOf(comprCode);
-
- int uncompressedSize = 0;
- if (compression != CompressionType.NONE &&
- compression != CompressionType.INCOMPRESSIBLE)
- {
- // uncompressedsize (full size of FIELDS only, after decompression)
- long pSize = getInt2_4_8Bytes(null);
- //TODO: Look into how to support data segments larger than INT_MAX bytes
- if (pSize > Integer.MAX_VALUE) {
- throw new DeserializationException("Uncompressed size of data block is too large.");
- }
- uncompressedSize = (int) pSize;
- }
+ byte unusedComprCode = getByte(null);
int numberOfFields = getInt1_4Bytes(null);
@@ -351,14 +317,13 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
// save a reference to the big buffer we're reading from:
GrowableByteBuffer bigBuf = buf;
-
- byte[] destination = compressor.decompress(compression, getBuf().array(), position(), uncompressedSize, Optional.of(dataSize));
+ GrowableByteBuffer thisStructOnly = GrowableByteBuffer.wrap(getBuf().array(), position(), dataSize);
// set position in original buffer to after data
position(position() + dataSize);
// for a while: deserialize from this buffer instead:
- buf = GrowableByteBuffer.wrap(destination);
+ buf = thisStructOnly;
StructDataType priType = target.getDataType().contentStruct();
@@ -613,7 +578,7 @@ public class VespaDocumentDeserializer6 extends BufferSerializer implements Docu
DocumentType docType = manager.getDocumentType(new DataTypeName(docTypeName));
if (docType == null) {
throw new DeserializationException("No known document type with name " +
- new Utf8String(docTypeName).toString());
+ new Utf8String(docTypeName));
}
return docType;
}
diff --git a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java
index ee36dcb1a9c..2de345b6e35 100644
--- a/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java
+++ b/document/src/main/java/com/yahoo/document/serialization/VespaDocumentSerializer6.java
@@ -483,7 +483,7 @@ public class VespaDocumentSerializer6 extends BufferSerializer implements Docume
write(tree.getRoot());
{
//add all annotations to temporary list and sort it, to get predictable serialization
- List<Annotation> tmpAnnotationList = new ArrayList<Annotation>(tree.numAnnotations());
+ List<Annotation> tmpAnnotationList = new ArrayList<>(tree.numAnnotations());
for (Annotation annotation : tree) {
tmpAnnotationList.add(annotation);
}
diff --git a/document/src/test/document/serializecpp-lz4-level9.dat b/document/src/test/document/serializecpp-lz4-level9.dat
deleted file mode 100644
index 1dffaa2d7a7..00000000000
--- a/document/src/test/document/serializecpp-lz4-level9.dat
+++ /dev/null
Binary files differ
diff --git a/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java b/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java
index 77b621a3c36..1a5016fb155 100644
--- a/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java
+++ b/document/src/test/java/com/yahoo/document/DocumentSerializationTestCase.java
@@ -1,7 +1,6 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.document;
-import com.yahoo.compress.CompressionType;
import com.yahoo.document.annotation.AbstractTypesTest;
import com.yahoo.document.datatypes.Array;
import com.yahoo.document.datatypes.BoolFieldValue;
@@ -13,7 +12,10 @@ import com.yahoo.document.datatypes.LongFieldValue;
import com.yahoo.document.datatypes.Raw;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.document.datatypes.WeightedSet;
-import com.yahoo.document.serialization.*;
+import com.yahoo.document.serialization.DocumentDeserializer;
+import com.yahoo.document.serialization.DocumentDeserializerFactory;
+import com.yahoo.document.serialization.DocumentSerializer;
+import com.yahoo.document.serialization.DocumentSerializerFactory;
import com.yahoo.io.GrowableByteBuffer;
import org.junit.Test;
@@ -27,9 +29,10 @@ import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
/**
* Tests serialization of all versions.
@@ -92,7 +95,7 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
doc.setFieldValue("intfield", 5);
doc.setFieldValue("floatfield", -9.23);
doc.setFieldValue("stringfield", "This is a string.");
- doc.setFieldValue("longfield", new LongFieldValue(398420092938472983l));
+ doc.setFieldValue("longfield", new LongFieldValue(398420092938472983L));
doc.setFieldValue("doublefield", new DoubleFieldValue(98374532.398820));
doc.setFieldValue("bytefield", new ByteFieldValue(254));
doc.setFieldValue("boolfield", new BoolFieldValue(true));
@@ -120,8 +123,8 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
class TestDoc {
- String testFile;
- int version;
+ final String testFile;
+ final int version;
TestDoc(String testFile, int version) {
this.testFile = testFile;
@@ -133,10 +136,8 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
List<TestDoc> tests = new ArrayList<>();
tests.add(new TestDoc(path + "document-java-currentversion-uncompressed.dat", Document.SERIALIZED_VERSION));
- tests.add(new TestDoc(path + "document-java-currentversion-lz4-9.dat", Document.SERIALIZED_VERSION));
tests.add(new TestDoc(path + "document-java-v8-uncompressed.dat", 8));
tests.add(new TestDoc(cpppath + "document-cpp-currentversion-uncompressed.dat", 7));
- tests.add(new TestDoc(cpppath + "document-cpp-currentversion-lz4-9.dat", 7));
tests.add(new TestDoc(cpppath + "document-cpp-v8-uncompressed.dat", 7));
for (TestDoc test : tests) {
File f = new File(test.testFile);
@@ -146,7 +147,7 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
int remaining = buffer.length;
while (remaining > 0) {
int read = fin.read(buffer, pos, remaining);
- assertFalse(read == -1);
+ assertNotEquals(-1, read);
pos += read;
remaining -= read;
}
@@ -159,7 +160,7 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
assertEquals(new IntegerFieldValue(5), doc.getFieldValue("intfield"));
assertEquals(-9.23, ((FloatFieldValue)doc.getFieldValue("floatfield")).getFloat(), 1E-6);
assertEquals(new StringFieldValue("This is a string."), doc.getFieldValue("stringfield"));
- assertEquals(new LongFieldValue(398420092938472983l), doc.getFieldValue("longfield"));
+ assertEquals(new LongFieldValue(398420092938472983L), doc.getFieldValue("longfield"));
assertEquals(98374532.398820, ((DoubleFieldValue)doc.getFieldValue("doublefield")).getDouble(), 1E-6);
assertEquals(new ByteFieldValue((byte)254), doc.getFieldValue("bytefield"));
// Todo add cpp serialization
@@ -167,20 +168,20 @@ public class DocumentSerializationTestCase extends AbstractTypesTest {
ByteBuffer bbuffer = ((Raw)doc.getFieldValue("rawfield")).getByteBuffer();
if (!Arrays.equals("RAW DATA".getBytes(), bbuffer.array())) {
System.err.println("Expected 'RAW DATA' but got '" + new String(bbuffer.array()) + "'.");
- assertTrue(false);
+ fail();
}
if (test.version > 6) {
Document docInDoc = (Document)doc.getFieldValue("docfield");
- assertTrue(docInDoc != null);
+ assertNotNull(docInDoc);
assertEquals(new StringFieldValue("Elvis is dead"),
docInDoc.getFieldValue("stringindocfield"));
}
Array array = (Array)doc.getFieldValue("arrayoffloatfield");
- assertTrue(array != null);
+ assertNotNull(array);
assertEquals(1.0f, ((FloatFieldValue)array.get(0)).getFloat(), 1E-6);
assertEquals(2.0f, ((FloatFieldValue)array.get(1)).getFloat(), 1E-6);
WeightedSet wset = (WeightedSet)doc.getFieldValue("wsfield");
- assertTrue(wset != null);
+ assertNotNull(wset);
assertEquals(Integer.valueOf(50), wset.get(new StringFieldValue("Weighted 0")));
assertEquals(Integer.valueOf(199), wset.get(new StringFieldValue("Weighted 1")));
}
diff --git a/document/src/test/java/com/yahoo/document/DocumentTestCase.java b/document/src/test/java/com/yahoo/document/DocumentTestCase.java
index e2bdd846dd1..f1c74ad6efc 100644
--- a/document/src/test/java/com/yahoo/document/DocumentTestCase.java
+++ b/document/src/test/java/com/yahoo/document/DocumentTestCase.java
@@ -3,7 +3,6 @@ package com.yahoo.document;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
-import com.yahoo.compress.CompressionType;
import com.yahoo.document.datatypes.Array;
import com.yahoo.document.datatypes.BoolFieldValue;
import com.yahoo.document.datatypes.ByteFieldValue;
@@ -29,8 +28,8 @@ import org.junit.Test;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Map;
@@ -217,7 +216,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
doc.setFieldValue("long", longVal);
}
- class VariableIteratorHandler extends FieldPathIteratorHandler {
+ static class VariableIteratorHandler extends FieldPathIteratorHandler {
public String retVal = "";
@@ -462,7 +461,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
}
}
- class RemoveIteratorHandler extends FieldPathIteratorHandler {
+ static class RemoveIteratorHandler extends FieldPathIteratorHandler {
public ModificationStatus doModify(FieldValue fv) {
return ModificationStatus.REMOVED;
@@ -639,17 +638,6 @@ public class DocumentTestCase extends DocumentTestCaseBase {
}
@Test
- public void testCppDocCompressed() throws IOException {
- docMan = setUpCppDocType();
- byte[] data = readFile("src/test/document/serializecpp-lz4-level9.dat");
- ByteBuffer buf = ByteBuffer.wrap(data);
-
- Document doc = docMan.createDocument(new GrowableByteBuffer(buf));
-
- validateCppDoc(doc);
- }
-
- @Test
public void testCppDoc() throws IOException {
docMan = setUpCppDocType();
byte[] data = readFile("src/test/document/serializecpp.dat");
@@ -794,7 +782,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
BufferSerializer buf = new BufferSerializer();
try {
new Document(DocumentDeserializerFactory.create6(docMan, buf.getBuf()));
- assertTrue(false);
+ fail();
} catch (Exception e) {
assertTrue(true);
}
@@ -802,7 +790,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
buf = BufferSerializer.wrap("Hello world".getBytes());
try {
new Document(DocumentDeserializerFactory.create6(docMan, buf.getBuf()));
- assertTrue(false);
+ fail();
} catch (Exception e) {
assertTrue(true);
}
@@ -983,7 +971,7 @@ public class DocumentTestCase extends DocumentTestCaseBase {
setUpSertestDocType();
Document doc = getSertestDocument();
String json = doc.toJson();
- Map<String, Object> parsed = new ObjectMapper().readValue(json, new TypeReference<Map<String, Object>>() {
+ Map<String, Object> parsed = new ObjectMapper().readValue(json, new TypeReference<>() {
});
assertEquals(parsed.get("id"), "id:ns:sertest::foobar");
assertTrue(parsed.get("fields") instanceof Map);
@@ -1218,11 +1206,11 @@ public class DocumentTestCase extends DocumentTestCaseBase {
}
@Test
- public void testDocumentIdWithNonTextCharacterCanBeDeserialized() throws UnsupportedEncodingException {
+ public void testDocumentIdWithNonTextCharacterCanBeDeserialized() {
DocumentIdFixture f = new DocumentIdFixture();
// Document id = "id:a:b::0x7c"
- String docId = new String(new byte[]{105, 100, 58, 97, 58, 98, 58, 58, 7, 99}, "UTF-8");
+ String docId = new String(new byte[]{105, 100, 58, 97, 58, 98, 58, 58, 7, 99}, StandardCharsets.UTF_8);
f.serialize(docId);
Document result = f.deserialize();
diff --git a/document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat b/document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat
deleted file mode 100644
index 033844ac09b..00000000000
--- a/document/src/test/serializeddocuments/document-java-currentversion-lz4-9.dat
+++ /dev/null
Binary files differ
diff --git a/document/src/tests/.gitignore b/document/src/tests/.gitignore
index 4d5d93dd093..f310f65eb7e 100644
--- a/document/src/tests/.gitignore
+++ b/document/src/tests/.gitignore
@@ -15,5 +15,4 @@ testrunner
*_test
document_gtest_runner_app
document_testrunner_app
-/serializecpp-lz4-level9.dat
/serializecppsplit_body.dat
diff --git a/document/src/tests/data/document-cpp-currentversion-lz4-9.dat b/document/src/tests/data/document-cpp-currentversion-lz4-9.dat
deleted file mode 100644
index 3383d97f253..00000000000
--- a/document/src/tests/data/document-cpp-currentversion-lz4-9.dat
+++ /dev/null
Binary files differ