summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2019-04-11 18:30:51 +0200
committerGitHub <noreply@github.com>2019-04-11 18:30:51 +0200
commit99c88e4846b752d80ac3590c0ab11adc3d84dc9c (patch)
treeaff6d10d2e45111987a72c479c824a30d508af72 /searchlib
parent406df9cbda7f97d0caccdd03a3e1340688d47cbf (diff)
parent55e53f26ea1379b5ce87685e7b8c04f370b5fca0 (diff)
Merge pull request #9087 from vespa-engine/toregge/factor-out-zc4-posting-writer
Factor out Zc4PostingWriter from Zc4PostingSeqWrite.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt2
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp270
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h53
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp222
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h66
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp8
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposting.cpp568
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposting.h56
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp361
-rw-r--r--searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h3
10 files changed, 728 insertions, 881 deletions
diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
index 3619affb54e..b21b799e693 100644
--- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
@@ -18,6 +18,8 @@ vespa_add_library(searchlib_diskindex OBJECT
pagedict4file.cpp
pagedict4randread.cpp
wordnummapper.cpp
+ zc4_posting_writer.cpp
+ zc4_posting_writer_base.cpp
zcbuf.cpp
zcposocc.cpp
zcposocciterators.cpp
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp
new file mode 100644
index 00000000000..0eb59a383a5
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp
@@ -0,0 +1,270 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "zc4_posting_writer.h"
+#include <vespa/searchlib/index/docidandfeatures.h>
+#include <vespa/searchlib/index/postinglistcounts.h>
+
+using search::index::DocIdAndFeatures;
+using search::index::PostingListCounts;
+using search::index::PostingListParams;
+
+namespace search::diskindex
+{
+
+template <bool bigEndian>
+Zc4PostingWriter<bigEndian>::Zc4PostingWriter(PostingListCounts &counts)
+ : Zc4PostingWriterBase(counts),
+ _encode_context(),
+ _encode_features(nullptr)
+{
+ _encode_context.setWriteContext(&_writeContext);
+ _writeContext.setEncodeContext(&_encode_context);
+}
+
+template <bool bigEndian>
+Zc4PostingWriter<bigEndian>::~Zc4PostingWriter()
+{
+}
+
+template <bool bigEndian>
+void
+Zc4PostingWriter<bigEndian>::reset_chunk()
+{
+ _docIds.clear();
+ if (_encode_features != nullptr) {
+ _encode_features->setupWrite(_featureWriteContext);
+ _featureOffset = 0;
+ }
+}
+
+template <bool bigEndian>
+void
+Zc4PostingWriter<bigEndian>::flush_word_with_skip(bool hasMore)
+{
+ assert(_docIds.size() >= _minSkipDocs || !_counts._segments.empty());
+
+ if (_encode_features != nullptr) {
+ _encode_features->flush();
+ }
+ EncodeContext &e = _encode_context;
+
+ uint32_t numDocs = _docIds.size();
+
+ e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS);
+ if (numDocs >= _minChunkDocs) {
+ e.writeBits((hasMore ? 1 : 0), 1);
+ }
+
+ calc_skip_info(_encode_features != nullptr);
+
+ uint32_t docIdsSize = _zcDocIds.size();
+ uint32_t l1SkipSize = _l1Skip.size();
+ uint32_t l2SkipSize = _l2Skip.size();
+ uint32_t l3SkipSize = _l3Skip.size();
+ uint32_t l4SkipSize = _l4Skip.size();
+
+ e.encodeExpGolomb(docIdsSize - 1, K_VALUE_ZCPOSTING_DOCIDSSIZE);
+ e.encodeExpGolomb(l1SkipSize, K_VALUE_ZCPOSTING_L1SKIPSIZE);
+ if (l1SkipSize != 0) {
+ e.encodeExpGolomb(l2SkipSize, K_VALUE_ZCPOSTING_L2SKIPSIZE);
+ if (l2SkipSize != 0) {
+ e.encodeExpGolomb(l3SkipSize, K_VALUE_ZCPOSTING_L3SKIPSIZE);
+ if (l3SkipSize != 0) {
+ e.encodeExpGolomb(l4SkipSize, K_VALUE_ZCPOSTING_L4SKIPSIZE);
+ }
+ }
+ }
+ if (_encode_features != nullptr) {
+ e.encodeExpGolomb(_featureOffset, K_VALUE_ZCPOSTING_FEATURESSIZE);
+ }
+
+ // Encode last document id in chunk or word.
+ if (_dynamicK) {
+ uint32_t docIdK = e.calcDocIdK((_counts._segments.empty() &&
+ !hasMore) ?
+ numDocs : 1,
+ _docIdLimit);
+ e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first,
+ docIdK);
+ } else {
+ e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first,
+ K_VALUE_ZCPOSTING_LASTDOCID);
+ }
+
+ e.smallAlign(8); // Byte align
+
+ uint8_t *docIds = _zcDocIds._mallocStart;
+ e.writeBits(reinterpret_cast<const uint64_t *>(docIds),
+ 0,
+ docIdsSize * 8);
+ if (l1SkipSize > 0) {
+ uint8_t *l1Skip = _l1Skip._mallocStart;
+ e.writeBits(reinterpret_cast<const uint64_t *>(l1Skip),
+ 0,
+ l1SkipSize * 8);
+ }
+ if (l2SkipSize > 0) {
+ uint8_t *l2Skip = _l2Skip._mallocStart;
+ e.writeBits(reinterpret_cast<const uint64_t *>(l2Skip),
+ 0,
+ l2SkipSize * 8);
+ }
+ if (l3SkipSize > 0) {
+ uint8_t *l3Skip = _l3Skip._mallocStart;
+ e.writeBits(reinterpret_cast<const uint64_t *>(l3Skip),
+ 0,
+ l3SkipSize * 8);
+ }
+ if (l4SkipSize > 0) {
+ uint8_t *l4Skip = _l4Skip._mallocStart;
+ e.writeBits(reinterpret_cast<const uint64_t *>(l4Skip),
+ 0,
+ l4SkipSize * 8);
+ }
+
+ // Write features
+ e.writeBits(static_cast<const uint64_t *>(_featureWriteContext._comprBuf),
+ 0,
+ _featureOffset);
+
+ _counts._numDocs += numDocs;
+ if (hasMore || !_counts._segments.empty()) {
+ uint64_t writePos = e.getWriteOffset();
+ PostingListCounts::Segment seg;
+ seg._bitLength = writePos - (_writePos + _counts._bitLength);
+ seg._numDocs = numDocs;
+ seg._lastDoc = _docIds.back().first;
+ _counts._segments.push_back(seg);
+ _counts._bitLength += seg._bitLength;
+ }
+ // reset tables in preparation for next word or next chunk
+ clear_skip_info();
+ reset_chunk();
+}
+
+template <bool bigEndian>
+void
+Zc4PostingWriter<bigEndian>::write_docid_and_features(const DocIdAndFeatures &features)
+{
+ if (__builtin_expect(_docIds.size() >= _minChunkDocs, false)) {
+ flush_word_with_skip(true);
+ }
+ if (_encode_features != nullptr) {
+ _encode_features->writeFeatures(features);
+ uint64_t writeOffset = _encode_features->getWriteOffset();
+ uint64_t featureSize = writeOffset - _featureOffset;
+ assert(static_cast<uint32_t>(featureSize) == featureSize);
+ _docIds.push_back(std::make_pair(features._docId,
+ static_cast<uint32_t>(featureSize)));
+ _featureOffset = writeOffset;
+ } else {
+ _docIds.push_back(std::make_pair(features._docId, uint32_t(0)));
+ }
+}
+
+template <bool bigEndian>
+void
+Zc4PostingWriter<bigEndian>::flush_word_no_skip()
+{
+ // Too few document ids for skip info.
+ assert(_docIds.size() < _minSkipDocs && _counts._segments.empty());
+
+ if (_encode_features != nullptr) {
+ _encode_features->flush();
+ }
+ EncodeContext &e = _encode_context;
+ uint32_t numDocs = _docIds.size();
+
+ e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS);
+
+ uint32_t docIdK = _dynamicK ? e.calcDocIdK(numDocs, _docIdLimit) : K_VALUE_ZCPOSTING_DELTA_DOCID;
+
+ uint32_t baseDocId = 1;
+ const uint64_t *features =
+ static_cast<const uint64_t *>(_featureWriteContext._comprBuf);
+ uint64_t featureOffset = 0;
+
+ std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin();
+ std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end();
+
+ for (; dit != dite; ++dit) {
+ uint32_t docId = dit->first;
+ uint32_t featureSize = dit->second;
+ e.encodeExpGolomb(docId - baseDocId, docIdK);
+ baseDocId = docId + 1;
+ if (featureSize != 0) {
+ e.writeBits(features + (featureOffset >> 6),
+ featureOffset & 63,
+ featureSize);
+ featureOffset += featureSize;
+ }
+ }
+ _counts._numDocs += numDocs;
+ reset_chunk();
+}
+
+template <bool bigEndian>
+void
+Zc4PostingWriter<bigEndian>::flush_word()
+{
+ if (__builtin_expect(_docIds.size() >= _minSkipDocs ||
+ !_counts._segments.empty(), false)) {
+ // Use skip information if enough documents or chunking has happened
+ flush_word_with_skip(false);
+ _numWords++;
+ } else if (_docIds.size() > 0) {
+ flush_word_no_skip();
+ _numWords++;
+ }
+
+ EncodeContext &e = _encode_context;
+ uint64_t writePos = e.getWriteOffset();
+
+ _counts._bitLength = writePos - _writePos;
+ _writePos = writePos;
+}
+
+template <bool bigEndian>
+void
+Zc4PostingWriter<bigEndian>::set_encode_features(EncodeContext *encode_features)
+{
+ _encode_features = encode_features;
+ if (_encode_features != nullptr) {
+ _encode_features->setWriteContext(&_featureWriteContext);
+ _encode_features->setupWrite(_featureWriteContext);
+ }
+ _featureWriteContext.setEncodeContext(_encode_features);
+ _featureOffset = 0;
+}
+
+template <bool bigEndian>
+void
+Zc4PostingWriter<bigEndian>::on_open()
+{
+ _numWords = 0;
+ _writePos = _encode_context.getWriteOffset(); // Position after file header
+}
+
+template <bool bigEndian>
+void
+Zc4PostingWriter<bigEndian>::on_close()
+{
+ // Write some pad bits to avoid decompression readahead going past
+ // memory mapped file during search and into SIGSEGV territory.
+
+ // First pad to 64 bits alignment.
+ _encode_context.smallAlign(64);
+ _encode_context.writeComprBufferIfNeeded();
+
+ // Then write 128 more bits. This allows for 64-bit decoding
+ // with a readbits that always leaves a nonzero preRead
+ _encode_context.padBits(128);
+ _encode_context.alignDirectIO();
+ _encode_context.flush();
+ _encode_context.writeComprBuffer(); // Also flushes slack
+}
+
+template class Zc4PostingWriter<false>;
+template class Zc4PostingWriter<true>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h
new file mode 100644
index 00000000000..8dc5e249d52
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h
@@ -0,0 +1,53 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "zc4_posting_writer_base.h"
+
+namespace search::index { class DocIdAndFeatures; }
+
+namespace search::diskindex
+{
+
+/*
+ * Class used to write posting lists of type "Zc.4" and "Zc.5" (dynamic k).
+ *
+ * Common words have docid deltas and skip info separate from
+ * features.
+ *
+ * Rare words do not have skip info, and docid deltas and features are
+ * interleaved.
+ */
+template <bool bigEndian>
+class Zc4PostingWriter : public Zc4PostingWriterBase
+{
+ using EncodeContext = bitcompression::FeatureEncodeContext<bigEndian>;
+
+ EncodeContext _encode_context;
+ // Buffer up features in memory
+ EncodeContext *_encode_features;
+public:
+ Zc4PostingWriter(const Zc4PostingWriter &) = delete;
+ Zc4PostingWriter(Zc4PostingWriter &&) = delete;
+ Zc4PostingWriter &operator=(const Zc4PostingWriter &) = delete;
+ Zc4PostingWriter &operator=(Zc4PostingWriter &&) = delete;
+ Zc4PostingWriter(index::PostingListCounts &counts);
+ ~Zc4PostingWriter();
+
+ void reset_chunk();
+ void flush_word_with_skip(bool hasMore);
+ void flush_word_no_skip();
+ void flush_word();
+ void write_docid_and_features(const index::DocIdAndFeatures &features);
+ void set_encode_features(EncodeContext *encode_features);
+ void on_open();
+ void on_close();
+
+ EncodeContext &get_encode_features() { return *_encode_features; }
+ EncodeContext &get_encode_context() { return _encode_context; }
+};
+
+extern template class Zc4PostingWriter<false>;
+extern template class Zc4PostingWriter<true>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp
new file mode 100644
index 00000000000..485610c2ebd
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp
@@ -0,0 +1,222 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "zc4_posting_writer_base.h"
+#include <vespa/searchlib/index/postinglistcounts.h>
+
+using search::index::PostingListCounts;
+using search::index::PostingListParams;
+
+namespace search::diskindex
+{
+
+Zc4PostingWriterBase::Zc4PostingWriterBase(PostingListCounts &counts)
+ : _minChunkDocs(1 << 30),
+ _minSkipDocs(64),
+ _docIdLimit(10000000),
+ _docIds(),
+ _featureOffset(0),
+ _writePos(0),
+ _dynamicK(false),
+ _zcDocIds(),
+ _l1Skip(),
+ _l2Skip(),
+ _l3Skip(),
+ _l4Skip(),
+ _numWords(0),
+ _counts(counts),
+ _writeContext(sizeof(uint64_t)),
+ _featureWriteContext(sizeof(uint64_t))
+{
+ _featureWriteContext.allocComprBuf(64, 1);
+ // Ensure that some space is initially available in encoding buffers
+ _zcDocIds.maybeExpand();
+ _l1Skip.maybeExpand();
+ _l2Skip.maybeExpand();
+ _l3Skip.maybeExpand();
+ _l4Skip.maybeExpand();
+}
+
+Zc4PostingWriterBase::~Zc4PostingWriterBase()
+{
+}
+
+#define L1SKIPSTRIDE 16
+#define L2SKIPSTRIDE 8
+#define L3SKIPSTRIDE 8
+#define L4SKIPSTRIDE 8
+
+void
+Zc4PostingWriterBase::calc_skip_info(bool encodeFeatures)
+{
+ uint32_t lastDocId = 0u;
+ uint32_t lastL1SkipDocId = 0u;
+ uint32_t lastL1SkipDocIdPos = 0;
+ uint32_t lastL1SkipFeaturePos = 0;
+ uint32_t lastL2SkipDocId = 0u;
+ uint32_t lastL2SkipDocIdPos = 0;
+ uint32_t lastL2SkipFeaturePos = 0;
+ uint32_t lastL2SkipL1SkipPos = 0;
+ uint32_t lastL3SkipDocId = 0u;
+ uint32_t lastL3SkipDocIdPos = 0;
+ uint32_t lastL3SkipFeaturePos = 0;
+ uint32_t lastL3SkipL1SkipPos = 0;
+ uint32_t lastL3SkipL2SkipPos = 0;
+ uint32_t lastL4SkipDocId = 0u;
+ uint32_t lastL4SkipDocIdPos = 0;
+ uint32_t lastL4SkipFeaturePos = 0;
+ uint32_t lastL4SkipL1SkipPos = 0;
+ uint32_t lastL4SkipL2SkipPos = 0;
+ uint32_t lastL4SkipL3SkipPos = 0;
+ unsigned int l1SkipCnt = 0;
+ unsigned int l2SkipCnt = 0;
+ unsigned int l3SkipCnt = 0;
+ unsigned int l4SkipCnt = 0;
+ uint64_t featurePos = 0;
+
+ std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin();
+ std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end();
+
+ if (!_counts._segments.empty()) {
+ lastDocId = _counts._segments.back()._lastDoc;
+ lastL1SkipDocId = lastDocId;
+ lastL2SkipDocId = lastDocId;
+ lastL3SkipDocId = lastDocId;
+ lastL4SkipDocId = lastDocId;
+ }
+
+ for (; dit != dite; ++dit) {
+ if (l1SkipCnt >= L1SKIPSTRIDE) {
+ // L1 docid delta
+ uint32_t docIdDelta = lastDocId - lastL1SkipDocId;
+ assert(static_cast<int32_t>(docIdDelta) > 0);
+ _l1Skip.encode(docIdDelta - 1);
+ lastL1SkipDocId = lastDocId;
+ // L1 docid pos
+ uint64_t docIdPos = _zcDocIds.size();
+ _l1Skip.encode(docIdPos - lastL1SkipDocIdPos - 1);
+ lastL1SkipDocIdPos = docIdPos;
+ if (encodeFeatures) {
+ // L1 features pos
+ _l1Skip.encode(featurePos - lastL1SkipFeaturePos - 1);
+ lastL1SkipFeaturePos = featurePos;
+ }
+ l1SkipCnt = 0;
+ ++l2SkipCnt;
+ if (l2SkipCnt >= L2SKIPSTRIDE) {
+ // L2 docid delta
+ docIdDelta = lastDocId - lastL2SkipDocId;
+ assert(static_cast<int32_t>(docIdDelta) > 0);
+ _l2Skip.encode(docIdDelta - 1);
+ lastL2SkipDocId = lastDocId;
+ // L2 docid pos
+ docIdPos = _zcDocIds.size();
+ _l2Skip.encode(docIdPos - lastL2SkipDocIdPos - 1);
+ lastL2SkipDocIdPos = docIdPos;
+ if (encodeFeatures) {
+ // L2 features pos
+ _l2Skip.encode(featurePos - lastL2SkipFeaturePos - 1);
+ lastL2SkipFeaturePos = featurePos;
+ }
+ // L2 L1Skip pos
+ uint64_t l1SkipPos = _l1Skip.size();
+ _l2Skip.encode(l1SkipPos - lastL2SkipL1SkipPos - 1);
+ lastL2SkipL1SkipPos = l1SkipPos;
+ l2SkipCnt = 0;
+ ++l3SkipCnt;
+ if (l3SkipCnt >= L3SKIPSTRIDE) {
+ // L3 docid delta
+ docIdDelta = lastDocId - lastL3SkipDocId;
+ assert(static_cast<int32_t>(docIdDelta) > 0);
+ _l3Skip.encode(docIdDelta - 1);
+ lastL3SkipDocId = lastDocId;
+ // L3 docid pos
+ docIdPos = _zcDocIds.size();
+ _l3Skip.encode(docIdPos - lastL3SkipDocIdPos - 1);
+ lastL3SkipDocIdPos = docIdPos;
+ if (encodeFeatures) {
+ // L3 features pos
+ _l3Skip.encode(featurePos - lastL3SkipFeaturePos - 1);
+ lastL3SkipFeaturePos = featurePos;
+ }
+ // L3 L1Skip pos
+ l1SkipPos = _l1Skip.size();
+ _l3Skip.encode(l1SkipPos - lastL3SkipL1SkipPos - 1);
+ lastL3SkipL1SkipPos = l1SkipPos;
+ // L3 L2Skip pos
+ uint64_t l2SkipPos = _l2Skip.size();
+ _l3Skip.encode(l2SkipPos - lastL3SkipL2SkipPos - 1);
+ lastL3SkipL2SkipPos = l2SkipPos;
+ l3SkipCnt = 0;
+ ++l4SkipCnt;
+ if (l4SkipCnt >= L4SKIPSTRIDE) {
+ // L4 docid delta
+ docIdDelta = lastDocId - lastL4SkipDocId;
+ assert(static_cast<int32_t>(docIdDelta) > 0);
+ _l4Skip.encode(docIdDelta - 1);
+ lastL4SkipDocId = lastDocId;
+ // L4 docid pos
+ docIdPos = _zcDocIds.size();
+ _l4Skip.encode(docIdPos - lastL4SkipDocIdPos - 1);
+ lastL4SkipDocIdPos = docIdPos;
+ if (encodeFeatures) {
+ // L4 features pos
+ _l4Skip.encode(featurePos - lastL4SkipFeaturePos - 1);
+ lastL4SkipFeaturePos = featurePos;
+ }
+ // L4 L1Skip pos
+ l1SkipPos = _l1Skip.size();
+ _l4Skip.encode(l1SkipPos - lastL4SkipL1SkipPos - 1);
+ lastL4SkipL1SkipPos = l1SkipPos;
+ // L4 L2Skip pos
+ l2SkipPos = _l2Skip.size();
+ _l4Skip.encode(l2SkipPos - lastL4SkipL2SkipPos - 1);
+ lastL4SkipL2SkipPos = l2SkipPos;
+ // L4 L3Skip pos
+ uint64_t l3SkipPos = _l3Skip.size();
+ _l4Skip.encode(l3SkipPos - lastL4SkipL3SkipPos - 1);
+ lastL4SkipL3SkipPos = l3SkipPos;
+ l4SkipCnt = 0;
+ }
+ }
+ }
+ }
+ uint32_t docId = dit->first;
+ featurePos += dit->second;
+ _zcDocIds.encode(docId - lastDocId - 1);
+ lastDocId = docId;
+ ++l1SkipCnt;
+ }
+ // Extra partial entries for skip tables to simplify iterator during search
+ if (_l1Skip.size() > 0) {
+ _l1Skip.encode(lastDocId - lastL1SkipDocId - 1);
+ }
+ if (_l2Skip.size() > 0) {
+ _l2Skip.encode(lastDocId - lastL2SkipDocId - 1);
+ }
+ if (_l3Skip.size() > 0) {
+ _l3Skip.encode(lastDocId - lastL3SkipDocId - 1);
+ }
+ if (_l4Skip.size() > 0) {
+ _l4Skip.encode(lastDocId - lastL4SkipDocId - 1);
+ }
+}
+
+void
+Zc4PostingWriterBase::clear_skip_info()
+{
+ _zcDocIds.clear();
+ _l1Skip.clear();
+ _l2Skip.clear();
+ _l3Skip.clear();
+ _l4Skip.clear();
+}
+
+void
+Zc4PostingWriterBase::set_posting_list_params(const PostingListParams &params)
+{
+ params.get("docIdLimit", _docIdLimit);
+ params.get("minChunkDocs", _minChunkDocs);
+ params.get("minSkipDocs", _minSkipDocs);
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h
new file mode 100644
index 00000000000..ba781c11564
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h
@@ -0,0 +1,66 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "zcbuf.h"
+#include <vespa/searchlib/bitcompression/compression.h>
+#include <vector>
+
+namespace search::index {
+class PostingListCounts;
+class PostingListParams;
+}
+
+namespace search::diskindex
+{
+
+/*
+ * Base class for writing posting lists that might have basic skip info.
+ */
+class Zc4PostingWriterBase
+{
+protected:
+ uint32_t _minChunkDocs; // # of documents needed for chunking
+ uint32_t _minSkipDocs; // # of documents needed for skipping
+ uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit)
+
+ // Unpacked document ids for word and feature sizes
+ using DocIdAndFeatureSize = std::pair<uint32_t, uint32_t>;
+ std::vector<DocIdAndFeatureSize> _docIds;
+
+ uint64_t _featureOffset; // Bit offset of next feature
+ uint64_t _writePos; // Bit position for start of current word
+ bool _dynamicK; // Caclulate EG compression parameters ?
+ ZcBuf _zcDocIds; // Document id deltas
+ ZcBuf _l1Skip; // L1 skip info
+ ZcBuf _l2Skip; // L2 skip info
+ ZcBuf _l3Skip; // L3 skip info
+ ZcBuf _l4Skip; // L4 skip info
+
+ uint64_t _numWords; // Number of words in file
+ index::PostingListCounts &_counts;
+ search::ComprFileWriteContext _writeContext;
+ search::ComprFileWriteContext _featureWriteContext;
+
+ Zc4PostingWriterBase(const Zc4PostingWriterBase &) = delete;
+ Zc4PostingWriterBase(Zc4PostingWriterBase &&) = delete;
+ Zc4PostingWriterBase &operator=(const Zc4PostingWriterBase &) = delete;
+ Zc4PostingWriterBase &operator=(Zc4PostingWriterBase &&) = delete;
+ Zc4PostingWriterBase(index::PostingListCounts &counts);
+ ~Zc4PostingWriterBase();
+ void calc_skip_info(bool encodeFeatures);
+ void clear_skip_info();
+
+public:
+ ComprFileWriteContext &get_write_context() { return _writeContext; }
+ ComprFileWriteContext &get_feature_write_context() { return _featureWriteContext; }
+ uint32_t get_min_chunk_docs() const { return _minChunkDocs; }
+ uint32_t get_min_skip_docs() const { return _minSkipDocs; }
+ uint32_t get_docid_limit() const { return _docIdLimit; }
+ uint64_t get_num_words() const { return _numWords; }
+ bool get_dynamic_k() const { return _dynamicK; }
+ void set_dynamic_k(bool dynamicK) { _dynamicK = dynamicK; }
+ void set_posting_list_params(const index::PostingListParams &params);
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
index df06432816f..10c08af92cb 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
@@ -63,9 +63,7 @@ Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema,
_fieldsParams(),
_realEncodeFeatures(&_fieldsParams)
{
- _encodeFeatures = &_realEncodeFeatures;
- _encodeFeatures->setWriteContext(&_featureWriteContext);
- _featureWriteContext.setEncodeContext(_encodeFeatures);
+ _writer.set_encode_features(&_realEncodeFeatures);
_fieldsParams.setSchemaParams(schema, indexId);
}
@@ -118,9 +116,7 @@ ZcPosOccSeqWrite::ZcPosOccSeqWrite(const Schema &schema,
_fieldsParams(),
_realEncodeFeatures(&_fieldsParams)
{
- _encodeFeatures = &_realEncodeFeatures;
- _encodeFeatures->setWriteContext(&_featureWriteContext);
- _featureWriteContext.setEncodeContext(_encodeFeatures);
+ _writer.set_encode_features(&_realEncodeFeatures);
_fieldsParams.setSchemaParams(schema, indexId);
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
index d51a592bf2b..e850f169adc 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
@@ -607,36 +607,16 @@ Zc4PostingSeqRead::setPostingOffset(uint64_t offset,
Zc4PostingSeqWrite::
Zc4PostingSeqWrite(PostingListCountFileSeqWrite *countFile)
: PostingListFileSeqWrite(),
- _encodeContext(),
- _writeContext(_encodeContext),
+ _writer(_counts),
_file(),
- _minChunkDocs(1 << 30),
- _minSkipDocs(64),
- _docIdLimit(10000000),
- _docIds(),
- _encodeFeatures(nullptr),
- _featureOffset(0),
- _featureWriteContext(sizeof(uint64_t)),
- _writePos(0),
- _dynamicK(false),
- _zcDocIds(),
- _l1Skip(),
- _l2Skip(),
- _l3Skip(),
- _l4Skip(),
- _numWords(0),
_fileBitSize(0),
_countFile(countFile)
{
- _encodeContext.setWriteContext(&_writeContext);
-
if (_countFile != nullptr) {
PostingListParams params;
_countFile->getParams(params);
- params.get("docIdLimit", _docIdLimit);
- params.get("minChunkDocs", _minChunkDocs);
+ _writer.set_posting_list_params(params);
}
- _featureWriteContext.allocComprBuf(64, 1);
}
@@ -646,110 +626,27 @@ Zc4PostingSeqWrite::~Zc4PostingSeqWrite()
void
-Zc4PostingSeqWrite::
-writeDocIdAndFeatures(const DocIdAndFeatures &features)
+Zc4PostingSeqWrite::writeDocIdAndFeatures(const DocIdAndFeatures &features)
{
- if (__builtin_expect(_docIds.size() >= _minChunkDocs, false))
- flushChunk();
- _encodeFeatures->writeFeatures(features);
- uint64_t writeOffset = _encodeFeatures->getWriteOffset();
- uint64_t featureSize = writeOffset - _featureOffset;
- assert(static_cast<uint32_t>(featureSize) == featureSize);
- _docIds.push_back(std::make_pair(features._docId,
- static_cast<uint32_t>(featureSize)));
- _featureOffset = writeOffset;
+ _writer.write_docid_and_features(features);
}
void
Zc4PostingSeqWrite::flushWord()
{
- if (__builtin_expect(_docIds.size() >= _minSkipDocs ||
- !_counts._segments.empty(), false)) {
- // Use skip information if enough documents of chunking has happened
- flushWordWithSkip(false);
- _numWords++;
- } else if (_docIds.size() > 0) {
- flushWordNoSkip();
- _numWords++;
- }
-
- EncodeContext &e = _encodeContext;
- uint64_t writePos = e.getWriteOffset();
-
- _counts._bitLength = writePos - _writePos;
- _writePos = writePos;
-}
-
-
-uint32_t
-Zc4PostingSeqWrite::readHeader(const vespalib::string &name)
-{
- EncodeContext &f = *_encodeFeatures;
-
- FeatureDecodeContextBE d;
- ComprFileReadContext drc(d);
- FastOS_File file;
- const vespalib::string &myId = _dynamicK ? myId5 : myId4;
-
- d.setReadContext(&drc);
- bool res = file.OpenReadOnly(name.c_str());
- if (!res) {
- LOG(error, "Could not open %s for reading file header: %s",
- name.c_str(), getLastErrorString().c_str());
- LOG_ABORT("should not be reached");
- }
-
- drc.setFile(&file);
- drc.setFileSize(file.GetSize());
- drc.allocComprBuf(512, 32768u);
- d.emptyBuffer(0);
- drc.readComprBuffer();
-
- vespalib::FileHeader header;
- d.readHeader(header, file.getSize());
- uint32_t headerLen = header.getSize();
- assert(header.hasTag("frozen"));
- assert(header.hasTag("fileBitSize"));
- assert(header.hasTag("format.0"));
- assert(header.hasTag("format.1"));
- assert(!header.hasTag("format.2"));
- assert(header.hasTag("numWords"));
- assert(header.hasTag("minChunkDocs"));
- assert(header.hasTag("docIdLimit"));
- assert(header.hasTag("minSkipDocs"));
- assert(header.hasTag("endian"));
- bool headerCompleted = header.getTag("frozen").asInteger() != 0;
- uint64_t headerFileBitSize = header.getTag("fileBitSize").asInteger();
- headerLen += (-headerLen & 7);
- assert(!headerCompleted || headerFileBitSize >= headerLen * 8);
- (void) headerCompleted;
- (void) headerFileBitSize;
- assert(header.getTag("format.0").asString() == myId);
- (void) myId;
- assert(header.getTag("format.1").asString() == f.getIdentifier());
- _minChunkDocs = header.getTag("minChunkDocs").asInteger();
- _docIdLimit = header.getTag("docIdLimit").asInteger();
- _minSkipDocs = header.getTag("minSkipDocs").asInteger();
- assert(header.getTag("endian").asString() == "big");
- // Read feature decoding specific subheader using helper decode context
- f.readHeader(header, "features.");
- // Align on 64-bit unit
- d.smallAlign(64);
- assert(d.getReadOffset() == headerLen * 8);
- file.Close();
- return headerLen;
+ _writer.flush_word();
}
void
Zc4PostingSeqWrite::makeHeader(const FileHeaderContext &fileHeaderContext)
{
- EncodeContext &f = *_encodeFeatures;
- EncodeContext &e = _encodeContext;
- ComprFileWriteContext &wce = _writeContext;
+ EncodeContext &f = _writer.get_encode_features();
+ EncodeContext &e = _writer.get_encode_context();
+ ComprFileWriteContext &wce = _writer.get_write_context();
- const vespalib::string &myId = _dynamicK ? myId5 : myId4;
+ const vespalib::string &myId = _writer.get_dynamic_k() ? myId5 : myId4;
vespalib::FileHeader header;
typedef vespalib::GenericHeader::Tag Tag;
@@ -759,9 +656,9 @@ Zc4PostingSeqWrite::makeHeader(const FileHeaderContext &fileHeaderContext)
header.putTag(Tag("format.0", myId));
header.putTag(Tag("format.1", f.getIdentifier()));
header.putTag(Tag("numWords", 0));
- header.putTag(Tag("minChunkDocs", _minChunkDocs));
- header.putTag(Tag("docIdLimit", _docIdLimit));
- header.putTag(Tag("minSkipDocs", _minSkipDocs));
+ header.putTag(Tag("minChunkDocs", _writer.get_min_chunk_docs()));
+ header.putTag(Tag("docIdLimit", _writer.get_docid_limit()));
+ header.putTag(Tag("minSkipDocs", _writer.get_min_skip_docs()));
header.putTag(Tag("endian", "big"));
header.putTag(Tag("desc", "Posting list file"));
@@ -788,7 +685,7 @@ Zc4PostingSeqWrite::updateHeader()
typedef vespalib::GenericHeader::Tag Tag;
h.putTag(Tag("frozen", 1));
h.putTag(Tag("fileBitSize", _fileBitSize));
- h.putTag(Tag("numWords", _numWords));
+ h.putTag(Tag("numWords", _writer.get_num_words()));
h.rewriteFile(f);
f.Sync();
f.Close();
@@ -813,40 +710,21 @@ Zc4PostingSeqWrite::open(const vespalib::string &name,
// XXX may need to do something more here, I don't know what...
return false;
}
- uint64_t fileSize = _file.GetSize();
- uint64_t bufferStartFilePos = _writeContext.getBufferStartFilePos();
- assert(fileSize >= bufferStartFilePos);
- (void) fileSize;
- _file.SetSize(bufferStartFilePos);
- assert(bufferStartFilePos == static_cast<uint64_t>(_file.GetPosition()));
- _writeContext.setFile(&_file);
- search::ComprBuffer &cb = _writeContext;
- EncodeContext &e = _encodeContext;
- _writeContext.allocComprBuf(65536u, 32768u);
- if (bufferStartFilePos == 0) {
- e.setupWrite(cb);
- // Reset accumulated stats
- _fileBitSize = 0;
- _numWords = 0;
- // Start write initial header
- makeHeader(fileHeaderContext);
- _encodeFeatures->setupWrite(_featureWriteContext);
- // end write initial header
- _writePos = e.getWriteOffset();
- } else {
- assert(bufferStartFilePos >= 8u);
- uint32_t headerSize = readHeader(name); // Read existing header
- assert(bufferStartFilePos >= headerSize);
- (void) headerSize;
- e.afterWrite(_writeContext, 0, bufferStartFilePos);
- }
-
- // Ensure that some space is initially available in encoding buffers
- _zcDocIds.maybeExpand();
- _l1Skip.maybeExpand();
- _l2Skip.maybeExpand();
- _l3Skip.maybeExpand();
- _l4Skip.maybeExpand();
+ auto &writeContext = _writer.get_write_context();
+ uint64_t bufferStartFilePos = writeContext.getBufferStartFilePos();
+ assert(bufferStartFilePos == 0);
+ _file.SetSize(0);
+ writeContext.setFile(&_file);
+ search::ComprBuffer &cb = writeContext;
+ EncodeContext &e = _writer.get_encode_context();
+ writeContext.allocComprBuf(65536u, 32768u);
+ e.setupWrite(cb);
+ // Reset accumulated stats
+ _fileBitSize = 0;
+ // Start write initial header
+ makeHeader(fileHeaderContext);
+ // end write initial header
+ _writer.on_open();
return true; // Assume success
}
@@ -854,42 +732,24 @@ Zc4PostingSeqWrite::open(const vespalib::string &name,
bool
Zc4PostingSeqWrite::close()
{
- EncodeContext &e = _encodeContext;
-
- _fileBitSize = e.getWriteOffset();
- // Write some pad bits to avoid decompression readahead going past
- // memory mapped file during search and into SIGSEGV territory.
-
- // First pad to 64 bits alignment.
- e.smallAlign(64);
- e.writeComprBufferIfNeeded();
-
- // Then write 128 more bits. This allows for 64-bit decoding
- // with a readbits that always leaves a nonzero preRead
- e.padBits(128);
- e.alignDirectIO();
- e.flush();
- e.writeComprBuffer(); // Also flushes slack
-
- _writeContext.dropComprBuf();
+ _fileBitSize = _writer.get_encode_context().getWriteOffset();
+ _writer.on_close(); // flush and pad
+ auto &writeContext = _writer.get_write_context();
+ writeContext.dropComprBuf();
_file.Sync();
_file.Close();
- _writeContext.setFile(nullptr);
+ writeContext.setFile(nullptr);
updateHeader();
return true;
}
-
-
void
Zc4PostingSeqWrite::
setParams(const PostingListParams &params)
{
if (_countFile != nullptr)
_countFile->setParams(params);
- params.get("docIdLimit", _docIdLimit);
- params.get("minChunkDocs", _minChunkDocs);
- params.get("minSkipDocs", _minSkipDocs);
+ _writer.set_posting_list_params(params);
}
@@ -905,14 +765,14 @@ getParams(PostingListParams &params)
uint32_t countMinChunkDocs = 0;
countParams.get("docIdLimit", countDocIdLimit);
countParams.get("minChunkDocs", countMinChunkDocs);
- assert(_docIdLimit == countDocIdLimit);
- assert(_minChunkDocs == countMinChunkDocs);
+ assert(_writer.get_docid_limit() == countDocIdLimit);
+ assert(_writer.get_min_chunk_docs() == countMinChunkDocs);
} else {
params.clear();
- params.set("docIdLimit", _docIdLimit);
- params.set("minChunkDocs", _minChunkDocs);
+ params.set("docIdLimit", _writer.get_docid_limit());
+ params.set("minChunkDocs", _writer.get_min_chunk_docs());
}
- params.set("minSkipDocs", _minSkipDocs);
+ params.set("minSkipDocs", _writer.get_min_skip_docs());
}
@@ -920,7 +780,7 @@ void
Zc4PostingSeqWrite::
setFeatureParams(const PostingListParams &params)
{
- _encodeFeatures->setParams(params);
+ _writer.get_encode_features().setParams(params);
}
@@ -928,314 +788,7 @@ void
Zc4PostingSeqWrite::
getFeatureParams(PostingListParams &params)
{
- _encodeFeatures->getParams(params);
-}
-
-
-void
-Zc4PostingSeqWrite::flushChunk()
-{
- /* TODO: Flush chunk and prepare for new (possible short) chunk */
- flushWordWithSkip(true);
-}
-
-#define L1SKIPSTRIDE 16
-#define L2SKIPSTRIDE 8
-#define L3SKIPSTRIDE 8
-#define L4SKIPSTRIDE 8
-
-
-void
-Zc4PostingSeqWrite::calcSkipInfo()
-{
- uint32_t lastDocId = 0u;
- uint32_t lastL1SkipDocId = 0u;
- uint32_t lastL1SkipDocIdPos = 0;
- uint32_t lastL1SkipFeaturePos = 0;
- uint32_t lastL2SkipDocId = 0u;
- uint32_t lastL2SkipDocIdPos = 0;
- uint32_t lastL2SkipFeaturePos = 0;
- uint32_t lastL2SkipL1SkipPos = 0;
- uint32_t lastL3SkipDocId = 0u;
- uint32_t lastL3SkipDocIdPos = 0;
- uint32_t lastL3SkipFeaturePos = 0;
- uint32_t lastL3SkipL1SkipPos = 0;
- uint32_t lastL3SkipL2SkipPos = 0;
- uint32_t lastL4SkipDocId = 0u;
- uint32_t lastL4SkipDocIdPos = 0;
- uint32_t lastL4SkipFeaturePos = 0;
- uint32_t lastL4SkipL1SkipPos = 0;
- uint32_t lastL4SkipL2SkipPos = 0;
- uint32_t lastL4SkipL3SkipPos = 0;
- unsigned int l1SkipCnt = 0;
- unsigned int l2SkipCnt = 0;
- unsigned int l3SkipCnt = 0;
- unsigned int l4SkipCnt = 0;
- uint64_t featurePos = 0;
-
- std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin();
- std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end();
-
- if (!_counts._segments.empty()) {
- lastDocId = _counts._segments.back()._lastDoc;
- lastL1SkipDocId = lastDocId;
- lastL2SkipDocId = lastDocId;
- lastL3SkipDocId = lastDocId;
- lastL4SkipDocId = lastDocId;
- }
-
- for (; dit != dite; ++dit) {
- if (l1SkipCnt >= L1SKIPSTRIDE) {
- // L1 docid delta
- uint32_t docIdDelta = lastDocId - lastL1SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- _l1Skip.encode(docIdDelta - 1);
- lastL1SkipDocId = lastDocId;
- // L1 docid pos
- uint64_t docIdPos = _zcDocIds.size();
- _l1Skip.encode(docIdPos - lastL1SkipDocIdPos - 1);
- lastL1SkipDocIdPos = docIdPos;
- // L1 features pos
- _l1Skip.encode(featurePos - lastL1SkipFeaturePos - 1);
- lastL1SkipFeaturePos = featurePos;
- l1SkipCnt = 0;
- ++l2SkipCnt;
- if (l2SkipCnt >= L2SKIPSTRIDE) {
- // L2 docid delta
- docIdDelta = lastDocId - lastL2SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- _l2Skip.encode(docIdDelta - 1);
- lastL2SkipDocId = lastDocId;
- // L2 docid pos
- docIdPos = _zcDocIds.size();
- _l2Skip.encode(docIdPos - lastL2SkipDocIdPos - 1);
- lastL2SkipDocIdPos = docIdPos;
- // L2 features pos
- _l2Skip.encode(featurePos - lastL2SkipFeaturePos - 1);
- lastL2SkipFeaturePos = featurePos;
- // L2 L1Skip pos
- uint64_t l1SkipPos = _l1Skip.size();
- _l2Skip.encode(l1SkipPos - lastL2SkipL1SkipPos - 1);
- lastL2SkipL1SkipPos = l1SkipPos;
- l2SkipCnt = 0;
- ++l3SkipCnt;
- if (l3SkipCnt >= L3SKIPSTRIDE) {
- // L3 docid delta
- docIdDelta = lastDocId - lastL3SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- _l3Skip.encode(docIdDelta - 1);
- lastL3SkipDocId = lastDocId;
- // L3 docid pos
- docIdPos = _zcDocIds.size();
- _l3Skip.encode(docIdPos - lastL3SkipDocIdPos - 1);
- lastL3SkipDocIdPos = docIdPos;
- // L3 features pos
- _l3Skip.encode(featurePos - lastL3SkipFeaturePos - 1);
- lastL3SkipFeaturePos = featurePos;
- // L3 L1Skip pos
- l1SkipPos = _l1Skip.size();
- _l3Skip.encode(l1SkipPos - lastL3SkipL1SkipPos - 1);
- lastL3SkipL1SkipPos = l1SkipPos;
- // L3 L2Skip pos
- uint64_t l2SkipPos = _l2Skip.size();
- _l3Skip.encode(l2SkipPos - lastL3SkipL2SkipPos - 1);
- lastL3SkipL2SkipPos = l2SkipPos;
- l3SkipCnt = 0;
- ++l4SkipCnt;
- if (l4SkipCnt >= L4SKIPSTRIDE) {
- // L4 docid delta
- docIdDelta = lastDocId - lastL4SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- _l4Skip.encode(docIdDelta - 1);
- lastL4SkipDocId = lastDocId;
- // L4 docid pos
- docIdPos = _zcDocIds.size();
- _l4Skip.encode(docIdPos - lastL4SkipDocIdPos - 1);
- lastL4SkipDocIdPos = docIdPos;
- // L4 features pos
- _l4Skip.encode(featurePos - lastL4SkipFeaturePos - 1);
- lastL4SkipFeaturePos = featurePos;
- // L4 L1Skip pos
- l1SkipPos = _l1Skip.size();
- _l4Skip.encode(l1SkipPos - lastL4SkipL1SkipPos - 1);
- lastL4SkipL1SkipPos = l1SkipPos;
- // L4 L2Skip pos
- l2SkipPos = _l2Skip.size();
- _l4Skip.encode(l2SkipPos - lastL4SkipL2SkipPos - 1);
- lastL4SkipL2SkipPos = l2SkipPos;
- // L4 L3Skip pos
- uint64_t l3SkipPos = _l3Skip.size();
- _l4Skip.encode(l3SkipPos - lastL4SkipL3SkipPos - 1);
- lastL4SkipL3SkipPos = l3SkipPos;
- l4SkipCnt = 0;
- }
- }
- }
- }
- uint32_t docId = dit->first;
- featurePos += dit->second;
- _zcDocIds.encode(docId - lastDocId - 1);
- lastDocId = docId;
- ++l1SkipCnt;
- }
- // Extra partial entries for skip tables to simplify iterator during search
- if (_l1Skip.size() > 0)
- _l1Skip.encode(lastDocId - lastL1SkipDocId - 1);
- if (_l2Skip.size() > 0)
- _l2Skip.encode(lastDocId - lastL2SkipDocId - 1);
- if (_l3Skip.size() > 0)
- _l3Skip.encode(lastDocId - lastL3SkipDocId - 1);
- if (_l4Skip.size() > 0)
- _l4Skip.encode(lastDocId - lastL4SkipDocId - 1);
-}
-
-
-void
-Zc4PostingSeqWrite::flushWordWithSkip(bool hasMore)
-{
- assert(_docIds.size() >= _minSkipDocs || !_counts._segments.empty());
-
- _encodeFeatures->flush();
- EncodeContext &e = _encodeContext;
-
- uint32_t numDocs = _docIds.size();
-
- e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS);
- if (numDocs >= _minChunkDocs)
- e.writeBits((hasMore ? 1 : 0), 1);
-
- // TODO: Calculate docids size, possible also k parameter */
- calcSkipInfo();
-
- uint32_t docIdsSize = _zcDocIds.size();
- uint32_t l1SkipSize = _l1Skip.size();
- uint32_t l2SkipSize = _l2Skip.size();
- uint32_t l3SkipSize = _l3Skip.size();
- uint32_t l4SkipSize = _l4Skip.size();
-
- e.encodeExpGolomb(docIdsSize - 1, K_VALUE_ZCPOSTING_DOCIDSSIZE);
- e.encodeExpGolomb(l1SkipSize, K_VALUE_ZCPOSTING_L1SKIPSIZE);
- if (l1SkipSize != 0) {
- e.encodeExpGolomb(l2SkipSize, K_VALUE_ZCPOSTING_L2SKIPSIZE);
- if (l2SkipSize != 0) {
- e.encodeExpGolomb(l3SkipSize, K_VALUE_ZCPOSTING_L3SKIPSIZE);
- if (l3SkipSize != 0) {
- e.encodeExpGolomb(l4SkipSize, K_VALUE_ZCPOSTING_L4SKIPSIZE);
- }
- }
- }
- e.encodeExpGolomb(_featureOffset, K_VALUE_ZCPOSTING_FEATURESSIZE);
-
- // Encode last document id in chunk or word.
- if (_dynamicK) {
- uint32_t docIdK = e.calcDocIdK((_counts._segments.empty() &&
- !hasMore) ?
- numDocs : 1,
- _docIdLimit);
- e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first,
- docIdK);
- } else {
- e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first,
- K_VALUE_ZCPOSTING_LASTDOCID);
- }
-
- e.smallAlign(8); // Byte align
-
- uint8_t *docIds = _zcDocIds._mallocStart;
- e.writeBits(reinterpret_cast<const uint64_t *>(docIds),
- 0,
- docIdsSize * 8);
- if (l1SkipSize > 0) {
- uint8_t *l1Skip = _l1Skip._mallocStart;
- e.writeBits(reinterpret_cast<const uint64_t *>(l1Skip),
- 0,
- l1SkipSize * 8);
- }
- if (l2SkipSize > 0) {
- uint8_t *l2Skip = _l2Skip._mallocStart;
- e.writeBits(reinterpret_cast<const uint64_t *>(l2Skip),
- 0,
- l2SkipSize * 8);
- }
- if (l3SkipSize > 0) {
- uint8_t *l3Skip = _l3Skip._mallocStart;
- e.writeBits(reinterpret_cast<const uint64_t *>(l3Skip),
- 0,
- l3SkipSize * 8);
- }
- if (l4SkipSize > 0) {
- uint8_t *l4Skip = _l4Skip._mallocStart;
- e.writeBits(reinterpret_cast<const uint64_t *>(l4Skip),
- 0,
- l4SkipSize * 8);
- }
-
- // Write features
- e.writeBits(static_cast<const uint64_t *>(_featureWriteContext._comprBuf),
- 0,
- _featureOffset);
-
- _counts._numDocs += numDocs;
- if (hasMore || !_counts._segments.empty()) {
- uint64_t writePos = e.getWriteOffset();
- PostingListCounts::Segment seg;
- seg._bitLength = writePos - (_writePos + _counts._bitLength);
- seg._numDocs = numDocs;
- seg._lastDoc = _docIds.back().first;
- _counts._segments.push_back(seg);
- _counts._bitLength += seg._bitLength;
- }
- // reset tables in preparation for next word or next chunk
- _zcDocIds.clear();
- _l1Skip.clear();
- _l2Skip.clear();
- _l3Skip.clear();
- _l4Skip.clear();
- resetWord();
-}
-
-
-void
-Zc4PostingSeqWrite::flushWordNoSkip()
-{
- // Too few document ids for skip info.
- assert(_docIds.size() < _minSkipDocs && _counts._segments.empty());
-
- _encodeFeatures->flush();
- EncodeContext &e = _encodeContext;
- uint32_t numDocs = _docIds.size();
-
- e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS);
-
- uint32_t baseDocId = 1;
- const uint64_t *features =
- static_cast<const uint64_t *>(_featureWriteContext._comprBuf);
- uint64_t featureOffset = 0;
-
- std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin();
- std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end();
-
- for (; dit != dite; ++dit) {
- uint32_t docId = dit->first;
- uint32_t featureSize = dit->second;
- e.encodeExpGolomb(docId - baseDocId, K_VALUE_ZCPOSTING_DELTA_DOCID);
- baseDocId = docId + 1;
- e.writeBits(features + (featureOffset >> 6),
- featureOffset & 63,
- featureSize);
- featureOffset += featureSize;
- }
- _counts._numDocs += numDocs;
- resetWord();
-}
-
-
-void
-Zc4PostingSeqWrite::resetWord()
-{
- _docIds.clear();
- _encodeFeatures->setupWrite(_featureWriteContext);
- _featureOffset = 0;
+ _writer.get_encode_features().getParams(params);
}
@@ -1300,44 +853,7 @@ ZcPostingSeqRead::getIdentifier()
ZcPostingSeqWrite::ZcPostingSeqWrite(PostingListCountFileSeqWrite *countFile)
: Zc4PostingSeqWrite(countFile)
{
- _dynamicK = true;
-}
-
-
-void
-ZcPostingSeqWrite::flushWordNoSkip()
-{
- // Too few document ids for skip info.
- assert(_docIds.size() < _minSkipDocs && _counts._segments.empty());
-
- _encodeFeatures->flush();
- EncodeContext &e = _encodeContext;
- uint32_t numDocs = _docIds.size();
-
- e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS);
-
- uint32_t docIdK = e.calcDocIdK(numDocs, _docIdLimit);
-
- uint32_t baseDocId = 1;
- const uint64_t *features =
- static_cast<const uint64_t *>(_featureWriteContext._comprBuf);
- uint64_t featureOffset = 0;
-
- std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin();
- std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end();
-
- for (; dit != dite; ++dit) {
- uint32_t docId = dit->first;
- uint32_t featureSize = dit->second;
- e.encodeExpGolomb(docId - baseDocId, docIdK);
- baseDocId = docId + 1;
- e.writeBits(features + (featureOffset >> 6),
- featureOffset & 63,
- featureSize);
- featureOffset += featureSize;
- }
- _counts._numDocs += numDocs;
- resetWord();
+ _writer.set_dynamic_k(true);
}
} // namespace search::diskindex
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.h b/searchlib/src/vespa/searchlib/diskindex/zcposting.h
index 8c69a051e83..96cc306cea8 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposting.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.h
@@ -2,9 +2,8 @@
#pragma once
-#include "zcbuf.h"
+#include "zc4_posting_writer.h"
#include <vespa/searchlib/index/postinglistfile.h>
-#include <vespa/searchlib/bitcompression/compression.h>
#include <vespa/fastos/file.h>
namespace search::index {
@@ -131,29 +130,8 @@ class Zc4PostingSeqWrite : public index::PostingListFileSeqWrite
protected:
typedef bitcompression::FeatureEncodeContextBE EncodeContext;
- EncodeContext _encodeContext;
- search::ComprFileWriteContext _writeContext;
- FastOS_File _file;
- uint32_t _minChunkDocs; // # of documents needed for chunking
- uint32_t _minSkipDocs; // # of documents needed for skipping
- uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit)
- // Unpacked document ids for word and feature sizes
- typedef std::pair<uint32_t, uint32_t> DocIdAndFeatureSize;
- std::vector<DocIdAndFeatureSize> _docIds;
-
- // Buffer up features in memory
- EncodeContext *_encodeFeatures;
- uint64_t _featureOffset; // Bit offset of next feature
- search::ComprFileWriteContext _featureWriteContext;
- uint64_t _writePos; // Bit position for start of current word
- bool _dynamicK; // Caclulate EG compression parameters ?
- ZcBuf _zcDocIds; // Document id deltas
- ZcBuf _l1Skip; // L1 skip info
- ZcBuf _l2Skip; // L2 skip info
- ZcBuf _l3Skip; // L3 skip info
- ZcBuf _l4Skip; // L4 skip info
-
- uint64_t _numWords; // Number of words in file
+ Zc4PostingWriter<true> _writer;
+ FastOS_File _file;
uint64_t _fileBitSize;
index::PostingListCountFileSeqWrite *const _countFile;
public:
@@ -177,37 +155,10 @@ public:
void getFeatureParams(PostingListParams &params) override;
/**
- * Flush chunk to file.
- */
- void flushChunk();
- void calcSkipInfo();
-
- /**
- * Flush word with skip info to disk
- */
- void flushWordWithSkip(bool hasMore);
-
-
- /**
- * Flush word without skip info to disk.
- */
- virtual void flushWordNoSkip();
-
- /**
- * Prepare for next word or next chunk.
- */
- void resetWord();
-
- /**
* Make header using feature encode write context.
*/
void makeHeader(const search::common::FileHeaderContext &fileHeaderContext);
void updateHeader();
-
- /**
- * Read header, using temporary feature decode context.
- */
- uint32_t readHeader(const vespalib::string &name);
};
@@ -223,7 +174,6 @@ class ZcPostingSeqWrite : public Zc4PostingSeqWrite
{
public:
ZcPostingSeqWrite(index::PostingListCountFileSeqWrite *countFile);
- void flushWordNoSkip() override;
};
}
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
index 3c16fc8e9a8..33819d4f7cb 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp
@@ -3,12 +3,13 @@
#include "fakezcfilterocc.h"
#include "fpfactory.h"
#include <vespa/searchlib/diskindex/zcposocciterators.h>
-#include <vespa/searchlib/diskindex/zcbuf.h>
+#include <vespa/searchlib/diskindex/zc4_posting_writer.h>
using search::fef::TermFieldMatchData;
using search::fef::TermFieldMatchDataArray;
using search::fef::TermFieldMatchDataPosition;
using search::queryeval::SearchIterator;
+using search::index::PostingListCounts;
using search::index::PostingListParams;
using search::index::DocIdAndFeatures;
using search::index::DocIdAndPosOccFeatures;
@@ -24,11 +25,6 @@ namespace search {
namespace fakedata {
-#define L1SKIPSTRIDE 16
-#define L2SKIPSTRIDE 8
-#define L3SKIPSTRIDE 8
-#define L4SKIPSTRIDE 8
-
#define DEBUG_ZCFILTEROCC_PRINTF 0
#define DEBUG_ZCFILTEROCC_ASSERT 0
@@ -137,35 +133,8 @@ void
FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures,
bool dynamicK)
{
- ZcBuf bytes;
- ZcBuf l1SkipBytes;
- ZcBuf l2SkipBytes;
- ZcBuf l3SkipBytes;
- ZcBuf l4SkipBytes;
- uint32_t lastDocId = 0u;
- uint32_t lastL1SkipDocId = 0u;
- uint64_t lastL1SkipDocIdPos = 0;
- uint64_t lastL1SkipFeaturePos = 0;
- unsigned int l1SkipCnt = 0;
- uint32_t lastL2SkipDocId = 0u;
- uint64_t lastL2SkipDocIdPos = 0;
- uint64_t lastL2SkipFeaturePos = 0;
- uint64_t lastL2SkipL1SkipPos = 0;
- unsigned int l2SkipCnt = 0;
- uint32_t lastL3SkipDocId = 0u;
- uint64_t lastL3SkipDocIdPos = 0;
- uint64_t lastL3SkipFeaturePos = 0;
- uint64_t lastL3SkipL1SkipPos = 0;
- uint64_t lastL3SkipL2SkipPos = 0;
- unsigned int l3SkipCnt = 0;
- uint32_t lastL4SkipDocId = 0u;
- uint64_t lastL4SkipDocIdPos = 0;
- uint64_t lastL4SkipFeaturePos = 0;
- uint64_t lastL4SkipL1SkipPos = 0;
- uint64_t lastL4SkipL2SkipPos = 0;
- uint64_t lastL4SkipL3SkipPos = 0;
- unsigned int l4SkipCnt = 0;
- uint64_t featurePos = 0;
+ PostingListCounts counts;
+ Zc4PostingWriter<bigEndian> writer(counts);
typedef FakeWord FW;
typedef FW::DocWordFeatureList DWFL;
@@ -181,288 +150,88 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures,
FeatureEncodeContext<bigEndian> &f = (dynamicK ?
static_cast<FeatureEncodeContext<bigEndian> &>(f1) :
static_cast<FeatureEncodeContext<bigEndian> &>(f0));
- search::ComprFileWriteContext fctx(f);
- f.setWriteContext(&fctx);
- fctx.allocComprBuf(64, 1);
- f.afterWrite(fctx, 0, 0);
+ writer.set_dynamic_k(dynamicK);
+ if (doFeatures) {
+ writer.set_encode_features(&f);
+ }
+ PostingListParams params;
+ params.set("docIdLimit", fw._docIdLimit);
+ params.set("minChunkDocs", 1000000000); // Disable chunking
+ params.set("minSkipDocs", 1u); // Force skip info
+ writer.set_posting_list_params(params);
+ auto &writeContext = writer.get_write_context();
+ search::ComprBuffer &cb = writeContext;
+ auto &e = writer.get_encode_context();
+ writeContext.allocComprBuf(65536u, 32768u);
+ e.setupWrite(cb);
// Ensure that some space is initially available in encoding buffers
- bytes.maybeExpand();
- l1SkipBytes.maybeExpand();
- l2SkipBytes.maybeExpand();
- l3SkipBytes.maybeExpand();
- l4SkipBytes.maybeExpand();
while (d != de) {
- if (l1SkipCnt >= L1SKIPSTRIDE) {
- uint32_t docIdDelta = lastDocId - lastL1SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- l1SkipBytes.encode(docIdDelta - 1);
- uint64_t lastDocIdPos = bytes.size();
- uint32_t docIdPosDelta = lastDocIdPos - lastL1SkipDocIdPos;
- l1SkipBytes.encode(docIdPosDelta - 1);
- if (doFeatures) {
- featurePos = f.getWriteOffset();
- l1SkipBytes.encode(featurePos - lastL1SkipFeaturePos - 1);
- lastL1SkipFeaturePos = featurePos;
- }
-#if DEBUG_ZCFILTEROCC_PRINTF
- printf("L1Encode docId=%d (+%d), docIdPos=%d (+%u)\n",
- lastDocId, docIdDelta,
- (int) lastDocIdPos, docIdPosDelta);
-#endif
- lastL1SkipDocId = lastDocId;
- lastL1SkipDocIdPos = lastDocIdPos;
- l1SkipCnt = 0;
- ++l2SkipCnt;
- if (l2SkipCnt >= L2SKIPSTRIDE) {
- docIdDelta = lastDocId - lastL2SkipDocId;
- docIdPosDelta = lastDocIdPos - lastL2SkipDocIdPos;
- uint64_t lastL1SkipPos = l1SkipBytes.size();
- uint32_t l1SkipPosDelta = lastL1SkipPos - lastL2SkipL1SkipPos;
- l2SkipBytes.encode(docIdDelta - 1);
- l2SkipBytes.encode(docIdPosDelta - 1);
- if (doFeatures) {
- l2SkipBytes.encode(featurePos - lastL2SkipFeaturePos - 1);
- lastL2SkipFeaturePos = featurePos;
- }
- l2SkipBytes.encode(l1SkipPosDelta - 1);
-#if DEBUG_ZCFILTEROCC_PRINTF
- printf("L2Encode docId=%d (+%d), docIdPos=%d (+%u),"
- " l1SkipPos=%d (+%u)\n",
- lastDocId, docIdDelta,
- (int) lastDocIdPos, docIdPosDelta,
- (int) lastL1SkipPos, l1SkipPosDelta);
-#endif
- lastL2SkipDocId = lastDocId;
- lastL2SkipDocIdPos = lastDocIdPos;
- lastL2SkipL1SkipPos = lastL1SkipPos;
- l2SkipCnt = 0;
- ++l3SkipCnt;
- if (l3SkipCnt >= L3SKIPSTRIDE) {
- docIdDelta = lastDocId - lastL3SkipDocId;
- docIdPosDelta = lastDocIdPos - lastL3SkipDocIdPos;
- l1SkipPosDelta = lastL1SkipPos - lastL3SkipL1SkipPos;
- uint64_t lastL2SkipPos = l2SkipBytes.size();
- uint32_t l2SkipPosDelta = lastL2SkipPos -
- lastL3SkipL2SkipPos;
- l3SkipBytes.encode(docIdDelta - 1);
- l3SkipBytes.encode(docIdPosDelta - 1);
- if (doFeatures) {
- l3SkipBytes.encode(featurePos - lastL3SkipFeaturePos - 1);
- lastL3SkipFeaturePos = featurePos;
- }
- l3SkipBytes.encode(l1SkipPosDelta - 1);
- l3SkipBytes.encode(l2SkipPosDelta - 1);
-#if DEBUG_ZCFILTEROCC_PRINTF
- printf("L3Encode docId=%d (+%d), docIdPos=%d (+%u),"
- " l1SkipPos=%d (+%u) l2SkipPos %d (+%u)\n",
- lastDocId, docIdDelta,
- (int) lastDocIdPos, docIdPosDelta,
- (int) lastL1SkipPos, l1SkipPosDelta,
- (int) lastL2SkipPos, l2SkipPosDelta);
-#endif
- lastL3SkipDocId = lastDocId;
- lastL3SkipDocIdPos = lastDocIdPos;
- lastL3SkipL1SkipPos = lastL1SkipPos;
- lastL3SkipL2SkipPos = lastL2SkipPos;
- l3SkipCnt = 0;
- ++l4SkipCnt;
- if (l4SkipCnt >= L4SKIPSTRIDE) {
- docIdDelta = lastDocId - lastL4SkipDocId;
- docIdPosDelta = lastDocIdPos - lastL4SkipDocIdPos;
- l1SkipPosDelta = lastL1SkipPos - lastL4SkipL1SkipPos;
- l2SkipPosDelta = lastL2SkipPos - lastL4SkipL2SkipPos;
- uint64_t lastL3SkipPos = l3SkipBytes.size();
- uint32_t l3SkipPosDelta = lastL3SkipPos -
- lastL4SkipL3SkipPos;
- l4SkipBytes.encode(docIdDelta - 1);
- l4SkipBytes.encode(docIdPosDelta - 1);
- if (doFeatures) {
- l4SkipBytes.encode(featurePos - lastL4SkipFeaturePos - 1);
- lastL4SkipFeaturePos = featurePos;
- }
- l4SkipBytes.encode(l1SkipPosDelta - 1);
- l4SkipBytes.encode(l2SkipPosDelta - 1);
- l4SkipBytes.encode(l3SkipPosDelta - 1);
-#if DEBUG_ZCFILTEROCC_PRINTF
- printf("L4Encode docId=%d (+%d), docIdPos=%d (+%u),"
- " l1SkipPos=%d (+%u) l2SkipPos %d (+%u)"
- " l3SkipPos=%d (+%u)\n",
- lastDocId, docIdDelta,
- (int) lastDocIdPos, docIdPosDelta,
- (int) lastL1SkipPos, l1SkipPosDelta,
- (int) lastL2SkipPos, l2SkipPosDelta,
- (int) lastL3SkipPos, l3SkipPosDelta);
-#endif
- lastL4SkipDocId = lastDocId;
- lastL4SkipDocIdPos = lastDocIdPos;
- lastL4SkipL1SkipPos = lastL1SkipPos;
- lastL4SkipL2SkipPos = lastL2SkipPos;
- lastL4SkipL3SkipPos = lastL3SkipPos;
- l4SkipCnt = 0;
- }
- }
- }
- }
- if (lastDocId == 0u) {
- bytes.encode(d->_docId - 1);
-#if DEBUG_ZCFILTEROCC_PRINTF
- printf("Encode docId=%d\n",
- d->_docId);
-#endif
- } else {
- uint32_t docIdDelta = d->_docId - lastDocId;
- bytes.encode(docIdDelta - 1);
-#if DEBUG_ZCFILTEROCC_PRINTF
- printf("Encode docId=%d (+%d)\n",
- d->_docId, docIdDelta);
-#endif
- }
if (doFeatures) {
fw.setupFeatures(*d, &*p, features);
p += d->_positions;
- f.writeFeatures(features);
+ } else {
+ features.clear(d->_docId);
}
- lastDocId = d->_docId;
- ++l1SkipCnt;
+ writer.write_docid_and_features(features);
++d;
}
if (doFeatures) {
assert(p == pe);
- _featuresSize = f.getWriteOffset();
- // First pad to 64 bits.
- uint32_t pad = (64 - f.getWriteOffset()) & 63;
- while (pad > 0) {
- uint32_t now = std::min(32u, pad);
- f.writeBits(0, now);
- f.writeComprBufferIfNeeded();
- pad -= now;
- }
-
- // Then write 128 more bits. This allows for 64-bit decoding
- // with a readbits that always leaves a nonzero preRead
- for (unsigned int i = 0; i < 4; i++) {
- f.writeBits(0, 32);
- f.writeComprBufferIfNeeded();
- }
- f.writeComprBufferIfNeeded();
- f.flush();
- f.writeComprBuffer();
- } else {
- _featuresSize = 0;
- }
- // Extra partial entries for skip tables to simplify iterator during search
- if (l1SkipBytes.size() > 0) {
- uint32_t docIdDelta = lastDocId - lastL1SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- l1SkipBytes.encode(docIdDelta - 1);
- }
- if (l2SkipBytes.size() > 0) {
- uint32_t docIdDelta = lastDocId - lastL2SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- l2SkipBytes.encode(docIdDelta - 1);
- }
- if (l3SkipBytes.size() > 0) {
- uint32_t docIdDelta = lastDocId - lastL3SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- l3SkipBytes.encode(docIdDelta - 1);
- }
- if (l4SkipBytes.size() > 0) {
- uint32_t docIdDelta = lastDocId - lastL4SkipDocId;
- assert(static_cast<int32_t>(docIdDelta) > 0);
- l4SkipBytes.encode(docIdDelta - 1);
}
+ writer.flush_word();
+ _featuresSize = 0;
_hitDocs = fw._postings.size();
_docIdLimit = fw._docIdLimit;
- _lastDocId = lastDocId;
- FeatureEncodeContext<bigEndian> e;
- ComprFileWriteContext ectx(e);
- e.setWriteContext(&ectx);
- ectx.allocComprBuf(64, 1);
- e.afterWrite(ectx, 0, 0);
+ _compressedBits = e.getWriteOffset();
+ assert(_compressedBits == counts._bitLength);
+ assert(_hitDocs == counts._numDocs);
+ _lastDocId = fw._postings.back()._docId;
+ writer.on_close();
- // Encode word header
- e.encodeExpGolomb(_hitDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS);
- _docIdsSize = bytes.size() * 8;
- _l1SkipSize = l1SkipBytes.size();
- _l2SkipSize = _l3SkipSize = _l4SkipSize = 0;
- if (_l1SkipSize != 0)
- _l2SkipSize = l2SkipBytes.size();
- if (_l2SkipSize != 0)
- _l3SkipSize = l3SkipBytes.size();
- if (_l3SkipSize != 0)
- _l4SkipSize = l4SkipBytes.size();
-
- e.encodeExpGolomb(bytes.size() - 1, K_VALUE_ZCPOSTING_DOCIDSSIZE);
- e.encodeExpGolomb(_l1SkipSize, K_VALUE_ZCPOSTING_L1SKIPSIZE);
- e.writeComprBufferIfNeeded();
+ std::pair<void *, size_t> ectxData = writeContext.grabComprBuffer(_compressedMalloc);
+ _compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first),
+ ectxData.second);
+ read_header<bigEndian>(doFeatures, dynamicK, writer.get_min_skip_docs(), writer.get_min_chunk_docs());
+}
+
+template <bool bigEndian>
+void
+FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_docs, uint32_t min_chunk_docs)
+{
+ // read back word header to get skip sizes
+ using EC = FeatureEncodeContext<bigEndian>;
+ UC64_DECODECONTEXT(o);
+ uint32_t length;
+ uint64_t val64;
+ UC64_SETUPBITS_NS(o, _compressed.first, 0, EC);
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
+ assert(static_cast<uint32_t>(val64) + 1 == _hitDocs);
+ assert(_hitDocs >= min_skip_docs);
+ assert(_hitDocs < min_chunk_docs);
+ uint32_t docIdK = dynamicK ? EC::calcDocIdK(_hitDocs, _docIdLimit) : K_VALUE_ZCPOSTING_LASTDOCID;
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC);
+ _docIdsSize = val64 + 1;
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC);
+ _l1SkipSize = val64;
if (_l1SkipSize != 0) {
- e.encodeExpGolomb(_l2SkipSize, K_VALUE_ZCPOSTING_L2SKIPSIZE);
- if (_l2SkipSize != 0) {
- e.writeComprBufferIfNeeded();
- e.encodeExpGolomb(_l3SkipSize, K_VALUE_ZCPOSTING_L3SKIPSIZE);
- if (_l3SkipSize != 0) {
- e.encodeExpGolomb(_l4SkipSize, K_VALUE_ZCPOSTING_L4SKIPSIZE);
- }
- }
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC);
+ _l2SkipSize = val64;
}
- e.writeComprBufferIfNeeded();
- if (doFeatures) {
- e.encodeExpGolomb(_featuresSize, K_VALUE_ZCPOSTING_FEATURESSIZE);
- }
- uint32_t docIdK = e.calcDocIdK(_hitDocs, _docIdLimit);
- if (dynamicK)
- e.encodeExpGolomb(_docIdLimit - 1 - _lastDocId, docIdK);
- else
- e.encodeExpGolomb(_docIdLimit - 1 - _lastDocId,
- K_VALUE_ZCPOSTING_LASTDOCID);
- uint64_t bytePad = (- e.getWriteOffset()) & 7;
- if (bytePad > 0)
- e.writeBits(0, bytePad);
- size_t docIdSize = bytes.size();
- if (docIdSize > 0) {
- writeZcBuf(e, bytes);
+ if (_l2SkipSize != 0) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC);
+ _l3SkipSize = val64;
}
- if (_l1SkipSize > 0) {
- writeZcBuf(e, l1SkipBytes);
- if (_l2SkipSize > 0) {
- writeZcBuf(e, l2SkipBytes);
- if (_l3SkipSize > 0) {
- writeZcBuf(e, l3SkipBytes);
- if (_l4SkipSize > 0) {
- writeZcBuf(e, l4SkipBytes);
- }
- }
- }
+ if (_l3SkipSize != 0) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC);
+ _l4SkipSize = val64;
}
if (doFeatures) {
- e.writeBits(static_cast<const uint64_t *>(fctx._comprBuf),
- 0,
- _featuresSize);
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC);
+ _featuresSize = val64;
}
- _compressedBits = e.getWriteOffset();
- // First pad to 64 bits.
- uint32_t pad = (64 - e.getWriteOffset()) & 63;
- while (pad > 0) {
- uint32_t now = std::min(32u, pad);
- e.writeBits(0, now);
- e.writeComprBufferIfNeeded();
- pad -= now;
- }
-
- // Then write 128 more bits. This allows for 64-bit decoding
- // with a readbits that always leaves a nonzero preRead
- for (unsigned int i = 0; i < 4; i++) {
- e.writeBits(0, 32);
- e.writeComprBufferIfNeeded();
- }
- e.writeComprBufferIfNeeded();
- e.flush();
- e.writeComprBuffer();
-
- std::pair<void *, size_t> ectxData = ectx.grabComprBuffer(_compressedMalloc);
- _compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first),
- ectxData.second);
+ UC64_DECODEEXPGOLOMB_NS(o, docIdK, EC);
+ assert(_lastDocId == _docIdLimit - 1 - val64);
}
diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h
index d5df198acdc..b68e3866461 100644
--- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h
+++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h
@@ -37,6 +37,9 @@ protected:
template <bool bigEndian>
void setupT(const FakeWord &fw, bool doFeatures, bool dynamicK);
+ template <bool bigEndian>
+ void read_header(bool do_features, bool dynamic_k, uint32_t min_skip_docs, uint32_t min_cunk_docs);
+
public:
FakeZcFilterOcc(const FakeWord &fw);
FakeZcFilterOcc(const FakeWord &fw, bool bigEndian, const char *nameSuffix);