summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2019-04-26 11:08:17 +0200
committerTor Egge <Tor.Egge@broadpark.no>2019-04-28 23:49:16 +0200
commit7553e0390c1ceb3834cba62774b3ddc77a6944d1 (patch)
tree0f524636b34a18fa5948889d2f1b3f01a78c9881 /searchlib
parente287c58dce2df5eb5451a61000aab34553698a55 (diff)
Factor out Zc4PostingReader from Zc4PostingSeqRead.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/extposocc.cpp8
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp424
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h96
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp26
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocc.h2
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposting.cpp561
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposting.h92
-rw-r--r--searchlib/src/vespa/searchlib/index/postinglistfile.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/index/postinglistfile.h31
10 files changed, 573 insertions, 670 deletions
diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
index 104994ad038..2fea4f2bab7 100644
--- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
@@ -19,6 +19,7 @@ vespa_add_library(searchlib_diskindex OBJECT
pagedict4randread.cpp
wordnummapper.cpp
zc4_posting_header.cpp
+ zc4_posting_reader.cpp
zc4_posting_writer.cpp
zc4_posting_writer_base.cpp
zcbuf.cpp
diff --git a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp
index f6e4da945e0..34e64a9b558 100644
--- a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp
@@ -69,7 +69,7 @@ makePosOccWrite(const vespalib::string &name,
fileHeader.getBigEndian() &&
fileHeader.getFormats().size() == 2 &&
fileHeader.getFormats()[0] ==
- ZcPosOccSeqRead::getIdentifier() &&
+ Zc4PosOccSeqRead::getIdentifier(true) &&
fileHeader.getFormats()[1] ==
ZcPosOccSeqRead::getSubIdentifier()) {
dynamicK = true;
@@ -77,7 +77,7 @@ makePosOccWrite(const vespalib::string &name,
fileHeader.getBigEndian() &&
fileHeader.getFormats().size() == 2 &&
fileHeader.getFormats()[0] ==
- Zc4PosOccSeqRead::getIdentifier() &&
+ Zc4PosOccSeqRead::getIdentifier(false) &&
fileHeader.getFormats()[1] ==
Zc4PosOccSeqRead::getSubIdentifier()) {
dynamicK = false;
@@ -115,7 +115,7 @@ makePosOccRead(const vespalib::string &name,
fileHeader.getBigEndian() &&
fileHeader.getFormats().size() == 2 &&
fileHeader.getFormats()[0] ==
- ZcPosOccSeqRead::getIdentifier() &&
+ Zc4PosOccSeqRead::getIdentifier(true) &&
fileHeader.getFormats()[1] ==
ZcPosOccSeqRead::getSubIdentifier()) {
dynamicK = true;
@@ -123,7 +123,7 @@ makePosOccRead(const vespalib::string &name,
fileHeader.getBigEndian() &&
fileHeader.getFormats().size() == 2 &&
fileHeader.getFormats()[0] ==
- Zc4PosOccSeqRead::getIdentifier() &&
+ Zc4PosOccSeqRead::getIdentifier(false) &&
fileHeader.getFormats()[1] ==
Zc4PosOccSeqRead::getSubIdentifier()) {
dynamicK = false;
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
new file mode 100644
index 00000000000..c9b8cf0b017
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
@@ -0,0 +1,424 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "zc4_posting_reader.h"
+#include <vespa/searchlib/index/docidandfeatures.h>
+
+namespace search::diskindex {
+
+using index::PostingListCounts;
+using index::DocIdAndFeatures;
+using bitcompression::FeatureEncodeContext;
+
+
+template <bool bigEndian>
+Zc4PostingReader<bigEndian>::Zc4PostingReader(bool dynamic_k)
+ : _decodeContext(nullptr),
+ _docIdK(K_VALUE_ZCPOSTING_DELTA_DOCID),
+ _prevDocId(0),
+ _numDocs(0),
+ _readContext(sizeof(uint64_t)),
+ _has_more(false),
+ _posting_params(64, 1 << 30, 10000000, dynamic_k, true),
+ _lastDocId(0),
+ _zcDocIds(),
+ _l1Skip(),
+ _l2Skip(),
+ _l3Skip(),
+ _l4Skip(),
+ _chunkNo(0),
+ _l1SkipDocId(0),
+ _l1SkipDocIdPos(0),
+ _l1SkipFeaturesPos(0),
+ _l2SkipDocId(0),
+ _l2SkipDocIdPos(0),
+ _l2SkipL1SkipPos(0),
+ _l2SkipFeaturesPos(0),
+ _l3SkipDocId(0),
+ _l3SkipDocIdPos(0),
+ _l3SkipL1SkipPos(0),
+ _l3SkipL2SkipPos(0),
+ _l3SkipFeaturesPos(0),
+ _l4SkipDocId(0),
+ _l4SkipDocIdPos(0),
+ _l4SkipL1SkipPos(0),
+ _l4SkipL2SkipPos(0),
+ _l4SkipL3SkipPos(0),
+ _l4SkipFeaturesPos(0),
+ _featuresSize(0),
+ _counts(),
+ _residue(0)
+{
+}
+
+template <bool bigEndian>
+Zc4PostingReader<bigEndian>::~Zc4PostingReader()
+{
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::read_common_word_doc_id_and_features(DocIdAndFeatures &features)
+{
+ if ((_zcDocIds._valI >= _zcDocIds._valE) && _has_more) {
+ read_word_start(); // Read start of next chunk
+ }
+ // Split docid & features.
+ assert(_zcDocIds._valI < _zcDocIds._valE);
+ uint32_t docIdPos = _zcDocIds.pos();
+ uint32_t docId = _prevDocId + 1 + _zcDocIds.decode();
+ features._docId = docId;
+ _prevDocId = docId;
+ assert(docId <= _lastDocId);
+ if (docId > _l1SkipDocId) {
+ _l1SkipDocIdPos += _l1Skip.decode() + 1;
+ assert(docIdPos == _l1SkipDocIdPos);
+ _l1SkipFeaturesPos += _l1Skip.decode() + 1;
+ uint64_t featuresPos = _decodeContext->getReadOffset();
+ assert(featuresPos == _l1SkipFeaturesPos);
+ (void) featuresPos;
+ if (docId > _l2SkipDocId) {
+ _l2SkipDocIdPos += _l2Skip.decode() + 1;
+ assert(docIdPos == _l2SkipDocIdPos);
+ _l2SkipFeaturesPos += _l2Skip.decode() + 1;
+ assert(featuresPos == _l2SkipFeaturesPos);
+ _l2SkipL1SkipPos += _l2Skip.decode() + 1;
+ assert(_l1Skip.pos() == _l2SkipL1SkipPos);
+ if (docId > _l3SkipDocId) {
+ _l3SkipDocIdPos += _l3Skip.decode() + 1;
+ assert(docIdPos == _l3SkipDocIdPos);
+ _l3SkipFeaturesPos += _l3Skip.decode() + 1;
+ assert(featuresPos == _l3SkipFeaturesPos);
+ _l3SkipL1SkipPos += _l3Skip.decode() + 1;
+ assert(_l1Skip.pos() == _l3SkipL1SkipPos);
+ _l3SkipL2SkipPos += _l3Skip.decode() + 1;
+ assert(_l2Skip.pos() == _l3SkipL2SkipPos);
+ if (docId > _l4SkipDocId) {
+ _l4SkipDocIdPos += _l4Skip.decode() + 1;
+ assert(docIdPos == _l4SkipDocIdPos);
+ (void) docIdPos;
+ _l4SkipFeaturesPos += _l4Skip.decode() + 1;
+ assert(featuresPos == _l4SkipFeaturesPos);
+ _l4SkipL1SkipPos += _l4Skip.decode() + 1;
+ assert(_l1Skip.pos() == _l4SkipL1SkipPos);
+ _l4SkipL2SkipPos += _l4Skip.decode() + 1;
+ assert(_l2Skip.pos() == _l4SkipL2SkipPos);
+ _l4SkipL3SkipPos += _l4Skip.decode() + 1;
+ assert(_l3Skip.pos() == _l4SkipL3SkipPos);
+ _l4SkipDocId += _l4Skip.decode() + 1;
+ assert(_l4SkipDocId <= _lastDocId);
+ assert(_l4SkipDocId >= docId);
+ }
+ _l3SkipDocId += _l3Skip.decode() + 1;
+ assert(_l3SkipDocId <= _lastDocId);
+ assert(_l3SkipDocId <= _l4SkipDocId);
+ assert(_l3SkipDocId >= docId);
+ }
+ _l2SkipDocId += _l2Skip.decode() + 1;
+ assert(_l2SkipDocId <= _lastDocId);
+ assert(_l2SkipDocId <= _l4SkipDocId);
+ assert(_l2SkipDocId <= _l3SkipDocId);
+ assert(_l2SkipDocId >= docId);
+ }
+ _l1SkipDocId += _l1Skip.decode() + 1;
+ assert(_l1SkipDocId <= _lastDocId);
+ assert(_l1SkipDocId <= _l4SkipDocId);
+ assert(_l1SkipDocId <= _l3SkipDocId);
+ assert(_l1SkipDocId <= _l2SkipDocId);
+ assert(_l1SkipDocId >= docId);
+ }
+ if (docId < _lastDocId) {
+ // Assert more space available when not yet at last docid
+ assert(_zcDocIds._valI < _zcDocIds._valE);
+ } else {
+ // Assert that space has been used when at last docid
+ assert(_zcDocIds._valI == _zcDocIds._valE);
+ // Assert that we've read to end of skip info
+ assert(_l1SkipDocId == _lastDocId);
+ assert(_l2SkipDocId == _lastDocId);
+ assert(_l3SkipDocId == _lastDocId);
+ assert(_l4SkipDocId == _lastDocId);
+ if (!_has_more) {
+ _chunkNo = 0;
+ }
+ }
+ _decodeContext->readFeatures(features);
+ --_residue;
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::read_doc_id_and_features(DocIdAndFeatures &features)
+{
+ if (_residue == 0 && !_has_more) {
+ if (_residue == 0) {
+ // Don't read past end of posting list.
+ features.clear(static_cast<uint32_t>(-1));
+ return;
+ }
+ }
+ if (_lastDocId > 0) {
+ read_common_word_doc_id_and_features(features);
+ return;
+ }
+ // Interleaves docid & features
+ using EC = FeatureEncodeContext<bigEndian>;
+ DecodeContext &d = *_decodeContext;
+ uint32_t length;
+ uint64_t val64;
+ UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
+
+ UC64_DECODEEXPGOLOMB_SMALL_NS(o, _docIdK, EC);
+ uint32_t docId = _prevDocId + 1 + val64;
+ features._docId = docId;
+ _prevDocId = docId;
+ UC64_DECODECONTEXT_STORE(o, d._);
+ if (__builtin_expect(oCompr >= d._valE, false)) {
+ _readContext.readComprBuffer();
+ }
+ _decodeContext->readFeatures(features);
+ --_residue;
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::read_word_start_with_skip()
+{
+ using EC = FeatureEncodeContext<bigEndian>;
+ DecodeContext &d = *_decodeContext;
+ UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
+ uint32_t length;
+ uint64_t val64;
+ const uint64_t *valE = d._valE;
+
+ if (_has_more) {
+ ++_chunkNo;
+ } else {
+ _chunkNo = 0;
+ }
+ assert(_numDocs >= _posting_params._min_skip_docs || _has_more);
+ bool has_more = false;
+ if (__builtin_expect(_numDocs >= _posting_params._min_chunk_docs, false)) {
+ if (bigEndian) {
+ has_more = static_cast<int64_t>(oVal) < 0;
+ oVal <<= 1;
+ } else {
+ has_more = (oVal & 1) != 0;
+ oVal >>= 1;
+ }
+ length = 1;
+ UC64_READBITS_NS(o, EC);
+ }
+ if (_posting_params._dynamic_k) {
+ _docIdK = EC::calcDocIdK((_has_more || has_more) ? 1 : _numDocs,
+ _posting_params._doc_id_limit);
+ }
+ if (_has_more || has_more) {
+ assert(has_more == (_chunkNo + 1 < _counts._segments.size()));
+ assert(_numDocs == _counts._segments[_chunkNo]._numDocs);
+ if (has_more) {
+ assert(_numDocs >= _posting_params._min_skip_docs);
+ assert(_numDocs >= _posting_params._min_chunk_docs);
+ }
+ } else {
+ assert(_numDocs >= _posting_params._min_skip_docs);
+ assert(_numDocs == _counts._numDocs);
+ }
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC);
+ uint32_t docIdsSize = val64 + 1;
+ UC64_DECODEEXPGOLOMB_NS(o,
+ K_VALUE_ZCPOSTING_L1SKIPSIZE,
+ EC);
+ uint32_t l1SkipSize = val64;
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ uint32_t l2SkipSize = 0;
+ if (l1SkipSize != 0) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC);
+ l2SkipSize = val64;
+ }
+ uint32_t l3SkipSize = 0;
+ if (l2SkipSize != 0) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC);
+ l3SkipSize = val64;
+ }
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ uint32_t l4SkipSize = 0;
+ if (l3SkipSize != 0) {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC);
+ l4SkipSize = val64;
+ }
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC);
+ _featuresSize = val64;
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ if (_posting_params._dynamic_k) {
+ UC64_DECODEEXPGOLOMB_NS(o, _docIdK, EC);
+ } else {
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_LASTDOCID, EC);
+ }
+ _lastDocId = _posting_params._doc_id_limit - 1 - val64;
+ if (_has_more || has_more) {
+ assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc);
+ }
+
+ if (__builtin_expect(oCompr >= valE, false)) {
+ UC64_DECODECONTEXT_STORE(o, d._);
+ _readContext.readComprBuffer();
+ valE = d._valE;
+ UC64_DECODECONTEXT_LOAD(o, d._);
+ }
+ uint64_t bytePad = oPreRead & 7;
+ if (bytePad > 0) {
+ length = bytePad;
+ if (bigEndian) {
+ oVal <<= length;
+ } else {
+ oVal >>= length;
+ }
+ UC64_READBITS_NS(o, EC);
+ }
+ UC64_DECODECONTEXT_STORE(o, d._);
+ if (__builtin_expect(oCompr >= valE, false)) {
+ _readContext.readComprBuffer();
+ }
+ _zcDocIds.clearReserve(docIdsSize);
+ _l1Skip.clearReserve(l1SkipSize);
+ _l2Skip.clearReserve(l2SkipSize);
+ _l3Skip.clearReserve(l3SkipSize);
+ _l4Skip.clearReserve(l4SkipSize);
+ _decodeContext->readBytes(_zcDocIds._valI, docIdsSize);
+ _zcDocIds._valE = _zcDocIds._valI + docIdsSize;
+ if (l1SkipSize > 0) {
+ _decodeContext->readBytes(_l1Skip._valI, l1SkipSize);
+ }
+ _l1Skip._valE = _l1Skip._valI + l1SkipSize;
+ if (l2SkipSize > 0) {
+ _decodeContext->readBytes(_l2Skip._valI, l2SkipSize);
+ }
+ _l2Skip._valE = _l2Skip._valI + l2SkipSize;
+ if (l3SkipSize > 0) {
+ _decodeContext->readBytes(_l3Skip._valI, l3SkipSize);
+ }
+ _l3Skip._valE = _l3Skip._valI + l3SkipSize;
+ if (l4SkipSize > 0) {
+ _decodeContext->readBytes(_l4Skip._valI, l4SkipSize);
+ }
+ _l4Skip._valE = _l4Skip._valI + l4SkipSize;
+
+ if (l1SkipSize > 0) {
+ _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId;
+ } else {
+ _l1SkipDocId = _lastDocId;
+ }
+ if (l2SkipSize > 0) {
+ _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId;
+ } else {
+ _l2SkipDocId = _lastDocId;
+ }
+ if (l3SkipSize > 0) {
+ _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId;
+ } else {
+ _l3SkipDocId = _lastDocId;
+ }
+ if (l4SkipSize > 0) {
+ _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId;
+ } else {
+ _l4SkipDocId = _lastDocId;
+ }
+ _l1SkipDocIdPos = 0;
+ _l1SkipFeaturesPos = _decodeContext->getReadOffset();
+ _l2SkipDocIdPos = 0;
+ _l2SkipL1SkipPos = 0;
+ _l2SkipFeaturesPos = _decodeContext->getReadOffset();
+ _l3SkipDocIdPos = 0;
+ _l3SkipL1SkipPos = 0;
+ _l3SkipL2SkipPos = 0;
+ _l3SkipFeaturesPos = _decodeContext->getReadOffset();
+ _l4SkipDocIdPos = 0;
+ _l4SkipL1SkipPos = 0;
+ _l4SkipL2SkipPos = 0;
+ _l4SkipL3SkipPos = 0;
+ _l4SkipFeaturesPos = _decodeContext->getReadOffset();
+ _has_more = has_more;
+ // Decode context is now positioned at start of features
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::read_word_start()
+{
+ using EC = FeatureEncodeContext<bigEndian>;
+ UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_);
+ uint32_t length;
+ uint64_t val64;
+ const uint64_t *valE = _decodeContext->_valE;
+
+ UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
+ UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
+ if (oCompr >= valE) {
+ _readContext.readComprBuffer();
+ }
+ _numDocs = static_cast<uint32_t>(val64) + 1;
+ _residue = _numDocs;
+ _prevDocId = _has_more ? _lastDocId : 0u;
+ assert(_numDocs <= _counts._numDocs);
+ assert(_numDocs == _counts._numDocs ||
+ _numDocs >= _posting_params._min_chunk_docs ||
+ _has_more);
+
+ if (_numDocs >= _posting_params._min_skip_docs || _has_more) {
+ read_word_start_with_skip();
+ // Decode context is not positioned at start of features
+ } else {
+ if (_posting_params._dynamic_k) {
+ _docIdK = EC::calcDocIdK(_numDocs, _posting_params._doc_id_limit);
+ }
+ _lastDocId = 0u;
+ // Decode context is not positioned at start of docids & features
+ }
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::set_counts(const PostingListCounts &counts)
+{
+ assert(!_has_more && _residue == 0); // Previous words must have been read.
+ _counts = counts;
+ assert((_counts._numDocs == 0) == (_counts._bitLength == 0));
+ if (_counts._numDocs > 0) {
+ read_word_start();
+ }
+}
+
+template <bool bigEndian>
+void
+Zc4PostingReader<bigEndian>::set_decode_features(DecodeContext *decode_features)
+{
+ _decodeContext = decode_features;
+ _decodeContext->setReadContext(&_readContext);
+ _readContext.setDecodeContext(_decodeContext);
+}
+
+template class Zc4PostingReader<false>;
+template class Zc4PostingReader<true>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
new file mode 100644
index 00000000000..d8161da15d5
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
@@ -0,0 +1,96 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "zc4_posting_writer.h"
+#include <vespa/searchlib/index/postinglistfile.h>
+#include <vespa/fastos/file.h>
+#include "zc4_posting_params.h"
+
+namespace search::index {
+ class PostingListCountFileSeqRead;
+}
+
+namespace search::diskindex {
+
+/*
+ * Class used to read posting lists of type "Zc.4" and "Zc.5" (dynamic k).
+ *
+ * Common words have docid deltas and skip info separate from
+ * features.
+ *
+ * Rare words do not have skip info, and docid deltas and features are
+ * interleaved.
+ */
+template <bool bigEndian>
+class Zc4PostingReader
+{
+
+protected:
+ using DecodeContext = bitcompression::FeatureDecodeContext<bigEndian>;
+
+ DecodeContext *_decodeContext;
+ uint32_t _docIdK;
+ uint32_t _prevDocId; // Previous document id
+ uint32_t _numDocs; // Documents in chunk or word
+ search::ComprFileReadContext _readContext;
+ bool _has_more;
+ Zc4PostingParams _posting_params;
+ uint32_t _lastDocId; // last document in chunk or word
+
+ ZcBuf _zcDocIds; // Document id deltas
+ ZcBuf _l1Skip; // L1 skip info
+ ZcBuf _l2Skip; // L2 skip info
+ ZcBuf _l3Skip; // L3 skip info
+ ZcBuf _l4Skip; // L4 skip info
+
+ uint64_t _numWords; // Number of words in file
+ uint32_t _chunkNo; // Chunk number
+
+ // Variables for validating skip information while reading
+ uint32_t _l1SkipDocId;
+ uint32_t _l1SkipDocIdPos;
+ uint64_t _l1SkipFeaturesPos;
+ uint32_t _l2SkipDocId;
+ uint32_t _l2SkipDocIdPos;
+ uint32_t _l2SkipL1SkipPos;
+ uint64_t _l2SkipFeaturesPos;
+ uint32_t _l3SkipDocId;
+ uint32_t _l3SkipDocIdPos;
+ uint32_t _l3SkipL1SkipPos;
+ uint32_t _l3SkipL2SkipPos;
+ uint64_t _l3SkipFeaturesPos;
+ uint32_t _l4SkipDocId;
+ uint32_t _l4SkipDocIdPos;
+ uint32_t _l4SkipL1SkipPos;
+ uint32_t _l4SkipL2SkipPos;
+ uint32_t _l4SkipL3SkipPos;
+ uint64_t _l4SkipFeaturesPos;
+
+ // Variable for validating chunk information while reading
+ uint64_t _featuresSize;
+ index::PostingListCounts _counts;
+
+ uint32_t _residue; // Number of unread documents after word header
+ void read_common_word_doc_id_and_features(index::DocIdAndFeatures &features);
+ void read_word_start_with_skip();
+ void read_word_start();
+public:
+ Zc4PostingReader(bool dynamic_k);
+ Zc4PostingReader(const Zc4PostingReader &) = delete;
+ Zc4PostingReader(Zc4PostingReader &&) = delete;
+ Zc4PostingReader &operator=(const Zc4PostingReader &) = delete;
+ Zc4PostingReader &operator=(Zc4PostingReader &&) = delete;
+ ~Zc4PostingReader();
+ void read_doc_id_and_features(index::DocIdAndFeatures &features);
+ void set_counts(const index::PostingListCounts &counts);
+ void set_decode_features(DecodeContext *decode_features);
+ DecodeContext &get_decode_features() const { return *_decodeContext; }
+ ComprFileReadContext &get_read_context() { return _readContext; }
+ Zc4PostingParams &get_posting_params() { return _posting_params; }
+};
+
+extern template class Zc4PostingReader<false>;
+extern template class Zc4PostingReader<true>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
index 10c08af92cb..3ae2a631cb1 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
@@ -16,14 +16,12 @@ using search::index::PostingListCountFileSeqRead;
using search::index::PostingListCountFileSeqWrite;
Zc4PosOccSeqRead::Zc4PosOccSeqRead(PostingListCountFileSeqRead *countFile)
- : Zc4PostingSeqRead(countFile),
+ : Zc4PostingSeqRead(countFile, false),
_fieldsParams(),
_cookedDecodeContext(&_fieldsParams),
_rawDecodeContext(&_fieldsParams)
{
- _decodeContext = &_cookedDecodeContext;
- _decodeContext->setReadContext(&_readContext);
- _readContext.setDecodeContext(_decodeContext);
+ _reader.set_decode_features(&_cookedDecodeContext);
}
@@ -31,18 +29,17 @@ void
Zc4PosOccSeqRead::
setFeatureParams(const PostingListParams &params)
{
- bool oldCooked = _decodeContext == &_cookedDecodeContext;
+ bool oldCooked = &_reader.get_decode_features() == &_cookedDecodeContext;
bool newCooked = oldCooked;
params.get("cooked", newCooked);
if (oldCooked != newCooked) {
if (newCooked) {
_cookedDecodeContext = _rawDecodeContext;
- _decodeContext = &_cookedDecodeContext;
+ _reader.set_decode_features(&_cookedDecodeContext);
} else {
_rawDecodeContext = _cookedDecodeContext;
- _decodeContext = &_rawDecodeContext;
+ _reader.set_decode_features(&_rawDecodeContext);
}
- _readContext.setDecodeContext(_decodeContext);
}
}
@@ -69,14 +66,12 @@ Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema,
ZcPosOccSeqRead::ZcPosOccSeqRead(PostingListCountFileSeqRead *countFile)
- : ZcPostingSeqRead(countFile),
+ : Zc4PostingSeqRead(countFile, true),
_fieldsParams(),
_cookedDecodeContext(&_fieldsParams),
_rawDecodeContext(&_fieldsParams)
{
- _decodeContext = &_cookedDecodeContext;
- _decodeContext->setReadContext(&_readContext);
- _readContext.setDecodeContext(_decodeContext);
+ _reader.set_decode_features(&_cookedDecodeContext);
}
@@ -84,18 +79,17 @@ void
ZcPosOccSeqRead::
setFeatureParams(const PostingListParams &params)
{
- bool oldCooked = _decodeContext == &_cookedDecodeContext;
+ bool oldCooked = &_reader.get_decode_features() == &_cookedDecodeContext;
bool newCooked = oldCooked;
params.get("cooked", newCooked);
if (oldCooked != newCooked) {
if (newCooked) {
_cookedDecodeContext = _rawDecodeContext;
- _decodeContext = &_cookedDecodeContext;
+ _reader.set_decode_features(&_cookedDecodeContext);
} else {
_rawDecodeContext = _cookedDecodeContext;
- _decodeContext = &_rawDecodeContext;
+ _reader.set_decode_features(&_rawDecodeContext);
}
- _readContext.setDecodeContext(_decodeContext);
}
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h
index cd21fb02f33..1e0555116ce 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h
@@ -34,7 +34,7 @@ public:
};
-class ZcPosOccSeqRead : public ZcPostingSeqRead
+class ZcPosOccSeqRead : public Zc4PostingSeqRead
{
private:
bitcompression::PosOccFieldsParams _fieldsParams;
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
index e40842737c9..a0203b64197 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp
@@ -29,60 +29,19 @@ using bitcompression::FeatureEncodeContextBE;
using vespalib::getLastErrorString;
-Zc4PostingSeqRead::
-Zc4PostingSeqRead(PostingListCountFileSeqRead *countFile)
+Zc4PostingSeqRead::Zc4PostingSeqRead(PostingListCountFileSeqRead *countFile, bool dynamic_k)
: PostingListFileSeqRead(),
- _decodeContext(),
- _docIdK(0),
- _prevDocId(0),
- _numDocs(0),
- _readContext(sizeof(uint64_t)),
+ _reader(dynamic_k),
_file(),
- _hasMore(false),
- _dynamicK(false),
- _lastDocId(0),
- _minChunkDocs(1 << 30),
- _minSkipDocs(64),
- _docIdLimit(10000000),
- _zcDocIds(),
- _l1Skip(),
- _l2Skip(),
- _l3Skip(),
- _l4Skip(),
_numWords(0),
_fileBitSize(0),
- _chunkNo(0),
- _l1SkipDocId(0),
- _l1SkipDocIdPos(0),
- _l1SkipFeaturesPos(0),
- _l2SkipDocId(0),
- _l2SkipDocIdPos(0),
- _l2SkipL1SkipPos(0),
- _l2SkipFeaturesPos(0),
- _l3SkipDocId(0),
- _l3SkipDocIdPos(0),
- _l3SkipL1SkipPos(0),
- _l3SkipL2SkipPos(0),
- _l3SkipFeaturesPos(0),
- _l4SkipDocId(0),
- _l4SkipDocIdPos(0),
- _l4SkipL1SkipPos(0),
- _l4SkipL2SkipPos(0),
- _l4SkipL3SkipPos(0),
- _l4SkipFeaturesPos(0),
- _featuresSize(0),
- _countFile(countFile),
- _headerBitLen(0),
- _rangeEndOffset(0),
- _readAheadEndOffset(0),
- _wordStart(0),
- _residue(0)
+ _countFile(countFile)
{
if (_countFile != nullptr) {
PostingListParams params;
_countFile->getParams(params);
- params.get("docIdLimit", _docIdLimit);
- params.get("minChunkDocs", _minChunkDocs);
+ params.get("docIdLimit", _reader.get_posting_params()._doc_id_limit);
+ params.get("minChunkDocs", _reader.get_posting_params()._min_chunk_docs);
}
}
@@ -91,387 +50,16 @@ Zc4PostingSeqRead::~Zc4PostingSeqRead()
{
}
-
-void
-Zc4PostingSeqRead::
-readCommonWordDocIdAndFeatures(DocIdAndFeatures &features)
-{
- if ((_zcDocIds._valI >= _zcDocIds._valE) && _hasMore) {
- readWordStart(); // Read start of next chunk
- }
- // Split docid & features.
- assert(_zcDocIds._valI < _zcDocIds._valE);
- uint32_t docIdPos = _zcDocIds.pos();
- uint32_t docId = _prevDocId + 1 + _zcDocIds.decode();
- features._docId = docId;
- _prevDocId = docId;
- assert(docId <= _lastDocId);
- if (docId > _l1SkipDocId) {
- _l1SkipDocIdPos += _l1Skip.decode() + 1;
- assert(docIdPos == _l1SkipDocIdPos);
- _l1SkipFeaturesPos += _l1Skip.decode() + 1;
- uint64_t featuresPos = _decodeContext->getReadOffset();
- assert(featuresPos == _l1SkipFeaturesPos);
- (void) featuresPos;
- if (docId > _l2SkipDocId) {
- _l2SkipDocIdPos += _l2Skip.decode() + 1;
- assert(docIdPos == _l2SkipDocIdPos);
- _l2SkipFeaturesPos += _l2Skip.decode() + 1;
- assert(featuresPos == _l2SkipFeaturesPos);
- _l2SkipL1SkipPos += _l2Skip.decode() + 1;
- assert(_l1Skip.pos() == _l2SkipL1SkipPos);
- if (docId > _l3SkipDocId) {
- _l3SkipDocIdPos += _l3Skip.decode() + 1;
- assert(docIdPos == _l3SkipDocIdPos);
- _l3SkipFeaturesPos += _l3Skip.decode() + 1;
- assert(featuresPos == _l3SkipFeaturesPos);
- _l3SkipL1SkipPos += _l3Skip.decode() + 1;
- assert(_l1Skip.pos() == _l3SkipL1SkipPos);
- _l3SkipL2SkipPos += _l3Skip.decode() + 1;
- assert(_l2Skip.pos() == _l3SkipL2SkipPos);
- if (docId > _l4SkipDocId) {
- _l4SkipDocIdPos += _l4Skip.decode() + 1;
- assert(docIdPos == _l4SkipDocIdPos);
- (void) docIdPos;
- _l4SkipFeaturesPos += _l4Skip.decode() + 1;
- assert(featuresPos == _l4SkipFeaturesPos);
- _l4SkipL1SkipPos += _l4Skip.decode() + 1;
- assert(_l1Skip.pos() == _l4SkipL1SkipPos);
- _l4SkipL2SkipPos += _l4Skip.decode() + 1;
- assert(_l2Skip.pos() == _l4SkipL2SkipPos);
- _l4SkipL3SkipPos += _l4Skip.decode() + 1;
- assert(_l3Skip.pos() == _l4SkipL3SkipPos);
- _l4SkipDocId += _l4Skip.decode() + 1;
- assert(_l4SkipDocId <= _lastDocId);
- assert(_l4SkipDocId >= docId);
- }
- _l3SkipDocId += _l3Skip.decode() + 1;
- assert(_l3SkipDocId <= _lastDocId);
- assert(_l3SkipDocId <= _l4SkipDocId);
- assert(_l3SkipDocId >= docId);
- }
- _l2SkipDocId += _l2Skip.decode() + 1;
- assert(_l2SkipDocId <= _lastDocId);
- assert(_l2SkipDocId <= _l4SkipDocId);
- assert(_l2SkipDocId <= _l3SkipDocId);
- assert(_l2SkipDocId >= docId);
- }
- _l1SkipDocId += _l1Skip.decode() + 1;
- assert(_l1SkipDocId <= _lastDocId);
- assert(_l1SkipDocId <= _l4SkipDocId);
- assert(_l1SkipDocId <= _l3SkipDocId);
- assert(_l1SkipDocId <= _l2SkipDocId);
- assert(_l1SkipDocId >= docId);
- }
- if (docId < _lastDocId) {
- // Assert more space available when not yet at last docid
- assert(_zcDocIds._valI < _zcDocIds._valE);
- } else {
- // Assert that space has been used when at last docid
- assert(_zcDocIds._valI == _zcDocIds._valE);
- // Assert that we've read to end of skip info
- assert(_l1SkipDocId == _lastDocId);
- assert(_l2SkipDocId == _lastDocId);
- assert(_l3SkipDocId == _lastDocId);
- assert(_l4SkipDocId == _lastDocId);
- if (!_hasMore) {
- _chunkNo = 0;
- }
- }
- _decodeContext->readFeatures(features);
- --_residue;
-}
-
-
-void
-Zc4PostingSeqRead::
-readDocIdAndFeatures(DocIdAndFeatures &features)
-{
- if (_residue == 0 && !_hasMore) {
- if (_rangeEndOffset != 0) {
- DecodeContext &d = *_decodeContext;
- uint64_t curOffset = d.getReadOffset();
- assert(curOffset <= _rangeEndOffset);
- if (curOffset < _rangeEndOffset) {
- readWordStart();
- }
- }
- if (_residue == 0) {
- // Don't read past end of posting list.
- features.clear(static_cast<uint32_t>(-1));
- return;
- }
- }
- if (_lastDocId > 0) {
- return readCommonWordDocIdAndFeatures(features);
- }
- // Interleaves docid & features
- typedef FeatureEncodeContextBE EC;
- DecodeContext &d = *_decodeContext;
- uint32_t length;
- uint64_t val64;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
-
- UC64BE_DECODEEXPGOLOMB_SMALL_NS(o,
- K_VALUE_ZCPOSTING_DELTA_DOCID,
- EC);
- uint32_t docId = _prevDocId + 1 + val64;
- features._docId = docId;
- _prevDocId = docId;
- UC64_DECODECONTEXT_STORE(o, d._);
- if (__builtin_expect(oCompr >= d._valE, false)) {
- _readContext.readComprBuffer();
- }
- _decodeContext->readFeatures(features);
- --_residue;
-}
-
-
-void
-Zc4PostingSeqRead::readWordStartWithSkip()
-{
- typedef FeatureEncodeContextBE EC;
- DecodeContext &d = *_decodeContext;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
- uint32_t length;
- uint64_t val64;
- const uint64_t *valE = d._valE;
-
- if (_hasMore) {
- ++_chunkNo;
- } else {
- _chunkNo = 0;
- }
- assert(_numDocs >= _minSkipDocs || _hasMore);
- bool hasMore = false;
- if (__builtin_expect(_numDocs >= _minChunkDocs, false)) {
- hasMore = static_cast<int64_t>(oVal) < 0;
- oVal <<= 1;
- length = 1;
- UC64BE_READBITS_NS(o, EC);
- }
- if (_dynamicK) {
- _docIdK = EC::calcDocIdK((_hasMore || hasMore) ? 1 : _numDocs,
- _docIdLimit);
- }
- if (_hasMore || hasMore) {
- if (_rangeEndOffset == 0) {
- assert(hasMore == (_chunkNo + 1 < _counts._segments.size()));
- assert(_numDocs == _counts._segments[_chunkNo]._numDocs);
- }
- if (hasMore) {
- assert(_numDocs >= _minSkipDocs);
- assert(_numDocs >= _minChunkDocs);
- }
- } else {
- assert(_numDocs >= _minSkipDocs);
- if (_rangeEndOffset == 0) {
- assert(_numDocs == _counts._numDocs);
- }
- }
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_DOCIDSSIZE,
- EC);
- uint32_t docIdsSize = val64 + 1;
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_L1SKIPSIZE,
- EC);
- uint32_t l1SkipSize = val64;
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- uint32_t l2SkipSize = 0;
- if (l1SkipSize != 0) {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_L2SKIPSIZE,
- EC);
- l2SkipSize = val64;
- }
- uint32_t l3SkipSize = 0;
- if (l2SkipSize != 0) {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_L3SKIPSIZE,
- EC);
- l3SkipSize = val64;
- }
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- uint32_t l4SkipSize = 0;
- if (l3SkipSize != 0) {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_L4SKIPSIZE,
- EC);
- l4SkipSize = val64;
- }
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_FEATURESSIZE,
- EC);
- _featuresSize = val64;
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- if (_dynamicK) {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- _docIdK,
- EC);
- } else {
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_LASTDOCID,
- EC);
- }
- _lastDocId = _docIdLimit - 1 - val64;
- if (_hasMore || hasMore) {
- if (_rangeEndOffset == 0) {
- assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc);
- }
- }
-
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- uint64_t bytePad = oPreRead & 7;
- if (bytePad > 0) {
- length = bytePad;
- oVal <<= length;
- UC64BE_READBITS_NS(o, EC);
- }
- UC64_DECODECONTEXT_STORE(o, d._);
- if (__builtin_expect(oCompr >= valE, false)) {
- _readContext.readComprBuffer();
- }
- _zcDocIds.clearReserve(docIdsSize);
- _l1Skip.clearReserve(l1SkipSize);
- _l2Skip.clearReserve(l2SkipSize);
- _l3Skip.clearReserve(l3SkipSize);
- _l4Skip.clearReserve(l4SkipSize);
- _decodeContext->readBytes(_zcDocIds._valI, docIdsSize);
- _zcDocIds._valE = _zcDocIds._valI + docIdsSize;
- if (l1SkipSize > 0) {
- _decodeContext->readBytes(_l1Skip._valI, l1SkipSize);
- }
- _l1Skip._valE = _l1Skip._valI + l1SkipSize;
- if (l2SkipSize > 0) {
- _decodeContext->readBytes(_l2Skip._valI, l2SkipSize);
- }
- _l2Skip._valE = _l2Skip._valI + l2SkipSize;
- if (l3SkipSize > 0) {
- _decodeContext->readBytes(_l3Skip._valI, l3SkipSize);
- }
- _l3Skip._valE = _l3Skip._valI + l3SkipSize;
- if (l4SkipSize > 0) {
- _decodeContext->readBytes(_l4Skip._valI, l4SkipSize);
- }
- _l4Skip._valE = _l4Skip._valI + l4SkipSize;
-
- if (l1SkipSize > 0) {
- _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId;
- } else {
- _l1SkipDocId = _lastDocId;
- }
- if (l2SkipSize > 0) {
- _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId;
- } else {
- _l2SkipDocId = _lastDocId;
- }
- if (l3SkipSize > 0) {
- _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId;
- } else {
- _l3SkipDocId = _lastDocId;
- }
- if (l4SkipSize > 0) {
- _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId;
- } else {
- _l4SkipDocId = _lastDocId;
- }
- _l1SkipDocIdPos = 0;
- _l1SkipFeaturesPos = _decodeContext->getReadOffset();
- _l2SkipDocIdPos = 0;
- _l2SkipL1SkipPos = 0;
- _l2SkipFeaturesPos = _decodeContext->getReadOffset();
- _l3SkipDocIdPos = 0;
- _l3SkipL1SkipPos = 0;
- _l3SkipL2SkipPos = 0;
- _l3SkipFeaturesPos = _decodeContext->getReadOffset();
- _l4SkipDocIdPos = 0;
- _l4SkipL1SkipPos = 0;
- _l4SkipL2SkipPos = 0;
- _l4SkipL3SkipPos = 0;
- _l4SkipFeaturesPos = _decodeContext->getReadOffset();
- _hasMore = hasMore;
- // Decode context is now positioned at start of features
-}
-
-
void
-Zc4PostingSeqRead::readWordStart()
+Zc4PostingSeqRead::readDocIdAndFeatures(DocIdAndFeatures &features)
{
- typedef FeatureEncodeContextBE EC;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_);
- uint32_t length;
- uint64_t val64;
- const uint64_t *valE = _decodeContext->_valE;
-
- UC64BE_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_NUMDOCS,
- EC);
- UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
- if (oCompr >= valE) {
- _readContext.readComprBuffer();
- }
- _numDocs = static_cast<uint32_t>(val64) + 1;
- _residue = _numDocs;
- _prevDocId = _hasMore ? _lastDocId : 0u;
- if (_rangeEndOffset == 0) {
- assert(_numDocs <= _counts._numDocs);
- assert(_numDocs == _counts._numDocs ||
- _numDocs >= _minChunkDocs ||
- _hasMore);
- }
-
- if (_numDocs >= _minSkipDocs || _hasMore) {
- readWordStartWithSkip();
- // Decode context is not positioned at start of features
- } else {
- if (_dynamicK) {
- _docIdK = EC::calcDocIdK(_numDocs, _docIdLimit);
- }
- _lastDocId = 0u;
- // Decode context is not positioned at start of docids & features
- }
+ _reader.read_doc_id_and_features(features);
}
-
void
Zc4PostingSeqRead::readCounts(const PostingListCounts &counts)
{
- assert(!_hasMore); // Previous words must have been read.
-
- _counts = counts;
-
- assert((_counts._numDocs == 0) == (_counts._bitLength == 0));
- if (_counts._numDocs > 0) {
- _wordStart = _decodeContext->getReadOffset();
- readWordStart();
- }
+ _reader.set_counts(counts);
}
@@ -484,16 +72,17 @@ Zc4PostingSeqRead::open(const vespalib::string &name,
}
bool res = _file.OpenReadOnly(name.c_str());
if (res) {
- _readContext.setFile(&_file);
- _readContext.setFileSize(_file.GetSize());
- DecodeContext &d = *_decodeContext;
- _readContext.allocComprBuf(65536u, 32768u);
+ auto &readContext = _reader.get_read_context();
+ readContext.setFile(&_file);
+ readContext.setFileSize(_file.GetSize());
+ auto &d = _reader.get_decode_features();
+ readContext.allocComprBuf(65536u, 32768u);
d.emptyBuffer(0);
- _readContext.readComprBuffer();
+ readContext.readComprBuffer();
readHeader();
if (d._valI >= d._valE) {
- _readContext.readComprBuffer();
+ readContext.readComprBuffer();
}
} else {
LOG(error, "could not open %s: %s",
@@ -506,9 +95,10 @@ Zc4PostingSeqRead::open(const vespalib::string &name,
bool
Zc4PostingSeqRead::close()
{
- _readContext.dropComprBuf();
+ auto &readContext = _reader.get_read_context();
+ readContext.dropComprBuf();
_file.Close();
- _readContext.setFile(nullptr);
+ readContext.setFile(nullptr);
return true;
}
@@ -524,29 +114,30 @@ Zc4PostingSeqRead::getParams(PostingListParams &params)
uint32_t countMinChunkDocs = 0;
countParams.get("docIdLimit", countDocIdLimit);
countParams.get("minChunkDocs", countMinChunkDocs);
- assert(_docIdLimit == countDocIdLimit);
- assert(_minChunkDocs == countMinChunkDocs);
+ assert(_reader.get_posting_params()._doc_id_limit == countDocIdLimit);
+ assert(_reader.get_posting_params()._min_chunk_docs == countMinChunkDocs);
} else {
params.clear();
- params.set("docIdLimit", _docIdLimit);
- params.set("minChunkDocs", _minChunkDocs);
+ params.set("docIdLimit", _reader.get_posting_params()._doc_id_limit);
+ params.set("minChunkDocs", _reader.get_posting_params()._min_chunk_docs);
}
- params.set("minSkipDocs", _minSkipDocs);
+ params.set("minSkipDocs", _reader.get_posting_params()._min_skip_docs);
}
void
Zc4PostingSeqRead::getFeatureParams(PostingListParams &params)
{
- _decodeContext->getParams(params);
+ _reader.get_decode_features().getParams(params);
}
void
Zc4PostingSeqRead::readHeader()
{
- FeatureDecodeContextBE &d = *_decodeContext;
- const vespalib::string &myId = _dynamicK ? myId5 : myId4;
+ FeatureDecodeContextBE &d = _reader.get_decode_features();
+ auto &posting_params = _reader.get_posting_params();
+ const vespalib::string &myId = posting_params._dynamic_k ? myId5 : myId4;
vespalib::FileHeader header;
d.readHeader(header, _file.getSize());
@@ -571,9 +162,9 @@ Zc4PostingSeqRead::readHeader()
(void) myId;
assert(header.getTag("format.1").asString() == d.getIdentifier());
_numWords = header.getTag("numWords").asInteger();
- _minChunkDocs = header.getTag("minChunkDocs").asInteger();
- _docIdLimit = header.getTag("docIdLimit").asInteger();
- _minSkipDocs = header.getTag("minSkipDocs").asInteger();
+ posting_params._min_chunk_docs = header.getTag("minChunkDocs").asInteger();
+ posting_params._doc_id_limit = header.getTag("docIdLimit").asInteger();
+ posting_params._min_skip_docs = header.getTag("minSkipDocs").asInteger();
assert(header.getTag("endian").asString() == "big");
// Read feature decoding specific subheader
d.readHeader(header, "features.");
@@ -585,38 +176,9 @@ Zc4PostingSeqRead::readHeader()
const vespalib::string &
-Zc4PostingSeqRead::getIdentifier()
-{
- return myId4;
-}
-
-
-uint64_t
-Zc4PostingSeqRead::getCurrentPostingOffset() const
+Zc4PostingSeqRead::getIdentifier(bool dynamic_k)
{
- FeatureDecodeContextBE &d = *_decodeContext;
- return d.getReadOffset() - _headerBitLen;
-}
-
-
-void
-Zc4PostingSeqRead::setPostingOffset(uint64_t offset,
- uint64_t endOffset,
- uint64_t readAheadOffset)
-{
- assert(_residue == 0); // Only to be called between posting lists
-
- FeatureDecodeContextBE &d = *_decodeContext;
-
- _rangeEndOffset = endOffset + _headerBitLen;
- _readAheadEndOffset = readAheadOffset + _headerBitLen;
- _readContext.setStopOffset(_readAheadEndOffset, false);
- uint64_t newOffset = offset + _headerBitLen;
- if (newOffset != d.getReadOffset()) {
- _readContext.setPosition(newOffset);
- assert(newOffset == d.getReadOffset());
- _readContext.readComprBuffer();
- }
+ return (dynamic_k ? myId5 : myId4);
}
@@ -809,65 +371,6 @@ getFeatureParams(PostingListParams &params)
}
-ZcPostingSeqRead::ZcPostingSeqRead(PostingListCountFileSeqRead *countFile)
- : Zc4PostingSeqRead(countFile)
-{
- _dynamicK = true;
-}
-
-
-void
-ZcPostingSeqRead::
-readDocIdAndFeatures(DocIdAndFeatures &features)
-{
- if (_residue == 0 && !_hasMore) {
- if (_rangeEndOffset != 0) {
- DecodeContext &d = *_decodeContext;
- uint64_t curOffset = d.getReadOffset();
- assert(curOffset <= _rangeEndOffset);
- if (curOffset < _rangeEndOffset) {
- readWordStart();
- }
- }
- if (_residue == 0) {
- // Don't read past end of posting list.
- features.clear(static_cast<uint32_t>(-1));
- return;
- }
- }
- if (_lastDocId > 0) {
- readCommonWordDocIdAndFeatures(features);
- return;
- }
- // Interleaves docid & features
- typedef FeatureEncodeContextBE EC;
- DecodeContext &d = *_decodeContext;
- uint32_t length;
- uint64_t val64;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
-
- UC64BE_DECODEEXPGOLOMB_SMALL_NS(o,
- _docIdK,
- EC);
- uint32_t docId = _prevDocId + 1 + val64;
- features._docId = docId;
- _prevDocId = docId;
- UC64_DECODECONTEXT_STORE(o, d._);
- if (__builtin_expect(oCompr >= d._valE, false)) {
- _readContext.readComprBuffer();
- }
- _decodeContext->readFeatures(features);
- --_residue;
-}
-
-
-const vespalib::string &
-ZcPostingSeqRead::getIdentifier()
-{
- return myId5;
-}
-
-
ZcPostingSeqWrite::ZcPostingSeqWrite(PostingListCountFileSeqWrite *countFile)
: Zc4PostingSeqWrite(countFile)
{
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.h b/searchlib/src/vespa/searchlib/diskindex/zcposting.h
index 96cc306cea8..01049e720a9 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposting.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.h
@@ -3,8 +3,10 @@
#pragma once
#include "zc4_posting_writer.h"
+#include "zc4_posting_reader.h"
#include <vespa/searchlib/index/postinglistfile.h>
#include <vespa/fastos/file.h>
+#include "zc4_posting_params.h"
namespace search::index {
class PostingListCountFileSeqRead;
@@ -19,63 +21,14 @@ class Zc4PostingSeqRead : public index::PostingListFileSeqRead
Zc4PostingSeqRead &operator=(const Zc4PostingSeqRead &);
protected:
- typedef bitcompression::FeatureDecodeContextBE DecodeContext;
- typedef bitcompression::FeatureEncodeContextBE EncodeContext;
-
- DecodeContext *_decodeContext;
- uint32_t _docIdK;
- uint32_t _prevDocId; // Previous document id
- uint32_t _numDocs; // Documents in chunk or word
- search::ComprFileReadContext _readContext;
+ Zc4PostingReader<true> _reader;
FastOS_File _file;
- bool _hasMore;
- bool _dynamicK; // Caclulate EG compression parameters ?
- uint32_t _lastDocId; // last document in chunk or word
- uint32_t _minChunkDocs; // # of documents needed for chunking
- uint32_t _minSkipDocs; // # of documents needed for skipping
- uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit)
-
- ZcBuf _zcDocIds; // Document id deltas
- ZcBuf _l1Skip; // L1 skip info
- ZcBuf _l2Skip; // L2 skip info
- ZcBuf _l3Skip; // L3 skip info
- ZcBuf _l4Skip; // L4 skip info
-
uint64_t _numWords; // Number of words in file
uint64_t _fileBitSize;
- uint32_t _chunkNo; // Chunk number
-
- // Variables for validating skip information while reading
- uint32_t _l1SkipDocId;
- uint32_t _l1SkipDocIdPos;
- uint64_t _l1SkipFeaturesPos;
- uint32_t _l2SkipDocId;
- uint32_t _l2SkipDocIdPos;
- uint32_t _l2SkipL1SkipPos;
- uint64_t _l2SkipFeaturesPos;
- uint32_t _l3SkipDocId;
- uint32_t _l3SkipDocIdPos;
- uint32_t _l3SkipL1SkipPos;
- uint32_t _l3SkipL2SkipPos;
- uint64_t _l3SkipFeaturesPos;
- uint32_t _l4SkipDocId;
- uint32_t _l4SkipDocIdPos;
- uint32_t _l4SkipL1SkipPos;
- uint32_t _l4SkipL2SkipPos;
- uint32_t _l4SkipL3SkipPos;
- uint64_t _l4SkipFeaturesPos;
-
- // Variable for validating chunk information while reading
- uint64_t _featuresSize;
index::PostingListCountFileSeqRead *const _countFile;
-
uint64_t _headerBitLen; // Size of file header in bits
- uint64_t _rangeEndOffset; // End offset for word pair
- uint64_t _readAheadEndOffset; // Readahead end offset for word pair
- uint64_t _wordStart; // last word header position
- uint32_t _residue; // Number of unread documents after word header
public:
- Zc4PostingSeqRead(index::PostingListCountFileSeqRead *countFile);
+ Zc4PostingSeqRead(index::PostingListCountFileSeqRead *countFile, bool dynamic_k);
~Zc4PostingSeqRead();
@@ -83,11 +36,6 @@ public:
typedef index::PostingListCounts PostingListCounts;
typedef index::PostingListParams PostingListParams;
- /**
- * Read document id and features for common word.
- */
- virtual void readCommonWordDocIdAndFeatures(DocIdAndFeatures &features);
-
void readDocIdAndFeatures(DocIdAndFeatures &features) override;
void readCounts(const PostingListCounts &counts) override; // Fill in for next word
bool open(const vespalib::string &name, const TuneFileSeqRead &tuneFileRead) override;
@@ -97,28 +45,7 @@ public:
void readWordStartWithSkip();
void readWordStart();
void readHeader();
- static const vespalib::string &getIdentifier();
-
- // Methods used when generating posting list for common word pairs.
-
- /*
- * Get current posting offset, measured in bits. First posting list
- * starts at 0, i.e. file header is not accounted for here.
- *
- * @return current posting offset, measured in bits.
- */
- uint64_t getCurrentPostingOffset() const override;
-
- /**
- * Set current posting offset, measured in bits. First posting
- * list starts at 0, i.e. file header is not accounted for here.
- *
- * @param Offset start of posting lists for word pair.
- * @param endOffset end of posting lists for word pair.
- * @param readAheadOffset end of posting list for either this or a
- * later word pair, depending on disk seek cost.
- */
- void setPostingOffset(uint64_t offset, uint64_t endOffset, uint64_t readAheadOffset) override;
+ static const vespalib::string &getIdentifier(bool dynamic_k);
};
@@ -161,15 +88,6 @@ public:
void updateHeader();
};
-
-class ZcPostingSeqRead : public Zc4PostingSeqRead
-{
-public:
- ZcPostingSeqRead(index::PostingListCountFileSeqRead *countFile);
- void readDocIdAndFeatures(DocIdAndFeatures &features) override;
- static const vespalib::string &getIdentifier();
-};
-
class ZcPostingSeqWrite : public Zc4PostingSeqWrite
{
public:
diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.cpp b/searchlib/src/vespa/searchlib/index/postinglistfile.cpp
index 0f0860f9145..52c6b85a0b8 100644
--- a/searchlib/src/vespa/searchlib/index/postinglistfile.cpp
+++ b/searchlib/src/vespa/searchlib/index/postinglistfile.cpp
@@ -6,8 +6,6 @@
namespace search::index {
PostingListFileSeqRead::PostingListFileSeqRead()
- : _counts(),
- _residueDocs(0)
{
}
diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.h b/searchlib/src/vespa/searchlib/index/postinglistfile.h
index 194ac519a19..1e7dde7f139 100644
--- a/searchlib/src/vespa/searchlib/index/postinglistfile.h
+++ b/searchlib/src/vespa/searchlib/index/postinglistfile.h
@@ -19,9 +19,6 @@ class DocIdAndFeatures;
* for words.
*/
class PostingListFileSeqRead {
-protected:
- PostingListCounts _counts;
- unsigned int _residueDocs; // Docids left to read for word
public:
PostingListFileSeqRead();
@@ -63,34 +60,6 @@ public:
* Get current (word, docid) feature parameters.
*/
virtual void getFeatureParams(PostingListParams &params);
-
- // Methods used when generating posting list for common word pairs.
-
- /*
- * Get current posting offset, measured in bits. First posting list
- * starts at 0, i.e. file header is not accounted for here.
- *
- * @return current posting offset, measured in bits.
- */
- virtual uint64_t getCurrentPostingOffset() const = 0;
-
- /**
- * Set current posting offset, measured in bits. First posting
- * list starts at 0, i.e. file header is not accounted for here.
- *
- * @param Offset start of posting lists for word pair.
- * @param endOffset end of posting lists for word pair.
- * @param readAheadOffset end of posting list for either this or a
- * later word pair, depending on disk seek cost.
- */
- virtual void setPostingOffset(uint64_t offset, uint64_t endOffset, uint64_t readAheadOffset) = 0;
-
- /**
- * Get counts read by last readCounts().
- */
- const PostingListCounts &getCounts() const { return _counts; }
-
- PostingListCounts &getCounts() { return _counts; }
};
/**