summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2019-05-02 11:10:17 +0200
committerTor Egge <Tor.Egge@broadpark.no>2019-05-02 11:10:17 +0200
commitf036f3fd0ef94689676a6da541c8c1b00efb276d (patch)
treec2701a9f1edd635abb5f437b5cf52ae4dd060165 /searchlib
parent05fb9663a0e78ee98f50e8ec57229654b5a4bb3c (diff)
Move endian independent portion of Zc4PostingReader to Zc4PostingReaderBase.
Use common code for reading posting list header.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp402
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h57
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp275
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h79
5 files changed, 382 insertions, 432 deletions
diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
index 2fea4f2bab7..ba608467c8a 100644
--- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
@@ -20,6 +20,7 @@ vespa_add_library(searchlib_diskindex OBJECT
wordnummapper.cpp
zc4_posting_header.cpp
zc4_posting_reader.cpp
+ zc4_posting_reader_base.cpp
zc4_posting_writer.cpp
zc4_posting_writer_base.cpp
zcbuf.cpp
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
index c0e1115521c..a09c26d7985 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp
@@ -1,6 +1,7 @@
// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "zc4_posting_reader.h"
+#include "zc4_posting_header.h"
#include <vespa/searchlib/index/docidandfeatures.h>
namespace search::diskindex {
@@ -12,41 +13,8 @@ using bitcompression::FeatureEncodeContext;
template <bool bigEndian>
Zc4PostingReader<bigEndian>::Zc4PostingReader(bool dynamic_k)
- : _decodeContext(nullptr),
- _docIdK(K_VALUE_ZCPOSTING_DELTA_DOCID),
- _prevDocId(0),
- _numDocs(0),
- _readContext(sizeof(uint64_t)),
- _has_more(false),
- _posting_params(64, 1 << 30, 10000000, dynamic_k, true),
- _lastDocId(0),
- _zcDocIds(),
- _l1Skip(),
- _l2Skip(),
- _l3Skip(),
- _l4Skip(),
- _chunkNo(0),
- _l1SkipDocId(0),
- _l1SkipDocIdPos(0),
- _l1SkipFeaturesPos(0),
- _l2SkipDocId(0),
- _l2SkipDocIdPos(0),
- _l2SkipL1SkipPos(0),
- _l2SkipFeaturesPos(0),
- _l3SkipDocId(0),
- _l3SkipDocIdPos(0),
- _l3SkipL1SkipPos(0),
- _l3SkipL2SkipPos(0),
- _l3SkipFeaturesPos(0),
- _l4SkipDocId(0),
- _l4SkipDocIdPos(0),
- _l4SkipL1SkipPos(0),
- _l4SkipL2SkipPos(0),
- _l4SkipL3SkipPos(0),
- _l4SkipFeaturesPos(0),
- _featuresSize(0),
- _counts(),
- _residue(0)
+ : Zc4PostingReaderBase(dynamic_k),
+ _decodeContext(nullptr)
{
}
@@ -57,370 +25,50 @@ Zc4PostingReader<bigEndian>::~Zc4PostingReader()
template <bool bigEndian>
void
-Zc4PostingReader<bigEndian>::read_common_word_doc_id_and_features(DocIdAndFeatures &features)
-{
- if ((_zcDocIds._valI >= _zcDocIds._valE) && _has_more) {
- read_word_start(); // Read start of next chunk
- }
- // Split docid & features.
- assert(_zcDocIds._valI < _zcDocIds._valE);
- uint32_t docIdPos = _zcDocIds.pos();
- uint32_t docId = _prevDocId + 1 + _zcDocIds.decode();
- features.set_doc_id(docId);
- _prevDocId = docId;
- assert(docId <= _lastDocId);
- if (docId > _l1SkipDocId) {
- _l1SkipDocIdPos += _l1Skip.decode() + 1;
- assert(docIdPos == _l1SkipDocIdPos);
- uint64_t featuresPos = _decodeContext->getReadOffset();
- if (_posting_params._encode_features) {
- _l1SkipFeaturesPos += _l1Skip.decode() + 1;
- assert(featuresPos == _l1SkipFeaturesPos);
- }
- (void) featuresPos;
- if (docId > _l2SkipDocId) {
- _l2SkipDocIdPos += _l2Skip.decode() + 1;
- assert(docIdPos == _l2SkipDocIdPos);
- if (_posting_params._encode_features) {
- _l2SkipFeaturesPos += _l2Skip.decode() + 1;
- assert(featuresPos == _l2SkipFeaturesPos);
- }
- _l2SkipL1SkipPos += _l2Skip.decode() + 1;
- assert(_l1Skip.pos() == _l2SkipL1SkipPos);
- if (docId > _l3SkipDocId) {
- _l3SkipDocIdPos += _l3Skip.decode() + 1;
- assert(docIdPos == _l3SkipDocIdPos);
- if (_posting_params._encode_features) {
- _l3SkipFeaturesPos += _l3Skip.decode() + 1;
- assert(featuresPos == _l3SkipFeaturesPos);
- }
- _l3SkipL1SkipPos += _l3Skip.decode() + 1;
- assert(_l1Skip.pos() == _l3SkipL1SkipPos);
- _l3SkipL2SkipPos += _l3Skip.decode() + 1;
- assert(_l2Skip.pos() == _l3SkipL2SkipPos);
- if (docId > _l4SkipDocId) {
- _l4SkipDocIdPos += _l4Skip.decode() + 1;
- assert(docIdPos == _l4SkipDocIdPos);
- (void) docIdPos;
- if (_posting_params._encode_features) {
- _l4SkipFeaturesPos += _l4Skip.decode() + 1;
- assert(featuresPos == _l4SkipFeaturesPos);
- }
- _l4SkipL1SkipPos += _l4Skip.decode() + 1;
- assert(_l1Skip.pos() == _l4SkipL1SkipPos);
- _l4SkipL2SkipPos += _l4Skip.decode() + 1;
- assert(_l2Skip.pos() == _l4SkipL2SkipPos);
- _l4SkipL3SkipPos += _l4Skip.decode() + 1;
- assert(_l3Skip.pos() == _l4SkipL3SkipPos);
- _l4SkipDocId += _l4Skip.decode() + 1;
- assert(_l4SkipDocId <= _lastDocId);
- assert(_l4SkipDocId >= docId);
- }
- _l3SkipDocId += _l3Skip.decode() + 1;
- assert(_l3SkipDocId <= _lastDocId);
- assert(_l3SkipDocId <= _l4SkipDocId);
- assert(_l3SkipDocId >= docId);
- }
- _l2SkipDocId += _l2Skip.decode() + 1;
- assert(_l2SkipDocId <= _lastDocId);
- assert(_l2SkipDocId <= _l4SkipDocId);
- assert(_l2SkipDocId <= _l3SkipDocId);
- assert(_l2SkipDocId >= docId);
- }
- _l1SkipDocId += _l1Skip.decode() + 1;
- assert(_l1SkipDocId <= _lastDocId);
- assert(_l1SkipDocId <= _l4SkipDocId);
- assert(_l1SkipDocId <= _l3SkipDocId);
- assert(_l1SkipDocId <= _l2SkipDocId);
- assert(_l1SkipDocId >= docId);
- }
- if (docId < _lastDocId) {
- // Assert more space available when not yet at last docid
- assert(_zcDocIds._valI < _zcDocIds._valE);
- } else {
- // Assert that space has been used when at last docid
- assert(_zcDocIds._valI == _zcDocIds._valE);
- // Assert that we've read to end of skip info
- assert(_l1SkipDocId == _lastDocId);
- assert(_l2SkipDocId == _lastDocId);
- assert(_l3SkipDocId == _lastDocId);
- assert(_l4SkipDocId == _lastDocId);
- if (!_has_more) {
- _chunkNo = 0;
- }
- }
- if (_posting_params._encode_features) {
- _decodeContext->readFeatures(features);
- }
- --_residue;
-}
-
-template <bool bigEndian>
-void
Zc4PostingReader<bigEndian>::read_doc_id_and_features(DocIdAndFeatures &features)
{
if (_residue == 0 && !_has_more) {
- if (_residue == 0) {
- // Don't read past end of posting list.
- features.clear(static_cast<uint32_t>(-1));
- return;
- }
- }
- if (_lastDocId > 0) {
- read_common_word_doc_id_and_features(features);
+ // Don't read past end of posting list.
+ features.clear(static_cast<uint32_t>(-1));
return;
}
- // Interleaves docid & features
- using EC = FeatureEncodeContext<bigEndian>;
- DecodeContext &d = *_decodeContext;
- uint32_t length;
- uint64_t val64;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
-
- UC64_DECODEEXPGOLOMB_SMALL_NS(o, _docIdK, EC);
- uint32_t docId = _prevDocId + 1 + val64;
- features.set_doc_id(docId);
- _prevDocId = docId;
- UC64_DECODECONTEXT_STORE(o, d._);
- if (__builtin_expect(oCompr >= d._valE, false)) {
- _readContext.readComprBuffer();
- }
- if (_posting_params._encode_features) {
- _decodeContext->readFeatures(features);
- }
- --_residue;
-}
-
-template <bool bigEndian>
-void
-Zc4PostingReader<bigEndian>::read_word_start_with_skip()
-{
- using EC = FeatureEncodeContext<bigEndian>;
- DecodeContext &d = *_decodeContext;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
- uint32_t length;
- uint64_t val64;
- const uint64_t *valE = d._valE;
-
- if (_has_more) {
- ++_chunkNo;
- } else {
- _chunkNo = 0;
- }
- assert(_numDocs >= _posting_params._min_skip_docs || _has_more);
- bool has_more = false;
- if (__builtin_expect(_numDocs >= _posting_params._min_chunk_docs, false)) {
- if (bigEndian) {
- has_more = static_cast<int64_t>(oVal) < 0;
- oVal <<= 1;
- } else {
- has_more = (oVal & 1) != 0;
- oVal >>= 1;
- }
- length = 1;
- UC64_READBITS_NS(o, EC);
- }
- if (_posting_params._dynamic_k) {
- _docIdK = EC::calcDocIdK((_has_more || has_more) ? 1 : _numDocs,
- _posting_params._doc_id_limit);
- }
- if (_has_more || has_more) {
- assert(has_more == (_chunkNo + 1 < _counts._segments.size()));
- assert(_numDocs == _counts._segments[_chunkNo]._numDocs);
- if (has_more) {
- assert(_numDocs >= _posting_params._min_skip_docs);
- assert(_numDocs >= _posting_params._min_chunk_docs);
- }
- } else {
- assert(_numDocs >= _posting_params._min_skip_docs);
- assert(_numDocs == _counts._numDocs);
- }
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC);
- uint32_t docIdsSize = val64 + 1;
- UC64_DECODEEXPGOLOMB_NS(o,
- K_VALUE_ZCPOSTING_L1SKIPSIZE,
- EC);
- uint32_t l1SkipSize = val64;
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- uint32_t l2SkipSize = 0;
- if (l1SkipSize != 0) {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC);
- l2SkipSize = val64;
- }
- uint32_t l3SkipSize = 0;
- if (l2SkipSize != 0) {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC);
- l3SkipSize = val64;
- }
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- uint32_t l4SkipSize = 0;
- if (l3SkipSize != 0) {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC);
- l4SkipSize = val64;
- }
- if (_posting_params._encode_features) {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC);
- _featuresSize = val64;
- }
- if (__builtin_expect(oCompr >= valE, false)) {
- UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- if (_posting_params._dynamic_k) {
- UC64_DECODEEXPGOLOMB_NS(o, _docIdK, EC);
+ if (_last_doc_id > 0) {
+ read_common_word_doc_id(*_decodeContext);
} else {
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_LASTDOCID, EC);
- }
- _lastDocId = _posting_params._doc_id_limit - 1 - val64;
- if (_has_more || has_more) {
- assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc);
- }
-
- if (__builtin_expect(oCompr >= valE, false)) {
+ // Interleaves docid & features
+ using EC = FeatureEncodeContext<bigEndian>;
+ DecodeContext &d = *_decodeContext;
+ uint32_t length;
+ uint64_t val64;
+ UC64_DECODECONTEXT_CONSTRUCTOR(o, d._);
+
+ UC64_DECODEEXPGOLOMB_SMALL_NS(o, _doc_id_k, EC);
+ uint32_t docId = _prev_doc_id + 1 + val64;
+ _prev_doc_id = docId;
UC64_DECODECONTEXT_STORE(o, d._);
- _readContext.readComprBuffer();
- valE = d._valE;
- UC64_DECODECONTEXT_LOAD(o, d._);
- }
- uint64_t bytePad = oPreRead & 7;
- if (bytePad > 0) {
- length = bytePad;
- if (bigEndian) {
- oVal <<= length;
- } else {
- oVal >>= length;
+ if (__builtin_expect(oCompr >= d._valE, false)) {
+ _readContext.readComprBuffer();
}
- UC64_READBITS_NS(o, EC);
- }
- UC64_DECODECONTEXT_STORE(o, d._);
- if (__builtin_expect(oCompr >= valE, false)) {
- _readContext.readComprBuffer();
- }
- _zcDocIds.clearReserve(docIdsSize);
- _l1Skip.clearReserve(l1SkipSize);
- _l2Skip.clearReserve(l2SkipSize);
- _l3Skip.clearReserve(l3SkipSize);
- _l4Skip.clearReserve(l4SkipSize);
- _decodeContext->readBytes(_zcDocIds._valI, docIdsSize);
- _zcDocIds._valE = _zcDocIds._valI + docIdsSize;
- if (l1SkipSize > 0) {
- _decodeContext->readBytes(_l1Skip._valI, l1SkipSize);
- }
- _l1Skip._valE = _l1Skip._valI + l1SkipSize;
- if (l2SkipSize > 0) {
- _decodeContext->readBytes(_l2Skip._valI, l2SkipSize);
- }
- _l2Skip._valE = _l2Skip._valI + l2SkipSize;
- if (l3SkipSize > 0) {
- _decodeContext->readBytes(_l3Skip._valI, l3SkipSize);
}
- _l3Skip._valE = _l3Skip._valI + l3SkipSize;
- if (l4SkipSize > 0) {
- _decodeContext->readBytes(_l4Skip._valI, l4SkipSize);
- }
- _l4Skip._valE = _l4Skip._valI + l4SkipSize;
-
- if (l1SkipSize > 0) {
- _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId;
- } else {
- _l1SkipDocId = _lastDocId;
- }
- if (l2SkipSize > 0) {
- _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId;
- } else {
- _l2SkipDocId = _lastDocId;
- }
- if (l3SkipSize > 0) {
- _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId;
- } else {
- _l3SkipDocId = _lastDocId;
- }
- if (l4SkipSize > 0) {
- _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId;
- } else {
- _l4SkipDocId = _lastDocId;
+ features.set_doc_id(_prev_doc_id);
+ if (_posting_params._encode_features) {
+ _decodeContext->readFeatures(features);
}
- _l1SkipDocIdPos = 0;
- _l1SkipFeaturesPos = _decodeContext->getReadOffset();
- _l2SkipDocIdPos = 0;
- _l2SkipL1SkipPos = 0;
- _l2SkipFeaturesPos = _decodeContext->getReadOffset();
- _l3SkipDocIdPos = 0;
- _l3SkipL1SkipPos = 0;
- _l3SkipL2SkipPos = 0;
- _l3SkipFeaturesPos = _decodeContext->getReadOffset();
- _l4SkipDocIdPos = 0;
- _l4SkipL1SkipPos = 0;
- _l4SkipL2SkipPos = 0;
- _l4SkipL3SkipPos = 0;
- _l4SkipFeaturesPos = _decodeContext->getReadOffset();
- _has_more = has_more;
- // Decode context is now positioned at start of features
+ --_residue;
}
template <bool bigEndian>
void
Zc4PostingReader<bigEndian>::read_word_start()
{
- using EC = FeatureEncodeContext<bigEndian>;
- UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_);
- uint32_t length;
- uint64_t val64;
- const uint64_t *valE = _decodeContext->_valE;
-
- UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC);
- UC64_DECODECONTEXT_STORE(o, _decodeContext->_);
- if (oCompr >= valE) {
- _readContext.readComprBuffer();
- }
- _numDocs = static_cast<uint32_t>(val64) + 1;
- _residue = _numDocs;
- _prevDocId = _has_more ? _lastDocId : 0u;
- assert(_numDocs <= _counts._numDocs);
- assert(_numDocs == _counts._numDocs ||
- _numDocs >= _posting_params._min_chunk_docs ||
- _has_more);
-
- if (_numDocs >= _posting_params._min_skip_docs || _has_more) {
- read_word_start_with_skip();
- // Decode context is not positioned at start of features
- } else {
- if (_posting_params._dynamic_k) {
- _docIdK = EC::calcDocIdK(_numDocs, _posting_params._doc_id_limit);
- }
- _lastDocId = 0u;
- // Decode context is not positioned at start of docids & features
- }
+ Zc4PostingReaderBase::read_word_start(*_decodeContext);
}
template <bool bigEndian>
void
Zc4PostingReader<bigEndian>::set_counts(const PostingListCounts &counts)
{
- assert(!_has_more && _residue == 0); // Previous words must have been read.
- _counts = counts;
- assert((_counts._numDocs == 0) == (_counts._bitLength == 0));
- if (_counts._numDocs > 0) {
- read_word_start();
- }
+ Zc4PostingReaderBase::set_counts(*_decodeContext, counts);
}
template <bool bigEndian>
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
index d8161da15d5..59a660407b4 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h
@@ -2,14 +2,7 @@
#pragma once
-#include "zc4_posting_writer.h"
-#include <vespa/searchlib/index/postinglistfile.h>
-#include <vespa/fastos/file.h>
-#include "zc4_posting_params.h"
-
-namespace search::index {
- class PostingListCountFileSeqRead;
-}
+#include "zc4_posting_reader_base.h"
namespace search::diskindex {
@@ -23,57 +16,13 @@ namespace search::diskindex {
* interleaved.
*/
template <bool bigEndian>
-class Zc4PostingReader
+class Zc4PostingReader : public Zc4PostingReaderBase
{
protected:
using DecodeContext = bitcompression::FeatureDecodeContext<bigEndian>;
DecodeContext *_decodeContext;
- uint32_t _docIdK;
- uint32_t _prevDocId; // Previous document id
- uint32_t _numDocs; // Documents in chunk or word
- search::ComprFileReadContext _readContext;
- bool _has_more;
- Zc4PostingParams _posting_params;
- uint32_t _lastDocId; // last document in chunk or word
-
- ZcBuf _zcDocIds; // Document id deltas
- ZcBuf _l1Skip; // L1 skip info
- ZcBuf _l2Skip; // L2 skip info
- ZcBuf _l3Skip; // L3 skip info
- ZcBuf _l4Skip; // L4 skip info
-
- uint64_t _numWords; // Number of words in file
- uint32_t _chunkNo; // Chunk number
-
- // Variables for validating skip information while reading
- uint32_t _l1SkipDocId;
- uint32_t _l1SkipDocIdPos;
- uint64_t _l1SkipFeaturesPos;
- uint32_t _l2SkipDocId;
- uint32_t _l2SkipDocIdPos;
- uint32_t _l2SkipL1SkipPos;
- uint64_t _l2SkipFeaturesPos;
- uint32_t _l3SkipDocId;
- uint32_t _l3SkipDocIdPos;
- uint32_t _l3SkipL1SkipPos;
- uint32_t _l3SkipL2SkipPos;
- uint64_t _l3SkipFeaturesPos;
- uint32_t _l4SkipDocId;
- uint32_t _l4SkipDocIdPos;
- uint32_t _l4SkipL1SkipPos;
- uint32_t _l4SkipL2SkipPos;
- uint32_t _l4SkipL3SkipPos;
- uint64_t _l4SkipFeaturesPos;
-
- // Variable for validating chunk information while reading
- uint64_t _featuresSize;
- index::PostingListCounts _counts;
-
- uint32_t _residue; // Number of unread documents after word header
- void read_common_word_doc_id_and_features(index::DocIdAndFeatures &features);
- void read_word_start_with_skip();
void read_word_start();
public:
Zc4PostingReader(bool dynamic_k);
@@ -86,8 +35,6 @@ public:
void set_counts(const index::PostingListCounts &counts);
void set_decode_features(DecodeContext *decode_features);
DecodeContext &get_decode_features() const { return *_decodeContext; }
- ComprFileReadContext &get_read_context() { return _readContext; }
- Zc4PostingParams &get_posting_params() { return _posting_params; }
};
extern template class Zc4PostingReader<false>;
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp
new file mode 100644
index 00000000000..18963e22404
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp
@@ -0,0 +1,275 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "zc4_posting_reader_base.h"
+#include "zc4_posting_header.h"
+#include <vespa/searchlib/index/docidandfeatures.h>
+
+namespace search::diskindex {
+
+using index::PostingListCounts;
+using index::DocIdAndFeatures;
+using bitcompression::FeatureEncodeContext;
+using bitcompression::DecodeContext64Base;
+
+
+Zc4PostingReaderBase::Zc4PostingReaderBase(bool dynamic_k)
+ : _doc_id_k(K_VALUE_ZCPOSTING_DELTA_DOCID),
+ _prev_doc_id(0),
+ _num_docs(0),
+ _readContext(sizeof(uint64_t)),
+ _has_more(false),
+ _posting_params(64, 1 << 30, 10000000, dynamic_k, true),
+ _last_doc_id(0),
+ _zcDocIds(),
+ _l1Skip(),
+ _l2Skip(),
+ _l3Skip(),
+ _l4Skip(),
+ _chunkNo(0),
+ _l1SkipDocId(0),
+ _l1SkipDocIdPos(0),
+ _l1SkipFeaturesPos(0),
+ _l2SkipDocId(0),
+ _l2SkipDocIdPos(0),
+ _l2SkipL1SkipPos(0),
+ _l2SkipFeaturesPos(0),
+ _l3SkipDocId(0),
+ _l3SkipDocIdPos(0),
+ _l3SkipL1SkipPos(0),
+ _l3SkipL2SkipPos(0),
+ _l3SkipFeaturesPos(0),
+ _l4SkipDocId(0),
+ _l4SkipDocIdPos(0),
+ _l4SkipL1SkipPos(0),
+ _l4SkipL2SkipPos(0),
+ _l4SkipL3SkipPos(0),
+ _l4SkipFeaturesPos(0),
+ _features_size(0),
+ _counts(),
+ _residue(0)
+{
+}
+
+Zc4PostingReaderBase::~Zc4PostingReaderBase()
+{
+}
+
+void
+Zc4PostingReaderBase::read_common_word_doc_id(DecodeContext64Base &decode_context)
+{
+ if ((_zcDocIds._valI >= _zcDocIds._valE) && _has_more) {
+ read_word_start(decode_context); // Read start of next chunk
+ }
+ // Split docid & features.
+ assert(_zcDocIds._valI < _zcDocIds._valE);
+ uint32_t docIdPos = _zcDocIds.pos();
+ uint32_t docId = _prev_doc_id + 1 + _zcDocIds.decode();
+ _prev_doc_id = docId;
+ assert(docId <= _last_doc_id);
+ if (docId > _l1SkipDocId) {
+ _l1SkipDocIdPos += _l1Skip.decode() + 1;
+ assert(docIdPos == _l1SkipDocIdPos);
+ uint64_t featuresPos = decode_context.getReadOffset();
+ if (_posting_params._encode_features) {
+ _l1SkipFeaturesPos += _l1Skip.decode() + 1;
+ assert(featuresPos == _l1SkipFeaturesPos);
+ }
+ (void) featuresPos;
+ if (docId > _l2SkipDocId) {
+ _l2SkipDocIdPos += _l2Skip.decode() + 1;
+ assert(docIdPos == _l2SkipDocIdPos);
+ if (_posting_params._encode_features) {
+ _l2SkipFeaturesPos += _l2Skip.decode() + 1;
+ assert(featuresPos == _l2SkipFeaturesPos);
+ }
+ _l2SkipL1SkipPos += _l2Skip.decode() + 1;
+ assert(_l1Skip.pos() == _l2SkipL1SkipPos);
+ if (docId > _l3SkipDocId) {
+ _l3SkipDocIdPos += _l3Skip.decode() + 1;
+ assert(docIdPos == _l3SkipDocIdPos);
+ if (_posting_params._encode_features) {
+ _l3SkipFeaturesPos += _l3Skip.decode() + 1;
+ assert(featuresPos == _l3SkipFeaturesPos);
+ }
+ _l3SkipL1SkipPos += _l3Skip.decode() + 1;
+ assert(_l1Skip.pos() == _l3SkipL1SkipPos);
+ _l3SkipL2SkipPos += _l3Skip.decode() + 1;
+ assert(_l2Skip.pos() == _l3SkipL2SkipPos);
+ if (docId > _l4SkipDocId) {
+ _l4SkipDocIdPos += _l4Skip.decode() + 1;
+ assert(docIdPos == _l4SkipDocIdPos);
+ (void) docIdPos;
+ if (_posting_params._encode_features) {
+ _l4SkipFeaturesPos += _l4Skip.decode() + 1;
+ assert(featuresPos == _l4SkipFeaturesPos);
+ }
+ _l4SkipL1SkipPos += _l4Skip.decode() + 1;
+ assert(_l1Skip.pos() == _l4SkipL1SkipPos);
+ _l4SkipL2SkipPos += _l4Skip.decode() + 1;
+ assert(_l2Skip.pos() == _l4SkipL2SkipPos);
+ _l4SkipL3SkipPos += _l4Skip.decode() + 1;
+ assert(_l3Skip.pos() == _l4SkipL3SkipPos);
+ _l4SkipDocId += _l4Skip.decode() + 1;
+ assert(_l4SkipDocId <= _last_doc_id);
+ assert(_l4SkipDocId >= docId);
+ }
+ _l3SkipDocId += _l3Skip.decode() + 1;
+ assert(_l3SkipDocId <= _last_doc_id);
+ assert(_l3SkipDocId <= _l4SkipDocId);
+ assert(_l3SkipDocId >= docId);
+ }
+ _l2SkipDocId += _l2Skip.decode() + 1;
+ assert(_l2SkipDocId <= _last_doc_id);
+ assert(_l2SkipDocId <= _l4SkipDocId);
+ assert(_l2SkipDocId <= _l3SkipDocId);
+ assert(_l2SkipDocId >= docId);
+ }
+ _l1SkipDocId += _l1Skip.decode() + 1;
+ assert(_l1SkipDocId <= _last_doc_id);
+ assert(_l1SkipDocId <= _l4SkipDocId);
+ assert(_l1SkipDocId <= _l3SkipDocId);
+ assert(_l1SkipDocId <= _l2SkipDocId);
+ assert(_l1SkipDocId >= docId);
+ }
+ if (docId < _last_doc_id) {
+ // Assert more space available when not yet at last docid
+ assert(_zcDocIds._valI < _zcDocIds._valE);
+ } else {
+ // Assert that space has been used when at last docid
+ assert(_zcDocIds._valI == _zcDocIds._valE);
+ // Assert that we've read to end of skip info
+ assert(_l1SkipDocId == _last_doc_id);
+ assert(_l2SkipDocId == _last_doc_id);
+ assert(_l3SkipDocId == _last_doc_id);
+ assert(_l4SkipDocId == _last_doc_id);
+ if (!_has_more) {
+ _chunkNo = 0;
+ }
+ }
+}
+
+void
+Zc4PostingReaderBase::read_word_start_with_skip(DecodeContext64Base &decode_context, const Zc4PostingHeader &header)
+{
+ if (_has_more) {
+ ++_chunkNo;
+ } else {
+ _chunkNo = 0;
+ }
+ assert(_num_docs >= _posting_params._min_skip_docs || _has_more);
+ bool has_more = header._has_more;
+ if (_has_more || has_more) {
+ assert(has_more == (_chunkNo + 1 < _counts._segments.size()));
+ assert(_num_docs == _counts._segments[_chunkNo]._numDocs);
+ if (has_more) {
+ assert(_num_docs >= _posting_params._min_skip_docs);
+ assert(_num_docs >= _posting_params._min_chunk_docs);
+ }
+ } else {
+ assert(_num_docs >= _posting_params._min_skip_docs);
+ assert(_num_docs == _counts._numDocs);
+ }
+ uint32_t docIdsSize = header._doc_ids_size;
+ uint32_t l1SkipSize = header._l1_skip_size;
+ uint32_t l2SkipSize = header._l2_skip_size;
+ uint32_t l3SkipSize = header._l3_skip_size;
+ uint32_t l4SkipSize = header._l4_skip_size;
+ if (_has_more || has_more) {
+ assert(_last_doc_id == _counts._segments[_chunkNo]._lastDoc);
+ }
+ _zcDocIds.clearReserve(docIdsSize);
+ _l1Skip.clearReserve(l1SkipSize);
+ _l2Skip.clearReserve(l2SkipSize);
+ _l3Skip.clearReserve(l3SkipSize);
+ _l4Skip.clearReserve(l4SkipSize);
+ decode_context.readBytes(_zcDocIds._valI, docIdsSize);
+ _zcDocIds._valE = _zcDocIds._valI + docIdsSize;
+ if (l1SkipSize > 0) {
+ decode_context.readBytes(_l1Skip._valI, l1SkipSize);
+ }
+ _l1Skip._valE = _l1Skip._valI + l1SkipSize;
+ if (l2SkipSize > 0) {
+ decode_context.readBytes(_l2Skip._valI, l2SkipSize);
+ }
+ _l2Skip._valE = _l2Skip._valI + l2SkipSize;
+ if (l3SkipSize > 0) {
+ decode_context.readBytes(_l3Skip._valI, l3SkipSize);
+ }
+ _l3Skip._valE = _l3Skip._valI + l3SkipSize;
+ if (l4SkipSize > 0) {
+ decode_context.readBytes(_l4Skip._valI, l4SkipSize);
+ }
+ _l4Skip._valE = _l4Skip._valI + l4SkipSize;
+
+ if (l1SkipSize > 0) {
+ _l1SkipDocId = _l1Skip.decode() + 1 + _prev_doc_id;
+ } else {
+ _l1SkipDocId = _last_doc_id;
+ }
+ if (l2SkipSize > 0) {
+ _l2SkipDocId = _l2Skip.decode() + 1 + _prev_doc_id;
+ } else {
+ _l2SkipDocId = _last_doc_id;
+ }
+ if (l3SkipSize > 0) {
+ _l3SkipDocId = _l3Skip.decode() + 1 + _prev_doc_id;
+ } else {
+ _l3SkipDocId = _last_doc_id;
+ }
+ if (l4SkipSize > 0) {
+ _l4SkipDocId = _l4Skip.decode() + 1 + _prev_doc_id;
+ } else {
+ _l4SkipDocId = _last_doc_id;
+ }
+ _l1SkipDocIdPos = 0;
+ _l1SkipFeaturesPos = decode_context.getReadOffset();
+ _l2SkipDocIdPos = 0;
+ _l2SkipL1SkipPos = 0;
+ _l2SkipFeaturesPos = decode_context.getReadOffset();
+ _l3SkipDocIdPos = 0;
+ _l3SkipL1SkipPos = 0;
+ _l3SkipL2SkipPos = 0;
+ _l3SkipFeaturesPos = decode_context.getReadOffset();
+ _l4SkipDocIdPos = 0;
+ _l4SkipL1SkipPos = 0;
+ _l4SkipL2SkipPos = 0;
+ _l4SkipL3SkipPos = 0;
+ _l4SkipFeaturesPos = decode_context.getReadOffset();
+ _has_more = has_more;
+ // Decode context is now positioned at start of features
+}
+
+void
+Zc4PostingReaderBase::read_word_start(DecodeContext64Base &decode_context)
+{
+ Zc4PostingHeader header;
+ header._has_more = _has_more;
+ header.read(decode_context, _posting_params);
+ _num_docs = header._num_docs;
+ _residue = _num_docs;
+ _prev_doc_id = _has_more ? _last_doc_id : 0u;
+ _doc_id_k = header._doc_id_k;
+ _last_doc_id = header._last_doc_id;
+ _features_size = header._features_size;
+ assert(_num_docs <= _counts._numDocs);
+ assert(_num_docs == _counts._numDocs ||
+ _num_docs >= _posting_params._min_chunk_docs ||
+ _has_more);
+
+ if (_num_docs >= _posting_params._min_skip_docs || _has_more) {
+ read_word_start_with_skip(decode_context, header);
+ }
+}
+
+void
+Zc4PostingReaderBase::set_counts(DecodeContext64Base &decode_context, const PostingListCounts &counts)
+{
+ assert(!_has_more && _residue == 0); // Previous words must have been read.
+ _counts = counts;
+ assert((_counts._numDocs == 0) == (_counts._bitLength == 0));
+ if (_counts._numDocs > 0) {
+ read_word_start(decode_context);
+ }
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h
new file mode 100644
index 00000000000..f19823936ba
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h
@@ -0,0 +1,79 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "zc4_posting_params.h"
+#include "zcbuf.h"
+#include <vespa/searchlib/bitcompression/compression.h>
+#include <vespa/searchlib/index/postinglistcounts.h>
+
+namespace search::diskindex {
+
+class Zc4PostingHeader;
+
+/*
+ * Base class for reading posting lists that might have basic skip info.
+ */
+class Zc4PostingReaderBase
+{
+
+protected:
+ uint32_t _doc_id_k;
+ uint32_t _prev_doc_id; // Previous document id
+ uint32_t _num_docs; // Documents in chunk or word
+ search::ComprFileReadContext _readContext;
+ bool _has_more;
+ Zc4PostingParams _posting_params;
+ uint32_t _last_doc_id; // last document in chunk or word
+
+ ZcBuf _zcDocIds; // Document id deltas
+ ZcBuf _l1Skip; // L1 skip info
+ ZcBuf _l2Skip; // L2 skip info
+ ZcBuf _l3Skip; // L3 skip info
+ ZcBuf _l4Skip; // L4 skip info
+
+ uint64_t _numWords; // Number of words in file
+ uint32_t _chunkNo; // Chunk number
+
+ // Variables for validating skip information while reading
+ uint32_t _l1SkipDocId;
+ uint32_t _l1SkipDocIdPos;
+ uint64_t _l1SkipFeaturesPos;
+ uint32_t _l2SkipDocId;
+ uint32_t _l2SkipDocIdPos;
+ uint32_t _l2SkipL1SkipPos;
+ uint64_t _l2SkipFeaturesPos;
+ uint32_t _l3SkipDocId;
+ uint32_t _l3SkipDocIdPos;
+ uint32_t _l3SkipL1SkipPos;
+ uint32_t _l3SkipL2SkipPos;
+ uint64_t _l3SkipFeaturesPos;
+ uint32_t _l4SkipDocId;
+ uint32_t _l4SkipDocIdPos;
+ uint32_t _l4SkipL1SkipPos;
+ uint32_t _l4SkipL2SkipPos;
+ uint32_t _l4SkipL3SkipPos;
+ uint64_t _l4SkipFeaturesPos;
+
+ // Variable for validating chunk information while reading
+ uint64_t _features_size;
+ index::PostingListCounts _counts;
+
+ uint32_t _residue; // Number of unread documents after word header
+ void read_common_word_doc_id(bitcompression::DecodeContext64Base &decode_context);
+ void read_word_start_with_skip(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingHeader &header);
+ void read_word_start(bitcompression::DecodeContext64Base &decode_context);
+public:
+ Zc4PostingReaderBase(bool dynamic_k);
+ Zc4PostingReaderBase(const Zc4PostingReaderBase &) = delete;
+ Zc4PostingReaderBase(Zc4PostingReaderBase &&) = delete;
+ Zc4PostingReaderBase &operator=(const Zc4PostingReaderBase &) = delete;
+ Zc4PostingReaderBase &operator=(Zc4PostingReaderBase &&) = delete;
+ ~Zc4PostingReaderBase();
+ void read_doc_id_and_features(index::DocIdAndFeatures &features);
+ void set_counts(bitcompression::DecodeContext64Base &decode_context, const index::PostingListCounts &counts);
+ ComprFileReadContext &get_read_context() { return _readContext; }
+ Zc4PostingParams &get_posting_params() { return _posting_params; }
+};
+
+}