From f036f3fd0ef94689676a6da541c8c1b00efb276d Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Thu, 2 May 2019 11:10:17 +0200 Subject: Move endian independent portion of Zc4PostingReader to Zc4PostingReaderBase. Use common code for reading posting list header. --- .../src/vespa/searchlib/diskindex/CMakeLists.txt | 1 + .../searchlib/diskindex/zc4_posting_reader.cpp | 402 ++------------------- .../vespa/searchlib/diskindex/zc4_posting_reader.h | 57 +-- .../diskindex/zc4_posting_reader_base.cpp | 275 ++++++++++++++ .../searchlib/diskindex/zc4_posting_reader_base.h | 79 ++++ 5 files changed, 382 insertions(+), 432 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp create mode 100644 searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h (limited to 'searchlib') diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt index 2fea4f2bab7..ba608467c8a 100644 --- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt @@ -20,6 +20,7 @@ vespa_add_library(searchlib_diskindex OBJECT wordnummapper.cpp zc4_posting_header.cpp zc4_posting_reader.cpp + zc4_posting_reader_base.cpp zc4_posting_writer.cpp zc4_posting_writer_base.cpp zcbuf.cpp diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp index c0e1115521c..a09c26d7985 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp @@ -1,6 +1,7 @@ // Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "zc4_posting_reader.h" +#include "zc4_posting_header.h" #include namespace search::diskindex { @@ -12,41 +13,8 @@ using bitcompression::FeatureEncodeContext; template Zc4PostingReader::Zc4PostingReader(bool dynamic_k) - : _decodeContext(nullptr), - _docIdK(K_VALUE_ZCPOSTING_DELTA_DOCID), - _prevDocId(0), - _numDocs(0), - _readContext(sizeof(uint64_t)), - _has_more(false), - _posting_params(64, 1 << 30, 10000000, dynamic_k, true), - _lastDocId(0), - _zcDocIds(), - _l1Skip(), - _l2Skip(), - _l3Skip(), - _l4Skip(), - _chunkNo(0), - _l1SkipDocId(0), - _l1SkipDocIdPos(0), - _l1SkipFeaturesPos(0), - _l2SkipDocId(0), - _l2SkipDocIdPos(0), - _l2SkipL1SkipPos(0), - _l2SkipFeaturesPos(0), - _l3SkipDocId(0), - _l3SkipDocIdPos(0), - _l3SkipL1SkipPos(0), - _l3SkipL2SkipPos(0), - _l3SkipFeaturesPos(0), - _l4SkipDocId(0), - _l4SkipDocIdPos(0), - _l4SkipL1SkipPos(0), - _l4SkipL2SkipPos(0), - _l4SkipL3SkipPos(0), - _l4SkipFeaturesPos(0), - _featuresSize(0), - _counts(), - _residue(0) + : Zc4PostingReaderBase(dynamic_k), + _decodeContext(nullptr) { } @@ -55,372 +23,52 @@ Zc4PostingReader::~Zc4PostingReader() { } -template -void -Zc4PostingReader::read_common_word_doc_id_and_features(DocIdAndFeatures &features) -{ - if ((_zcDocIds._valI >= _zcDocIds._valE) && _has_more) { - read_word_start(); // Read start of next chunk - } - // Split docid & features. - assert(_zcDocIds._valI < _zcDocIds._valE); - uint32_t docIdPos = _zcDocIds.pos(); - uint32_t docId = _prevDocId + 1 + _zcDocIds.decode(); - features.set_doc_id(docId); - _prevDocId = docId; - assert(docId <= _lastDocId); - if (docId > _l1SkipDocId) { - _l1SkipDocIdPos += _l1Skip.decode() + 1; - assert(docIdPos == _l1SkipDocIdPos); - uint64_t featuresPos = _decodeContext->getReadOffset(); - if (_posting_params._encode_features) { - _l1SkipFeaturesPos += _l1Skip.decode() + 1; - assert(featuresPos == _l1SkipFeaturesPos); - } - (void) featuresPos; - if (docId > _l2SkipDocId) { - _l2SkipDocIdPos += _l2Skip.decode() + 1; - assert(docIdPos == _l2SkipDocIdPos); - if (_posting_params._encode_features) { - _l2SkipFeaturesPos += _l2Skip.decode() + 1; - assert(featuresPos == _l2SkipFeaturesPos); - } - _l2SkipL1SkipPos += _l2Skip.decode() + 1; - assert(_l1Skip.pos() == _l2SkipL1SkipPos); - if (docId > _l3SkipDocId) { - _l3SkipDocIdPos += _l3Skip.decode() + 1; - assert(docIdPos == _l3SkipDocIdPos); - if (_posting_params._encode_features) { - _l3SkipFeaturesPos += _l3Skip.decode() + 1; - assert(featuresPos == _l3SkipFeaturesPos); - } - _l3SkipL1SkipPos += _l3Skip.decode() + 1; - assert(_l1Skip.pos() == _l3SkipL1SkipPos); - _l3SkipL2SkipPos += _l3Skip.decode() + 1; - assert(_l2Skip.pos() == _l3SkipL2SkipPos); - if (docId > _l4SkipDocId) { - _l4SkipDocIdPos += _l4Skip.decode() + 1; - assert(docIdPos == _l4SkipDocIdPos); - (void) docIdPos; - if (_posting_params._encode_features) { - _l4SkipFeaturesPos += _l4Skip.decode() + 1; - assert(featuresPos == _l4SkipFeaturesPos); - } - _l4SkipL1SkipPos += _l4Skip.decode() + 1; - assert(_l1Skip.pos() == _l4SkipL1SkipPos); - _l4SkipL2SkipPos += _l4Skip.decode() + 1; - assert(_l2Skip.pos() == _l4SkipL2SkipPos); - _l4SkipL3SkipPos += _l4Skip.decode() + 1; - assert(_l3Skip.pos() == _l4SkipL3SkipPos); - _l4SkipDocId += _l4Skip.decode() + 1; - assert(_l4SkipDocId <= _lastDocId); - assert(_l4SkipDocId >= docId); - } - _l3SkipDocId += _l3Skip.decode() + 1; - assert(_l3SkipDocId <= _lastDocId); - assert(_l3SkipDocId <= _l4SkipDocId); - assert(_l3SkipDocId >= docId); - } - _l2SkipDocId += _l2Skip.decode() + 1; - assert(_l2SkipDocId <= _lastDocId); - assert(_l2SkipDocId <= _l4SkipDocId); - assert(_l2SkipDocId <= _l3SkipDocId); - assert(_l2SkipDocId >= docId); - } - _l1SkipDocId += _l1Skip.decode() + 1; - assert(_l1SkipDocId <= _lastDocId); - assert(_l1SkipDocId <= _l4SkipDocId); - assert(_l1SkipDocId <= _l3SkipDocId); - assert(_l1SkipDocId <= _l2SkipDocId); - assert(_l1SkipDocId >= docId); - } - if (docId < _lastDocId) { - // Assert more space available when not yet at last docid - assert(_zcDocIds._valI < _zcDocIds._valE); - } else { - // Assert that space has been used when at last docid - assert(_zcDocIds._valI == _zcDocIds._valE); - // Assert that we've read to end of skip info - assert(_l1SkipDocId == _lastDocId); - assert(_l2SkipDocId == _lastDocId); - assert(_l3SkipDocId == _lastDocId); - assert(_l4SkipDocId == _lastDocId); - if (!_has_more) { - _chunkNo = 0; - } - } - if (_posting_params._encode_features) { - _decodeContext->readFeatures(features); - } - --_residue; -} - template void Zc4PostingReader::read_doc_id_and_features(DocIdAndFeatures &features) { if (_residue == 0 && !_has_more) { - if (_residue == 0) { - // Don't read past end of posting list. - features.clear(static_cast(-1)); - return; - } - } - if (_lastDocId > 0) { - read_common_word_doc_id_and_features(features); + // Don't read past end of posting list. + features.clear(static_cast(-1)); return; } - // Interleaves docid & features - using EC = FeatureEncodeContext; - DecodeContext &d = *_decodeContext; - uint32_t length; - uint64_t val64; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - - UC64_DECODEEXPGOLOMB_SMALL_NS(o, _docIdK, EC); - uint32_t docId = _prevDocId + 1 + val64; - features.set_doc_id(docId); - _prevDocId = docId; - UC64_DECODECONTEXT_STORE(o, d._); - if (__builtin_expect(oCompr >= d._valE, false)) { - _readContext.readComprBuffer(); - } - if (_posting_params._encode_features) { - _decodeContext->readFeatures(features); - } - --_residue; -} - -template -void -Zc4PostingReader::read_word_start_with_skip() -{ - using EC = FeatureEncodeContext; - DecodeContext &d = *_decodeContext; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - uint32_t length; - uint64_t val64; - const uint64_t *valE = d._valE; - - if (_has_more) { - ++_chunkNo; - } else { - _chunkNo = 0; - } - assert(_numDocs >= _posting_params._min_skip_docs || _has_more); - bool has_more = false; - if (__builtin_expect(_numDocs >= _posting_params._min_chunk_docs, false)) { - if (bigEndian) { - has_more = static_cast(oVal) < 0; - oVal <<= 1; - } else { - has_more = (oVal & 1) != 0; - oVal >>= 1; - } - length = 1; - UC64_READBITS_NS(o, EC); - } - if (_posting_params._dynamic_k) { - _docIdK = EC::calcDocIdK((_has_more || has_more) ? 1 : _numDocs, - _posting_params._doc_id_limit); - } - if (_has_more || has_more) { - assert(has_more == (_chunkNo + 1 < _counts._segments.size())); - assert(_numDocs == _counts._segments[_chunkNo]._numDocs); - if (has_more) { - assert(_numDocs >= _posting_params._min_skip_docs); - assert(_numDocs >= _posting_params._min_chunk_docs); - } - } else { - assert(_numDocs >= _posting_params._min_skip_docs); - assert(_numDocs == _counts._numDocs); - } - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC); - uint32_t docIdsSize = val64 + 1; - UC64_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_L1SKIPSIZE, - EC); - uint32_t l1SkipSize = val64; - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - uint32_t l2SkipSize = 0; - if (l1SkipSize != 0) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC); - l2SkipSize = val64; - } - uint32_t l3SkipSize = 0; - if (l2SkipSize != 0) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC); - l3SkipSize = val64; - } - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - uint32_t l4SkipSize = 0; - if (l3SkipSize != 0) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); - l4SkipSize = val64; - } - if (_posting_params._encode_features) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC); - _featuresSize = val64; - } - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - if (_posting_params._dynamic_k) { - UC64_DECODEEXPGOLOMB_NS(o, _docIdK, EC); + if (_last_doc_id > 0) { + read_common_word_doc_id(*_decodeContext); } else { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_LASTDOCID, EC); - } - _lastDocId = _posting_params._doc_id_limit - 1 - val64; - if (_has_more || has_more) { - assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc); - } - - if (__builtin_expect(oCompr >= valE, false)) { + // Interleaves docid & features + using EC = FeatureEncodeContext; + DecodeContext &d = *_decodeContext; + uint32_t length; + uint64_t val64; + UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); + + UC64_DECODEEXPGOLOMB_SMALL_NS(o, _doc_id_k, EC); + uint32_t docId = _prev_doc_id + 1 + val64; + _prev_doc_id = docId; UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - uint64_t bytePad = oPreRead & 7; - if (bytePad > 0) { - length = bytePad; - if (bigEndian) { - oVal <<= length; - } else { - oVal >>= length; + if (__builtin_expect(oCompr >= d._valE, false)) { + _readContext.readComprBuffer(); } - UC64_READBITS_NS(o, EC); - } - UC64_DECODECONTEXT_STORE(o, d._); - if (__builtin_expect(oCompr >= valE, false)) { - _readContext.readComprBuffer(); - } - _zcDocIds.clearReserve(docIdsSize); - _l1Skip.clearReserve(l1SkipSize); - _l2Skip.clearReserve(l2SkipSize); - _l3Skip.clearReserve(l3SkipSize); - _l4Skip.clearReserve(l4SkipSize); - _decodeContext->readBytes(_zcDocIds._valI, docIdsSize); - _zcDocIds._valE = _zcDocIds._valI + docIdsSize; - if (l1SkipSize > 0) { - _decodeContext->readBytes(_l1Skip._valI, l1SkipSize); - } - _l1Skip._valE = _l1Skip._valI + l1SkipSize; - if (l2SkipSize > 0) { - _decodeContext->readBytes(_l2Skip._valI, l2SkipSize); - } - _l2Skip._valE = _l2Skip._valI + l2SkipSize; - if (l3SkipSize > 0) { - _decodeContext->readBytes(_l3Skip._valI, l3SkipSize); } - _l3Skip._valE = _l3Skip._valI + l3SkipSize; - if (l4SkipSize > 0) { - _decodeContext->readBytes(_l4Skip._valI, l4SkipSize); - } - _l4Skip._valE = _l4Skip._valI + l4SkipSize; - - if (l1SkipSize > 0) { - _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId; - } else { - _l1SkipDocId = _lastDocId; - } - if (l2SkipSize > 0) { - _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId; - } else { - _l2SkipDocId = _lastDocId; - } - if (l3SkipSize > 0) { - _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId; - } else { - _l3SkipDocId = _lastDocId; - } - if (l4SkipSize > 0) { - _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId; - } else { - _l4SkipDocId = _lastDocId; + features.set_doc_id(_prev_doc_id); + if (_posting_params._encode_features) { + _decodeContext->readFeatures(features); } - _l1SkipDocIdPos = 0; - _l1SkipFeaturesPos = _decodeContext->getReadOffset(); - _l2SkipDocIdPos = 0; - _l2SkipL1SkipPos = 0; - _l2SkipFeaturesPos = _decodeContext->getReadOffset(); - _l3SkipDocIdPos = 0; - _l3SkipL1SkipPos = 0; - _l3SkipL2SkipPos = 0; - _l3SkipFeaturesPos = _decodeContext->getReadOffset(); - _l4SkipDocIdPos = 0; - _l4SkipL1SkipPos = 0; - _l4SkipL2SkipPos = 0; - _l4SkipL3SkipPos = 0; - _l4SkipFeaturesPos = _decodeContext->getReadOffset(); - _has_more = has_more; - // Decode context is now positioned at start of features + --_residue; } template void Zc4PostingReader::read_word_start() { - using EC = FeatureEncodeContext; - UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_); - uint32_t length; - uint64_t val64; - const uint64_t *valE = _decodeContext->_valE; - - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); - UC64_DECODECONTEXT_STORE(o, _decodeContext->_); - if (oCompr >= valE) { - _readContext.readComprBuffer(); - } - _numDocs = static_cast(val64) + 1; - _residue = _numDocs; - _prevDocId = _has_more ? _lastDocId : 0u; - assert(_numDocs <= _counts._numDocs); - assert(_numDocs == _counts._numDocs || - _numDocs >= _posting_params._min_chunk_docs || - _has_more); - - if (_numDocs >= _posting_params._min_skip_docs || _has_more) { - read_word_start_with_skip(); - // Decode context is not positioned at start of features - } else { - if (_posting_params._dynamic_k) { - _docIdK = EC::calcDocIdK(_numDocs, _posting_params._doc_id_limit); - } - _lastDocId = 0u; - // Decode context is not positioned at start of docids & features - } + Zc4PostingReaderBase::read_word_start(*_decodeContext); } template void Zc4PostingReader::set_counts(const PostingListCounts &counts) { - assert(!_has_more && _residue == 0); // Previous words must have been read. - _counts = counts; - assert((_counts._numDocs == 0) == (_counts._bitLength == 0)); - if (_counts._numDocs > 0) { - read_word_start(); - } + Zc4PostingReaderBase::set_counts(*_decodeContext, counts); } template diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h index d8161da15d5..59a660407b4 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h @@ -2,14 +2,7 @@ #pragma once -#include "zc4_posting_writer.h" -#include -#include -#include "zc4_posting_params.h" - -namespace search::index { - class PostingListCountFileSeqRead; -} +#include "zc4_posting_reader_base.h" namespace search::diskindex { @@ -23,57 +16,13 @@ namespace search::diskindex { * interleaved. */ template -class Zc4PostingReader +class Zc4PostingReader : public Zc4PostingReaderBase { protected: using DecodeContext = bitcompression::FeatureDecodeContext; DecodeContext *_decodeContext; - uint32_t _docIdK; - uint32_t _prevDocId; // Previous document id - uint32_t _numDocs; // Documents in chunk or word - search::ComprFileReadContext _readContext; - bool _has_more; - Zc4PostingParams _posting_params; - uint32_t _lastDocId; // last document in chunk or word - - ZcBuf _zcDocIds; // Document id deltas - ZcBuf _l1Skip; // L1 skip info - ZcBuf _l2Skip; // L2 skip info - ZcBuf _l3Skip; // L3 skip info - ZcBuf _l4Skip; // L4 skip info - - uint64_t _numWords; // Number of words in file - uint32_t _chunkNo; // Chunk number - - // Variables for validating skip information while reading - uint32_t _l1SkipDocId; - uint32_t _l1SkipDocIdPos; - uint64_t _l1SkipFeaturesPos; - uint32_t _l2SkipDocId; - uint32_t _l2SkipDocIdPos; - uint32_t _l2SkipL1SkipPos; - uint64_t _l2SkipFeaturesPos; - uint32_t _l3SkipDocId; - uint32_t _l3SkipDocIdPos; - uint32_t _l3SkipL1SkipPos; - uint32_t _l3SkipL2SkipPos; - uint64_t _l3SkipFeaturesPos; - uint32_t _l4SkipDocId; - uint32_t _l4SkipDocIdPos; - uint32_t _l4SkipL1SkipPos; - uint32_t _l4SkipL2SkipPos; - uint32_t _l4SkipL3SkipPos; - uint64_t _l4SkipFeaturesPos; - - // Variable for validating chunk information while reading - uint64_t _featuresSize; - index::PostingListCounts _counts; - - uint32_t _residue; // Number of unread documents after word header - void read_common_word_doc_id_and_features(index::DocIdAndFeatures &features); - void read_word_start_with_skip(); void read_word_start(); public: Zc4PostingReader(bool dynamic_k); @@ -86,8 +35,6 @@ public: void set_counts(const index::PostingListCounts &counts); void set_decode_features(DecodeContext *decode_features); DecodeContext &get_decode_features() const { return *_decodeContext; } - ComprFileReadContext &get_read_context() { return _readContext; } - Zc4PostingParams &get_posting_params() { return _posting_params; } }; extern template class Zc4PostingReader; diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp new file mode 100644 index 00000000000..18963e22404 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp @@ -0,0 +1,275 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "zc4_posting_reader_base.h" +#include "zc4_posting_header.h" +#include + +namespace search::diskindex { + +using index::PostingListCounts; +using index::DocIdAndFeatures; +using bitcompression::FeatureEncodeContext; +using bitcompression::DecodeContext64Base; + + +Zc4PostingReaderBase::Zc4PostingReaderBase(bool dynamic_k) + : _doc_id_k(K_VALUE_ZCPOSTING_DELTA_DOCID), + _prev_doc_id(0), + _num_docs(0), + _readContext(sizeof(uint64_t)), + _has_more(false), + _posting_params(64, 1 << 30, 10000000, dynamic_k, true), + _last_doc_id(0), + _zcDocIds(), + _l1Skip(), + _l2Skip(), + _l3Skip(), + _l4Skip(), + _chunkNo(0), + _l1SkipDocId(0), + _l1SkipDocIdPos(0), + _l1SkipFeaturesPos(0), + _l2SkipDocId(0), + _l2SkipDocIdPos(0), + _l2SkipL1SkipPos(0), + _l2SkipFeaturesPos(0), + _l3SkipDocId(0), + _l3SkipDocIdPos(0), + _l3SkipL1SkipPos(0), + _l3SkipL2SkipPos(0), + _l3SkipFeaturesPos(0), + _l4SkipDocId(0), + _l4SkipDocIdPos(0), + _l4SkipL1SkipPos(0), + _l4SkipL2SkipPos(0), + _l4SkipL3SkipPos(0), + _l4SkipFeaturesPos(0), + _features_size(0), + _counts(), + _residue(0) +{ +} + +Zc4PostingReaderBase::~Zc4PostingReaderBase() +{ +} + +void +Zc4PostingReaderBase::read_common_word_doc_id(DecodeContext64Base &decode_context) +{ + if ((_zcDocIds._valI >= _zcDocIds._valE) && _has_more) { + read_word_start(decode_context); // Read start of next chunk + } + // Split docid & features. + assert(_zcDocIds._valI < _zcDocIds._valE); + uint32_t docIdPos = _zcDocIds.pos(); + uint32_t docId = _prev_doc_id + 1 + _zcDocIds.decode(); + _prev_doc_id = docId; + assert(docId <= _last_doc_id); + if (docId > _l1SkipDocId) { + _l1SkipDocIdPos += _l1Skip.decode() + 1; + assert(docIdPos == _l1SkipDocIdPos); + uint64_t featuresPos = decode_context.getReadOffset(); + if (_posting_params._encode_features) { + _l1SkipFeaturesPos += _l1Skip.decode() + 1; + assert(featuresPos == _l1SkipFeaturesPos); + } + (void) featuresPos; + if (docId > _l2SkipDocId) { + _l2SkipDocIdPos += _l2Skip.decode() + 1; + assert(docIdPos == _l2SkipDocIdPos); + if (_posting_params._encode_features) { + _l2SkipFeaturesPos += _l2Skip.decode() + 1; + assert(featuresPos == _l2SkipFeaturesPos); + } + _l2SkipL1SkipPos += _l2Skip.decode() + 1; + assert(_l1Skip.pos() == _l2SkipL1SkipPos); + if (docId > _l3SkipDocId) { + _l3SkipDocIdPos += _l3Skip.decode() + 1; + assert(docIdPos == _l3SkipDocIdPos); + if (_posting_params._encode_features) { + _l3SkipFeaturesPos += _l3Skip.decode() + 1; + assert(featuresPos == _l3SkipFeaturesPos); + } + _l3SkipL1SkipPos += _l3Skip.decode() + 1; + assert(_l1Skip.pos() == _l3SkipL1SkipPos); + _l3SkipL2SkipPos += _l3Skip.decode() + 1; + assert(_l2Skip.pos() == _l3SkipL2SkipPos); + if (docId > _l4SkipDocId) { + _l4SkipDocIdPos += _l4Skip.decode() + 1; + assert(docIdPos == _l4SkipDocIdPos); + (void) docIdPos; + if (_posting_params._encode_features) { + _l4SkipFeaturesPos += _l4Skip.decode() + 1; + assert(featuresPos == _l4SkipFeaturesPos); + } + _l4SkipL1SkipPos += _l4Skip.decode() + 1; + assert(_l1Skip.pos() == _l4SkipL1SkipPos); + _l4SkipL2SkipPos += _l4Skip.decode() + 1; + assert(_l2Skip.pos() == _l4SkipL2SkipPos); + _l4SkipL3SkipPos += _l4Skip.decode() + 1; + assert(_l3Skip.pos() == _l4SkipL3SkipPos); + _l4SkipDocId += _l4Skip.decode() + 1; + assert(_l4SkipDocId <= _last_doc_id); + assert(_l4SkipDocId >= docId); + } + _l3SkipDocId += _l3Skip.decode() + 1; + assert(_l3SkipDocId <= _last_doc_id); + assert(_l3SkipDocId <= _l4SkipDocId); + assert(_l3SkipDocId >= docId); + } + _l2SkipDocId += _l2Skip.decode() + 1; + assert(_l2SkipDocId <= _last_doc_id); + assert(_l2SkipDocId <= _l4SkipDocId); + assert(_l2SkipDocId <= _l3SkipDocId); + assert(_l2SkipDocId >= docId); + } + _l1SkipDocId += _l1Skip.decode() + 1; + assert(_l1SkipDocId <= _last_doc_id); + assert(_l1SkipDocId <= _l4SkipDocId); + assert(_l1SkipDocId <= _l3SkipDocId); + assert(_l1SkipDocId <= _l2SkipDocId); + assert(_l1SkipDocId >= docId); + } + if (docId < _last_doc_id) { + // Assert more space available when not yet at last docid + assert(_zcDocIds._valI < _zcDocIds._valE); + } else { + // Assert that space has been used when at last docid + assert(_zcDocIds._valI == _zcDocIds._valE); + // Assert that we've read to end of skip info + assert(_l1SkipDocId == _last_doc_id); + assert(_l2SkipDocId == _last_doc_id); + assert(_l3SkipDocId == _last_doc_id); + assert(_l4SkipDocId == _last_doc_id); + if (!_has_more) { + _chunkNo = 0; + } + } +} + +void +Zc4PostingReaderBase::read_word_start_with_skip(DecodeContext64Base &decode_context, const Zc4PostingHeader &header) +{ + if (_has_more) { + ++_chunkNo; + } else { + _chunkNo = 0; + } + assert(_num_docs >= _posting_params._min_skip_docs || _has_more); + bool has_more = header._has_more; + if (_has_more || has_more) { + assert(has_more == (_chunkNo + 1 < _counts._segments.size())); + assert(_num_docs == _counts._segments[_chunkNo]._numDocs); + if (has_more) { + assert(_num_docs >= _posting_params._min_skip_docs); + assert(_num_docs >= _posting_params._min_chunk_docs); + } + } else { + assert(_num_docs >= _posting_params._min_skip_docs); + assert(_num_docs == _counts._numDocs); + } + uint32_t docIdsSize = header._doc_ids_size; + uint32_t l1SkipSize = header._l1_skip_size; + uint32_t l2SkipSize = header._l2_skip_size; + uint32_t l3SkipSize = header._l3_skip_size; + uint32_t l4SkipSize = header._l4_skip_size; + if (_has_more || has_more) { + assert(_last_doc_id == _counts._segments[_chunkNo]._lastDoc); + } + _zcDocIds.clearReserve(docIdsSize); + _l1Skip.clearReserve(l1SkipSize); + _l2Skip.clearReserve(l2SkipSize); + _l3Skip.clearReserve(l3SkipSize); + _l4Skip.clearReserve(l4SkipSize); + decode_context.readBytes(_zcDocIds._valI, docIdsSize); + _zcDocIds._valE = _zcDocIds._valI + docIdsSize; + if (l1SkipSize > 0) { + decode_context.readBytes(_l1Skip._valI, l1SkipSize); + } + _l1Skip._valE = _l1Skip._valI + l1SkipSize; + if (l2SkipSize > 0) { + decode_context.readBytes(_l2Skip._valI, l2SkipSize); + } + _l2Skip._valE = _l2Skip._valI + l2SkipSize; + if (l3SkipSize > 0) { + decode_context.readBytes(_l3Skip._valI, l3SkipSize); + } + _l3Skip._valE = _l3Skip._valI + l3SkipSize; + if (l4SkipSize > 0) { + decode_context.readBytes(_l4Skip._valI, l4SkipSize); + } + _l4Skip._valE = _l4Skip._valI + l4SkipSize; + + if (l1SkipSize > 0) { + _l1SkipDocId = _l1Skip.decode() + 1 + _prev_doc_id; + } else { + _l1SkipDocId = _last_doc_id; + } + if (l2SkipSize > 0) { + _l2SkipDocId = _l2Skip.decode() + 1 + _prev_doc_id; + } else { + _l2SkipDocId = _last_doc_id; + } + if (l3SkipSize > 0) { + _l3SkipDocId = _l3Skip.decode() + 1 + _prev_doc_id; + } else { + _l3SkipDocId = _last_doc_id; + } + if (l4SkipSize > 0) { + _l4SkipDocId = _l4Skip.decode() + 1 + _prev_doc_id; + } else { + _l4SkipDocId = _last_doc_id; + } + _l1SkipDocIdPos = 0; + _l1SkipFeaturesPos = decode_context.getReadOffset(); + _l2SkipDocIdPos = 0; + _l2SkipL1SkipPos = 0; + _l2SkipFeaturesPos = decode_context.getReadOffset(); + _l3SkipDocIdPos = 0; + _l3SkipL1SkipPos = 0; + _l3SkipL2SkipPos = 0; + _l3SkipFeaturesPos = decode_context.getReadOffset(); + _l4SkipDocIdPos = 0; + _l4SkipL1SkipPos = 0; + _l4SkipL2SkipPos = 0; + _l4SkipL3SkipPos = 0; + _l4SkipFeaturesPos = decode_context.getReadOffset(); + _has_more = has_more; + // Decode context is now positioned at start of features +} + +void +Zc4PostingReaderBase::read_word_start(DecodeContext64Base &decode_context) +{ + Zc4PostingHeader header; + header._has_more = _has_more; + header.read(decode_context, _posting_params); + _num_docs = header._num_docs; + _residue = _num_docs; + _prev_doc_id = _has_more ? _last_doc_id : 0u; + _doc_id_k = header._doc_id_k; + _last_doc_id = header._last_doc_id; + _features_size = header._features_size; + assert(_num_docs <= _counts._numDocs); + assert(_num_docs == _counts._numDocs || + _num_docs >= _posting_params._min_chunk_docs || + _has_more); + + if (_num_docs >= _posting_params._min_skip_docs || _has_more) { + read_word_start_with_skip(decode_context, header); + } +} + +void +Zc4PostingReaderBase::set_counts(DecodeContext64Base &decode_context, const PostingListCounts &counts) +{ + assert(!_has_more && _residue == 0); // Previous words must have been read. + _counts = counts; + assert((_counts._numDocs == 0) == (_counts._bitLength == 0)); + if (_counts._numDocs > 0) { + read_word_start(decode_context); + } +} + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h new file mode 100644 index 00000000000..f19823936ba --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h @@ -0,0 +1,79 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "zc4_posting_params.h" +#include "zcbuf.h" +#include +#include + +namespace search::diskindex { + +class Zc4PostingHeader; + +/* + * Base class for reading posting lists that might have basic skip info. + */ +class Zc4PostingReaderBase +{ + +protected: + uint32_t _doc_id_k; + uint32_t _prev_doc_id; // Previous document id + uint32_t _num_docs; // Documents in chunk or word + search::ComprFileReadContext _readContext; + bool _has_more; + Zc4PostingParams _posting_params; + uint32_t _last_doc_id; // last document in chunk or word + + ZcBuf _zcDocIds; // Document id deltas + ZcBuf _l1Skip; // L1 skip info + ZcBuf _l2Skip; // L2 skip info + ZcBuf _l3Skip; // L3 skip info + ZcBuf _l4Skip; // L4 skip info + + uint64_t _numWords; // Number of words in file + uint32_t _chunkNo; // Chunk number + + // Variables for validating skip information while reading + uint32_t _l1SkipDocId; + uint32_t _l1SkipDocIdPos; + uint64_t _l1SkipFeaturesPos; + uint32_t _l2SkipDocId; + uint32_t _l2SkipDocIdPos; + uint32_t _l2SkipL1SkipPos; + uint64_t _l2SkipFeaturesPos; + uint32_t _l3SkipDocId; + uint32_t _l3SkipDocIdPos; + uint32_t _l3SkipL1SkipPos; + uint32_t _l3SkipL2SkipPos; + uint64_t _l3SkipFeaturesPos; + uint32_t _l4SkipDocId; + uint32_t _l4SkipDocIdPos; + uint32_t _l4SkipL1SkipPos; + uint32_t _l4SkipL2SkipPos; + uint32_t _l4SkipL3SkipPos; + uint64_t _l4SkipFeaturesPos; + + // Variable for validating chunk information while reading + uint64_t _features_size; + index::PostingListCounts _counts; + + uint32_t _residue; // Number of unread documents after word header + void read_common_word_doc_id(bitcompression::DecodeContext64Base &decode_context); + void read_word_start_with_skip(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingHeader &header); + void read_word_start(bitcompression::DecodeContext64Base &decode_context); +public: + Zc4PostingReaderBase(bool dynamic_k); + Zc4PostingReaderBase(const Zc4PostingReaderBase &) = delete; + Zc4PostingReaderBase(Zc4PostingReaderBase &&) = delete; + Zc4PostingReaderBase &operator=(const Zc4PostingReaderBase &) = delete; + Zc4PostingReaderBase &operator=(Zc4PostingReaderBase &&) = delete; + ~Zc4PostingReaderBase(); + void read_doc_id_and_features(index::DocIdAndFeatures &features); + void set_counts(bitcompression::DecodeContext64Base &decode_context, const index::PostingListCounts &counts); + ComprFileReadContext &get_read_context() { return _readContext; } + Zc4PostingParams &get_posting_params() { return _posting_params; } +}; + +} -- cgit v1.2.3