From 7553e0390c1ceb3834cba62774b3ddc77a6944d1 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Fri, 26 Apr 2019 11:08:17 +0200 Subject: Factor out Zc4PostingReader from Zc4PostingSeqRead. --- .../src/vespa/searchlib/diskindex/CMakeLists.txt | 1 + .../src/vespa/searchlib/diskindex/extposocc.cpp | 8 +- .../searchlib/diskindex/zc4_posting_reader.cpp | 424 ++++++++++++++++ .../vespa/searchlib/diskindex/zc4_posting_reader.h | 96 ++++ .../src/vespa/searchlib/diskindex/zcposocc.cpp | 26 +- searchlib/src/vespa/searchlib/diskindex/zcposocc.h | 2 +- .../src/vespa/searchlib/diskindex/zcposting.cpp | 561 ++------------------- .../src/vespa/searchlib/diskindex/zcposting.h | 92 +--- .../src/vespa/searchlib/index/postinglistfile.cpp | 2 - .../src/vespa/searchlib/index/postinglistfile.h | 31 -- 10 files changed, 573 insertions(+), 670 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp create mode 100644 searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h (limited to 'searchlib') diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt index 104994ad038..2fea4f2bab7 100644 --- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt @@ -19,6 +19,7 @@ vespa_add_library(searchlib_diskindex OBJECT pagedict4randread.cpp wordnummapper.cpp zc4_posting_header.cpp + zc4_posting_reader.cpp zc4_posting_writer.cpp zc4_posting_writer_base.cpp zcbuf.cpp diff --git a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp index f6e4da945e0..34e64a9b558 100644 --- a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp @@ -69,7 +69,7 @@ makePosOccWrite(const vespalib::string &name, fileHeader.getBigEndian() && fileHeader.getFormats().size() == 2 && fileHeader.getFormats()[0] == - ZcPosOccSeqRead::getIdentifier() && + Zc4PosOccSeqRead::getIdentifier(true) && fileHeader.getFormats()[1] == ZcPosOccSeqRead::getSubIdentifier()) { dynamicK = true; @@ -77,7 +77,7 @@ makePosOccWrite(const vespalib::string &name, fileHeader.getBigEndian() && fileHeader.getFormats().size() == 2 && fileHeader.getFormats()[0] == - Zc4PosOccSeqRead::getIdentifier() && + Zc4PosOccSeqRead::getIdentifier(false) && fileHeader.getFormats()[1] == Zc4PosOccSeqRead::getSubIdentifier()) { dynamicK = false; @@ -115,7 +115,7 @@ makePosOccRead(const vespalib::string &name, fileHeader.getBigEndian() && fileHeader.getFormats().size() == 2 && fileHeader.getFormats()[0] == - ZcPosOccSeqRead::getIdentifier() && + Zc4PosOccSeqRead::getIdentifier(true) && fileHeader.getFormats()[1] == ZcPosOccSeqRead::getSubIdentifier()) { dynamicK = true; @@ -123,7 +123,7 @@ makePosOccRead(const vespalib::string &name, fileHeader.getBigEndian() && fileHeader.getFormats().size() == 2 && fileHeader.getFormats()[0] == - Zc4PosOccSeqRead::getIdentifier() && + Zc4PosOccSeqRead::getIdentifier(false) && fileHeader.getFormats()[1] == Zc4PosOccSeqRead::getSubIdentifier()) { dynamicK = false; diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp new file mode 100644 index 00000000000..c9b8cf0b017 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp @@ -0,0 +1,424 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "zc4_posting_reader.h" +#include + +namespace search::diskindex { + +using index::PostingListCounts; +using index::DocIdAndFeatures; +using bitcompression::FeatureEncodeContext; + + +template +Zc4PostingReader::Zc4PostingReader(bool dynamic_k) + : _decodeContext(nullptr), + _docIdK(K_VALUE_ZCPOSTING_DELTA_DOCID), + _prevDocId(0), + _numDocs(0), + _readContext(sizeof(uint64_t)), + _has_more(false), + _posting_params(64, 1 << 30, 10000000, dynamic_k, true), + _lastDocId(0), + _zcDocIds(), + _l1Skip(), + _l2Skip(), + _l3Skip(), + _l4Skip(), + _chunkNo(0), + _l1SkipDocId(0), + _l1SkipDocIdPos(0), + _l1SkipFeaturesPos(0), + _l2SkipDocId(0), + _l2SkipDocIdPos(0), + _l2SkipL1SkipPos(0), + _l2SkipFeaturesPos(0), + _l3SkipDocId(0), + _l3SkipDocIdPos(0), + _l3SkipL1SkipPos(0), + _l3SkipL2SkipPos(0), + _l3SkipFeaturesPos(0), + _l4SkipDocId(0), + _l4SkipDocIdPos(0), + _l4SkipL1SkipPos(0), + _l4SkipL2SkipPos(0), + _l4SkipL3SkipPos(0), + _l4SkipFeaturesPos(0), + _featuresSize(0), + _counts(), + _residue(0) +{ +} + +template +Zc4PostingReader::~Zc4PostingReader() +{ +} + +template +void +Zc4PostingReader::read_common_word_doc_id_and_features(DocIdAndFeatures &features) +{ + if ((_zcDocIds._valI >= _zcDocIds._valE) && _has_more) { + read_word_start(); // Read start of next chunk + } + // Split docid & features. + assert(_zcDocIds._valI < _zcDocIds._valE); + uint32_t docIdPos = _zcDocIds.pos(); + uint32_t docId = _prevDocId + 1 + _zcDocIds.decode(); + features._docId = docId; + _prevDocId = docId; + assert(docId <= _lastDocId); + if (docId > _l1SkipDocId) { + _l1SkipDocIdPos += _l1Skip.decode() + 1; + assert(docIdPos == _l1SkipDocIdPos); + _l1SkipFeaturesPos += _l1Skip.decode() + 1; + uint64_t featuresPos = _decodeContext->getReadOffset(); + assert(featuresPos == _l1SkipFeaturesPos); + (void) featuresPos; + if (docId > _l2SkipDocId) { + _l2SkipDocIdPos += _l2Skip.decode() + 1; + assert(docIdPos == _l2SkipDocIdPos); + _l2SkipFeaturesPos += _l2Skip.decode() + 1; + assert(featuresPos == _l2SkipFeaturesPos); + _l2SkipL1SkipPos += _l2Skip.decode() + 1; + assert(_l1Skip.pos() == _l2SkipL1SkipPos); + if (docId > _l3SkipDocId) { + _l3SkipDocIdPos += _l3Skip.decode() + 1; + assert(docIdPos == _l3SkipDocIdPos); + _l3SkipFeaturesPos += _l3Skip.decode() + 1; + assert(featuresPos == _l3SkipFeaturesPos); + _l3SkipL1SkipPos += _l3Skip.decode() + 1; + assert(_l1Skip.pos() == _l3SkipL1SkipPos); + _l3SkipL2SkipPos += _l3Skip.decode() + 1; + assert(_l2Skip.pos() == _l3SkipL2SkipPos); + if (docId > _l4SkipDocId) { + _l4SkipDocIdPos += _l4Skip.decode() + 1; + assert(docIdPos == _l4SkipDocIdPos); + (void) docIdPos; + _l4SkipFeaturesPos += _l4Skip.decode() + 1; + assert(featuresPos == _l4SkipFeaturesPos); + _l4SkipL1SkipPos += _l4Skip.decode() + 1; + assert(_l1Skip.pos() == _l4SkipL1SkipPos); + _l4SkipL2SkipPos += _l4Skip.decode() + 1; + assert(_l2Skip.pos() == _l4SkipL2SkipPos); + _l4SkipL3SkipPos += _l4Skip.decode() + 1; + assert(_l3Skip.pos() == _l4SkipL3SkipPos); + _l4SkipDocId += _l4Skip.decode() + 1; + assert(_l4SkipDocId <= _lastDocId); + assert(_l4SkipDocId >= docId); + } + _l3SkipDocId += _l3Skip.decode() + 1; + assert(_l3SkipDocId <= _lastDocId); + assert(_l3SkipDocId <= _l4SkipDocId); + assert(_l3SkipDocId >= docId); + } + _l2SkipDocId += _l2Skip.decode() + 1; + assert(_l2SkipDocId <= _lastDocId); + assert(_l2SkipDocId <= _l4SkipDocId); + assert(_l2SkipDocId <= _l3SkipDocId); + assert(_l2SkipDocId >= docId); + } + _l1SkipDocId += _l1Skip.decode() + 1; + assert(_l1SkipDocId <= _lastDocId); + assert(_l1SkipDocId <= _l4SkipDocId); + assert(_l1SkipDocId <= _l3SkipDocId); + assert(_l1SkipDocId <= _l2SkipDocId); + assert(_l1SkipDocId >= docId); + } + if (docId < _lastDocId) { + // Assert more space available when not yet at last docid + assert(_zcDocIds._valI < _zcDocIds._valE); + } else { + // Assert that space has been used when at last docid + assert(_zcDocIds._valI == _zcDocIds._valE); + // Assert that we've read to end of skip info + assert(_l1SkipDocId == _lastDocId); + assert(_l2SkipDocId == _lastDocId); + assert(_l3SkipDocId == _lastDocId); + assert(_l4SkipDocId == _lastDocId); + if (!_has_more) { + _chunkNo = 0; + } + } + _decodeContext->readFeatures(features); + --_residue; +} + +template +void +Zc4PostingReader::read_doc_id_and_features(DocIdAndFeatures &features) +{ + if (_residue == 0 && !_has_more) { + if (_residue == 0) { + // Don't read past end of posting list. + features.clear(static_cast(-1)); + return; + } + } + if (_lastDocId > 0) { + read_common_word_doc_id_and_features(features); + return; + } + // Interleaves docid & features + using EC = FeatureEncodeContext; + DecodeContext &d = *_decodeContext; + uint32_t length; + uint64_t val64; + UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); + + UC64_DECODEEXPGOLOMB_SMALL_NS(o, _docIdK, EC); + uint32_t docId = _prevDocId + 1 + val64; + features._docId = docId; + _prevDocId = docId; + UC64_DECODECONTEXT_STORE(o, d._); + if (__builtin_expect(oCompr >= d._valE, false)) { + _readContext.readComprBuffer(); + } + _decodeContext->readFeatures(features); + --_residue; +} + +template +void +Zc4PostingReader::read_word_start_with_skip() +{ + using EC = FeatureEncodeContext; + DecodeContext &d = *_decodeContext; + UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); + uint32_t length; + uint64_t val64; + const uint64_t *valE = d._valE; + + if (_has_more) { + ++_chunkNo; + } else { + _chunkNo = 0; + } + assert(_numDocs >= _posting_params._min_skip_docs || _has_more); + bool has_more = false; + if (__builtin_expect(_numDocs >= _posting_params._min_chunk_docs, false)) { + if (bigEndian) { + has_more = static_cast(oVal) < 0; + oVal <<= 1; + } else { + has_more = (oVal & 1) != 0; + oVal >>= 1; + } + length = 1; + UC64_READBITS_NS(o, EC); + } + if (_posting_params._dynamic_k) { + _docIdK = EC::calcDocIdK((_has_more || has_more) ? 1 : _numDocs, + _posting_params._doc_id_limit); + } + if (_has_more || has_more) { + assert(has_more == (_chunkNo + 1 < _counts._segments.size())); + assert(_numDocs == _counts._segments[_chunkNo]._numDocs); + if (has_more) { + assert(_numDocs >= _posting_params._min_skip_docs); + assert(_numDocs >= _posting_params._min_chunk_docs); + } + } else { + assert(_numDocs >= _posting_params._min_skip_docs); + assert(_numDocs == _counts._numDocs); + } + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC); + uint32_t docIdsSize = val64 + 1; + UC64_DECODEEXPGOLOMB_NS(o, + K_VALUE_ZCPOSTING_L1SKIPSIZE, + EC); + uint32_t l1SkipSize = val64; + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + uint32_t l2SkipSize = 0; + if (l1SkipSize != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC); + l2SkipSize = val64; + } + uint32_t l3SkipSize = 0; + if (l2SkipSize != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC); + l3SkipSize = val64; + } + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + uint32_t l4SkipSize = 0; + if (l3SkipSize != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); + l4SkipSize = val64; + } + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC); + _featuresSize = val64; + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + if (_posting_params._dynamic_k) { + UC64_DECODEEXPGOLOMB_NS(o, _docIdK, EC); + } else { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_LASTDOCID, EC); + } + _lastDocId = _posting_params._doc_id_limit - 1 - val64; + if (_has_more || has_more) { + assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc); + } + + if (__builtin_expect(oCompr >= valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + valE = d._valE; + UC64_DECODECONTEXT_LOAD(o, d._); + } + uint64_t bytePad = oPreRead & 7; + if (bytePad > 0) { + length = bytePad; + if (bigEndian) { + oVal <<= length; + } else { + oVal >>= length; + } + UC64_READBITS_NS(o, EC); + } + UC64_DECODECONTEXT_STORE(o, d._); + if (__builtin_expect(oCompr >= valE, false)) { + _readContext.readComprBuffer(); + } + _zcDocIds.clearReserve(docIdsSize); + _l1Skip.clearReserve(l1SkipSize); + _l2Skip.clearReserve(l2SkipSize); + _l3Skip.clearReserve(l3SkipSize); + _l4Skip.clearReserve(l4SkipSize); + _decodeContext->readBytes(_zcDocIds._valI, docIdsSize); + _zcDocIds._valE = _zcDocIds._valI + docIdsSize; + if (l1SkipSize > 0) { + _decodeContext->readBytes(_l1Skip._valI, l1SkipSize); + } + _l1Skip._valE = _l1Skip._valI + l1SkipSize; + if (l2SkipSize > 0) { + _decodeContext->readBytes(_l2Skip._valI, l2SkipSize); + } + _l2Skip._valE = _l2Skip._valI + l2SkipSize; + if (l3SkipSize > 0) { + _decodeContext->readBytes(_l3Skip._valI, l3SkipSize); + } + _l3Skip._valE = _l3Skip._valI + l3SkipSize; + if (l4SkipSize > 0) { + _decodeContext->readBytes(_l4Skip._valI, l4SkipSize); + } + _l4Skip._valE = _l4Skip._valI + l4SkipSize; + + if (l1SkipSize > 0) { + _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId; + } else { + _l1SkipDocId = _lastDocId; + } + if (l2SkipSize > 0) { + _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId; + } else { + _l2SkipDocId = _lastDocId; + } + if (l3SkipSize > 0) { + _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId; + } else { + _l3SkipDocId = _lastDocId; + } + if (l4SkipSize > 0) { + _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId; + } else { + _l4SkipDocId = _lastDocId; + } + _l1SkipDocIdPos = 0; + _l1SkipFeaturesPos = _decodeContext->getReadOffset(); + _l2SkipDocIdPos = 0; + _l2SkipL1SkipPos = 0; + _l2SkipFeaturesPos = _decodeContext->getReadOffset(); + _l3SkipDocIdPos = 0; + _l3SkipL1SkipPos = 0; + _l3SkipL2SkipPos = 0; + _l3SkipFeaturesPos = _decodeContext->getReadOffset(); + _l4SkipDocIdPos = 0; + _l4SkipL1SkipPos = 0; + _l4SkipL2SkipPos = 0; + _l4SkipL3SkipPos = 0; + _l4SkipFeaturesPos = _decodeContext->getReadOffset(); + _has_more = has_more; + // Decode context is now positioned at start of features +} + +template +void +Zc4PostingReader::read_word_start() +{ + using EC = FeatureEncodeContext; + UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_); + uint32_t length; + uint64_t val64; + const uint64_t *valE = _decodeContext->_valE; + + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); + UC64_DECODECONTEXT_STORE(o, _decodeContext->_); + if (oCompr >= valE) { + _readContext.readComprBuffer(); + } + _numDocs = static_cast(val64) + 1; + _residue = _numDocs; + _prevDocId = _has_more ? _lastDocId : 0u; + assert(_numDocs <= _counts._numDocs); + assert(_numDocs == _counts._numDocs || + _numDocs >= _posting_params._min_chunk_docs || + _has_more); + + if (_numDocs >= _posting_params._min_skip_docs || _has_more) { + read_word_start_with_skip(); + // Decode context is not positioned at start of features + } else { + if (_posting_params._dynamic_k) { + _docIdK = EC::calcDocIdK(_numDocs, _posting_params._doc_id_limit); + } + _lastDocId = 0u; + // Decode context is not positioned at start of docids & features + } +} + +template +void +Zc4PostingReader::set_counts(const PostingListCounts &counts) +{ + assert(!_has_more && _residue == 0); // Previous words must have been read. + _counts = counts; + assert((_counts._numDocs == 0) == (_counts._bitLength == 0)); + if (_counts._numDocs > 0) { + read_word_start(); + } +} + +template +void +Zc4PostingReader::set_decode_features(DecodeContext *decode_features) +{ + _decodeContext = decode_features; + _decodeContext->setReadContext(&_readContext); + _readContext.setDecodeContext(_decodeContext); +} + +template class Zc4PostingReader; +template class Zc4PostingReader; + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h new file mode 100644 index 00000000000..d8161da15d5 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h @@ -0,0 +1,96 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "zc4_posting_writer.h" +#include +#include +#include "zc4_posting_params.h" + +namespace search::index { + class PostingListCountFileSeqRead; +} + +namespace search::diskindex { + +/* + * Class used to read posting lists of type "Zc.4" and "Zc.5" (dynamic k). + * + * Common words have docid deltas and skip info separate from + * features. + * + * Rare words do not have skip info, and docid deltas and features are + * interleaved. + */ +template +class Zc4PostingReader +{ + +protected: + using DecodeContext = bitcompression::FeatureDecodeContext; + + DecodeContext *_decodeContext; + uint32_t _docIdK; + uint32_t _prevDocId; // Previous document id + uint32_t _numDocs; // Documents in chunk or word + search::ComprFileReadContext _readContext; + bool _has_more; + Zc4PostingParams _posting_params; + uint32_t _lastDocId; // last document in chunk or word + + ZcBuf _zcDocIds; // Document id deltas + ZcBuf _l1Skip; // L1 skip info + ZcBuf _l2Skip; // L2 skip info + ZcBuf _l3Skip; // L3 skip info + ZcBuf _l4Skip; // L4 skip info + + uint64_t _numWords; // Number of words in file + uint32_t _chunkNo; // Chunk number + + // Variables for validating skip information while reading + uint32_t _l1SkipDocId; + uint32_t _l1SkipDocIdPos; + uint64_t _l1SkipFeaturesPos; + uint32_t _l2SkipDocId; + uint32_t _l2SkipDocIdPos; + uint32_t _l2SkipL1SkipPos; + uint64_t _l2SkipFeaturesPos; + uint32_t _l3SkipDocId; + uint32_t _l3SkipDocIdPos; + uint32_t _l3SkipL1SkipPos; + uint32_t _l3SkipL2SkipPos; + uint64_t _l3SkipFeaturesPos; + uint32_t _l4SkipDocId; + uint32_t _l4SkipDocIdPos; + uint32_t _l4SkipL1SkipPos; + uint32_t _l4SkipL2SkipPos; + uint32_t _l4SkipL3SkipPos; + uint64_t _l4SkipFeaturesPos; + + // Variable for validating chunk information while reading + uint64_t _featuresSize; + index::PostingListCounts _counts; + + uint32_t _residue; // Number of unread documents after word header + void read_common_word_doc_id_and_features(index::DocIdAndFeatures &features); + void read_word_start_with_skip(); + void read_word_start(); +public: + Zc4PostingReader(bool dynamic_k); + Zc4PostingReader(const Zc4PostingReader &) = delete; + Zc4PostingReader(Zc4PostingReader &&) = delete; + Zc4PostingReader &operator=(const Zc4PostingReader &) = delete; + Zc4PostingReader &operator=(Zc4PostingReader &&) = delete; + ~Zc4PostingReader(); + void read_doc_id_and_features(index::DocIdAndFeatures &features); + void set_counts(const index::PostingListCounts &counts); + void set_decode_features(DecodeContext *decode_features); + DecodeContext &get_decode_features() const { return *_decodeContext; } + ComprFileReadContext &get_read_context() { return _readContext; } + Zc4PostingParams &get_posting_params() { return _posting_params; } +}; + +extern template class Zc4PostingReader; +extern template class Zc4PostingReader; + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp index 10c08af92cb..3ae2a631cb1 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp @@ -16,14 +16,12 @@ using search::index::PostingListCountFileSeqRead; using search::index::PostingListCountFileSeqWrite; Zc4PosOccSeqRead::Zc4PosOccSeqRead(PostingListCountFileSeqRead *countFile) - : Zc4PostingSeqRead(countFile), + : Zc4PostingSeqRead(countFile, false), _fieldsParams(), _cookedDecodeContext(&_fieldsParams), _rawDecodeContext(&_fieldsParams) { - _decodeContext = &_cookedDecodeContext; - _decodeContext->setReadContext(&_readContext); - _readContext.setDecodeContext(_decodeContext); + _reader.set_decode_features(&_cookedDecodeContext); } @@ -31,18 +29,17 @@ void Zc4PosOccSeqRead:: setFeatureParams(const PostingListParams ¶ms) { - bool oldCooked = _decodeContext == &_cookedDecodeContext; + bool oldCooked = &_reader.get_decode_features() == &_cookedDecodeContext; bool newCooked = oldCooked; params.get("cooked", newCooked); if (oldCooked != newCooked) { if (newCooked) { _cookedDecodeContext = _rawDecodeContext; - _decodeContext = &_cookedDecodeContext; + _reader.set_decode_features(&_cookedDecodeContext); } else { _rawDecodeContext = _cookedDecodeContext; - _decodeContext = &_rawDecodeContext; + _reader.set_decode_features(&_rawDecodeContext); } - _readContext.setDecodeContext(_decodeContext); } } @@ -69,14 +66,12 @@ Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema, ZcPosOccSeqRead::ZcPosOccSeqRead(PostingListCountFileSeqRead *countFile) - : ZcPostingSeqRead(countFile), + : Zc4PostingSeqRead(countFile, true), _fieldsParams(), _cookedDecodeContext(&_fieldsParams), _rawDecodeContext(&_fieldsParams) { - _decodeContext = &_cookedDecodeContext; - _decodeContext->setReadContext(&_readContext); - _readContext.setDecodeContext(_decodeContext); + _reader.set_decode_features(&_cookedDecodeContext); } @@ -84,18 +79,17 @@ void ZcPosOccSeqRead:: setFeatureParams(const PostingListParams ¶ms) { - bool oldCooked = _decodeContext == &_cookedDecodeContext; + bool oldCooked = &_reader.get_decode_features() == &_cookedDecodeContext; bool newCooked = oldCooked; params.get("cooked", newCooked); if (oldCooked != newCooked) { if (newCooked) { _cookedDecodeContext = _rawDecodeContext; - _decodeContext = &_cookedDecodeContext; + _reader.set_decode_features(&_cookedDecodeContext); } else { _rawDecodeContext = _cookedDecodeContext; - _decodeContext = &_rawDecodeContext; + _reader.set_decode_features(&_rawDecodeContext); } - _readContext.setDecodeContext(_decodeContext); } } diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h index cd21fb02f33..1e0555116ce 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h @@ -34,7 +34,7 @@ public: }; -class ZcPosOccSeqRead : public ZcPostingSeqRead +class ZcPosOccSeqRead : public Zc4PostingSeqRead { private: bitcompression::PosOccFieldsParams _fieldsParams; diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp index e40842737c9..a0203b64197 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp @@ -29,60 +29,19 @@ using bitcompression::FeatureEncodeContextBE; using vespalib::getLastErrorString; -Zc4PostingSeqRead:: -Zc4PostingSeqRead(PostingListCountFileSeqRead *countFile) +Zc4PostingSeqRead::Zc4PostingSeqRead(PostingListCountFileSeqRead *countFile, bool dynamic_k) : PostingListFileSeqRead(), - _decodeContext(), - _docIdK(0), - _prevDocId(0), - _numDocs(0), - _readContext(sizeof(uint64_t)), + _reader(dynamic_k), _file(), - _hasMore(false), - _dynamicK(false), - _lastDocId(0), - _minChunkDocs(1 << 30), - _minSkipDocs(64), - _docIdLimit(10000000), - _zcDocIds(), - _l1Skip(), - _l2Skip(), - _l3Skip(), - _l4Skip(), _numWords(0), _fileBitSize(0), - _chunkNo(0), - _l1SkipDocId(0), - _l1SkipDocIdPos(0), - _l1SkipFeaturesPos(0), - _l2SkipDocId(0), - _l2SkipDocIdPos(0), - _l2SkipL1SkipPos(0), - _l2SkipFeaturesPos(0), - _l3SkipDocId(0), - _l3SkipDocIdPos(0), - _l3SkipL1SkipPos(0), - _l3SkipL2SkipPos(0), - _l3SkipFeaturesPos(0), - _l4SkipDocId(0), - _l4SkipDocIdPos(0), - _l4SkipL1SkipPos(0), - _l4SkipL2SkipPos(0), - _l4SkipL3SkipPos(0), - _l4SkipFeaturesPos(0), - _featuresSize(0), - _countFile(countFile), - _headerBitLen(0), - _rangeEndOffset(0), - _readAheadEndOffset(0), - _wordStart(0), - _residue(0) + _countFile(countFile) { if (_countFile != nullptr) { PostingListParams params; _countFile->getParams(params); - params.get("docIdLimit", _docIdLimit); - params.get("minChunkDocs", _minChunkDocs); + params.get("docIdLimit", _reader.get_posting_params()._doc_id_limit); + params.get("minChunkDocs", _reader.get_posting_params()._min_chunk_docs); } } @@ -91,387 +50,16 @@ Zc4PostingSeqRead::~Zc4PostingSeqRead() { } - -void -Zc4PostingSeqRead:: -readCommonWordDocIdAndFeatures(DocIdAndFeatures &features) -{ - if ((_zcDocIds._valI >= _zcDocIds._valE) && _hasMore) { - readWordStart(); // Read start of next chunk - } - // Split docid & features. - assert(_zcDocIds._valI < _zcDocIds._valE); - uint32_t docIdPos = _zcDocIds.pos(); - uint32_t docId = _prevDocId + 1 + _zcDocIds.decode(); - features._docId = docId; - _prevDocId = docId; - assert(docId <= _lastDocId); - if (docId > _l1SkipDocId) { - _l1SkipDocIdPos += _l1Skip.decode() + 1; - assert(docIdPos == _l1SkipDocIdPos); - _l1SkipFeaturesPos += _l1Skip.decode() + 1; - uint64_t featuresPos = _decodeContext->getReadOffset(); - assert(featuresPos == _l1SkipFeaturesPos); - (void) featuresPos; - if (docId > _l2SkipDocId) { - _l2SkipDocIdPos += _l2Skip.decode() + 1; - assert(docIdPos == _l2SkipDocIdPos); - _l2SkipFeaturesPos += _l2Skip.decode() + 1; - assert(featuresPos == _l2SkipFeaturesPos); - _l2SkipL1SkipPos += _l2Skip.decode() + 1; - assert(_l1Skip.pos() == _l2SkipL1SkipPos); - if (docId > _l3SkipDocId) { - _l3SkipDocIdPos += _l3Skip.decode() + 1; - assert(docIdPos == _l3SkipDocIdPos); - _l3SkipFeaturesPos += _l3Skip.decode() + 1; - assert(featuresPos == _l3SkipFeaturesPos); - _l3SkipL1SkipPos += _l3Skip.decode() + 1; - assert(_l1Skip.pos() == _l3SkipL1SkipPos); - _l3SkipL2SkipPos += _l3Skip.decode() + 1; - assert(_l2Skip.pos() == _l3SkipL2SkipPos); - if (docId > _l4SkipDocId) { - _l4SkipDocIdPos += _l4Skip.decode() + 1; - assert(docIdPos == _l4SkipDocIdPos); - (void) docIdPos; - _l4SkipFeaturesPos += _l4Skip.decode() + 1; - assert(featuresPos == _l4SkipFeaturesPos); - _l4SkipL1SkipPos += _l4Skip.decode() + 1; - assert(_l1Skip.pos() == _l4SkipL1SkipPos); - _l4SkipL2SkipPos += _l4Skip.decode() + 1; - assert(_l2Skip.pos() == _l4SkipL2SkipPos); - _l4SkipL3SkipPos += _l4Skip.decode() + 1; - assert(_l3Skip.pos() == _l4SkipL3SkipPos); - _l4SkipDocId += _l4Skip.decode() + 1; - assert(_l4SkipDocId <= _lastDocId); - assert(_l4SkipDocId >= docId); - } - _l3SkipDocId += _l3Skip.decode() + 1; - assert(_l3SkipDocId <= _lastDocId); - assert(_l3SkipDocId <= _l4SkipDocId); - assert(_l3SkipDocId >= docId); - } - _l2SkipDocId += _l2Skip.decode() + 1; - assert(_l2SkipDocId <= _lastDocId); - assert(_l2SkipDocId <= _l4SkipDocId); - assert(_l2SkipDocId <= _l3SkipDocId); - assert(_l2SkipDocId >= docId); - } - _l1SkipDocId += _l1Skip.decode() + 1; - assert(_l1SkipDocId <= _lastDocId); - assert(_l1SkipDocId <= _l4SkipDocId); - assert(_l1SkipDocId <= _l3SkipDocId); - assert(_l1SkipDocId <= _l2SkipDocId); - assert(_l1SkipDocId >= docId); - } - if (docId < _lastDocId) { - // Assert more space available when not yet at last docid - assert(_zcDocIds._valI < _zcDocIds._valE); - } else { - // Assert that space has been used when at last docid - assert(_zcDocIds._valI == _zcDocIds._valE); - // Assert that we've read to end of skip info - assert(_l1SkipDocId == _lastDocId); - assert(_l2SkipDocId == _lastDocId); - assert(_l3SkipDocId == _lastDocId); - assert(_l4SkipDocId == _lastDocId); - if (!_hasMore) { - _chunkNo = 0; - } - } - _decodeContext->readFeatures(features); - --_residue; -} - - -void -Zc4PostingSeqRead:: -readDocIdAndFeatures(DocIdAndFeatures &features) -{ - if (_residue == 0 && !_hasMore) { - if (_rangeEndOffset != 0) { - DecodeContext &d = *_decodeContext; - uint64_t curOffset = d.getReadOffset(); - assert(curOffset <= _rangeEndOffset); - if (curOffset < _rangeEndOffset) { - readWordStart(); - } - } - if (_residue == 0) { - // Don't read past end of posting list. - features.clear(static_cast(-1)); - return; - } - } - if (_lastDocId > 0) { - return readCommonWordDocIdAndFeatures(features); - } - // Interleaves docid & features - typedef FeatureEncodeContextBE EC; - DecodeContext &d = *_decodeContext; - uint32_t length; - uint64_t val64; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - - UC64BE_DECODEEXPGOLOMB_SMALL_NS(o, - K_VALUE_ZCPOSTING_DELTA_DOCID, - EC); - uint32_t docId = _prevDocId + 1 + val64; - features._docId = docId; - _prevDocId = docId; - UC64_DECODECONTEXT_STORE(o, d._); - if (__builtin_expect(oCompr >= d._valE, false)) { - _readContext.readComprBuffer(); - } - _decodeContext->readFeatures(features); - --_residue; -} - - -void -Zc4PostingSeqRead::readWordStartWithSkip() -{ - typedef FeatureEncodeContextBE EC; - DecodeContext &d = *_decodeContext; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - uint32_t length; - uint64_t val64; - const uint64_t *valE = d._valE; - - if (_hasMore) { - ++_chunkNo; - } else { - _chunkNo = 0; - } - assert(_numDocs >= _minSkipDocs || _hasMore); - bool hasMore = false; - if (__builtin_expect(_numDocs >= _minChunkDocs, false)) { - hasMore = static_cast(oVal) < 0; - oVal <<= 1; - length = 1; - UC64BE_READBITS_NS(o, EC); - } - if (_dynamicK) { - _docIdK = EC::calcDocIdK((_hasMore || hasMore) ? 1 : _numDocs, - _docIdLimit); - } - if (_hasMore || hasMore) { - if (_rangeEndOffset == 0) { - assert(hasMore == (_chunkNo + 1 < _counts._segments.size())); - assert(_numDocs == _counts._segments[_chunkNo]._numDocs); - } - if (hasMore) { - assert(_numDocs >= _minSkipDocs); - assert(_numDocs >= _minChunkDocs); - } - } else { - assert(_numDocs >= _minSkipDocs); - if (_rangeEndOffset == 0) { - assert(_numDocs == _counts._numDocs); - } - } - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_DOCIDSSIZE, - EC); - uint32_t docIdsSize = val64 + 1; - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_L1SKIPSIZE, - EC); - uint32_t l1SkipSize = val64; - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - uint32_t l2SkipSize = 0; - if (l1SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_L2SKIPSIZE, - EC); - l2SkipSize = val64; - } - uint32_t l3SkipSize = 0; - if (l2SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_L3SKIPSIZE, - EC); - l3SkipSize = val64; - } - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - uint32_t l4SkipSize = 0; - if (l3SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_L4SKIPSIZE, - EC); - l4SkipSize = val64; - } - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_FEATURESSIZE, - EC); - _featuresSize = val64; - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - if (_dynamicK) { - UC64BE_DECODEEXPGOLOMB_NS(o, - _docIdK, - EC); - } else { - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_LASTDOCID, - EC); - } - _lastDocId = _docIdLimit - 1 - val64; - if (_hasMore || hasMore) { - if (_rangeEndOffset == 0) { - assert(_lastDocId == _counts._segments[_chunkNo]._lastDoc); - } - } - - if (__builtin_expect(oCompr >= valE, false)) { - UC64_DECODECONTEXT_STORE(o, d._); - _readContext.readComprBuffer(); - valE = d._valE; - UC64_DECODECONTEXT_LOAD(o, d._); - } - uint64_t bytePad = oPreRead & 7; - if (bytePad > 0) { - length = bytePad; - oVal <<= length; - UC64BE_READBITS_NS(o, EC); - } - UC64_DECODECONTEXT_STORE(o, d._); - if (__builtin_expect(oCompr >= valE, false)) { - _readContext.readComprBuffer(); - } - _zcDocIds.clearReserve(docIdsSize); - _l1Skip.clearReserve(l1SkipSize); - _l2Skip.clearReserve(l2SkipSize); - _l3Skip.clearReserve(l3SkipSize); - _l4Skip.clearReserve(l4SkipSize); - _decodeContext->readBytes(_zcDocIds._valI, docIdsSize); - _zcDocIds._valE = _zcDocIds._valI + docIdsSize; - if (l1SkipSize > 0) { - _decodeContext->readBytes(_l1Skip._valI, l1SkipSize); - } - _l1Skip._valE = _l1Skip._valI + l1SkipSize; - if (l2SkipSize > 0) { - _decodeContext->readBytes(_l2Skip._valI, l2SkipSize); - } - _l2Skip._valE = _l2Skip._valI + l2SkipSize; - if (l3SkipSize > 0) { - _decodeContext->readBytes(_l3Skip._valI, l3SkipSize); - } - _l3Skip._valE = _l3Skip._valI + l3SkipSize; - if (l4SkipSize > 0) { - _decodeContext->readBytes(_l4Skip._valI, l4SkipSize); - } - _l4Skip._valE = _l4Skip._valI + l4SkipSize; - - if (l1SkipSize > 0) { - _l1SkipDocId = _l1Skip.decode() + 1 + _prevDocId; - } else { - _l1SkipDocId = _lastDocId; - } - if (l2SkipSize > 0) { - _l2SkipDocId = _l2Skip.decode() + 1 + _prevDocId; - } else { - _l2SkipDocId = _lastDocId; - } - if (l3SkipSize > 0) { - _l3SkipDocId = _l3Skip.decode() + 1 + _prevDocId; - } else { - _l3SkipDocId = _lastDocId; - } - if (l4SkipSize > 0) { - _l4SkipDocId = _l4Skip.decode() + 1 + _prevDocId; - } else { - _l4SkipDocId = _lastDocId; - } - _l1SkipDocIdPos = 0; - _l1SkipFeaturesPos = _decodeContext->getReadOffset(); - _l2SkipDocIdPos = 0; - _l2SkipL1SkipPos = 0; - _l2SkipFeaturesPos = _decodeContext->getReadOffset(); - _l3SkipDocIdPos = 0; - _l3SkipL1SkipPos = 0; - _l3SkipL2SkipPos = 0; - _l3SkipFeaturesPos = _decodeContext->getReadOffset(); - _l4SkipDocIdPos = 0; - _l4SkipL1SkipPos = 0; - _l4SkipL2SkipPos = 0; - _l4SkipL3SkipPos = 0; - _l4SkipFeaturesPos = _decodeContext->getReadOffset(); - _hasMore = hasMore; - // Decode context is now positioned at start of features -} - - void -Zc4PostingSeqRead::readWordStart() +Zc4PostingSeqRead::readDocIdAndFeatures(DocIdAndFeatures &features) { - typedef FeatureEncodeContextBE EC; - UC64_DECODECONTEXT_CONSTRUCTOR(o, _decodeContext->_); - uint32_t length; - uint64_t val64; - const uint64_t *valE = _decodeContext->_valE; - - UC64BE_DECODEEXPGOLOMB_NS(o, - K_VALUE_ZCPOSTING_NUMDOCS, - EC); - UC64_DECODECONTEXT_STORE(o, _decodeContext->_); - if (oCompr >= valE) { - _readContext.readComprBuffer(); - } - _numDocs = static_cast(val64) + 1; - _residue = _numDocs; - _prevDocId = _hasMore ? _lastDocId : 0u; - if (_rangeEndOffset == 0) { - assert(_numDocs <= _counts._numDocs); - assert(_numDocs == _counts._numDocs || - _numDocs >= _minChunkDocs || - _hasMore); - } - - if (_numDocs >= _minSkipDocs || _hasMore) { - readWordStartWithSkip(); - // Decode context is not positioned at start of features - } else { - if (_dynamicK) { - _docIdK = EC::calcDocIdK(_numDocs, _docIdLimit); - } - _lastDocId = 0u; - // Decode context is not positioned at start of docids & features - } + _reader.read_doc_id_and_features(features); } - void Zc4PostingSeqRead::readCounts(const PostingListCounts &counts) { - assert(!_hasMore); // Previous words must have been read. - - _counts = counts; - - assert((_counts._numDocs == 0) == (_counts._bitLength == 0)); - if (_counts._numDocs > 0) { - _wordStart = _decodeContext->getReadOffset(); - readWordStart(); - } + _reader.set_counts(counts); } @@ -484,16 +72,17 @@ Zc4PostingSeqRead::open(const vespalib::string &name, } bool res = _file.OpenReadOnly(name.c_str()); if (res) { - _readContext.setFile(&_file); - _readContext.setFileSize(_file.GetSize()); - DecodeContext &d = *_decodeContext; - _readContext.allocComprBuf(65536u, 32768u); + auto &readContext = _reader.get_read_context(); + readContext.setFile(&_file); + readContext.setFileSize(_file.GetSize()); + auto &d = _reader.get_decode_features(); + readContext.allocComprBuf(65536u, 32768u); d.emptyBuffer(0); - _readContext.readComprBuffer(); + readContext.readComprBuffer(); readHeader(); if (d._valI >= d._valE) { - _readContext.readComprBuffer(); + readContext.readComprBuffer(); } } else { LOG(error, "could not open %s: %s", @@ -506,9 +95,10 @@ Zc4PostingSeqRead::open(const vespalib::string &name, bool Zc4PostingSeqRead::close() { - _readContext.dropComprBuf(); + auto &readContext = _reader.get_read_context(); + readContext.dropComprBuf(); _file.Close(); - _readContext.setFile(nullptr); + readContext.setFile(nullptr); return true; } @@ -524,29 +114,30 @@ Zc4PostingSeqRead::getParams(PostingListParams ¶ms) uint32_t countMinChunkDocs = 0; countParams.get("docIdLimit", countDocIdLimit); countParams.get("minChunkDocs", countMinChunkDocs); - assert(_docIdLimit == countDocIdLimit); - assert(_minChunkDocs == countMinChunkDocs); + assert(_reader.get_posting_params()._doc_id_limit == countDocIdLimit); + assert(_reader.get_posting_params()._min_chunk_docs == countMinChunkDocs); } else { params.clear(); - params.set("docIdLimit", _docIdLimit); - params.set("minChunkDocs", _minChunkDocs); + params.set("docIdLimit", _reader.get_posting_params()._doc_id_limit); + params.set("minChunkDocs", _reader.get_posting_params()._min_chunk_docs); } - params.set("minSkipDocs", _minSkipDocs); + params.set("minSkipDocs", _reader.get_posting_params()._min_skip_docs); } void Zc4PostingSeqRead::getFeatureParams(PostingListParams ¶ms) { - _decodeContext->getParams(params); + _reader.get_decode_features().getParams(params); } void Zc4PostingSeqRead::readHeader() { - FeatureDecodeContextBE &d = *_decodeContext; - const vespalib::string &myId = _dynamicK ? myId5 : myId4; + FeatureDecodeContextBE &d = _reader.get_decode_features(); + auto &posting_params = _reader.get_posting_params(); + const vespalib::string &myId = posting_params._dynamic_k ? myId5 : myId4; vespalib::FileHeader header; d.readHeader(header, _file.getSize()); @@ -571,9 +162,9 @@ Zc4PostingSeqRead::readHeader() (void) myId; assert(header.getTag("format.1").asString() == d.getIdentifier()); _numWords = header.getTag("numWords").asInteger(); - _minChunkDocs = header.getTag("minChunkDocs").asInteger(); - _docIdLimit = header.getTag("docIdLimit").asInteger(); - _minSkipDocs = header.getTag("minSkipDocs").asInteger(); + posting_params._min_chunk_docs = header.getTag("minChunkDocs").asInteger(); + posting_params._doc_id_limit = header.getTag("docIdLimit").asInteger(); + posting_params._min_skip_docs = header.getTag("minSkipDocs").asInteger(); assert(header.getTag("endian").asString() == "big"); // Read feature decoding specific subheader d.readHeader(header, "features."); @@ -585,38 +176,9 @@ Zc4PostingSeqRead::readHeader() const vespalib::string & -Zc4PostingSeqRead::getIdentifier() -{ - return myId4; -} - - -uint64_t -Zc4PostingSeqRead::getCurrentPostingOffset() const +Zc4PostingSeqRead::getIdentifier(bool dynamic_k) { - FeatureDecodeContextBE &d = *_decodeContext; - return d.getReadOffset() - _headerBitLen; -} - - -void -Zc4PostingSeqRead::setPostingOffset(uint64_t offset, - uint64_t endOffset, - uint64_t readAheadOffset) -{ - assert(_residue == 0); // Only to be called between posting lists - - FeatureDecodeContextBE &d = *_decodeContext; - - _rangeEndOffset = endOffset + _headerBitLen; - _readAheadEndOffset = readAheadOffset + _headerBitLen; - _readContext.setStopOffset(_readAheadEndOffset, false); - uint64_t newOffset = offset + _headerBitLen; - if (newOffset != d.getReadOffset()) { - _readContext.setPosition(newOffset); - assert(newOffset == d.getReadOffset()); - _readContext.readComprBuffer(); - } + return (dynamic_k ? myId5 : myId4); } @@ -809,65 +371,6 @@ getFeatureParams(PostingListParams ¶ms) } -ZcPostingSeqRead::ZcPostingSeqRead(PostingListCountFileSeqRead *countFile) - : Zc4PostingSeqRead(countFile) -{ - _dynamicK = true; -} - - -void -ZcPostingSeqRead:: -readDocIdAndFeatures(DocIdAndFeatures &features) -{ - if (_residue == 0 && !_hasMore) { - if (_rangeEndOffset != 0) { - DecodeContext &d = *_decodeContext; - uint64_t curOffset = d.getReadOffset(); - assert(curOffset <= _rangeEndOffset); - if (curOffset < _rangeEndOffset) { - readWordStart(); - } - } - if (_residue == 0) { - // Don't read past end of posting list. - features.clear(static_cast(-1)); - return; - } - } - if (_lastDocId > 0) { - readCommonWordDocIdAndFeatures(features); - return; - } - // Interleaves docid & features - typedef FeatureEncodeContextBE EC; - DecodeContext &d = *_decodeContext; - uint32_t length; - uint64_t val64; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - - UC64BE_DECODEEXPGOLOMB_SMALL_NS(o, - _docIdK, - EC); - uint32_t docId = _prevDocId + 1 + val64; - features._docId = docId; - _prevDocId = docId; - UC64_DECODECONTEXT_STORE(o, d._); - if (__builtin_expect(oCompr >= d._valE, false)) { - _readContext.readComprBuffer(); - } - _decodeContext->readFeatures(features); - --_residue; -} - - -const vespalib::string & -ZcPostingSeqRead::getIdentifier() -{ - return myId5; -} - - ZcPostingSeqWrite::ZcPostingSeqWrite(PostingListCountFileSeqWrite *countFile) : Zc4PostingSeqWrite(countFile) { diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.h b/searchlib/src/vespa/searchlib/diskindex/zcposting.h index 96cc306cea8..01049e720a9 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposting.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.h @@ -3,8 +3,10 @@ #pragma once #include "zc4_posting_writer.h" +#include "zc4_posting_reader.h" #include #include +#include "zc4_posting_params.h" namespace search::index { class PostingListCountFileSeqRead; @@ -19,63 +21,14 @@ class Zc4PostingSeqRead : public index::PostingListFileSeqRead Zc4PostingSeqRead &operator=(const Zc4PostingSeqRead &); protected: - typedef bitcompression::FeatureDecodeContextBE DecodeContext; - typedef bitcompression::FeatureEncodeContextBE EncodeContext; - - DecodeContext *_decodeContext; - uint32_t _docIdK; - uint32_t _prevDocId; // Previous document id - uint32_t _numDocs; // Documents in chunk or word - search::ComprFileReadContext _readContext; + Zc4PostingReader _reader; FastOS_File _file; - bool _hasMore; - bool _dynamicK; // Caclulate EG compression parameters ? - uint32_t _lastDocId; // last document in chunk or word - uint32_t _minChunkDocs; // # of documents needed for chunking - uint32_t _minSkipDocs; // # of documents needed for skipping - uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit) - - ZcBuf _zcDocIds; // Document id deltas - ZcBuf _l1Skip; // L1 skip info - ZcBuf _l2Skip; // L2 skip info - ZcBuf _l3Skip; // L3 skip info - ZcBuf _l4Skip; // L4 skip info - uint64_t _numWords; // Number of words in file uint64_t _fileBitSize; - uint32_t _chunkNo; // Chunk number - - // Variables for validating skip information while reading - uint32_t _l1SkipDocId; - uint32_t _l1SkipDocIdPos; - uint64_t _l1SkipFeaturesPos; - uint32_t _l2SkipDocId; - uint32_t _l2SkipDocIdPos; - uint32_t _l2SkipL1SkipPos; - uint64_t _l2SkipFeaturesPos; - uint32_t _l3SkipDocId; - uint32_t _l3SkipDocIdPos; - uint32_t _l3SkipL1SkipPos; - uint32_t _l3SkipL2SkipPos; - uint64_t _l3SkipFeaturesPos; - uint32_t _l4SkipDocId; - uint32_t _l4SkipDocIdPos; - uint32_t _l4SkipL1SkipPos; - uint32_t _l4SkipL2SkipPos; - uint32_t _l4SkipL3SkipPos; - uint64_t _l4SkipFeaturesPos; - - // Variable for validating chunk information while reading - uint64_t _featuresSize; index::PostingListCountFileSeqRead *const _countFile; - uint64_t _headerBitLen; // Size of file header in bits - uint64_t _rangeEndOffset; // End offset for word pair - uint64_t _readAheadEndOffset; // Readahead end offset for word pair - uint64_t _wordStart; // last word header position - uint32_t _residue; // Number of unread documents after word header public: - Zc4PostingSeqRead(index::PostingListCountFileSeqRead *countFile); + Zc4PostingSeqRead(index::PostingListCountFileSeqRead *countFile, bool dynamic_k); ~Zc4PostingSeqRead(); @@ -83,11 +36,6 @@ public: typedef index::PostingListCounts PostingListCounts; typedef index::PostingListParams PostingListParams; - /** - * Read document id and features for common word. - */ - virtual void readCommonWordDocIdAndFeatures(DocIdAndFeatures &features); - void readDocIdAndFeatures(DocIdAndFeatures &features) override; void readCounts(const PostingListCounts &counts) override; // Fill in for next word bool open(const vespalib::string &name, const TuneFileSeqRead &tuneFileRead) override; @@ -97,28 +45,7 @@ public: void readWordStartWithSkip(); void readWordStart(); void readHeader(); - static const vespalib::string &getIdentifier(); - - // Methods used when generating posting list for common word pairs. - - /* - * Get current posting offset, measured in bits. First posting list - * starts at 0, i.e. file header is not accounted for here. - * - * @return current posting offset, measured in bits. - */ - uint64_t getCurrentPostingOffset() const override; - - /** - * Set current posting offset, measured in bits. First posting - * list starts at 0, i.e. file header is not accounted for here. - * - * @param Offset start of posting lists for word pair. - * @param endOffset end of posting lists for word pair. - * @param readAheadOffset end of posting list for either this or a - * later word pair, depending on disk seek cost. - */ - void setPostingOffset(uint64_t offset, uint64_t endOffset, uint64_t readAheadOffset) override; + static const vespalib::string &getIdentifier(bool dynamic_k); }; @@ -161,15 +88,6 @@ public: void updateHeader(); }; - -class ZcPostingSeqRead : public Zc4PostingSeqRead -{ -public: - ZcPostingSeqRead(index::PostingListCountFileSeqRead *countFile); - void readDocIdAndFeatures(DocIdAndFeatures &features) override; - static const vespalib::string &getIdentifier(); -}; - class ZcPostingSeqWrite : public Zc4PostingSeqWrite { public: diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.cpp b/searchlib/src/vespa/searchlib/index/postinglistfile.cpp index 0f0860f9145..52c6b85a0b8 100644 --- a/searchlib/src/vespa/searchlib/index/postinglistfile.cpp +++ b/searchlib/src/vespa/searchlib/index/postinglistfile.cpp @@ -6,8 +6,6 @@ namespace search::index { PostingListFileSeqRead::PostingListFileSeqRead() - : _counts(), - _residueDocs(0) { } diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.h b/searchlib/src/vespa/searchlib/index/postinglistfile.h index 194ac519a19..1e7dde7f139 100644 --- a/searchlib/src/vespa/searchlib/index/postinglistfile.h +++ b/searchlib/src/vespa/searchlib/index/postinglistfile.h @@ -19,9 +19,6 @@ class DocIdAndFeatures; * for words. */ class PostingListFileSeqRead { -protected: - PostingListCounts _counts; - unsigned int _residueDocs; // Docids left to read for word public: PostingListFileSeqRead(); @@ -63,34 +60,6 @@ public: * Get current (word, docid) feature parameters. */ virtual void getFeatureParams(PostingListParams ¶ms); - - // Methods used when generating posting list for common word pairs. - - /* - * Get current posting offset, measured in bits. First posting list - * starts at 0, i.e. file header is not accounted for here. - * - * @return current posting offset, measured in bits. - */ - virtual uint64_t getCurrentPostingOffset() const = 0; - - /** - * Set current posting offset, measured in bits. First posting - * list starts at 0, i.e. file header is not accounted for here. - * - * @param Offset start of posting lists for word pair. - * @param endOffset end of posting lists for word pair. - * @param readAheadOffset end of posting list for either this or a - * later word pair, depending on disk seek cost. - */ - virtual void setPostingOffset(uint64_t offset, uint64_t endOffset, uint64_t readAheadOffset) = 0; - - /** - * Get counts read by last readCounts(). - */ - const PostingListCounts &getCounts() const { return _counts; } - - PostingListCounts &getCounts() { return _counts; } }; /** -- cgit v1.2.3