diff options
author | Tor Egge <Tor.Egge@broadpark.no> | 2019-05-10 14:17:16 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@broadpark.no> | 2019-05-10 15:31:06 +0200 |
commit | b6f24cf5f949bd9dbb16c4e36b27eec84bcd217a (patch) | |
tree | 0899ade2682a11f4c3adbb559fc2be14d5b23bb2 | |
parent | 1e98247ac92f391bf8af18627354f2374255f32b (diff) |
Consolidate factory for posocc iterators.
5 files changed, 68 insertions, 150 deletions
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp index 7678b11ba41..bea92e5c009 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "zcposocciterators.h" +#include "zc4_posting_params.h" namespace search::diskindex { @@ -64,6 +65,42 @@ ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool d _decodeContext = &_decodeContextReal; } +template <bool bigEndian> +std::unique_ptr<search::queryeval::SearchIterator> +create_zc_posocc_iterator(const PostingListCounts &counts, bitcompression::Position start, uint64_t bit_length, const Zc4PostingParams &posting_params, const bitcompression::PosOccFieldsParams &fields_params, const fef::TermFieldMatchDataArray &match_data) +{ + using EC = bitcompression::EncodeContext64<bigEndian>; + bitcompression::DecodeContext64<bigEndian> d(start.getOccurences(), start.getBitOffset()); + UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); + uint32_t length; + uint64_t val64; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); + uint32_t num_docs = static_cast<uint32_t>(val64) + 1; + assert((num_docs == counts._numDocs) || ((num_docs == posting_params._min_chunk_docs) && (num_docs < counts._numDocs))); + if (num_docs < posting_params._min_skip_docs) { + if (posting_params._dynamic_k) { + return std::make_unique<ZcRareWordPosOccIterator<bigEndian>>(start, bit_length, posting_params._doc_id_limit, posting_params._encode_cheap_features, &fields_params, match_data); + } else { + return std::make_unique<Zc4RareWordPosOccIterator<bigEndian>>(start, bit_length, posting_params._doc_id_limit, posting_params._encode_cheap_features, &fields_params, match_data); + } + } else { + if (posting_params._dynamic_k) { + return std::make_unique<ZcPosOccIterator<bigEndian>>(start, bit_length, posting_params._doc_id_limit, posting_params._encode_cheap_features, posting_params._min_chunk_docs, counts, &fields_params, match_data); + } else { + return std::make_unique<Zc4PosOccIterator<bigEndian>>(start, bit_length, posting_params._doc_id_limit, posting_params._encode_cheap_features, posting_params._min_chunk_docs, counts, &fields_params, match_data); + } + } +} + +std::unique_ptr<search::queryeval::SearchIterator> +create_zc_posocc_iterator(bool bigEndian, const PostingListCounts &counts, bitcompression::Position start, uint64_t bit_length, const Zc4PostingParams &posting_params, const bitcompression::PosOccFieldsParams &fields_params, const fef::TermFieldMatchDataArray &match_data) +{ + if (bigEndian) { + return create_zc_posocc_iterator<true>(counts, start, bit_length, posting_params, fields_params, match_data); + } else { + return create_zc_posocc_iterator<false>(counts, start, bit_length, posting_params, fields_params, match_data); + } +} template class Zc4RareWordPosOccIterator<true>; template class Zc4RareWordPosOccIterator<false>; diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h index 3b58203aab4..76e7b384c11 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h @@ -7,6 +7,8 @@ namespace search::diskindex { +struct Zc4PostingParams; + template <bool bigEndian> class Zc4RareWordPosOccIterator : public Zc4RareWordPostingIterator<bigEndian> { @@ -73,6 +75,9 @@ public: }; +std::unique_ptr<search::queryeval::SearchIterator> +create_zc_posocc_iterator(bool bigEndian, const index::PostingListCounts &counts, bitcompression::Position start, uint64_t bit_length, const Zc4PostingParams &posting_params, const bitcompression::PosOccFieldsParams &fields_params, const fef::TermFieldMatchDataArray &match_data); + extern template class Zc4RareWordPosOccIterator<true>; extern template class Zc4RareWordPosOccIterator<false>; diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp index 9d7df382325..edbd78b9b01 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp @@ -36,15 +36,11 @@ using vespalib::getLastErrorString; ZcPosOccRandRead::ZcPosOccRandRead() : _file(std::make_unique<FastOS_File>()), _fileSize(0), - _minChunkDocs(1 << 30), - _minSkipDocs(64), - _docIdLimit(10000000), + _posting_params(64, 1 << 30, 10000000, true, true, false), _numWords(0), _fileBitSize(0), _headerBitSize(0), - _fieldsParams(), - _dynamicK(true), - _decode_cheap_features(false) + _fieldsParams() { } @@ -65,8 +61,6 @@ createIterator(const PostingListCounts &counts, { (void) usebitVector; - typedef EGPosOccEncodeContext<true> EC; - assert((handle._bitLength != 0) == (counts._bitLength != 0)); assert((counts._numDocs != 0) == (counts._bitLength != 0)); assert(handle._bitOffsetMem <= handle._bitOffset); @@ -85,22 +79,7 @@ createIterator(const PostingListCounts &counts, handle._bitOffsetMem) & 63; Position start(mem, bitOffset); - - EGPosOccDecodeContext<true> d(mem, bitOffset, &_fieldsParams); - - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - uint32_t length; - uint64_t val64; - - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); - - uint32_t numDocs = static_cast<uint32_t>(val64) + 1; - - if (numDocs < _minSkipDocs) { - return new ZcRareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, &_fieldsParams, matchData); - } else { - return new ZcPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, _minChunkDocs, counts, &_fieldsParams, matchData); - } + return create_zc_posocc_iterator(true, counts, start, handle._bitLength, _posting_params, _fieldsParams, matchData).release(); } @@ -200,10 +179,11 @@ ZcPosOccRandRead::close() } +template <typename DecodeContext> void -ZcPosOccRandRead::readHeader() +ZcPosOccRandRead::readHeader(const vespalib::string &identifier) { - EGPosOccDecodeContext<true> d(&_fieldsParams); + DecodeContext d(&_fieldsParams); ComprFileReadContext drc(d); drc.setFile(_file.get()); @@ -227,14 +207,14 @@ ZcPosOccRandRead::readHeader() assert(header.hasTag("minSkipDocs")); assert(header.getTag("frozen").asInteger() != 0); _fileBitSize = header.getTag("fileBitSize").asInteger(); - assert(header.getTag("format.0").asString() == myId5); + assert(header.getTag("format.0").asString() == identifier); assert(header.getTag("format.1").asString() == d.getIdentifier()); _numWords = header.getTag("numWords").asInteger(); - _minChunkDocs = header.getTag("minChunkDocs").asInteger(); - _docIdLimit = header.getTag("docIdLimit").asInteger(); - _minSkipDocs = header.getTag("minSkipDocs").asInteger(); + _posting_params._min_chunk_docs = header.getTag("minChunkDocs").asInteger(); + _posting_params._doc_id_limit = header.getTag("docIdLimit").asInteger(); + _posting_params._min_skip_docs = header.getTag("minSkipDocs").asInteger(); if (header.hasTag(cheap_features) && (header.getTag(cheap_features).asInteger() != 0)) { - _decode_cheap_features = true; + _posting_params._encode_cheap_features = true; } // Read feature decoding specific subheader d.readHeader(header, "features."); @@ -245,6 +225,11 @@ ZcPosOccRandRead::readHeader() _headerBitSize = d.getReadOffset(); } +void +ZcPosOccRandRead::readHeader() +{ + readHeader<EGPosOccDecodeContext<true>>(myId5); +} const vespalib::string & ZcPosOccRandRead::getIdentifier() @@ -266,95 +251,14 @@ Zc4PosOccRandRead:: Zc4PosOccRandRead() : ZcPosOccRandRead() { - _dynamicK = false; + _posting_params._dynamic_k = false; } -search::queryeval::SearchIterator * -Zc4PosOccRandRead:: -createIterator(const PostingListCounts &counts, - const PostingListHandle &handle, - const search::fef::TermFieldMatchDataArray &matchData, - bool usebitVector) const -{ - (void) usebitVector; - typedef EGPosOccEncodeContext<true> EC; - - assert((handle._bitLength != 0) == (counts._bitLength != 0)); - assert((counts._numDocs != 0) == (counts._bitLength != 0)); - assert(handle._bitOffsetMem <= handle._bitOffset); - - if (handle._bitLength == 0) { - return new search::queryeval::EmptySearch; - } - - const char *cmem = static_cast<const char *>(handle._mem); - uint64_t memOffset = reinterpret_cast<unsigned long>(cmem) & 7; - const uint64_t *mem = reinterpret_cast<const uint64_t *> - (cmem - memOffset) + - (memOffset * 8 + handle._bitOffset - - handle._bitOffsetMem) / 64; - int bitOffset = (memOffset * 8 + handle._bitOffset - - handle._bitOffsetMem) & 63; - - Position start(mem, bitOffset); - EG2PosOccDecodeContext<true> d(mem, bitOffset, &_fieldsParams); - - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - uint32_t length; - uint64_t val64; - - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); - - uint32_t numDocs = static_cast<uint32_t>(val64) + 1; - - if (numDocs < _minSkipDocs) { - return new Zc4RareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, &_fieldsParams, matchData); - } else { - return new Zc4PosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, _minChunkDocs, counts, &_fieldsParams, matchData); - } -} - void Zc4PosOccRandRead::readHeader() { - EG2PosOccDecodeContext<true> d(&_fieldsParams); - ComprFileReadContext drc(d); - - drc.setFile(_file.get()); - drc.setFileSize(_file->GetSize()); - drc.allocComprBuf(512, 32768u); - d.emptyBuffer(0); - drc.readComprBuffer(); - d.setReadContext(&drc); - - vespalib::FileHeader header; - d.readHeader(header, _file->getSize()); - uint32_t headerLen = header.getSize(); - assert(header.hasTag("frozen")); - assert(header.hasTag("fileBitSize")); - assert(header.hasTag("format.0")); - assert(header.hasTag("format.1")); - assert(!header.hasTag("format.2")); - assert(header.hasTag("numWords")); - assert(header.hasTag("minChunkDocs")); - assert(header.hasTag("docIdLimit")); - assert(header.hasTag("minSkipDocs")); - assert(header.getTag("frozen").asInteger() != 0); - _fileBitSize = header.getTag("fileBitSize").asInteger(); - assert(header.getTag("format.0").asString() == myId4); - assert(header.getTag("format.1").asString() == d.getIdentifier()); - _numWords = header.getTag("numWords").asInteger(); - _minChunkDocs = header.getTag("minChunkDocs").asInteger(); - _docIdLimit = header.getTag("docIdLimit").asInteger(); - _minSkipDocs = header.getTag("minSkipDocs").asInteger(); - // Read feature decoding specific subheader - d.readHeader(header, "features."); - // Align on 64-bit unit - d.smallAlign(64); - headerLen += (-headerLen & 7); - assert(d.getReadOffset() == headerLen * 8); - _headerBitSize = d.getReadOffset(); + readHeader<EG2PosOccDecodeContext<true> >(myId4); } const vespalib::string & diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h index a78ae6f14f3..dcaca5d1f81 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h @@ -6,6 +6,7 @@ #include <vespa/searchlib/bitcompression/compression.h> #include <vespa/searchlib/bitcompression/posocccompression.h> #include <vespa/searchlib/fef/termfieldmatchdataarray.h> +#include "zc4_posting_params.h" namespace search::diskindex { @@ -14,18 +15,11 @@ class ZcPosOccRandRead : public index::PostingListFileRandRead protected: std::unique_ptr<FastOS_FileInterface> _file; uint64_t _fileSize; - - uint32_t _minChunkDocs; // # of documents needed for chunking - uint32_t _minSkipDocs; // # of documents needed for skipping - uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit) - + Zc4PostingParams _posting_params; uint64_t _numWords; // Number of words in file uint64_t _fileBitSize; uint64_t _headerBitSize; bitcompression::PosOccFieldsParams _fieldsParams; - bool _dynamicK; - bool _decode_cheap_features; - public: ZcPosOccRandRead(); @@ -50,6 +44,8 @@ public: bool open(const vespalib::string &name, const TuneFileRandRead &tuneFileRead) override; bool close() override; + template <typename DecodeContext> + void readHeader(const vespalib::string &identifier); virtual void readHeader(); static const vespalib::string &getIdentifier(); static const vespalib::string &getSubIdentifier(); @@ -57,17 +53,10 @@ public: class Zc4PosOccRandRead : public ZcPosOccRandRead { + using ZcPosOccRandRead::readHeader; public: Zc4PosOccRandRead(); - /** - * Create iterator for single word. Semantic lifetime of counts and - * handle must exceed lifetime of iterator. - */ - queryeval::SearchIterator * - createIterator(const PostingListCounts &counts, const PostingListHandle &handle, - const fef::TermFieldMatchDataArray &matchData, bool usebitVector) const override; - void readHeader() override; static const vespalib::string &getIdentifier(); diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp index c79574a61ff..0f914e2e3b1 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp @@ -1223,6 +1223,7 @@ FakeZcSkipPosOcc<bigEndian>::FakeZcSkipPosOcc(const FakeWord &fw) { setup(fw); _counts._bitLength = _compressedBits; + _counts._numDocs = _hitDocs; } @@ -1284,6 +1285,7 @@ FakeZc4SkipPosOcc<bigEndian>::FakeZc4SkipPosOcc(const FakeWord &fw, const Zc4Pos { setup(fw); _counts._bitLength = _compressedBits; + _counts._numDocs = _hitDocs; } template <bool bigEndian> @@ -1318,26 +1320,7 @@ SearchIterator * FakeZc4SkipPosOcc<bigEndian>:: createIterator(const TermFieldMatchDataArray &matchData) const { - if (_hitDocs >= _posting_params._min_skip_docs) { - if (_posting_params._dynamic_k) { - return new ZcPosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, - static_cast<uint32_t>(-1), - _counts, - &_fieldsParams, - matchData); - } else { - return new Zc4PosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, - static_cast<uint32_t>(-1), _counts, &_fieldsParams, matchData); - } - } else { - if (_posting_params._dynamic_k) { - return new ZcRareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0), - _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, &_fieldsParams, matchData); - } else { - return new Zc4RareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0), - _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, &_fieldsParams, matchData); - } - } + return create_zc_posocc_iterator(bigEndian, _counts, Position(_compressed.first, 0), _compressedBits, _posting_params, _fieldsParams, matchData).release(); } template <bool bigEndian> |