diff options
author | Geir Storli <geirst@verizonmedia.com> | 2019-05-08 21:04:19 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-05-08 21:04:19 +0200 |
commit | f69f960d5ae48d246f56a60e6e46c90a58f836ba (patch) | |
tree | 4fbba4b13578dfd08f708a7b87784ffb713d38f2 | |
parent | 26842a42a1521ad7c6ec333d723c358bb325850f (diff) | |
parent | b8c3247c0813a91b5f064272f750f0e5c49e3352 (diff) |
Merge pull request #9317 from vespa-engine/toregge/extend-zc4-postings-with-cheap-features
Extend Zc4/Zc5 postings with "cheap" features.
23 files changed, 423 insertions, 301 deletions
diff --git a/searchlib/src/vespa/searchlib/bitcompression/compression.h b/searchlib/src/vespa/searchlib/bitcompression/compression.h index b9166f675aa..de206d33b8d 100644 --- a/searchlib/src/vespa/searchlib/bitcompression/compression.h +++ b/searchlib/src/vespa/searchlib/bitcompression/compression.h @@ -68,6 +68,8 @@ private: #define K_VALUE_ZCPOSTING_L4SKIPSIZE 6 #define K_VALUE_ZCPOSTING_FEATURESSIZE 25 #define K_VALUE_ZCPOSTING_DELTA_DOCID 22 +#define K_VALUE_ZCPOSTING_FIELD_LENGTH 9 +#define K_VALUE_ZCPOSTING_NUM_OCCS 0 /** * Lookup tables used for compression / decompression. diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h index ea4cc6f58a6..8a5564d1cf7 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h @@ -15,13 +15,15 @@ struct Zc4PostingParams { uint32_t _doc_id_limit; bool _dynamic_k; bool _encode_features; + bool _encode_cheap_features; - Zc4PostingParams(uint32_t min_skip_docs, uint32_t min_chunk_docs, uint32_t doc_id_limit, bool dynamic_k, bool encode_features) + Zc4PostingParams(uint32_t min_skip_docs, uint32_t min_chunk_docs, uint32_t doc_id_limit, bool dynamic_k, bool encode_features, bool encode_cheap_features) : _min_skip_docs(min_skip_docs), _min_chunk_docs(min_chunk_docs), _doc_id_limit(doc_id_limit), _dynamic_k(dynamic_k), - _encode_features(encode_features) + _encode_features(encode_features), + _encode_cheap_features(encode_cheap_features) { } }; diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp index 961940d318f..ab2598211bb 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp @@ -50,6 +50,22 @@ Zc4PostingReader<bigEndian>::read_doc_id_and_features(DocIdAndFeatures &features UC64_DECODEEXPGOLOMB_SMALL_NS(o, _doc_id_k, EC); _no_skip.set_doc_id(_no_skip.get_doc_id() + 1 + val64); + if (_posting_params._encode_cheap_features) { + if (__builtin_expect(oCompr >= d._valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + UC64_DECODECONTEXT_LOAD(o, d._); + } + UC64_DECODEEXPGOLOMB_SMALL_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC); + _no_skip.set_field_length(val64 + 1); + if (__builtin_expect(oCompr >= d._valE, false)) { + UC64_DECODECONTEXT_STORE(o, d._); + _readContext.readComprBuffer(); + UC64_DECODECONTEXT_STORE(o, d._); + } + UC64_DECODEEXPGOLOMB_SMALL_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC); + _no_skip.set_num_occs(val64 + 1); + } UC64_DECODECONTEXT_STORE(o, d._); if (__builtin_expect(oCompr >= d._valE, false)) { _readContext.readComprBuffer(); @@ -57,6 +73,10 @@ Zc4PostingReader<bigEndian>::read_doc_id_and_features(DocIdAndFeatures &features } features.set_doc_id(_no_skip.get_doc_id()); if (_posting_params._encode_features) { + if (_posting_params._encode_cheap_features) { + features.set_field_length(_no_skip.get_field_length()); + features.set_num_occs(_no_skip.get_num_occs()); + } _decodeContext->readFeatures(features); } --_residue; diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h index 59a660407b4..1be71450d09 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.h @@ -10,7 +10,9 @@ namespace search::diskindex { * Class used to read posting lists of type "Zc.4" and "Zc.5" (dynamic k). * * Common words have docid deltas and skip info separate from - * features. + * features. If "cheap" features are enabled then they are interleaved + * with docid deltas for quick access during sequential scan while the + * full features still remains separate. * * Rare words do not have skip info, and docid deltas and features are * interleaved. diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp index 68cf9489475..411b1cae8b9 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.cpp @@ -14,6 +14,8 @@ using bitcompression::DecodeContext64Base; Zc4PostingReaderBase::NoSkip::NoSkip() : _zc_buf(), _doc_id(0), + _field_length(1), + _num_occs(1), _doc_id_pos(0), _features_pos(0) { @@ -35,10 +37,14 @@ Zc4PostingReaderBase::NoSkip::setup(DecodeContext &decode_context, uint32_t size } void -Zc4PostingReaderBase::NoSkip::read() +Zc4PostingReaderBase::NoSkip::read(bool decode_cheap_features) { assert(_zc_buf._valI < _zc_buf._valE); _doc_id += (_zc_buf.decode()+ 1); + if (decode_cheap_features) { + _field_length = _zc_buf.decode() + 1; + _num_occs = _zc_buf.decode() + 1; + } _doc_id_pos = _zc_buf.pos(); } @@ -167,7 +173,7 @@ Zc4PostingReaderBase::Zc4PostingReaderBase(bool dynamic_k) _num_docs(0), _readContext(sizeof(uint64_t)), _has_more(false), - _posting_params(64, 1 << 30, 10000000, dynamic_k, true), + _posting_params(64, 1 << 30, 10000000, dynamic_k, true, false), _last_doc_id(0), _no_skip(), _l1_skip(), @@ -206,7 +212,7 @@ Zc4PostingReaderBase::read_common_word_doc_id(DecodeContext64Base &decode_contex } _l1_skip.next_skip_entry(); } - _no_skip.read(); + _no_skip.read(_posting_params._encode_cheap_features); if (_residue == 1) { _no_skip.check_end(_last_doc_id); _l1_skip.check_end(_last_doc_id); diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h index bc72a61942b..7ca87445633 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader_base.h @@ -24,6 +24,8 @@ protected: protected: ZcBuf _zc_buf; uint32_t _doc_id; + uint32_t _field_length; + uint32_t _num_occs; uint32_t _doc_id_pos; uint64_t _features_pos; public: @@ -31,13 +33,17 @@ protected: ~NoSkip(); void setup(DecodeContext &decode_context, uint32_t size, uint32_t doc_id); void set_features_pos(uint64_t features_pos) { _features_pos = features_pos; } - void read(); + void read(bool decode_cheap_features); void check_end(uint32_t last_doc_id); void check_not_end(uint32_t last_doc_id); uint32_t get_doc_id() const { return _doc_id; } + uint32_t get_field_length() const { return _field_length; } + uint32_t get_num_occs() const { return _num_occs; } uint32_t get_doc_id_pos() const { return _doc_id_pos; } uint64_t get_features_pos() const { return _features_pos; } - void set_doc_id(uint32_t doc_id) { _doc_id = doc_id; } + void set_doc_id(uint32_t doc_id) { _doc_id = doc_id; } + void set_field_length(uint32_t field_length) { _field_length = field_length; } + void set_num_occs(uint32_t num_occs) { _num_occs = num_occs; } }; // Helper class for L1 skip info class L1Skip : public NoSkip { diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp index 78d18cb5550..ad8f7440bdc 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp @@ -83,10 +83,10 @@ Zc4PostingWriter<bigEndian>::flush_word_with_skip(bool hasMore) !hasMore) ? numDocs : 1, _docIdLimit); - e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first, + e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back()._doc_id, docIdK); } else { - e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first, + e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back()._doc_id, K_VALUE_ZCPOSTING_LASTDOCID); } @@ -132,7 +132,7 @@ Zc4PostingWriter<bigEndian>::flush_word_with_skip(bool hasMore) PostingListCounts::Segment seg; seg._bitLength = writePos - (_writePos + _counts._bitLength); seg._numDocs = numDocs; - seg._lastDoc = _docIds.back().first; + seg._lastDoc = _docIds.back()._doc_id; _counts._segments.push_back(seg); _counts._bitLength += seg._bitLength; } @@ -153,11 +153,11 @@ Zc4PostingWriter<bigEndian>::write_docid_and_features(const DocIdAndFeatures &fe uint64_t writeOffset = _encode_features->getWriteOffset(); uint64_t featureSize = writeOffset - _featureOffset; assert(static_cast<uint32_t>(featureSize) == featureSize); - _docIds.push_back(std::make_pair(features.doc_id(), - static_cast<uint32_t>(featureSize))); + _docIds.emplace_back(features.doc_id(), features.field_length(), features.num_occs(), + static_cast<uint32_t>(featureSize)); _featureOffset = writeOffset; } else { - _docIds.push_back(std::make_pair(features.doc_id(), uint32_t(0))); + _docIds.emplace_back(features.doc_id(), features.field_length(), features.num_occs(), 0); } } @@ -187,10 +187,16 @@ Zc4PostingWriter<bigEndian>::flush_word_no_skip() std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end(); for (; dit != dite; ++dit) { - uint32_t docId = dit->first; - uint32_t featureSize = dit->second; + uint32_t docId = dit->_doc_id; + uint32_t featureSize = dit->_features_size; e.encodeExpGolomb(docId - baseDocId, docIdK); baseDocId = docId + 1; + if (_encode_cheap_features) { + assert(dit->_field_length > 0); + e.encodeExpGolomb(dit->_field_length - 1, K_VALUE_ZCPOSTING_FIELD_LENGTH); + assert(dit->_num_occs > 0); + e.encodeExpGolomb(dit->_num_occs - 1, K_VALUE_ZCPOSTING_NUM_OCCS); + } if (featureSize != 0) { e.writeBits(features + (featureOffset >> 6), featureOffset & 63, diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h index f2c96af8901..ce1a73db7a5 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h @@ -12,7 +12,9 @@ namespace search::diskindex { * Class used to write posting lists of type "Zc.4" and "Zc.5" (dynamic k). * * Common words have docid deltas and skip info separate from - * features. + * features. If "cheap" features are enabled then they are interleaved + * with docid deltas for quick access during sequential scan while the + * full features still remains separate. * * Rare words do not have skip info, and docid deltas and features are * interleaved. diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp index 5ab37cecc3d..6e0cf6ed881 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp @@ -15,7 +15,7 @@ protected: uint32_t _doc_id; uint32_t _doc_id_pos; uint32_t _feature_pos; - using DocIdAndFeatureSize = std::pair<uint32_t, uint32_t>; + using DocIdAndFeatureSize = Zc4PostingWriterBase::DocIdAndFeatureSize; public: DocIdEncoder() @@ -25,7 +25,7 @@ public: { } - void write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size); + void write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size, bool encode_cheap_features); void set_doc_id(uint32_t doc_id) { _doc_id = doc_id; } uint32_t get_doc_id() const { return _doc_id; } uint32_t get_doc_id_pos() const { return _doc_id_pos; } @@ -100,11 +100,17 @@ public: }; void -DocIdEncoder::write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size) +DocIdEncoder::write(ZcBuf &zc_buf, const DocIdAndFeatureSize &doc_id_and_feature_size, bool encode_cheap_features) { - _feature_pos += doc_id_and_feature_size.second; - zc_buf.encode(doc_id_and_feature_size.first - _doc_id - 1); - _doc_id = doc_id_and_feature_size.first; + _feature_pos += doc_id_and_feature_size._features_size; + zc_buf.encode(doc_id_and_feature_size._doc_id - _doc_id - 1); + _doc_id = doc_id_and_feature_size._doc_id; + if (encode_cheap_features) { + assert(doc_id_and_feature_size._field_length > 0); + zc_buf.encode(doc_id_and_feature_size._field_length - 1); + assert(doc_id_and_feature_size._num_occs > 0); + zc_buf.encode(doc_id_and_feature_size._num_occs - 1); + } _doc_id_pos = zc_buf.size(); } @@ -199,6 +205,7 @@ Zc4PostingWriterBase::Zc4PostingWriterBase(PostingListCounts &counts) _featureOffset(0), _writePos(0), _dynamicK(false), + _encode_cheap_features(false), _zcDocIds(), _l1Skip(), _l2Skip(), @@ -257,7 +264,7 @@ Zc4PostingWriterBase::calc_skip_info(bool encode_features) } } } - doc_id_encoder.write(_zcDocIds, doc_id_and_feature_size); + doc_id_encoder.write(_zcDocIds, doc_id_and_feature_size, _encode_cheap_features); } // Extra partial entries for skip tables to simplify iterator during search l1_skip_encoder.write_partial_skip(_l1Skip, doc_id_encoder.get_doc_id()); @@ -282,6 +289,7 @@ Zc4PostingWriterBase::set_posting_list_params(const PostingListParams ¶ms) params.get("docIdLimit", _docIdLimit); params.get("minChunkDocs", _minChunkDocs); params.get("minSkipDocs", _minSkipDocs); + params.get("cheap_features", _encode_cheap_features); } } diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h index 6da59028803..bb94e379c38 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h @@ -18,18 +18,32 @@ namespace search::diskindex { */ class Zc4PostingWriterBase { +public: + struct DocIdAndFeatureSize { + uint32_t _doc_id; + uint32_t _field_length; + uint32_t _num_occs; + uint32_t _features_size; + DocIdAndFeatureSize(uint32_t doc_id, uint32_t field_length, uint32_t num_occs, uint32_t features_size) + : _doc_id(doc_id), + _field_length(field_length), + _num_occs(num_occs), + _features_size(features_size) + { + } + }; protected: uint32_t _minChunkDocs; // # of documents needed for chunking uint32_t _minSkipDocs; // # of documents needed for skipping uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit) // Unpacked document ids for word and feature sizes - using DocIdAndFeatureSize = std::pair<uint32_t, uint32_t>; std::vector<DocIdAndFeatureSize> _docIds; uint64_t _featureOffset; // Bit offset of next feature uint64_t _writePos; // Bit position for start of current word bool _dynamicK; // Caclulate EG compression parameters ? + bool _encode_cheap_features; ZcBuf _zcDocIds; // Document id deltas ZcBuf _l1Skip; // L1 skip info ZcBuf _l2Skip; // L2 skip info @@ -58,7 +72,9 @@ public: uint32_t get_docid_limit() const { return _docIdLimit; } uint64_t get_num_words() const { return _numWords; } bool get_dynamic_k() const { return _dynamicK; } + bool get_encode_cheap_features() const { return _encode_cheap_features; } void set_dynamic_k(bool dynamicK) { _dynamicK = dynamicK; } + void set_encode_cheap_features(bool encode_cheap_features) { _encode_cheap_features = encode_cheap_features; } void set_posting_list_params(const index::PostingListParams ¶ms); }; diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp index 64160c848fb..7678b11ba41 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.cpp @@ -13,10 +13,10 @@ using search::index::PostingListCounts; template <bool bigEndian> Zc4RareWordPosOccIterator<bigEndian>:: -Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, +Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features, const PosOccFieldsParams *fieldsParams, const TermFieldMatchDataArray &matchData) - : Zc4RareWordPostingIterator<bigEndian>(matchData, start, docIdLimit), + : Zc4RareWordPostingIterator<bigEndian>(matchData, start, docIdLimit, decode_cheap_features), _decodeContextReal(start.getOccurences(), start.getBitOffset(), bitLength, fieldsParams) { assert(!matchData.valid() || (fieldsParams->getNumFields() == matchData.size())); @@ -26,11 +26,11 @@ Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimi template <bool bigEndian> Zc4PosOccIterator<bigEndian>:: -Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, +Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features, uint32_t minChunkDocs, const PostingListCounts &counts, const PosOccFieldsParams *fieldsParams, const TermFieldMatchDataArray &matchData) - : ZcPostingIterator<bigEndian>(minChunkDocs, false, counts, matchData, start, docIdLimit), + : ZcPostingIterator<bigEndian>(minChunkDocs, false, counts, matchData, start, docIdLimit, decode_cheap_features), _decodeContextReal(start.getOccurences(), start.getBitOffset(), bitLength, fieldsParams) { assert(!matchData.valid() || (fieldsParams->getNumFields() == matchData.size())); @@ -40,10 +40,10 @@ Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, template <bool bigEndian> ZcRareWordPosOccIterator<bigEndian>:: -ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, +ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features, const PosOccFieldsParams *fieldsParams, const TermFieldMatchDataArray &matchData) - : ZcRareWordPostingIterator<bigEndian>(matchData, start, docIdLimit), + : ZcRareWordPostingIterator<bigEndian>(matchData, start, docIdLimit, decode_cheap_features), _decodeContextReal(start.getOccurences(), start.getBitOffset(), bitLength, fieldsParams) { assert(!matchData.valid() || (fieldsParams->getNumFields() == matchData.size())); @@ -53,11 +53,11 @@ ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit template <bool bigEndian> ZcPosOccIterator<bigEndian>:: -ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, +ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features, uint32_t minChunkDocs, const PostingListCounts &counts, const PosOccFieldsParams *fieldsParams, const TermFieldMatchDataArray &matchData) - : ZcPostingIterator<bigEndian>(minChunkDocs, true, counts, matchData, start, docIdLimit), + : ZcPostingIterator<bigEndian>(minChunkDocs, true, counts, matchData, start, docIdLimit, decode_cheap_features), _decodeContextReal(start.getOccurences(), start.getBitOffset(), bitLength, fieldsParams) { assert(!matchData.valid() || (fieldsParams->getNumFields() == matchData.size())); diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h index 41f2b747916..3b58203aab4 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocciterators.h @@ -17,7 +17,7 @@ private: typedef bitcompression::EG2PosOccDecodeContextCooked<bigEndian> DecodeContextReal; DecodeContextReal _decodeContextReal; public: - Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, + Zc4RareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features, const bitcompression::PosOccFieldsParams *fieldsParams, const fef::TermFieldMatchDataArray &matchData); }; @@ -33,7 +33,7 @@ private: typedef bitcompression::EG2PosOccDecodeContextCooked<bigEndian> DecodeContext; DecodeContext _decodeContextReal; public: - Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, + Zc4PosOccIterator(Position start, uint64_t bitLength, uint32_t docIdLimit, bool decode_cheap_features, uint32_t minChunkDocs, const index::PostingListCounts &counts, const bitcompression::PosOccFieldsParams *fieldsParams, const fef::TermFieldMatchDataArray &matchData); @@ -50,7 +50,7 @@ private: typedef bitcompression::EGPosOccDecodeContextCooked<bigEndian> DecodeContextReal; DecodeContextReal _decodeContextReal; public: - ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docidLimit, + ZcRareWordPosOccIterator(Position start, uint64_t bitLength, uint32_t docidLimit, bool decode_cheap_features, const bitcompression::PosOccFieldsParams *fieldsParams, const fef::TermFieldMatchDataArray &matchData); }; @@ -66,7 +66,7 @@ private: typedef bitcompression::EGPosOccDecodeContextCooked<bigEndian> DecodeContext; DecodeContext _decodeContextReal; public: - ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docidLimit, + ZcPosOccIterator(Position start, uint64_t bitLength, uint32_t docidLimit, bool decode_cheap_features, uint32_t minChunkDocs, const index::PostingListCounts &counts, const bitcompression::PosOccFieldsParams *fieldsParams, const fef::TermFieldMatchDataArray &matchData); diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp index 90680851859..9d7df382325 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp @@ -25,6 +25,7 @@ namespace { vespalib::string myId4("Zc.4"); vespalib::string myId5("Zc.5"); +vespalib::string cheap_features("cheap_features"); } @@ -42,7 +43,8 @@ ZcPosOccRandRead::ZcPosOccRandRead() _fileBitSize(0), _headerBitSize(0), _fieldsParams(), - _dynamicK(true) + _dynamicK(true), + _decode_cheap_features(false) { } @@ -95,9 +97,9 @@ createIterator(const PostingListCounts &counts, uint32_t numDocs = static_cast<uint32_t>(val64) + 1; if (numDocs < _minSkipDocs) { - return new ZcRareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, &_fieldsParams, matchData); + return new ZcRareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, &_fieldsParams, matchData); } else { - return new ZcPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _minChunkDocs, counts, &_fieldsParams, matchData); + return new ZcPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, _minChunkDocs, counts, &_fieldsParams, matchData); } } @@ -231,6 +233,9 @@ ZcPosOccRandRead::readHeader() _minChunkDocs = header.getTag("minChunkDocs").asInteger(); _docIdLimit = header.getTag("docIdLimit").asInteger(); _minSkipDocs = header.getTag("minSkipDocs").asInteger(); + if (header.hasTag(cheap_features) && (header.getTag(cheap_features).asInteger() != 0)) { + _decode_cheap_features = true; + } // Read feature decoding specific subheader d.readHeader(header, "features."); // Align on 64-bit unit @@ -304,9 +309,9 @@ createIterator(const PostingListCounts &counts, uint32_t numDocs = static_cast<uint32_t>(val64) + 1; if (numDocs < _minSkipDocs) { - return new Zc4RareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, &_fieldsParams, matchData); + return new Zc4RareWordPosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, &_fieldsParams, matchData); } else { - return new Zc4PosOccIterator<true>(start, handle._bitLength, _docIdLimit, _minChunkDocs, counts, &_fieldsParams, matchData); + return new Zc4PosOccIterator<true>(start, handle._bitLength, _docIdLimit, _decode_cheap_features, _minChunkDocs, counts, &_fieldsParams, matchData); } } diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h index 3741f011c30..a78ae6f14f3 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h @@ -24,6 +24,7 @@ protected: uint64_t _headerBitSize; bitcompression::PosOccFieldsParams _fieldsParams; bool _dynamicK; + bool _decode_cheap_features; public: diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp index a0203b64197..b03085b0b55 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp @@ -16,6 +16,7 @@ namespace { vespalib::string myId5("Zc.5"); vespalib::string myId4("Zc.4"); vespalib::string emptyId; +vespalib::string cheap_features("cheap_features"); } @@ -165,6 +166,9 @@ Zc4PostingSeqRead::readHeader() posting_params._min_chunk_docs = header.getTag("minChunkDocs").asInteger(); posting_params._doc_id_limit = header.getTag("docIdLimit").asInteger(); posting_params._min_skip_docs = header.getTag("minSkipDocs").asInteger(); + if (header.hasTag(cheap_features) && (header.getTag(cheap_features).asInteger() != 0)) { + posting_params._encode_cheap_features = true; + } assert(header.getTag("endian").asString() == "big"); // Read feature decoding specific subheader d.readHeader(header, "features."); @@ -233,6 +237,7 @@ Zc4PostingSeqWrite::makeHeader(const FileHeaderContext &fileHeaderContext) header.putTag(Tag("fileBitSize", 0)); header.putTag(Tag("format.0", myId)); header.putTag(Tag("format.1", f.getIdentifier())); + header.putTag(Tag("cheap_features", _writer.get_encode_cheap_features() ? 1 : 0)); header.putTag(Tag("numWords", 0)); header.putTag(Tag("minChunkDocs", _writer.get_min_chunk_docs())); header.putTag(Tag("docIdLimit", _writer.get_docid_limit())); diff --git a/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.cpp b/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.cpp index 95679bb0af2..8ea576b06b0 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.cpp @@ -35,12 +35,15 @@ ZcIteratorBase::initRange(uint32_t beginid, uint32_t endid) template <bool bigEndian> Zc4RareWordPostingIterator<bigEndian>:: -Zc4RareWordPostingIterator(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit) +Zc4RareWordPostingIterator(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features) : ZcIteratorBase(matchData, start, docIdLimit), _decodeContext(nullptr), _residue(0), _prevDocId(0), - _numDocs(0) + _numDocs(0), + _decode_cheap_features(decode_cheap_features), + _field_length(0), + _num_occs(0) { } @@ -66,6 +69,12 @@ Zc4RareWordPostingIterator<bigEndian>::doSeek(uint32_t docId) printf("Decode docId=%d\n", oDocId); #endif + if (_decode_cheap_features) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC); + _field_length = static_cast<uint32_t>(val64) + 1; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC); + _num_occs = static_cast<uint32_t>(val64) + 1; + } } while (__builtin_expect(oDocId < docId, true)) { UC64_DECODECONTEXT_STORE(o, _decodeContext->_); @@ -80,6 +89,12 @@ Zc4RareWordPostingIterator<bigEndian>::doSeek(uint32_t docId) printf("Decode docId=%d\n", oDocId); #endif + if (_decode_cheap_features) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC); + _field_length = static_cast<uint32_t>(val64) + 1; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC); + _num_occs = static_cast<uint32_t>(val64) + 1; + } } UC64_DECODECONTEXT_STORE(o, _decodeContext->_); setDocId(oDocId); @@ -123,6 +138,12 @@ Zc4RareWordPostingIterator<bigEndian>::readWordStart(uint32_t docIdLimit) _numDocs = static_cast<uint32_t>(val64) + 1; UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DELTA_DOCID, EC); uint32_t docId = static_cast<uint32_t>(val64) + 1; + if (_decode_cheap_features) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC); + _field_length = static_cast<uint32_t>(val64) + 1; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC); + _num_occs = static_cast<uint32_t>(val64) + 1; + } UC64_DECODECONTEXT_STORE(o, _decodeContext->_); setDocId(docId); @@ -133,8 +154,8 @@ Zc4RareWordPostingIterator<bigEndian>::readWordStart(uint32_t docIdLimit) template <bool bigEndian> ZcRareWordPostingIterator<bigEndian>:: -ZcRareWordPostingIterator(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit) - : Zc4RareWordPostingIterator<bigEndian>(matchData, start, docIdLimit), +ZcRareWordPostingIterator(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features) + : Zc4RareWordPostingIterator<bigEndian>(matchData, start, docIdLimit, decode_cheap_features), _docIdK(0) { } @@ -162,6 +183,12 @@ ZcRareWordPostingIterator<bigEndian>::doSeek(uint32_t docId) printf("Decode docId=%d\n", oDocId); #endif + if (_decode_cheap_features) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC); + _field_length = static_cast<uint32_t>(val64) + 1; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC); + _num_occs = static_cast<uint32_t>(val64) + 1; + } } while (__builtin_expect(oDocId < docId, true)) { UC64_DECODECONTEXT_STORE(o, _decodeContext->_); @@ -176,6 +203,12 @@ ZcRareWordPostingIterator<bigEndian>::doSeek(uint32_t docId) printf("Decode docId=%d\n", oDocId); #endif + if (_decode_cheap_features) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC); + _field_length = static_cast<uint32_t>(val64) + 1; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC); + _num_occs = static_cast<uint32_t>(val64) + 1; + } } UC64_DECODECONTEXT_STORE(o, _decodeContext->_); setDocId(oDocId); @@ -200,6 +233,12 @@ ZcRareWordPostingIterator<bigEndian>::readWordStart(uint32_t docIdLimit) _docIdK = EC::calcDocIdK(_numDocs, docIdLimit); UC64_DECODEEXPGOLOMB_NS(o, _docIdK, EC); uint32_t docId = static_cast<uint32_t>(val64) + 1; + if (_decode_cheap_features) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FIELD_LENGTH, EC); + _field_length = static_cast<uint32_t>(val64) + 1; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUM_OCCS, EC); + _num_occs = static_cast<uint32_t>(val64) + 1; + } UC64_DECODECONTEXT_STORE(o, _decodeContext->_); setDocId(docId); @@ -207,7 +246,7 @@ ZcRareWordPostingIterator<bigEndian>::readWordStart(uint32_t docIdLimit) clearUnpacked(); } -ZcPostingIteratorBase::ZcPostingIteratorBase(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit) +ZcPostingIteratorBase::ZcPostingIteratorBase(const TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features) : ZcIteratorBase(matchData, start, docIdLimit), _valI(nullptr), _valIBase(nullptr), @@ -219,7 +258,10 @@ ZcPostingIteratorBase::ZcPostingIteratorBase(const TermFieldMatchDataArray &matc _chunk(), _featuresSize(0), _hasMore(false), - _chunkNo(0) + _decode_cheap_features(decode_cheap_features), + _chunkNo(0), + _field_length(0), + _num_occs(0) { } @@ -229,8 +271,8 @@ ZcPostingIterator(uint32_t minChunkDocs, bool dynamicK, const PostingListCounts &counts, const search::fef::TermFieldMatchDataArray &matchData, - Position start, uint32_t docIdLimit) - : ZcPostingIteratorBase(matchData, start, docIdLimit), + Position start, uint32_t docIdLimit, bool decode_cheap_features) + : ZcPostingIteratorBase(matchData, start, docIdLimit, decode_cheap_features), _decodeContext(nullptr), _minChunkDocs(minChunkDocs), _docIdK(0), @@ -550,6 +592,8 @@ ZcPostingIteratorBase::doSeek(uint32_t docId) assert(docId <= _l4._skipDocId); #endif const uint8_t *oCompr = _valI; + uint32_t field_length = _field_length; + uint32_t num_occs = _num_occs; while (__builtin_expect(oDocId < docId, true)) { #if DEBUG_ZCPOSTING_ASSERT assert(oDocId <= _l1._skipDocId); @@ -562,10 +606,18 @@ ZcPostingIteratorBase::doSeek(uint32_t docId) printf("Decode docId=%d\n", oDocId); #endif + if (_decode_cheap_features) { + ZCDECODE(oCompr, field_length =); + ZCDECODE(oCompr, num_occs =); + } incNeedUnpack(); } _valI = oCompr; setDocId(oDocId); + if (_decode_cheap_features) { + _field_length = field_length; + _num_occs = num_occs; + } return; } diff --git a/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.h b/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.h index 97b7e2dc0cc..222fb404a7d 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcpostingiterators.h @@ -68,8 +68,11 @@ public: unsigned int _residue; uint32_t _prevDocId; // Previous document id uint32_t _numDocs; // Documents in chunk or word + bool _decode_cheap_features; + uint32_t _field_length; + uint32_t _num_occs; - Zc4RareWordPostingIterator(const fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit); + Zc4RareWordPostingIterator(const fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features); void doUnpack(uint32_t docId) override; void doSeek(uint32_t docId) override; @@ -89,12 +92,15 @@ private: using ParentClass::setDocId; using ParentClass::setAtEnd; using ParentClass::_numDocs; + using ParentClass::_decode_cheap_features; + using ParentClass::_field_length; + using ParentClass::_num_occs; uint32_t _docIdK; public: using ParentClass::_decodeContext; - ZcRareWordPostingIterator(const search::fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit); + ZcRareWordPostingIterator(const search::fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features); void doSeek(uint32_t docId) override; void readWordStart(uint32_t docIdLimit) override; @@ -239,12 +245,19 @@ protected: ChunkSkip _chunk; uint64_t _featuresSize; bool _hasMore; + bool _decode_cheap_features; uint32_t _chunkNo; + uint32_t _field_length; + uint32_t _num_occs; void nextDocId(uint32_t prevDocId) { uint32_t docId = prevDocId + 1; ZCDECODE(_valI, docId +=); setDocId(docId); + if (_decode_cheap_features) { + ZCDECODE(_valI, _field_length =); + ZCDECODE(_valI, _num_occs =); + } } virtual void featureSeek(uint64_t offset) = 0; VESPA_DLL_LOCAL void doChunkSkipSeek(uint32_t docId); @@ -254,7 +267,7 @@ protected: VESPA_DLL_LOCAL void doL1SkipSeek(uint32_t docId); void doSeek(uint32_t docId) override; public: - ZcPostingIteratorBase(const fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit); + ZcPostingIteratorBase(const fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features); }; template <bool bigEndian> @@ -281,7 +294,7 @@ public: const PostingListCounts &_counts; ZcPostingIterator(uint32_t minChunkDocs, bool dynamicK, const PostingListCounts &counts, - const search::fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit); + const search::fef::TermFieldMatchDataArray &matchData, Position start, uint32_t docIdLimit, bool decode_cheap_features); void doUnpack(uint32_t docId) override; diff --git a/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp b/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp index 07b4da8a85f..ac3c4f4d3a5 100644 --- a/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp +++ b/searchlib/src/vespa/searchlib/index/docidandfeatures.cpp @@ -8,6 +8,8 @@ namespace search::index { DocIdAndFeatures::DocIdAndFeatures() : _doc_id(0), + _field_length(1), + _num_occs(1), _elements(), _word_positions(), _blob(), @@ -19,6 +21,6 @@ DocIdAndFeatures::DocIdAndFeatures() DocIdAndFeatures::DocIdAndFeatures(const DocIdAndFeatures &) = default; DocIdAndFeatures & DocIdAndFeatures::operator = (const DocIdAndFeatures &) = default; -DocIdAndFeatures::~DocIdAndFeatures() { } +DocIdAndFeatures::~DocIdAndFeatures() = default; } diff --git a/searchlib/src/vespa/searchlib/index/docidandfeatures.h b/searchlib/src/vespa/searchlib/index/docidandfeatures.h index a063712a79e..5372d5ef3aa 100644 --- a/searchlib/src/vespa/searchlib/index/docidandfeatures.h +++ b/searchlib/src/vespa/searchlib/index/docidandfeatures.h @@ -95,6 +95,8 @@ public: protected: uint32_t _doc_id; // Current document id + uint32_t _field_length; + uint32_t _num_occs; std::vector<WordDocElementFeatures> _elements; std::vector<WordDocElementWordPosFeatures> _word_positions; @@ -140,7 +142,11 @@ public: } uint32_t doc_id() const { return _doc_id; } + uint32_t field_length() const { return _field_length; } + uint32_t num_occs() const { return _num_occs; } void set_doc_id(uint32_t val) { _doc_id = val; } + void set_field_length(uint32_t val) { _field_length = val; } + void set_num_occs(uint32_t val) { _num_occs = val; } const std::vector<WordDocElementFeatures>& elements() const { return _elements; } std::vector<WordDocElementFeatures>& elements() { return _elements; } diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp index 8f6c16658c9..601451dc6c4 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.cpp @@ -107,14 +107,12 @@ FakeWord::DocWordPosFeature::~DocWordPosFeature() FakeWord::DocWordCollapsedFeature::DocWordCollapsedFeature() + : _field_len(0), + _num_occs(0) { } - -FakeWord::DocWordCollapsedFeature::~DocWordCollapsedFeature() -{ -} - +FakeWord::DocWordCollapsedFeature::~DocWordCollapsedFeature() = default; FakeWord::DocWordFeature::DocWordFeature() : _docId(0), @@ -235,14 +233,16 @@ FakeWord::fakeup(search::BitVector &bitmap, DocWordPosFeature dwpf; dwpf._wordPos = rnd.lrand48() % 8192; dwpf._elementId = 0; - if (_fieldsParams.getFieldParams()[0]._hasElements) + if (_fieldsParams.getFieldParams()[0]._hasElements) { dwpf._elementId = rnd.lrand48() % 4; + } wpf.push_back(dwpf); } if (positions > 1) { /* Sort wordpos list and "avoid" duplicate positions */ std::sort(wpf.begin(), wpf.end()); } + uint32_t field_len = 0; do { DocWordPosFeatureList::iterator ie(wpf.end()); DocWordPosFeatureList::iterator i(wpf.begin()); @@ -274,8 +274,14 @@ FakeWord::fakeup(search::BitVector &bitmap, pi->_elementWeight = elementWeight; ++pi; } + field_len += elementLen; + } + if (_fieldsParams.getFieldParams()[0]._hasElements) { + field_len += ((rnd.lrand48() % 10) + 10); } } while (0); + dwf._collapsedDocWordFeatures._field_len = field_len; + dwf._collapsedDocWordFeatures._num_occs = dwf._positions; dwf._accPositions = wordPosFeatures.size(); assert(dwf._positions == wpf.size()); postings.push_back(dwf); diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.h b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.h index 345d69c29f6..106c0c0d9ab 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakeword.h +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakeword.h @@ -49,6 +49,9 @@ public: class DocWordCollapsedFeature { public: + uint32_t _field_len; + uint32_t _num_occs; + DocWordCollapsedFeature(); ~DocWordCollapsedFeature(); }; @@ -201,6 +204,8 @@ public: p->_elementWeight, p->_elementLen); ++p; } + features.set_field_length(d._collapsedDocWordFeatures._field_len); + features.set_num_occs(d._collapsedDocWordFeatures._num_occs); } public: diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp index f6c6e5a64f3..c79574a61ff 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp @@ -29,6 +29,13 @@ namespace search { namespace fakedata { +namespace { + +constexpr uint32_t disable_chunking = 1000000000; +constexpr uint32_t disable_skip = 1000000000; +constexpr uint32_t force_skip = 1; + +} #define DEBUG_ZCFILTEROCC_PRINTF 0 #define DEBUG_ZCFILTEROCC_ASSERT 0 @@ -86,7 +93,6 @@ FakeZcFilterOcc::FakeZcFilterOcc(const FakeWord &fw) _l2SkipSize(0), _l3SkipSize(0), _l4SkipSize(0), - _docIdLimit(0), _hitDocs(0), _lastDocId(0u), _compressedBits(0), @@ -94,14 +100,16 @@ FakeZcFilterOcc::FakeZcFilterOcc(const FakeWord &fw) _compressedMalloc(NULL), _featuresSize(0), _fieldsParams(fw.getFieldsParams()), - _bigEndian(true) + _bigEndian(true), + _posting_params(force_skip, disable_chunking, fw._docIdLimit, true, false, false) { - setup(fw, false, true); + setup(fw); } FakeZcFilterOcc::FakeZcFilterOcc(const FakeWord &fw, bool bigEndian, + const diskindex::Zc4PostingParams &posting_params, const char *nameSuffix) : FakePosting(fw.getName() + nameSuffix), _docIdsSize(0), @@ -109,36 +117,34 @@ FakeZcFilterOcc::FakeZcFilterOcc(const FakeWord &fw, _l2SkipSize(0), _l3SkipSize(0), _l4SkipSize(0), - _docIdLimit(0), _hitDocs(0), _lastDocId(0u), _compressedBits(0), _compressed(std::make_pair(static_cast<uint64_t *>(NULL), 0)), _featuresSize(0), _fieldsParams(fw.getFieldsParams()), - _bigEndian(bigEndian) + _bigEndian(bigEndian), + _posting_params(posting_params) { - // subclass responsible for calling setup(fw, false/true); + // subclass responsible for calling setup(fw); } void -FakeZcFilterOcc::setup(const FakeWord &fw, bool doFeatures, - bool dynamicK) +FakeZcFilterOcc::setup(const FakeWord &fw) { if (_bigEndian) { - setupT<true>(fw, doFeatures, dynamicK); + setupT<true>(fw); } else { - setupT<false>(fw, doFeatures, dynamicK); + setupT<false>(fw); } - validate_read(fw, doFeatures, dynamicK); + validate_read(fw); } template <bool bigEndian> void -FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures, - bool dynamicK) +FakeZcFilterOcc::setupT(const FakeWord &fw) { PostingListCounts counts; Zc4PostingWriter<bigEndian> writer(counts); @@ -154,18 +160,19 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures, DocIdAndPosOccFeatures features; EGPosOccEncodeContext<bigEndian> f1(&_fieldsParams); EG2PosOccEncodeContext<bigEndian> f0(&_fieldsParams); - FeatureEncodeContext<bigEndian> &f = (dynamicK ? + FeatureEncodeContext<bigEndian> &f = (_posting_params._dynamic_k ? static_cast<FeatureEncodeContext<bigEndian> &>(f1) : static_cast<FeatureEncodeContext<bigEndian> &>(f0)); - writer.set_dynamic_k(dynamicK); - if (doFeatures) { + writer.set_dynamic_k(_posting_params._dynamic_k); + if (_posting_params._encode_features) { writer.set_encode_features(&f); } PostingListParams params; params.set("docIdLimit", fw._docIdLimit); - params.set("minChunkDocs", 1000000000); // Disable chunking - params.set("minSkipDocs", 1u); // Force skip info + params.set("minChunkDocs", _posting_params._min_chunk_docs); // Control chunking + params.set("minSkipDocs", _posting_params._min_skip_docs); // Control skip info + params.set("cheap_features", _posting_params._encode_cheap_features); writer.set_posting_list_params(params); auto &writeContext = writer.get_write_context(); search::ComprBuffer &cb = writeContext; @@ -174,7 +181,7 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures, e.setupWrite(cb); // Ensure that some space is initially available in encoding buffers while (d != de) { - if (doFeatures) { + if (_posting_params._encode_features) { fw.setupFeatures(*d, &*p, features); p += d->_positions; } else { @@ -183,13 +190,12 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures, writer.write_docid_and_features(features); ++d; } - if (doFeatures) { + if (_posting_params._encode_features) { assert(p == pe); } writer.flush_word(); _featuresSize = 0; _hitDocs = fw._postings.size(); - _docIdLimit = fw._docIdLimit; _compressedBits = e.getWriteOffset(); assert(_compressedBits == counts._bitLength); assert(_hitDocs == counts._numDocs); @@ -199,55 +205,56 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures, std::pair<void *, size_t> ectxData = writeContext.grabComprBuffer(_compressedMalloc); _compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first), ectxData.second); - read_header<bigEndian>(doFeatures, dynamicK, writer.get_min_skip_docs(), writer.get_min_chunk_docs()); + read_header<bigEndian>(); } template <bool bigEndian> void -FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_docs, uint32_t min_chunk_docs) +FakeZcFilterOcc::read_header() { // read back word header to get skip sizes DecodeContext64<bigEndian> decode_context; decode_context.setPosition({ _compressed.first, 0 }); - Zc4PostingParams params(min_skip_docs, min_chunk_docs, _docIdLimit, dynamicK, doFeatures); Zc4PostingHeader header; - header.read(decode_context, params); + header.read(decode_context, _posting_params); _docIdsSize = header._doc_ids_size; _l1SkipSize = header._l1_skip_size; _l2SkipSize = header._l2_skip_size; _l3SkipSize = header._l3_skip_size; _l4SkipSize = header._l4_skip_size; _featuresSize = header._features_size; - assert(_lastDocId == header._last_doc_id); + assert(header._num_docs == _hitDocs); + if (header._num_docs >= _posting_params._min_skip_docs) { + assert(_lastDocId == header._last_doc_id); + } else { + assert(header._last_doc_id == 0); + } } void -FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const +FakeZcFilterOcc::validate_read(const FakeWord &fw) const { if (_bigEndian) { - validate_read<true>(fw, encode_features, dynamic_k); + validate_read<true>(fw); } else { - validate_read<false>(fw, encode_features, dynamic_k); + validate_read<false>(fw); } } template <bool bigEndian> void -FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const +FakeZcFilterOcc::validate_read(const FakeWord &fw) const { bitcompression::EGPosOccDecodeContextCooked<bigEndian> decode_context_dynamic_k(&_fieldsParams); bitcompression::EG2PosOccDecodeContextCooked<bigEndian> decode_context_static_k(&_fieldsParams); bitcompression::FeatureDecodeContext<bigEndian> &decode_context_dynamic_k_upcast = decode_context_dynamic_k; bitcompression::FeatureDecodeContext<bigEndian> &decode_context_static_k_upcast = decode_context_static_k; - bitcompression::FeatureDecodeContext<bigEndian> &decode_context = dynamic_k ? decode_context_dynamic_k_upcast : decode_context_static_k_upcast; - Zc4PostingReader<bigEndian> reader(dynamic_k); + bitcompression::FeatureDecodeContext<bigEndian> &decode_context = _posting_params._dynamic_k ? decode_context_dynamic_k_upcast : decode_context_static_k_upcast; + Zc4PostingReader<bigEndian> reader(_posting_params._dynamic_k); reader.set_decode_features(&decode_context); auto ¶ms = reader.get_posting_params(); - params._min_skip_docs = 1; - params._min_chunk_docs = 1000000000; - params._doc_id_limit = _docIdLimit; - params._encode_features = encode_features; + params = _posting_params; reader.get_read_context().reference_compressed_buffer(_compressed.first, _compressed.second); assert(decode_context.getReadOffset() == 0u); PostingListCounts counts; @@ -260,7 +267,7 @@ FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dy DocIdAndFeatures features; uint32_t hits = 0; for (const auto &doc : fw._postings) { - if (encode_features) { + if (_posting_params._encode_features) { fw.setupFeatures(doc, &*word_pos_iterator, check_features); word_pos_iterator += doc._positions; } else { @@ -270,9 +277,13 @@ FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dy assert(features.doc_id() == doc._docId); assert(features.elements().size() == check_features.elements().size()); assert(features.word_positions().size() == check_features.word_positions().size()); + if (_posting_params._encode_cheap_features) { + assert(features.field_length() == doc._collapsedDocWordFeatures._field_len); + assert(features.num_occs() == doc._collapsedDocWordFeatures._num_occs); + } ++hits; } - if (encode_features) { + if (_posting_params._encode_features) { assert(word_pos_iterator == word_pos_iterator_end); } reader.read_doc_id_and_features(features); @@ -398,7 +409,7 @@ public: uint32_t docIdLimit, const fef::TermFieldMatchDataArray &matchData); - ~FakeFilterOccZCArrayIterator(); + ~FakeFilterOccZCArrayIterator() override; void doUnpack(uint32_t docId) override; void doSeek(uint32_t docId) override; @@ -427,7 +438,7 @@ FakeFilterOccZCArrayIterator::initRange(uint32_t begin, uint32_t end) { queryeval::RankedSearchIteratorBase::initRange(begin, end); DecodeContext &d = _decodeContext; - Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false); + Zc4PostingParams params(force_skip, disable_chunking, _docIdLimit, true, false, false); Zc4PostingHeader header; header.read(d, params); assert((d.getBitOffset() & 7) == 0); @@ -462,11 +473,13 @@ FakeFilterOccZCArrayIterator::doSeek(uint32_t docId) const uint8_t *oCompr = _valI; uint32_t oDocId = getDocId(); - if (getUnpacked()) + if (getUnpacked()) { clearUnpacked(); + } while (oDocId < docId) { - if (--_residue == 0) + if (--_residue == 0) { goto atbreak; + } ZCDECODE(oCompr, oDocId += 1 +); #if DEBUG_ZCFILTEROCC_PRINTF printf("Decode docId=%d\n", @@ -499,7 +512,7 @@ SearchIterator * FakeZcFilterOcc:: createIterator(const TermFieldMatchDataArray &matchData) const { - return new FakeFilterOccZCArrayIterator(_compressed.first, 0, _docIdLimit, matchData); + return new FakeFilterOccZCArrayIterator(_compressed.first, 0, _posting_params._doc_id_limit, matchData); } template <bool doSkip> @@ -508,7 +521,7 @@ class FakeZcSkipFilterOcc : public FakeZcFilterOcc public: FakeZcSkipFilterOcc(const FakeWord &fw); - ~FakeZcSkipFilterOcc(); + ~FakeZcSkipFilterOcc() override; SearchIterator *createIterator(const TermFieldMatchDataArray &matchData) const override; }; @@ -523,17 +536,17 @@ initSkip(std::make_pair("ZcSkipFilterOcc", template<> FakeZcSkipFilterOcc<false>::FakeZcSkipFilterOcc(const FakeWord &fw) - : FakeZcFilterOcc(fw, true, ".zcnoskipfilterocc") + : FakeZcFilterOcc(fw, true, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, true, false, false), ".zc5noskipfilterocc") { - setup(fw, false, true); + setup(fw); } template<> FakeZcSkipFilterOcc<true>::FakeZcSkipFilterOcc(const FakeWord &fw) - : FakeZcFilterOcc(fw, true, ".zcskipfilterocc") + : FakeZcFilterOcc(fw, true, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, true, false, false), ".zc5skipfilterocc") { - setup(fw, false, true); + setup(fw); } @@ -591,7 +604,7 @@ public: uint32_t docIdLimit, const TermFieldMatchDataArray &matchData); - ~FakeFilterOccZCSkipArrayIterator(); + ~FakeFilterOccZCSkipArrayIterator() override; void doL4SkipSeek(uint32_t docId); void doL3SkipSeek(uint32_t docId); @@ -648,7 +661,7 @@ initRange(uint32_t begin, uint32_t end) { queryeval::RankedSearchIteratorBase::initRange(begin, end); DecodeContext &d = _decodeContext; - Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false); + Zc4PostingParams params(force_skip, disable_chunking, _docIdLimit, true, false, false); Zc4PostingHeader header; header.read(d, params); _lastDocId = header._last_doc_id; @@ -800,8 +813,9 @@ FakeFilterOccZCSkipArrayIterator<true>::doL3SkipSeek(uint32_t docId) if (__builtin_expect(docId > _l4SkipDocId, false)) { doL4SkipSeek(docId); - if (docId <= _l3SkipDocId) + if (docId <= _l3SkipDocId) { return; + } } do { lastL3SkipDocId = _l3SkipDocId; @@ -847,8 +861,9 @@ FakeFilterOccZCSkipArrayIterator<true>::doL2SkipSeek(uint32_t docId) if (__builtin_expect(docId > _l3SkipDocId, false)) { doL3SkipSeek(docId); - if (docId <= _l2SkipDocId) + if (docId <= _l2SkipDocId) { return; + } } do { lastL2SkipDocId = _l2SkipDocId; @@ -894,8 +909,9 @@ FakeFilterOccZCSkipArrayIterator<true>::doL1SkipSeek(uint32_t docId) uint32_t lastL1SkipDocId; if (__builtin_expect(docId > _l2SkipDocId, false)) { doL2SkipSeek(docId); - if (docId <= _l1SkipDocId) + if (docId <= _l1SkipDocId) { return; + } } do { lastL1SkipDocId = _l1SkipDocId; @@ -925,8 +941,9 @@ template <bool doSkip> void FakeFilterOccZCSkipArrayIterator<doSkip>::doSeek(uint32_t docId) { - if (getUnpacked()) + if (getUnpacked()) { clearUnpacked(); + } if (doSkip && docId > _l1SkipDocId) { doL1SkipSeek(docId); } @@ -1075,7 +1092,7 @@ createIterator(const TermFieldMatchDataArray &matchData) const { return new FakeFilterOccZCSkipArrayIterator<doSkip>(_compressed.first, 0, - _docIdLimit, + _posting_params._doc_id_limit, matchData); } @@ -1085,8 +1102,7 @@ class FakeEGCompr64PosOcc : public FakeZcFilterOcc { public: FakeEGCompr64PosOcc(const FakeWord &fw); - ~FakeEGCompr64PosOcc(); - void setup(const FakeWord &fw); + ~FakeEGCompr64PosOcc() override; size_t bitSize() const override; bool hasWordPositions() const override; SearchIterator *createIterator(const TermFieldMatchDataArray &matchData) const override; @@ -1095,7 +1111,7 @@ public: template <bool bigEndian> FakeEGCompr64PosOcc<bigEndian>::FakeEGCompr64PosOcc(const FakeWord &fw) - : FakeZcFilterOcc(fw, bigEndian, + : FakeZcFilterOcc(fw, bigEndian, Zc4PostingParams(disable_skip, disable_chunking, fw._docIdLimit, true, true, false), bigEndian ? ".zcposoccbe" : ".zcposoccle") { setup(fw); @@ -1107,80 +1123,6 @@ FakeEGCompr64PosOcc<bigEndian>::~FakeEGCompr64PosOcc() { } - -template <bool bigEndian> -void -FakeEGCompr64PosOcc<bigEndian>::setup(const FakeWord &fw) -{ - uint32_t lastDocId = 0u; - - typedef FakeWord FW; - typedef FW::DocWordFeatureList DWFL; - typedef FW::DocWordPosFeatureList DWPFL; - - DWFL::const_iterator d(fw._postings.begin()); - DWFL::const_iterator de(fw._postings.end()); - DWPFL::const_iterator p(fw._wordPosFeatures.begin()); - DWPFL::const_iterator pe(fw._wordPosFeatures.end()); - DocIdAndPosOccFeatures features; - EGPosOccEncodeContext<bigEndian> e(&_fieldsParams); - ComprFileWriteContext ectx(e); - e.setWriteContext(&ectx); - ectx.allocComprBuf(64, 1); - e.afterWrite(ectx, 0, 0); - - _hitDocs = fw._postings.size(); - _docIdLimit = fw._docIdLimit; - if (_hitDocs > 0) - _lastDocId = fw._postings.back()._docId; - else - _lastDocId = 0u; - e.encodeExpGolomb(_hitDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS); - - uint32_t docIdK = e.calcDocIdK(_hitDocs, _docIdLimit); - - while (d != de) { - e.encodeExpGolomb(d->_docId - lastDocId - 1, docIdK); -#if DEBUG_ZCFILTEROCC_PRINTF - printf("Encode docId=%d (+%u + 1)\n", - d->_docId, d->_docId - lastDocId - 1); -#endif - fw.setupFeatures(*d, &*p, features); - p += d->_positions; - e.writeFeatures(features); - lastDocId = d->_docId; - ++d; - } - assert(p == pe); - - _compressedBits = e.getWriteOffset(); - - // First pad to 64 bits. - uint32_t pad = (64 - e.getWriteOffset()) & 63; - while (pad > 0) { - uint32_t now = std::min(32u, pad); - e.writeBits(0, now); - e.writeComprBufferIfNeeded(); - pad -= now; - } - - // Then write 128 more bits. This allows for 64-bit decoding - // with a readbits that always leaves a nonzero preRead - for (unsigned int i = 0; i < 4; i++) { - e.writeBits(0, 32); - e.writeComprBufferIfNeeded(); - } - e.writeComprBufferIfNeeded(); - e.flush(); - e.writeComprBuffer(); - - std::pair<void *, size_t> ectxData = - ectx.grabComprBuffer(_compressedMalloc); - _compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first), - ectxData.second); -} - - template <bool bigEndian> size_t FakeEGCompr64PosOcc<bigEndian>::bitSize() const @@ -1203,7 +1145,7 @@ FakeEGCompr64PosOcc<bigEndian>:: createIterator(const TermFieldMatchDataArray &matchData) const { return new ZcRareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0), - _compressedBits, _docIdLimit, &_fieldsParams, matchData); + _compressedBits, _posting_params._doc_id_limit, false, &_fieldsParams, matchData); } @@ -1212,8 +1154,7 @@ class FakeEG2Compr64PosOcc : public FakeZcFilterOcc { public: FakeEG2Compr64PosOcc(const FakeWord &fw); - ~FakeEG2Compr64PosOcc(); - void setup(const FakeWord &fw); + ~FakeEG2Compr64PosOcc() override; size_t bitSize() const override; bool hasWordPositions() const override; SearchIterator *createIterator(const fef::TermFieldMatchDataArray &matchData) const override; @@ -1222,8 +1163,8 @@ public: template <bool bigEndian> FakeEG2Compr64PosOcc<bigEndian>::FakeEG2Compr64PosOcc(const FakeWord &fw) - : FakeZcFilterOcc(fw, bigEndian, - bigEndian ? ".zc2posoccbe" : ".zc2posoccle") + : FakeZcFilterOcc(fw, bigEndian, Zc4PostingParams(disable_skip, disable_chunking, fw._docIdLimit, false, true, false), + bigEndian ? ".zc4posoccbe" : ".zc4posoccle") { setup(fw); } @@ -1236,78 +1177,6 @@ FakeEG2Compr64PosOcc<bigEndian>::~FakeEG2Compr64PosOcc() template <bool bigEndian> -void -FakeEG2Compr64PosOcc<bigEndian>::setup(const FakeWord &fw) -{ - uint32_t lastDocId = 0u; - - typedef FakeWord FW; - typedef FW::DocWordFeatureList DWFL; - typedef FW::DocWordPosFeatureList DWPFL; - - DWFL::const_iterator d(fw._postings.begin()); - DWFL::const_iterator de(fw._postings.end()); - DWPFL::const_iterator p(fw._wordPosFeatures.begin()); - DWPFL::const_iterator pe(fw._wordPosFeatures.end()); - DocIdAndPosOccFeatures features; - EG2PosOccEncodeContext<bigEndian> e(&_fieldsParams); - ComprFileWriteContext ectx(e); - e.setWriteContext(&ectx); - ectx.allocComprBuf(64, 1); - e.afterWrite(ectx, 0, 0); - - _hitDocs = fw._postings.size(); - _docIdLimit = fw._docIdLimit; - if (_hitDocs > 0) - _lastDocId = fw._postings.back()._docId; - else - _lastDocId = 0u; - e.encodeExpGolomb(_hitDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS); - - while (d != de) { - e.encodeExpGolomb(d->_docId - lastDocId - 1, - K_VALUE_ZCPOSTING_DELTA_DOCID); -#if DEBUG_ZCFILTEROCC_PRINTF - printf("Encode docId=%d (+%u + 1)\n", - d->_docId, d->_docId - lastDocId - 1); -#endif - fw.setupFeatures(*d, &*p, features); - p += d->_positions; - e.writeFeatures(features); - lastDocId = d->_docId; - ++d; - } - assert(p == pe); - - _compressedBits = e.getWriteOffset(); - - // First pad to 64 bits. - uint32_t pad = (64 - e.getWriteOffset()) & 63; - while (pad > 0) { - uint32_t now = std::min(32u, pad); - e.writeBits(0, now); - e.writeComprBufferIfNeeded(); - pad -= now; - } - - // Then write 128 more bits. This allows for 64-bit decoding - // with a readbits that always leaves a nonzero preRead - for (unsigned int i = 0; i < 4; i++) { - e.writeBits(0, 32); - e.writeComprBufferIfNeeded(); - } - e.writeComprBufferIfNeeded(); - e.flush(); - e.writeComprBuffer(); - - std::pair<void *, size_t> ectxData = - ectx.grabComprBuffer(_compressedMalloc); - _compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first), - ectxData.second); -} - - -template <bool bigEndian> size_t FakeEG2Compr64PosOcc<bigEndian>::bitSize() const { @@ -1329,7 +1198,7 @@ FakeEG2Compr64PosOcc<bigEndian>:: createIterator(const TermFieldMatchDataArray &matchData) const { return new Zc4RareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0), - _compressedBits, _docIdLimit, &_fieldsParams, matchData); + _compressedBits, _posting_params._doc_id_limit, false, &_fieldsParams, matchData); } @@ -1339,7 +1208,7 @@ class FakeZcSkipPosOcc : public FakeZcFilterOcc search::index::PostingListCounts _counts; public: FakeZcSkipPosOcc(const FakeWord &fw); - ~FakeZcSkipPosOcc(); + ~FakeZcSkipPosOcc() override; size_t bitSize() const override; bool hasWordPositions() const override; @@ -1349,10 +1218,10 @@ public: template <bool bigEndian> FakeZcSkipPosOcc<bigEndian>::FakeZcSkipPosOcc(const FakeWord &fw) - : FakeZcFilterOcc(fw, bigEndian, + : FakeZcFilterOcc(fw, bigEndian, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, true, true, false), bigEndian ? ".zcskipposoccbe" : ".zcskipposoccle") { - setup(fw, true, true); + setup(fw); _counts._bitLength = _compressedBits; } @@ -1385,7 +1254,7 @@ SearchIterator * FakeZcSkipPosOcc<bigEndian>:: createIterator(const TermFieldMatchDataArray &matchData) const { - return new ZcPosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _docIdLimit, + return new ZcPosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, false, static_cast<uint32_t>(-1), _counts, &_fieldsParams, @@ -1394,12 +1263,15 @@ createIterator(const TermFieldMatchDataArray &matchData) const template <bool bigEndian> -class FakeZc2SkipPosOcc : public FakeZcFilterOcc +class FakeZc4SkipPosOcc : public FakeZcFilterOcc { search::index::PostingListCounts _counts; + bool _encode_cheap_features; +protected: + FakeZc4SkipPosOcc(const FakeWord &fw, const Zc4PostingParams &posting_params, const char *name_suffix); public: - FakeZc2SkipPosOcc(const FakeWord &fw); - ~FakeZc2SkipPosOcc(); + FakeZc4SkipPosOcc(const FakeWord &fw); + ~FakeZc4SkipPosOcc() override; size_t bitSize() const override; bool hasWordPositions() const override; SearchIterator *createIterator(const TermFieldMatchDataArray &matchData) const override; @@ -1407,24 +1279,26 @@ public: template <bool bigEndian> -FakeZc2SkipPosOcc<bigEndian>::FakeZc2SkipPosOcc(const FakeWord &fw) - : FakeZcFilterOcc(fw, bigEndian, - bigEndian ? ".zc2skipposoccbe" : ".zc2skipposoccle") +FakeZc4SkipPosOcc<bigEndian>::FakeZc4SkipPosOcc(const FakeWord &fw, const Zc4PostingParams &posting_params, const char *name_suffix) + : FakeZcFilterOcc(fw, bigEndian, posting_params, name_suffix) { - setup(fw, true, false); + setup(fw); _counts._bitLength = _compressedBits; } - template <bool bigEndian> -FakeZc2SkipPosOcc<bigEndian>::~FakeZc2SkipPosOcc() +FakeZc4SkipPosOcc<bigEndian>::FakeZc4SkipPosOcc(const FakeWord &fw) + : FakeZc4SkipPosOcc<bigEndian>(fw, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, false, true, false), + (bigEndian ? ".zc4skipposoccbe" : ".zc4skipposoccle")) { } +template <bool bigEndian> +FakeZc4SkipPosOcc<bigEndian>::~FakeZc4SkipPosOcc() = default; template <bool bigEndian> size_t -FakeZc2SkipPosOcc<bigEndian>::bitSize() const +FakeZc4SkipPosOcc<bigEndian>::bitSize() const { return _compressedBits - _l1SkipSize - _l2SkipSize - _l3SkipSize - _l4SkipSize; @@ -1433,7 +1307,7 @@ FakeZc2SkipPosOcc<bigEndian>::bitSize() const template <bool bigEndian> bool -FakeZc2SkipPosOcc<bigEndian>::hasWordPositions() const +FakeZc4SkipPosOcc<bigEndian>::hasWordPositions() const { return true; } @@ -1441,13 +1315,63 @@ FakeZc2SkipPosOcc<bigEndian>::hasWordPositions() const template <bool bigEndian> SearchIterator * -FakeZc2SkipPosOcc<bigEndian>:: +FakeZc4SkipPosOcc<bigEndian>:: createIterator(const TermFieldMatchDataArray &matchData) const { - return new Zc4PosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _docIdLimit, - static_cast<uint32_t>(-1), _counts, &_fieldsParams, matchData); + if (_hitDocs >= _posting_params._min_skip_docs) { + if (_posting_params._dynamic_k) { + return new ZcPosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, + static_cast<uint32_t>(-1), + _counts, + &_fieldsParams, + matchData); + } else { + return new Zc4PosOccIterator<bigEndian>(Position(_compressed.first, 0), _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, + static_cast<uint32_t>(-1), _counts, &_fieldsParams, matchData); + } + } else { + if (_posting_params._dynamic_k) { + return new ZcRareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0), + _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, &_fieldsParams, matchData); + } else { + return new Zc4RareWordPosOccIterator<bigEndian>(Position(_compressed.first, 0), + _compressedBits, _posting_params._doc_id_limit, _posting_params._encode_cheap_features, &_fieldsParams, matchData); + } + } } +template <bool bigEndian> +class FakeZc4SkipPosOccCf : public FakeZc4SkipPosOcc<bigEndian> +{ +public: + FakeZc4SkipPosOccCf(const FakeWord &fw) + : FakeZc4SkipPosOcc<bigEndian>(fw, Zc4PostingParams(force_skip, disable_chunking, fw._docIdLimit, false, true, true), + (bigEndian ? ".zc4skipposoccbe.cf" : ".zc4skipposoccle.cf")) + { + } +}; + +template <bool bigEndian> +class FakeZc4NoSkipPosOccCf : public FakeZc4SkipPosOcc<bigEndian> +{ +public: + FakeZc4NoSkipPosOccCf(const FakeWord &fw) + : FakeZc4SkipPosOcc<bigEndian>(fw, Zc4PostingParams(disable_skip, disable_chunking, fw._docIdLimit, false, true, true), + (bigEndian ? ".zc4noskipposoccbe.cf" : "zc4noskipposoccle.cf")) + { + } +}; + +template <bool bigEndian> +class FakeZc5NoSkipPosOccCf : public FakeZc4SkipPosOcc<bigEndian> +{ +public: + FakeZc5NoSkipPosOccCf(const FakeWord &fw) + : FakeZc4SkipPosOcc<bigEndian>(fw, Zc4PostingParams(disable_skip, disable_chunking, fw._docIdLimit, true, true, true), + (bigEndian ? ".zc5noskipposoccbe.cf" : "zc5noskipposoccle.cf")) + { + } +}; static FPFactoryInit initPosbe(std::make_pair("EGCompr64PosOccBE", @@ -1479,13 +1403,42 @@ initSkipPosle(std::make_pair("ZcSkipPosOccLE", static FPFactoryInit -initSkipPos0be(std::make_pair("Zc2SkipPosOccBE", - makeFPFactory<FPFactoryT<FakeZc2SkipPosOcc<true> > >)); +initSkipPos0be(std::make_pair("Zc4SkipPosOccBE", + makeFPFactory<FPFactoryT<FakeZc4SkipPosOcc<true> > >)); + + +static FPFactoryInit +initSkipPos0le(std::make_pair("Zc4SkipPosOccLE", + makeFPFactory<FPFactoryT<FakeZc4SkipPosOcc<false> > >)); + + +static FPFactoryInit +initSkipPos0becf(std::make_pair("Zc4SkipPosOccBE.cf", + makeFPFactory<FPFactoryT<FakeZc4SkipPosOccCf<true> > >)); + + +static FPFactoryInit +initSkipPos0lecf(std::make_pair("Zc4SkipPosOccLE.cf", + makeFPFactory<FPFactoryT<FakeZc4SkipPosOccCf<false> > >)); + +static FPFactoryInit +initNoSkipPos0becf(std::make_pair("Zc4NoSkipPosOccBE.cf", + makeFPFactory<FPFactoryT<FakeZc4NoSkipPosOccCf<true> > >)); + + +static FPFactoryInit +initNoSkipPos0lecf(std::make_pair("Zc4NoSkipPosOccLE.cf", + makeFPFactory<FPFactoryT<FakeZc4NoSkipPosOccCf<false> > >)); + + +static FPFactoryInit +initNoSkipPosbecf(std::make_pair("Zc5NoSkipPosOccBE.cf", + makeFPFactory<FPFactoryT<FakeZc5NoSkipPosOccCf<true> > >)); static FPFactoryInit -initSkipPos0le(std::make_pair("Zc2SkipPosOccLE", - makeFPFactory<FPFactoryT<FakeZc2SkipPosOcc<false> > >)); +initNoSkipPoslecf(std::make_pair("Zc5NoSkipPosOccLE.cf", + makeFPFactory<FPFactoryT<FakeZc5NoSkipPosOccCf<false> > >)); } // namespace fakedata diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h index 36738a0f5a8..3d1673edec7 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h @@ -5,6 +5,7 @@ #include "fakeposting.h" #include <vespa/searchlib/bitcompression/compression.h> #include <vespa/searchlib/bitcompression/posocccompression.h> +#include <vespa/searchlib/diskindex/zc4_posting_params.h> namespace search { @@ -21,7 +22,6 @@ protected: size_t _l2SkipSize; size_t _l3SkipSize; size_t _l4SkipSize; - unsigned int _docIdLimit; unsigned int _hitDocs; uint32_t _lastDocId; @@ -31,23 +31,27 @@ protected: uint64_t _featuresSize; const search::bitcompression::PosOccFieldsParams &_fieldsParams; bool _bigEndian; + diskindex::Zc4PostingParams _posting_params; protected: - void setup(const FakeWord &fw, bool doFeatures, bool dynamicK); + void setup(const FakeWord &fw); template <bool bigEndian> - void setupT(const FakeWord &fw, bool doFeatures, bool dynamicK); + void setupT(const FakeWord &fw); template <bool bigEndian> - void read_header(bool do_features, bool dynamic_k, uint32_t min_skip_docs, uint32_t min_cunk_docs); + void read_header(); - void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const; + void validate_read(const FakeWord &fw) const; template <bool bigEndian> - void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const; + void validate_read(const FakeWord &fw) const; public: FakeZcFilterOcc(const FakeWord &fw); - FakeZcFilterOcc(const FakeWord &fw, bool bigEndian, const char *nameSuffix); - ~FakeZcFilterOcc(); + FakeZcFilterOcc(const FakeWord &fw, + bool bigEndian, + const diskindex::Zc4PostingParams &posting_params, + const char *nameSuffix); + ~FakeZcFilterOcc() override; static void forceLink(); |