diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2019-04-11 18:30:51 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-04-11 18:30:51 +0200 |
commit | 99c88e4846b752d80ac3590c0ab11adc3d84dc9c (patch) | |
tree | aff6d10d2e45111987a72c479c824a30d508af72 /searchlib | |
parent | 406df9cbda7f97d0caccdd03a3e1340688d47cbf (diff) | |
parent | 55e53f26ea1379b5ce87685e7b8c04f370b5fca0 (diff) |
Merge pull request #9087 from vespa-engine/toregge/factor-out-zc4-posting-writer
Factor out Zc4PostingWriter from Zc4PostingSeqWrite.
Diffstat (limited to 'searchlib')
10 files changed, 728 insertions, 881 deletions
diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt index 3619affb54e..b21b799e693 100644 --- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt @@ -18,6 +18,8 @@ vespa_add_library(searchlib_diskindex OBJECT pagedict4file.cpp pagedict4randread.cpp wordnummapper.cpp + zc4_posting_writer.cpp + zc4_posting_writer_base.cpp zcbuf.cpp zcposocc.cpp zcposocciterators.cpp diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp new file mode 100644 index 00000000000..0eb59a383a5 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.cpp @@ -0,0 +1,270 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "zc4_posting_writer.h" +#include <vespa/searchlib/index/docidandfeatures.h> +#include <vespa/searchlib/index/postinglistcounts.h> + +using search::index::DocIdAndFeatures; +using search::index::PostingListCounts; +using search::index::PostingListParams; + +namespace search::diskindex +{ + +template <bool bigEndian> +Zc4PostingWriter<bigEndian>::Zc4PostingWriter(PostingListCounts &counts) + : Zc4PostingWriterBase(counts), + _encode_context(), + _encode_features(nullptr) +{ + _encode_context.setWriteContext(&_writeContext); + _writeContext.setEncodeContext(&_encode_context); +} + +template <bool bigEndian> +Zc4PostingWriter<bigEndian>::~Zc4PostingWriter() +{ +} + +template <bool bigEndian> +void +Zc4PostingWriter<bigEndian>::reset_chunk() +{ + _docIds.clear(); + if (_encode_features != nullptr) { + _encode_features->setupWrite(_featureWriteContext); + _featureOffset = 0; + } +} + +template <bool bigEndian> +void +Zc4PostingWriter<bigEndian>::flush_word_with_skip(bool hasMore) +{ + assert(_docIds.size() >= _minSkipDocs || !_counts._segments.empty()); + + if (_encode_features != nullptr) { + _encode_features->flush(); + } + EncodeContext &e = _encode_context; + + uint32_t numDocs = _docIds.size(); + + e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS); + if (numDocs >= _minChunkDocs) { + e.writeBits((hasMore ? 1 : 0), 1); + } + + calc_skip_info(_encode_features != nullptr); + + uint32_t docIdsSize = _zcDocIds.size(); + uint32_t l1SkipSize = _l1Skip.size(); + uint32_t l2SkipSize = _l2Skip.size(); + uint32_t l3SkipSize = _l3Skip.size(); + uint32_t l4SkipSize = _l4Skip.size(); + + e.encodeExpGolomb(docIdsSize - 1, K_VALUE_ZCPOSTING_DOCIDSSIZE); + e.encodeExpGolomb(l1SkipSize, K_VALUE_ZCPOSTING_L1SKIPSIZE); + if (l1SkipSize != 0) { + e.encodeExpGolomb(l2SkipSize, K_VALUE_ZCPOSTING_L2SKIPSIZE); + if (l2SkipSize != 0) { + e.encodeExpGolomb(l3SkipSize, K_VALUE_ZCPOSTING_L3SKIPSIZE); + if (l3SkipSize != 0) { + e.encodeExpGolomb(l4SkipSize, K_VALUE_ZCPOSTING_L4SKIPSIZE); + } + } + } + if (_encode_features != nullptr) { + e.encodeExpGolomb(_featureOffset, K_VALUE_ZCPOSTING_FEATURESSIZE); + } + + // Encode last document id in chunk or word. + if (_dynamicK) { + uint32_t docIdK = e.calcDocIdK((_counts._segments.empty() && + !hasMore) ? + numDocs : 1, + _docIdLimit); + e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first, + docIdK); + } else { + e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first, + K_VALUE_ZCPOSTING_LASTDOCID); + } + + e.smallAlign(8); // Byte align + + uint8_t *docIds = _zcDocIds._mallocStart; + e.writeBits(reinterpret_cast<const uint64_t *>(docIds), + 0, + docIdsSize * 8); + if (l1SkipSize > 0) { + uint8_t *l1Skip = _l1Skip._mallocStart; + e.writeBits(reinterpret_cast<const uint64_t *>(l1Skip), + 0, + l1SkipSize * 8); + } + if (l2SkipSize > 0) { + uint8_t *l2Skip = _l2Skip._mallocStart; + e.writeBits(reinterpret_cast<const uint64_t *>(l2Skip), + 0, + l2SkipSize * 8); + } + if (l3SkipSize > 0) { + uint8_t *l3Skip = _l3Skip._mallocStart; + e.writeBits(reinterpret_cast<const uint64_t *>(l3Skip), + 0, + l3SkipSize * 8); + } + if (l4SkipSize > 0) { + uint8_t *l4Skip = _l4Skip._mallocStart; + e.writeBits(reinterpret_cast<const uint64_t *>(l4Skip), + 0, + l4SkipSize * 8); + } + + // Write features + e.writeBits(static_cast<const uint64_t *>(_featureWriteContext._comprBuf), + 0, + _featureOffset); + + _counts._numDocs += numDocs; + if (hasMore || !_counts._segments.empty()) { + uint64_t writePos = e.getWriteOffset(); + PostingListCounts::Segment seg; + seg._bitLength = writePos - (_writePos + _counts._bitLength); + seg._numDocs = numDocs; + seg._lastDoc = _docIds.back().first; + _counts._segments.push_back(seg); + _counts._bitLength += seg._bitLength; + } + // reset tables in preparation for next word or next chunk + clear_skip_info(); + reset_chunk(); +} + +template <bool bigEndian> +void +Zc4PostingWriter<bigEndian>::write_docid_and_features(const DocIdAndFeatures &features) +{ + if (__builtin_expect(_docIds.size() >= _minChunkDocs, false)) { + flush_word_with_skip(true); + } + if (_encode_features != nullptr) { + _encode_features->writeFeatures(features); + uint64_t writeOffset = _encode_features->getWriteOffset(); + uint64_t featureSize = writeOffset - _featureOffset; + assert(static_cast<uint32_t>(featureSize) == featureSize); + _docIds.push_back(std::make_pair(features._docId, + static_cast<uint32_t>(featureSize))); + _featureOffset = writeOffset; + } else { + _docIds.push_back(std::make_pair(features._docId, uint32_t(0))); + } +} + +template <bool bigEndian> +void +Zc4PostingWriter<bigEndian>::flush_word_no_skip() +{ + // Too few document ids for skip info. + assert(_docIds.size() < _minSkipDocs && _counts._segments.empty()); + + if (_encode_features != nullptr) { + _encode_features->flush(); + } + EncodeContext &e = _encode_context; + uint32_t numDocs = _docIds.size(); + + e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS); + + uint32_t docIdK = _dynamicK ? e.calcDocIdK(numDocs, _docIdLimit) : K_VALUE_ZCPOSTING_DELTA_DOCID; + + uint32_t baseDocId = 1; + const uint64_t *features = + static_cast<const uint64_t *>(_featureWriteContext._comprBuf); + uint64_t featureOffset = 0; + + std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin(); + std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end(); + + for (; dit != dite; ++dit) { + uint32_t docId = dit->first; + uint32_t featureSize = dit->second; + e.encodeExpGolomb(docId - baseDocId, docIdK); + baseDocId = docId + 1; + if (featureSize != 0) { + e.writeBits(features + (featureOffset >> 6), + featureOffset & 63, + featureSize); + featureOffset += featureSize; + } + } + _counts._numDocs += numDocs; + reset_chunk(); +} + +template <bool bigEndian> +void +Zc4PostingWriter<bigEndian>::flush_word() +{ + if (__builtin_expect(_docIds.size() >= _minSkipDocs || + !_counts._segments.empty(), false)) { + // Use skip information if enough documents or chunking has happened + flush_word_with_skip(false); + _numWords++; + } else if (_docIds.size() > 0) { + flush_word_no_skip(); + _numWords++; + } + + EncodeContext &e = _encode_context; + uint64_t writePos = e.getWriteOffset(); + + _counts._bitLength = writePos - _writePos; + _writePos = writePos; +} + +template <bool bigEndian> +void +Zc4PostingWriter<bigEndian>::set_encode_features(EncodeContext *encode_features) +{ + _encode_features = encode_features; + if (_encode_features != nullptr) { + _encode_features->setWriteContext(&_featureWriteContext); + _encode_features->setupWrite(_featureWriteContext); + } + _featureWriteContext.setEncodeContext(_encode_features); + _featureOffset = 0; +} + +template <bool bigEndian> +void +Zc4PostingWriter<bigEndian>::on_open() +{ + _numWords = 0; + _writePos = _encode_context.getWriteOffset(); // Position after file header +} + +template <bool bigEndian> +void +Zc4PostingWriter<bigEndian>::on_close() +{ + // Write some pad bits to avoid decompression readahead going past + // memory mapped file during search and into SIGSEGV territory. + + // First pad to 64 bits alignment. + _encode_context.smallAlign(64); + _encode_context.writeComprBufferIfNeeded(); + + // Then write 128 more bits. This allows for 64-bit decoding + // with a readbits that always leaves a nonzero preRead + _encode_context.padBits(128); + _encode_context.alignDirectIO(); + _encode_context.flush(); + _encode_context.writeComprBuffer(); // Also flushes slack +} + +template class Zc4PostingWriter<false>; +template class Zc4PostingWriter<true>; + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h new file mode 100644 index 00000000000..8dc5e249d52 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer.h @@ -0,0 +1,53 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "zc4_posting_writer_base.h" + +namespace search::index { class DocIdAndFeatures; } + +namespace search::diskindex +{ + +/* + * Class used to write posting lists of type "Zc.4" and "Zc.5" (dynamic k). + * + * Common words have docid deltas and skip info separate from + * features. + * + * Rare words do not have skip info, and docid deltas and features are + * interleaved. + */ +template <bool bigEndian> +class Zc4PostingWriter : public Zc4PostingWriterBase +{ + using EncodeContext = bitcompression::FeatureEncodeContext<bigEndian>; + + EncodeContext _encode_context; + // Buffer up features in memory + EncodeContext *_encode_features; +public: + Zc4PostingWriter(const Zc4PostingWriter &) = delete; + Zc4PostingWriter(Zc4PostingWriter &&) = delete; + Zc4PostingWriter &operator=(const Zc4PostingWriter &) = delete; + Zc4PostingWriter &operator=(Zc4PostingWriter &&) = delete; + Zc4PostingWriter(index::PostingListCounts &counts); + ~Zc4PostingWriter(); + + void reset_chunk(); + void flush_word_with_skip(bool hasMore); + void flush_word_no_skip(); + void flush_word(); + void write_docid_and_features(const index::DocIdAndFeatures &features); + void set_encode_features(EncodeContext *encode_features); + void on_open(); + void on_close(); + + EncodeContext &get_encode_features() { return *_encode_features; } + EncodeContext &get_encode_context() { return _encode_context; } +}; + +extern template class Zc4PostingWriter<false>; +extern template class Zc4PostingWriter<true>; + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp new file mode 100644 index 00000000000..485610c2ebd --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.cpp @@ -0,0 +1,222 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "zc4_posting_writer_base.h" +#include <vespa/searchlib/index/postinglistcounts.h> + +using search::index::PostingListCounts; +using search::index::PostingListParams; + +namespace search::diskindex +{ + +Zc4PostingWriterBase::Zc4PostingWriterBase(PostingListCounts &counts) + : _minChunkDocs(1 << 30), + _minSkipDocs(64), + _docIdLimit(10000000), + _docIds(), + _featureOffset(0), + _writePos(0), + _dynamicK(false), + _zcDocIds(), + _l1Skip(), + _l2Skip(), + _l3Skip(), + _l4Skip(), + _numWords(0), + _counts(counts), + _writeContext(sizeof(uint64_t)), + _featureWriteContext(sizeof(uint64_t)) +{ + _featureWriteContext.allocComprBuf(64, 1); + // Ensure that some space is initially available in encoding buffers + _zcDocIds.maybeExpand(); + _l1Skip.maybeExpand(); + _l2Skip.maybeExpand(); + _l3Skip.maybeExpand(); + _l4Skip.maybeExpand(); +} + +Zc4PostingWriterBase::~Zc4PostingWriterBase() +{ +} + +#define L1SKIPSTRIDE 16 +#define L2SKIPSTRIDE 8 +#define L3SKIPSTRIDE 8 +#define L4SKIPSTRIDE 8 + +void +Zc4PostingWriterBase::calc_skip_info(bool encodeFeatures) +{ + uint32_t lastDocId = 0u; + uint32_t lastL1SkipDocId = 0u; + uint32_t lastL1SkipDocIdPos = 0; + uint32_t lastL1SkipFeaturePos = 0; + uint32_t lastL2SkipDocId = 0u; + uint32_t lastL2SkipDocIdPos = 0; + uint32_t lastL2SkipFeaturePos = 0; + uint32_t lastL2SkipL1SkipPos = 0; + uint32_t lastL3SkipDocId = 0u; + uint32_t lastL3SkipDocIdPos = 0; + uint32_t lastL3SkipFeaturePos = 0; + uint32_t lastL3SkipL1SkipPos = 0; + uint32_t lastL3SkipL2SkipPos = 0; + uint32_t lastL4SkipDocId = 0u; + uint32_t lastL4SkipDocIdPos = 0; + uint32_t lastL4SkipFeaturePos = 0; + uint32_t lastL4SkipL1SkipPos = 0; + uint32_t lastL4SkipL2SkipPos = 0; + uint32_t lastL4SkipL3SkipPos = 0; + unsigned int l1SkipCnt = 0; + unsigned int l2SkipCnt = 0; + unsigned int l3SkipCnt = 0; + unsigned int l4SkipCnt = 0; + uint64_t featurePos = 0; + + std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin(); + std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end(); + + if (!_counts._segments.empty()) { + lastDocId = _counts._segments.back()._lastDoc; + lastL1SkipDocId = lastDocId; + lastL2SkipDocId = lastDocId; + lastL3SkipDocId = lastDocId; + lastL4SkipDocId = lastDocId; + } + + for (; dit != dite; ++dit) { + if (l1SkipCnt >= L1SKIPSTRIDE) { + // L1 docid delta + uint32_t docIdDelta = lastDocId - lastL1SkipDocId; + assert(static_cast<int32_t>(docIdDelta) > 0); + _l1Skip.encode(docIdDelta - 1); + lastL1SkipDocId = lastDocId; + // L1 docid pos + uint64_t docIdPos = _zcDocIds.size(); + _l1Skip.encode(docIdPos - lastL1SkipDocIdPos - 1); + lastL1SkipDocIdPos = docIdPos; + if (encodeFeatures) { + // L1 features pos + _l1Skip.encode(featurePos - lastL1SkipFeaturePos - 1); + lastL1SkipFeaturePos = featurePos; + } + l1SkipCnt = 0; + ++l2SkipCnt; + if (l2SkipCnt >= L2SKIPSTRIDE) { + // L2 docid delta + docIdDelta = lastDocId - lastL2SkipDocId; + assert(static_cast<int32_t>(docIdDelta) > 0); + _l2Skip.encode(docIdDelta - 1); + lastL2SkipDocId = lastDocId; + // L2 docid pos + docIdPos = _zcDocIds.size(); + _l2Skip.encode(docIdPos - lastL2SkipDocIdPos - 1); + lastL2SkipDocIdPos = docIdPos; + if (encodeFeatures) { + // L2 features pos + _l2Skip.encode(featurePos - lastL2SkipFeaturePos - 1); + lastL2SkipFeaturePos = featurePos; + } + // L2 L1Skip pos + uint64_t l1SkipPos = _l1Skip.size(); + _l2Skip.encode(l1SkipPos - lastL2SkipL1SkipPos - 1); + lastL2SkipL1SkipPos = l1SkipPos; + l2SkipCnt = 0; + ++l3SkipCnt; + if (l3SkipCnt >= L3SKIPSTRIDE) { + // L3 docid delta + docIdDelta = lastDocId - lastL3SkipDocId; + assert(static_cast<int32_t>(docIdDelta) > 0); + _l3Skip.encode(docIdDelta - 1); + lastL3SkipDocId = lastDocId; + // L3 docid pos + docIdPos = _zcDocIds.size(); + _l3Skip.encode(docIdPos - lastL3SkipDocIdPos - 1); + lastL3SkipDocIdPos = docIdPos; + if (encodeFeatures) { + // L3 features pos + _l3Skip.encode(featurePos - lastL3SkipFeaturePos - 1); + lastL3SkipFeaturePos = featurePos; + } + // L3 L1Skip pos + l1SkipPos = _l1Skip.size(); + _l3Skip.encode(l1SkipPos - lastL3SkipL1SkipPos - 1); + lastL3SkipL1SkipPos = l1SkipPos; + // L3 L2Skip pos + uint64_t l2SkipPos = _l2Skip.size(); + _l3Skip.encode(l2SkipPos - lastL3SkipL2SkipPos - 1); + lastL3SkipL2SkipPos = l2SkipPos; + l3SkipCnt = 0; + ++l4SkipCnt; + if (l4SkipCnt >= L4SKIPSTRIDE) { + // L4 docid delta + docIdDelta = lastDocId - lastL4SkipDocId; + assert(static_cast<int32_t>(docIdDelta) > 0); + _l4Skip.encode(docIdDelta - 1); + lastL4SkipDocId = lastDocId; + // L4 docid pos + docIdPos = _zcDocIds.size(); + _l4Skip.encode(docIdPos - lastL4SkipDocIdPos - 1); + lastL4SkipDocIdPos = docIdPos; + if (encodeFeatures) { + // L4 features pos + _l4Skip.encode(featurePos - lastL4SkipFeaturePos - 1); + lastL4SkipFeaturePos = featurePos; + } + // L4 L1Skip pos + l1SkipPos = _l1Skip.size(); + _l4Skip.encode(l1SkipPos - lastL4SkipL1SkipPos - 1); + lastL4SkipL1SkipPos = l1SkipPos; + // L4 L2Skip pos + l2SkipPos = _l2Skip.size(); + _l4Skip.encode(l2SkipPos - lastL4SkipL2SkipPos - 1); + lastL4SkipL2SkipPos = l2SkipPos; + // L4 L3Skip pos + uint64_t l3SkipPos = _l3Skip.size(); + _l4Skip.encode(l3SkipPos - lastL4SkipL3SkipPos - 1); + lastL4SkipL3SkipPos = l3SkipPos; + l4SkipCnt = 0; + } + } + } + } + uint32_t docId = dit->first; + featurePos += dit->second; + _zcDocIds.encode(docId - lastDocId - 1); + lastDocId = docId; + ++l1SkipCnt; + } + // Extra partial entries for skip tables to simplify iterator during search + if (_l1Skip.size() > 0) { + _l1Skip.encode(lastDocId - lastL1SkipDocId - 1); + } + if (_l2Skip.size() > 0) { + _l2Skip.encode(lastDocId - lastL2SkipDocId - 1); + } + if (_l3Skip.size() > 0) { + _l3Skip.encode(lastDocId - lastL3SkipDocId - 1); + } + if (_l4Skip.size() > 0) { + _l4Skip.encode(lastDocId - lastL4SkipDocId - 1); + } +} + +void +Zc4PostingWriterBase::clear_skip_info() +{ + _zcDocIds.clear(); + _l1Skip.clear(); + _l2Skip.clear(); + _l3Skip.clear(); + _l4Skip.clear(); +} + +void +Zc4PostingWriterBase::set_posting_list_params(const PostingListParams ¶ms) +{ + params.get("docIdLimit", _docIdLimit); + params.get("minChunkDocs", _minChunkDocs); + params.get("minSkipDocs", _minSkipDocs); +} + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h new file mode 100644 index 00000000000..ba781c11564 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_writer_base.h @@ -0,0 +1,66 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "zcbuf.h" +#include <vespa/searchlib/bitcompression/compression.h> +#include <vector> + +namespace search::index { +class PostingListCounts; +class PostingListParams; +} + +namespace search::diskindex +{ + +/* + * Base class for writing posting lists that might have basic skip info. + */ +class Zc4PostingWriterBase +{ +protected: + uint32_t _minChunkDocs; // # of documents needed for chunking + uint32_t _minSkipDocs; // # of documents needed for skipping + uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit) + + // Unpacked document ids for word and feature sizes + using DocIdAndFeatureSize = std::pair<uint32_t, uint32_t>; + std::vector<DocIdAndFeatureSize> _docIds; + + uint64_t _featureOffset; // Bit offset of next feature + uint64_t _writePos; // Bit position for start of current word + bool _dynamicK; // Caclulate EG compression parameters ? + ZcBuf _zcDocIds; // Document id deltas + ZcBuf _l1Skip; // L1 skip info + ZcBuf _l2Skip; // L2 skip info + ZcBuf _l3Skip; // L3 skip info + ZcBuf _l4Skip; // L4 skip info + + uint64_t _numWords; // Number of words in file + index::PostingListCounts &_counts; + search::ComprFileWriteContext _writeContext; + search::ComprFileWriteContext _featureWriteContext; + + Zc4PostingWriterBase(const Zc4PostingWriterBase &) = delete; + Zc4PostingWriterBase(Zc4PostingWriterBase &&) = delete; + Zc4PostingWriterBase &operator=(const Zc4PostingWriterBase &) = delete; + Zc4PostingWriterBase &operator=(Zc4PostingWriterBase &&) = delete; + Zc4PostingWriterBase(index::PostingListCounts &counts); + ~Zc4PostingWriterBase(); + void calc_skip_info(bool encodeFeatures); + void clear_skip_info(); + +public: + ComprFileWriteContext &get_write_context() { return _writeContext; } + ComprFileWriteContext &get_feature_write_context() { return _featureWriteContext; } + uint32_t get_min_chunk_docs() const { return _minChunkDocs; } + uint32_t get_min_skip_docs() const { return _minSkipDocs; } + uint32_t get_docid_limit() const { return _docIdLimit; } + uint64_t get_num_words() const { return _numWords; } + bool get_dynamic_k() const { return _dynamicK; } + void set_dynamic_k(bool dynamicK) { _dynamicK = dynamicK; } + void set_posting_list_params(const index::PostingListParams ¶ms); +}; + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp index df06432816f..10c08af92cb 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp @@ -63,9 +63,7 @@ Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema, _fieldsParams(), _realEncodeFeatures(&_fieldsParams) { - _encodeFeatures = &_realEncodeFeatures; - _encodeFeatures->setWriteContext(&_featureWriteContext); - _featureWriteContext.setEncodeContext(_encodeFeatures); + _writer.set_encode_features(&_realEncodeFeatures); _fieldsParams.setSchemaParams(schema, indexId); } @@ -118,9 +116,7 @@ ZcPosOccSeqWrite::ZcPosOccSeqWrite(const Schema &schema, _fieldsParams(), _realEncodeFeatures(&_fieldsParams) { - _encodeFeatures = &_realEncodeFeatures; - _encodeFeatures->setWriteContext(&_featureWriteContext); - _featureWriteContext.setEncodeContext(_encodeFeatures); + _writer.set_encode_features(&_realEncodeFeatures); _fieldsParams.setSchemaParams(schema, indexId); } diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp index d51a592bf2b..e850f169adc 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.cpp @@ -607,36 +607,16 @@ Zc4PostingSeqRead::setPostingOffset(uint64_t offset, Zc4PostingSeqWrite:: Zc4PostingSeqWrite(PostingListCountFileSeqWrite *countFile) : PostingListFileSeqWrite(), - _encodeContext(), - _writeContext(_encodeContext), + _writer(_counts), _file(), - _minChunkDocs(1 << 30), - _minSkipDocs(64), - _docIdLimit(10000000), - _docIds(), - _encodeFeatures(nullptr), - _featureOffset(0), - _featureWriteContext(sizeof(uint64_t)), - _writePos(0), - _dynamicK(false), - _zcDocIds(), - _l1Skip(), - _l2Skip(), - _l3Skip(), - _l4Skip(), - _numWords(0), _fileBitSize(0), _countFile(countFile) { - _encodeContext.setWriteContext(&_writeContext); - if (_countFile != nullptr) { PostingListParams params; _countFile->getParams(params); - params.get("docIdLimit", _docIdLimit); - params.get("minChunkDocs", _minChunkDocs); + _writer.set_posting_list_params(params); } - _featureWriteContext.allocComprBuf(64, 1); } @@ -646,110 +626,27 @@ Zc4PostingSeqWrite::~Zc4PostingSeqWrite() void -Zc4PostingSeqWrite:: -writeDocIdAndFeatures(const DocIdAndFeatures &features) +Zc4PostingSeqWrite::writeDocIdAndFeatures(const DocIdAndFeatures &features) { - if (__builtin_expect(_docIds.size() >= _minChunkDocs, false)) - flushChunk(); - _encodeFeatures->writeFeatures(features); - uint64_t writeOffset = _encodeFeatures->getWriteOffset(); - uint64_t featureSize = writeOffset - _featureOffset; - assert(static_cast<uint32_t>(featureSize) == featureSize); - _docIds.push_back(std::make_pair(features._docId, - static_cast<uint32_t>(featureSize))); - _featureOffset = writeOffset; + _writer.write_docid_and_features(features); } void Zc4PostingSeqWrite::flushWord() { - if (__builtin_expect(_docIds.size() >= _minSkipDocs || - !_counts._segments.empty(), false)) { - // Use skip information if enough documents of chunking has happened - flushWordWithSkip(false); - _numWords++; - } else if (_docIds.size() > 0) { - flushWordNoSkip(); - _numWords++; - } - - EncodeContext &e = _encodeContext; - uint64_t writePos = e.getWriteOffset(); - - _counts._bitLength = writePos - _writePos; - _writePos = writePos; -} - - -uint32_t -Zc4PostingSeqWrite::readHeader(const vespalib::string &name) -{ - EncodeContext &f = *_encodeFeatures; - - FeatureDecodeContextBE d; - ComprFileReadContext drc(d); - FastOS_File file; - const vespalib::string &myId = _dynamicK ? myId5 : myId4; - - d.setReadContext(&drc); - bool res = file.OpenReadOnly(name.c_str()); - if (!res) { - LOG(error, "Could not open %s for reading file header: %s", - name.c_str(), getLastErrorString().c_str()); - LOG_ABORT("should not be reached"); - } - - drc.setFile(&file); - drc.setFileSize(file.GetSize()); - drc.allocComprBuf(512, 32768u); - d.emptyBuffer(0); - drc.readComprBuffer(); - - vespalib::FileHeader header; - d.readHeader(header, file.getSize()); - uint32_t headerLen = header.getSize(); - assert(header.hasTag("frozen")); - assert(header.hasTag("fileBitSize")); - assert(header.hasTag("format.0")); - assert(header.hasTag("format.1")); - assert(!header.hasTag("format.2")); - assert(header.hasTag("numWords")); - assert(header.hasTag("minChunkDocs")); - assert(header.hasTag("docIdLimit")); - assert(header.hasTag("minSkipDocs")); - assert(header.hasTag("endian")); - bool headerCompleted = header.getTag("frozen").asInteger() != 0; - uint64_t headerFileBitSize = header.getTag("fileBitSize").asInteger(); - headerLen += (-headerLen & 7); - assert(!headerCompleted || headerFileBitSize >= headerLen * 8); - (void) headerCompleted; - (void) headerFileBitSize; - assert(header.getTag("format.0").asString() == myId); - (void) myId; - assert(header.getTag("format.1").asString() == f.getIdentifier()); - _minChunkDocs = header.getTag("minChunkDocs").asInteger(); - _docIdLimit = header.getTag("docIdLimit").asInteger(); - _minSkipDocs = header.getTag("minSkipDocs").asInteger(); - assert(header.getTag("endian").asString() == "big"); - // Read feature decoding specific subheader using helper decode context - f.readHeader(header, "features."); - // Align on 64-bit unit - d.smallAlign(64); - assert(d.getReadOffset() == headerLen * 8); - file.Close(); - return headerLen; + _writer.flush_word(); } void Zc4PostingSeqWrite::makeHeader(const FileHeaderContext &fileHeaderContext) { - EncodeContext &f = *_encodeFeatures; - EncodeContext &e = _encodeContext; - ComprFileWriteContext &wce = _writeContext; + EncodeContext &f = _writer.get_encode_features(); + EncodeContext &e = _writer.get_encode_context(); + ComprFileWriteContext &wce = _writer.get_write_context(); - const vespalib::string &myId = _dynamicK ? myId5 : myId4; + const vespalib::string &myId = _writer.get_dynamic_k() ? myId5 : myId4; vespalib::FileHeader header; typedef vespalib::GenericHeader::Tag Tag; @@ -759,9 +656,9 @@ Zc4PostingSeqWrite::makeHeader(const FileHeaderContext &fileHeaderContext) header.putTag(Tag("format.0", myId)); header.putTag(Tag("format.1", f.getIdentifier())); header.putTag(Tag("numWords", 0)); - header.putTag(Tag("minChunkDocs", _minChunkDocs)); - header.putTag(Tag("docIdLimit", _docIdLimit)); - header.putTag(Tag("minSkipDocs", _minSkipDocs)); + header.putTag(Tag("minChunkDocs", _writer.get_min_chunk_docs())); + header.putTag(Tag("docIdLimit", _writer.get_docid_limit())); + header.putTag(Tag("minSkipDocs", _writer.get_min_skip_docs())); header.putTag(Tag("endian", "big")); header.putTag(Tag("desc", "Posting list file")); @@ -788,7 +685,7 @@ Zc4PostingSeqWrite::updateHeader() typedef vespalib::GenericHeader::Tag Tag; h.putTag(Tag("frozen", 1)); h.putTag(Tag("fileBitSize", _fileBitSize)); - h.putTag(Tag("numWords", _numWords)); + h.putTag(Tag("numWords", _writer.get_num_words())); h.rewriteFile(f); f.Sync(); f.Close(); @@ -813,40 +710,21 @@ Zc4PostingSeqWrite::open(const vespalib::string &name, // XXX may need to do something more here, I don't know what... return false; } - uint64_t fileSize = _file.GetSize(); - uint64_t bufferStartFilePos = _writeContext.getBufferStartFilePos(); - assert(fileSize >= bufferStartFilePos); - (void) fileSize; - _file.SetSize(bufferStartFilePos); - assert(bufferStartFilePos == static_cast<uint64_t>(_file.GetPosition())); - _writeContext.setFile(&_file); - search::ComprBuffer &cb = _writeContext; - EncodeContext &e = _encodeContext; - _writeContext.allocComprBuf(65536u, 32768u); - if (bufferStartFilePos == 0) { - e.setupWrite(cb); - // Reset accumulated stats - _fileBitSize = 0; - _numWords = 0; - // Start write initial header - makeHeader(fileHeaderContext); - _encodeFeatures->setupWrite(_featureWriteContext); - // end write initial header - _writePos = e.getWriteOffset(); - } else { - assert(bufferStartFilePos >= 8u); - uint32_t headerSize = readHeader(name); // Read existing header - assert(bufferStartFilePos >= headerSize); - (void) headerSize; - e.afterWrite(_writeContext, 0, bufferStartFilePos); - } - - // Ensure that some space is initially available in encoding buffers - _zcDocIds.maybeExpand(); - _l1Skip.maybeExpand(); - _l2Skip.maybeExpand(); - _l3Skip.maybeExpand(); - _l4Skip.maybeExpand(); + auto &writeContext = _writer.get_write_context(); + uint64_t bufferStartFilePos = writeContext.getBufferStartFilePos(); + assert(bufferStartFilePos == 0); + _file.SetSize(0); + writeContext.setFile(&_file); + search::ComprBuffer &cb = writeContext; + EncodeContext &e = _writer.get_encode_context(); + writeContext.allocComprBuf(65536u, 32768u); + e.setupWrite(cb); + // Reset accumulated stats + _fileBitSize = 0; + // Start write initial header + makeHeader(fileHeaderContext); + // end write initial header + _writer.on_open(); return true; // Assume success } @@ -854,42 +732,24 @@ Zc4PostingSeqWrite::open(const vespalib::string &name, bool Zc4PostingSeqWrite::close() { - EncodeContext &e = _encodeContext; - - _fileBitSize = e.getWriteOffset(); - // Write some pad bits to avoid decompression readahead going past - // memory mapped file during search and into SIGSEGV territory. - - // First pad to 64 bits alignment. - e.smallAlign(64); - e.writeComprBufferIfNeeded(); - - // Then write 128 more bits. This allows for 64-bit decoding - // with a readbits that always leaves a nonzero preRead - e.padBits(128); - e.alignDirectIO(); - e.flush(); - e.writeComprBuffer(); // Also flushes slack - - _writeContext.dropComprBuf(); + _fileBitSize = _writer.get_encode_context().getWriteOffset(); + _writer.on_close(); // flush and pad + auto &writeContext = _writer.get_write_context(); + writeContext.dropComprBuf(); _file.Sync(); _file.Close(); - _writeContext.setFile(nullptr); + writeContext.setFile(nullptr); updateHeader(); return true; } - - void Zc4PostingSeqWrite:: setParams(const PostingListParams ¶ms) { if (_countFile != nullptr) _countFile->setParams(params); - params.get("docIdLimit", _docIdLimit); - params.get("minChunkDocs", _minChunkDocs); - params.get("minSkipDocs", _minSkipDocs); + _writer.set_posting_list_params(params); } @@ -905,14 +765,14 @@ getParams(PostingListParams ¶ms) uint32_t countMinChunkDocs = 0; countParams.get("docIdLimit", countDocIdLimit); countParams.get("minChunkDocs", countMinChunkDocs); - assert(_docIdLimit == countDocIdLimit); - assert(_minChunkDocs == countMinChunkDocs); + assert(_writer.get_docid_limit() == countDocIdLimit); + assert(_writer.get_min_chunk_docs() == countMinChunkDocs); } else { params.clear(); - params.set("docIdLimit", _docIdLimit); - params.set("minChunkDocs", _minChunkDocs); + params.set("docIdLimit", _writer.get_docid_limit()); + params.set("minChunkDocs", _writer.get_min_chunk_docs()); } - params.set("minSkipDocs", _minSkipDocs); + params.set("minSkipDocs", _writer.get_min_skip_docs()); } @@ -920,7 +780,7 @@ void Zc4PostingSeqWrite:: setFeatureParams(const PostingListParams ¶ms) { - _encodeFeatures->setParams(params); + _writer.get_encode_features().setParams(params); } @@ -928,314 +788,7 @@ void Zc4PostingSeqWrite:: getFeatureParams(PostingListParams ¶ms) { - _encodeFeatures->getParams(params); -} - - -void -Zc4PostingSeqWrite::flushChunk() -{ - /* TODO: Flush chunk and prepare for new (possible short) chunk */ - flushWordWithSkip(true); -} - -#define L1SKIPSTRIDE 16 -#define L2SKIPSTRIDE 8 -#define L3SKIPSTRIDE 8 -#define L4SKIPSTRIDE 8 - - -void -Zc4PostingSeqWrite::calcSkipInfo() -{ - uint32_t lastDocId = 0u; - uint32_t lastL1SkipDocId = 0u; - uint32_t lastL1SkipDocIdPos = 0; - uint32_t lastL1SkipFeaturePos = 0; - uint32_t lastL2SkipDocId = 0u; - uint32_t lastL2SkipDocIdPos = 0; - uint32_t lastL2SkipFeaturePos = 0; - uint32_t lastL2SkipL1SkipPos = 0; - uint32_t lastL3SkipDocId = 0u; - uint32_t lastL3SkipDocIdPos = 0; - uint32_t lastL3SkipFeaturePos = 0; - uint32_t lastL3SkipL1SkipPos = 0; - uint32_t lastL3SkipL2SkipPos = 0; - uint32_t lastL4SkipDocId = 0u; - uint32_t lastL4SkipDocIdPos = 0; - uint32_t lastL4SkipFeaturePos = 0; - uint32_t lastL4SkipL1SkipPos = 0; - uint32_t lastL4SkipL2SkipPos = 0; - uint32_t lastL4SkipL3SkipPos = 0; - unsigned int l1SkipCnt = 0; - unsigned int l2SkipCnt = 0; - unsigned int l3SkipCnt = 0; - unsigned int l4SkipCnt = 0; - uint64_t featurePos = 0; - - std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin(); - std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end(); - - if (!_counts._segments.empty()) { - lastDocId = _counts._segments.back()._lastDoc; - lastL1SkipDocId = lastDocId; - lastL2SkipDocId = lastDocId; - lastL3SkipDocId = lastDocId; - lastL4SkipDocId = lastDocId; - } - - for (; dit != dite; ++dit) { - if (l1SkipCnt >= L1SKIPSTRIDE) { - // L1 docid delta - uint32_t docIdDelta = lastDocId - lastL1SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - _l1Skip.encode(docIdDelta - 1); - lastL1SkipDocId = lastDocId; - // L1 docid pos - uint64_t docIdPos = _zcDocIds.size(); - _l1Skip.encode(docIdPos - lastL1SkipDocIdPos - 1); - lastL1SkipDocIdPos = docIdPos; - // L1 features pos - _l1Skip.encode(featurePos - lastL1SkipFeaturePos - 1); - lastL1SkipFeaturePos = featurePos; - l1SkipCnt = 0; - ++l2SkipCnt; - if (l2SkipCnt >= L2SKIPSTRIDE) { - // L2 docid delta - docIdDelta = lastDocId - lastL2SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - _l2Skip.encode(docIdDelta - 1); - lastL2SkipDocId = lastDocId; - // L2 docid pos - docIdPos = _zcDocIds.size(); - _l2Skip.encode(docIdPos - lastL2SkipDocIdPos - 1); - lastL2SkipDocIdPos = docIdPos; - // L2 features pos - _l2Skip.encode(featurePos - lastL2SkipFeaturePos - 1); - lastL2SkipFeaturePos = featurePos; - // L2 L1Skip pos - uint64_t l1SkipPos = _l1Skip.size(); - _l2Skip.encode(l1SkipPos - lastL2SkipL1SkipPos - 1); - lastL2SkipL1SkipPos = l1SkipPos; - l2SkipCnt = 0; - ++l3SkipCnt; - if (l3SkipCnt >= L3SKIPSTRIDE) { - // L3 docid delta - docIdDelta = lastDocId - lastL3SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - _l3Skip.encode(docIdDelta - 1); - lastL3SkipDocId = lastDocId; - // L3 docid pos - docIdPos = _zcDocIds.size(); - _l3Skip.encode(docIdPos - lastL3SkipDocIdPos - 1); - lastL3SkipDocIdPos = docIdPos; - // L3 features pos - _l3Skip.encode(featurePos - lastL3SkipFeaturePos - 1); - lastL3SkipFeaturePos = featurePos; - // L3 L1Skip pos - l1SkipPos = _l1Skip.size(); - _l3Skip.encode(l1SkipPos - lastL3SkipL1SkipPos - 1); - lastL3SkipL1SkipPos = l1SkipPos; - // L3 L2Skip pos - uint64_t l2SkipPos = _l2Skip.size(); - _l3Skip.encode(l2SkipPos - lastL3SkipL2SkipPos - 1); - lastL3SkipL2SkipPos = l2SkipPos; - l3SkipCnt = 0; - ++l4SkipCnt; - if (l4SkipCnt >= L4SKIPSTRIDE) { - // L4 docid delta - docIdDelta = lastDocId - lastL4SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - _l4Skip.encode(docIdDelta - 1); - lastL4SkipDocId = lastDocId; - // L4 docid pos - docIdPos = _zcDocIds.size(); - _l4Skip.encode(docIdPos - lastL4SkipDocIdPos - 1); - lastL4SkipDocIdPos = docIdPos; - // L4 features pos - _l4Skip.encode(featurePos - lastL4SkipFeaturePos - 1); - lastL4SkipFeaturePos = featurePos; - // L4 L1Skip pos - l1SkipPos = _l1Skip.size(); - _l4Skip.encode(l1SkipPos - lastL4SkipL1SkipPos - 1); - lastL4SkipL1SkipPos = l1SkipPos; - // L4 L2Skip pos - l2SkipPos = _l2Skip.size(); - _l4Skip.encode(l2SkipPos - lastL4SkipL2SkipPos - 1); - lastL4SkipL2SkipPos = l2SkipPos; - // L4 L3Skip pos - uint64_t l3SkipPos = _l3Skip.size(); - _l4Skip.encode(l3SkipPos - lastL4SkipL3SkipPos - 1); - lastL4SkipL3SkipPos = l3SkipPos; - l4SkipCnt = 0; - } - } - } - } - uint32_t docId = dit->first; - featurePos += dit->second; - _zcDocIds.encode(docId - lastDocId - 1); - lastDocId = docId; - ++l1SkipCnt; - } - // Extra partial entries for skip tables to simplify iterator during search - if (_l1Skip.size() > 0) - _l1Skip.encode(lastDocId - lastL1SkipDocId - 1); - if (_l2Skip.size() > 0) - _l2Skip.encode(lastDocId - lastL2SkipDocId - 1); - if (_l3Skip.size() > 0) - _l3Skip.encode(lastDocId - lastL3SkipDocId - 1); - if (_l4Skip.size() > 0) - _l4Skip.encode(lastDocId - lastL4SkipDocId - 1); -} - - -void -Zc4PostingSeqWrite::flushWordWithSkip(bool hasMore) -{ - assert(_docIds.size() >= _minSkipDocs || !_counts._segments.empty()); - - _encodeFeatures->flush(); - EncodeContext &e = _encodeContext; - - uint32_t numDocs = _docIds.size(); - - e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS); - if (numDocs >= _minChunkDocs) - e.writeBits((hasMore ? 1 : 0), 1); - - // TODO: Calculate docids size, possible also k parameter */ - calcSkipInfo(); - - uint32_t docIdsSize = _zcDocIds.size(); - uint32_t l1SkipSize = _l1Skip.size(); - uint32_t l2SkipSize = _l2Skip.size(); - uint32_t l3SkipSize = _l3Skip.size(); - uint32_t l4SkipSize = _l4Skip.size(); - - e.encodeExpGolomb(docIdsSize - 1, K_VALUE_ZCPOSTING_DOCIDSSIZE); - e.encodeExpGolomb(l1SkipSize, K_VALUE_ZCPOSTING_L1SKIPSIZE); - if (l1SkipSize != 0) { - e.encodeExpGolomb(l2SkipSize, K_VALUE_ZCPOSTING_L2SKIPSIZE); - if (l2SkipSize != 0) { - e.encodeExpGolomb(l3SkipSize, K_VALUE_ZCPOSTING_L3SKIPSIZE); - if (l3SkipSize != 0) { - e.encodeExpGolomb(l4SkipSize, K_VALUE_ZCPOSTING_L4SKIPSIZE); - } - } - } - e.encodeExpGolomb(_featureOffset, K_VALUE_ZCPOSTING_FEATURESSIZE); - - // Encode last document id in chunk or word. - if (_dynamicK) { - uint32_t docIdK = e.calcDocIdK((_counts._segments.empty() && - !hasMore) ? - numDocs : 1, - _docIdLimit); - e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first, - docIdK); - } else { - e.encodeExpGolomb(_docIdLimit - 1 - _docIds.back().first, - K_VALUE_ZCPOSTING_LASTDOCID); - } - - e.smallAlign(8); // Byte align - - uint8_t *docIds = _zcDocIds._mallocStart; - e.writeBits(reinterpret_cast<const uint64_t *>(docIds), - 0, - docIdsSize * 8); - if (l1SkipSize > 0) { - uint8_t *l1Skip = _l1Skip._mallocStart; - e.writeBits(reinterpret_cast<const uint64_t *>(l1Skip), - 0, - l1SkipSize * 8); - } - if (l2SkipSize > 0) { - uint8_t *l2Skip = _l2Skip._mallocStart; - e.writeBits(reinterpret_cast<const uint64_t *>(l2Skip), - 0, - l2SkipSize * 8); - } - if (l3SkipSize > 0) { - uint8_t *l3Skip = _l3Skip._mallocStart; - e.writeBits(reinterpret_cast<const uint64_t *>(l3Skip), - 0, - l3SkipSize * 8); - } - if (l4SkipSize > 0) { - uint8_t *l4Skip = _l4Skip._mallocStart; - e.writeBits(reinterpret_cast<const uint64_t *>(l4Skip), - 0, - l4SkipSize * 8); - } - - // Write features - e.writeBits(static_cast<const uint64_t *>(_featureWriteContext._comprBuf), - 0, - _featureOffset); - - _counts._numDocs += numDocs; - if (hasMore || !_counts._segments.empty()) { - uint64_t writePos = e.getWriteOffset(); - PostingListCounts::Segment seg; - seg._bitLength = writePos - (_writePos + _counts._bitLength); - seg._numDocs = numDocs; - seg._lastDoc = _docIds.back().first; - _counts._segments.push_back(seg); - _counts._bitLength += seg._bitLength; - } - // reset tables in preparation for next word or next chunk - _zcDocIds.clear(); - _l1Skip.clear(); - _l2Skip.clear(); - _l3Skip.clear(); - _l4Skip.clear(); - resetWord(); -} - - -void -Zc4PostingSeqWrite::flushWordNoSkip() -{ - // Too few document ids for skip info. - assert(_docIds.size() < _minSkipDocs && _counts._segments.empty()); - - _encodeFeatures->flush(); - EncodeContext &e = _encodeContext; - uint32_t numDocs = _docIds.size(); - - e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS); - - uint32_t baseDocId = 1; - const uint64_t *features = - static_cast<const uint64_t *>(_featureWriteContext._comprBuf); - uint64_t featureOffset = 0; - - std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin(); - std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end(); - - for (; dit != dite; ++dit) { - uint32_t docId = dit->first; - uint32_t featureSize = dit->second; - e.encodeExpGolomb(docId - baseDocId, K_VALUE_ZCPOSTING_DELTA_DOCID); - baseDocId = docId + 1; - e.writeBits(features + (featureOffset >> 6), - featureOffset & 63, - featureSize); - featureOffset += featureSize; - } - _counts._numDocs += numDocs; - resetWord(); -} - - -void -Zc4PostingSeqWrite::resetWord() -{ - _docIds.clear(); - _encodeFeatures->setupWrite(_featureWriteContext); - _featureOffset = 0; + _writer.get_encode_features().getParams(params); } @@ -1300,44 +853,7 @@ ZcPostingSeqRead::getIdentifier() ZcPostingSeqWrite::ZcPostingSeqWrite(PostingListCountFileSeqWrite *countFile) : Zc4PostingSeqWrite(countFile) { - _dynamicK = true; -} - - -void -ZcPostingSeqWrite::flushWordNoSkip() -{ - // Too few document ids for skip info. - assert(_docIds.size() < _minSkipDocs && _counts._segments.empty()); - - _encodeFeatures->flush(); - EncodeContext &e = _encodeContext; - uint32_t numDocs = _docIds.size(); - - e.encodeExpGolomb(numDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS); - - uint32_t docIdK = e.calcDocIdK(numDocs, _docIdLimit); - - uint32_t baseDocId = 1; - const uint64_t *features = - static_cast<const uint64_t *>(_featureWriteContext._comprBuf); - uint64_t featureOffset = 0; - - std::vector<DocIdAndFeatureSize>::const_iterator dit = _docIds.begin(); - std::vector<DocIdAndFeatureSize>::const_iterator dite = _docIds.end(); - - for (; dit != dite; ++dit) { - uint32_t docId = dit->first; - uint32_t featureSize = dit->second; - e.encodeExpGolomb(docId - baseDocId, docIdK); - baseDocId = docId + 1; - e.writeBits(features + (featureOffset >> 6), - featureOffset & 63, - featureSize); - featureOffset += featureSize; - } - _counts._numDocs += numDocs; - resetWord(); + _writer.set_dynamic_k(true); } } // namespace search::diskindex diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.h b/searchlib/src/vespa/searchlib/diskindex/zcposting.h index 8c69a051e83..96cc306cea8 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zcposting.h +++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.h @@ -2,9 +2,8 @@ #pragma once -#include "zcbuf.h" +#include "zc4_posting_writer.h" #include <vespa/searchlib/index/postinglistfile.h> -#include <vespa/searchlib/bitcompression/compression.h> #include <vespa/fastos/file.h> namespace search::index { @@ -131,29 +130,8 @@ class Zc4PostingSeqWrite : public index::PostingListFileSeqWrite protected: typedef bitcompression::FeatureEncodeContextBE EncodeContext; - EncodeContext _encodeContext; - search::ComprFileWriteContext _writeContext; - FastOS_File _file; - uint32_t _minChunkDocs; // # of documents needed for chunking - uint32_t _minSkipDocs; // # of documents needed for skipping - uint32_t _docIdLimit; // Limit for document ids (docId < docIdLimit) - // Unpacked document ids for word and feature sizes - typedef std::pair<uint32_t, uint32_t> DocIdAndFeatureSize; - std::vector<DocIdAndFeatureSize> _docIds; - - // Buffer up features in memory - EncodeContext *_encodeFeatures; - uint64_t _featureOffset; // Bit offset of next feature - search::ComprFileWriteContext _featureWriteContext; - uint64_t _writePos; // Bit position for start of current word - bool _dynamicK; // Caclulate EG compression parameters ? - ZcBuf _zcDocIds; // Document id deltas - ZcBuf _l1Skip; // L1 skip info - ZcBuf _l2Skip; // L2 skip info - ZcBuf _l3Skip; // L3 skip info - ZcBuf _l4Skip; // L4 skip info - - uint64_t _numWords; // Number of words in file + Zc4PostingWriter<true> _writer; + FastOS_File _file; uint64_t _fileBitSize; index::PostingListCountFileSeqWrite *const _countFile; public: @@ -177,37 +155,10 @@ public: void getFeatureParams(PostingListParams ¶ms) override; /** - * Flush chunk to file. - */ - void flushChunk(); - void calcSkipInfo(); - - /** - * Flush word with skip info to disk - */ - void flushWordWithSkip(bool hasMore); - - - /** - * Flush word without skip info to disk. - */ - virtual void flushWordNoSkip(); - - /** - * Prepare for next word or next chunk. - */ - void resetWord(); - - /** * Make header using feature encode write context. */ void makeHeader(const search::common::FileHeaderContext &fileHeaderContext); void updateHeader(); - - /** - * Read header, using temporary feature decode context. - */ - uint32_t readHeader(const vespalib::string &name); }; @@ -223,7 +174,6 @@ class ZcPostingSeqWrite : public Zc4PostingSeqWrite { public: ZcPostingSeqWrite(index::PostingListCountFileSeqWrite *countFile); - void flushWordNoSkip() override; }; } diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp index 3c16fc8e9a8..33819d4f7cb 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp @@ -3,12 +3,13 @@ #include "fakezcfilterocc.h" #include "fpfactory.h" #include <vespa/searchlib/diskindex/zcposocciterators.h> -#include <vespa/searchlib/diskindex/zcbuf.h> +#include <vespa/searchlib/diskindex/zc4_posting_writer.h> using search::fef::TermFieldMatchData; using search::fef::TermFieldMatchDataArray; using search::fef::TermFieldMatchDataPosition; using search::queryeval::SearchIterator; +using search::index::PostingListCounts; using search::index::PostingListParams; using search::index::DocIdAndFeatures; using search::index::DocIdAndPosOccFeatures; @@ -24,11 +25,6 @@ namespace search { namespace fakedata { -#define L1SKIPSTRIDE 16 -#define L2SKIPSTRIDE 8 -#define L3SKIPSTRIDE 8 -#define L4SKIPSTRIDE 8 - #define DEBUG_ZCFILTEROCC_PRINTF 0 #define DEBUG_ZCFILTEROCC_ASSERT 0 @@ -137,35 +133,8 @@ void FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures, bool dynamicK) { - ZcBuf bytes; - ZcBuf l1SkipBytes; - ZcBuf l2SkipBytes; - ZcBuf l3SkipBytes; - ZcBuf l4SkipBytes; - uint32_t lastDocId = 0u; - uint32_t lastL1SkipDocId = 0u; - uint64_t lastL1SkipDocIdPos = 0; - uint64_t lastL1SkipFeaturePos = 0; - unsigned int l1SkipCnt = 0; - uint32_t lastL2SkipDocId = 0u; - uint64_t lastL2SkipDocIdPos = 0; - uint64_t lastL2SkipFeaturePos = 0; - uint64_t lastL2SkipL1SkipPos = 0; - unsigned int l2SkipCnt = 0; - uint32_t lastL3SkipDocId = 0u; - uint64_t lastL3SkipDocIdPos = 0; - uint64_t lastL3SkipFeaturePos = 0; - uint64_t lastL3SkipL1SkipPos = 0; - uint64_t lastL3SkipL2SkipPos = 0; - unsigned int l3SkipCnt = 0; - uint32_t lastL4SkipDocId = 0u; - uint64_t lastL4SkipDocIdPos = 0; - uint64_t lastL4SkipFeaturePos = 0; - uint64_t lastL4SkipL1SkipPos = 0; - uint64_t lastL4SkipL2SkipPos = 0; - uint64_t lastL4SkipL3SkipPos = 0; - unsigned int l4SkipCnt = 0; - uint64_t featurePos = 0; + PostingListCounts counts; + Zc4PostingWriter<bigEndian> writer(counts); typedef FakeWord FW; typedef FW::DocWordFeatureList DWFL; @@ -181,288 +150,88 @@ FakeZcFilterOcc::setupT(const FakeWord &fw, bool doFeatures, FeatureEncodeContext<bigEndian> &f = (dynamicK ? static_cast<FeatureEncodeContext<bigEndian> &>(f1) : static_cast<FeatureEncodeContext<bigEndian> &>(f0)); - search::ComprFileWriteContext fctx(f); - f.setWriteContext(&fctx); - fctx.allocComprBuf(64, 1); - f.afterWrite(fctx, 0, 0); + writer.set_dynamic_k(dynamicK); + if (doFeatures) { + writer.set_encode_features(&f); + } + PostingListParams params; + params.set("docIdLimit", fw._docIdLimit); + params.set("minChunkDocs", 1000000000); // Disable chunking + params.set("minSkipDocs", 1u); // Force skip info + writer.set_posting_list_params(params); + auto &writeContext = writer.get_write_context(); + search::ComprBuffer &cb = writeContext; + auto &e = writer.get_encode_context(); + writeContext.allocComprBuf(65536u, 32768u); + e.setupWrite(cb); // Ensure that some space is initially available in encoding buffers - bytes.maybeExpand(); - l1SkipBytes.maybeExpand(); - l2SkipBytes.maybeExpand(); - l3SkipBytes.maybeExpand(); - l4SkipBytes.maybeExpand(); while (d != de) { - if (l1SkipCnt >= L1SKIPSTRIDE) { - uint32_t docIdDelta = lastDocId - lastL1SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - l1SkipBytes.encode(docIdDelta - 1); - uint64_t lastDocIdPos = bytes.size(); - uint32_t docIdPosDelta = lastDocIdPos - lastL1SkipDocIdPos; - l1SkipBytes.encode(docIdPosDelta - 1); - if (doFeatures) { - featurePos = f.getWriteOffset(); - l1SkipBytes.encode(featurePos - lastL1SkipFeaturePos - 1); - lastL1SkipFeaturePos = featurePos; - } -#if DEBUG_ZCFILTEROCC_PRINTF - printf("L1Encode docId=%d (+%d), docIdPos=%d (+%u)\n", - lastDocId, docIdDelta, - (int) lastDocIdPos, docIdPosDelta); -#endif - lastL1SkipDocId = lastDocId; - lastL1SkipDocIdPos = lastDocIdPos; - l1SkipCnt = 0; - ++l2SkipCnt; - if (l2SkipCnt >= L2SKIPSTRIDE) { - docIdDelta = lastDocId - lastL2SkipDocId; - docIdPosDelta = lastDocIdPos - lastL2SkipDocIdPos; - uint64_t lastL1SkipPos = l1SkipBytes.size(); - uint32_t l1SkipPosDelta = lastL1SkipPos - lastL2SkipL1SkipPos; - l2SkipBytes.encode(docIdDelta - 1); - l2SkipBytes.encode(docIdPosDelta - 1); - if (doFeatures) { - l2SkipBytes.encode(featurePos - lastL2SkipFeaturePos - 1); - lastL2SkipFeaturePos = featurePos; - } - l2SkipBytes.encode(l1SkipPosDelta - 1); -#if DEBUG_ZCFILTEROCC_PRINTF - printf("L2Encode docId=%d (+%d), docIdPos=%d (+%u)," - " l1SkipPos=%d (+%u)\n", - lastDocId, docIdDelta, - (int) lastDocIdPos, docIdPosDelta, - (int) lastL1SkipPos, l1SkipPosDelta); -#endif - lastL2SkipDocId = lastDocId; - lastL2SkipDocIdPos = lastDocIdPos; - lastL2SkipL1SkipPos = lastL1SkipPos; - l2SkipCnt = 0; - ++l3SkipCnt; - if (l3SkipCnt >= L3SKIPSTRIDE) { - docIdDelta = lastDocId - lastL3SkipDocId; - docIdPosDelta = lastDocIdPos - lastL3SkipDocIdPos; - l1SkipPosDelta = lastL1SkipPos - lastL3SkipL1SkipPos; - uint64_t lastL2SkipPos = l2SkipBytes.size(); - uint32_t l2SkipPosDelta = lastL2SkipPos - - lastL3SkipL2SkipPos; - l3SkipBytes.encode(docIdDelta - 1); - l3SkipBytes.encode(docIdPosDelta - 1); - if (doFeatures) { - l3SkipBytes.encode(featurePos - lastL3SkipFeaturePos - 1); - lastL3SkipFeaturePos = featurePos; - } - l3SkipBytes.encode(l1SkipPosDelta - 1); - l3SkipBytes.encode(l2SkipPosDelta - 1); -#if DEBUG_ZCFILTEROCC_PRINTF - printf("L3Encode docId=%d (+%d), docIdPos=%d (+%u)," - " l1SkipPos=%d (+%u) l2SkipPos %d (+%u)\n", - lastDocId, docIdDelta, - (int) lastDocIdPos, docIdPosDelta, - (int) lastL1SkipPos, l1SkipPosDelta, - (int) lastL2SkipPos, l2SkipPosDelta); -#endif - lastL3SkipDocId = lastDocId; - lastL3SkipDocIdPos = lastDocIdPos; - lastL3SkipL1SkipPos = lastL1SkipPos; - lastL3SkipL2SkipPos = lastL2SkipPos; - l3SkipCnt = 0; - ++l4SkipCnt; - if (l4SkipCnt >= L4SKIPSTRIDE) { - docIdDelta = lastDocId - lastL4SkipDocId; - docIdPosDelta = lastDocIdPos - lastL4SkipDocIdPos; - l1SkipPosDelta = lastL1SkipPos - lastL4SkipL1SkipPos; - l2SkipPosDelta = lastL2SkipPos - lastL4SkipL2SkipPos; - uint64_t lastL3SkipPos = l3SkipBytes.size(); - uint32_t l3SkipPosDelta = lastL3SkipPos - - lastL4SkipL3SkipPos; - l4SkipBytes.encode(docIdDelta - 1); - l4SkipBytes.encode(docIdPosDelta - 1); - if (doFeatures) { - l4SkipBytes.encode(featurePos - lastL4SkipFeaturePos - 1); - lastL4SkipFeaturePos = featurePos; - } - l4SkipBytes.encode(l1SkipPosDelta - 1); - l4SkipBytes.encode(l2SkipPosDelta - 1); - l4SkipBytes.encode(l3SkipPosDelta - 1); -#if DEBUG_ZCFILTEROCC_PRINTF - printf("L4Encode docId=%d (+%d), docIdPos=%d (+%u)," - " l1SkipPos=%d (+%u) l2SkipPos %d (+%u)" - " l3SkipPos=%d (+%u)\n", - lastDocId, docIdDelta, - (int) lastDocIdPos, docIdPosDelta, - (int) lastL1SkipPos, l1SkipPosDelta, - (int) lastL2SkipPos, l2SkipPosDelta, - (int) lastL3SkipPos, l3SkipPosDelta); -#endif - lastL4SkipDocId = lastDocId; - lastL4SkipDocIdPos = lastDocIdPos; - lastL4SkipL1SkipPos = lastL1SkipPos; - lastL4SkipL2SkipPos = lastL2SkipPos; - lastL4SkipL3SkipPos = lastL3SkipPos; - l4SkipCnt = 0; - } - } - } - } - if (lastDocId == 0u) { - bytes.encode(d->_docId - 1); -#if DEBUG_ZCFILTEROCC_PRINTF - printf("Encode docId=%d\n", - d->_docId); -#endif - } else { - uint32_t docIdDelta = d->_docId - lastDocId; - bytes.encode(docIdDelta - 1); -#if DEBUG_ZCFILTEROCC_PRINTF - printf("Encode docId=%d (+%d)\n", - d->_docId, docIdDelta); -#endif - } if (doFeatures) { fw.setupFeatures(*d, &*p, features); p += d->_positions; - f.writeFeatures(features); + } else { + features.clear(d->_docId); } - lastDocId = d->_docId; - ++l1SkipCnt; + writer.write_docid_and_features(features); ++d; } if (doFeatures) { assert(p == pe); - _featuresSize = f.getWriteOffset(); - // First pad to 64 bits. - uint32_t pad = (64 - f.getWriteOffset()) & 63; - while (pad > 0) { - uint32_t now = std::min(32u, pad); - f.writeBits(0, now); - f.writeComprBufferIfNeeded(); - pad -= now; - } - - // Then write 128 more bits. This allows for 64-bit decoding - // with a readbits that always leaves a nonzero preRead - for (unsigned int i = 0; i < 4; i++) { - f.writeBits(0, 32); - f.writeComprBufferIfNeeded(); - } - f.writeComprBufferIfNeeded(); - f.flush(); - f.writeComprBuffer(); - } else { - _featuresSize = 0; - } - // Extra partial entries for skip tables to simplify iterator during search - if (l1SkipBytes.size() > 0) { - uint32_t docIdDelta = lastDocId - lastL1SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - l1SkipBytes.encode(docIdDelta - 1); - } - if (l2SkipBytes.size() > 0) { - uint32_t docIdDelta = lastDocId - lastL2SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - l2SkipBytes.encode(docIdDelta - 1); - } - if (l3SkipBytes.size() > 0) { - uint32_t docIdDelta = lastDocId - lastL3SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - l3SkipBytes.encode(docIdDelta - 1); - } - if (l4SkipBytes.size() > 0) { - uint32_t docIdDelta = lastDocId - lastL4SkipDocId; - assert(static_cast<int32_t>(docIdDelta) > 0); - l4SkipBytes.encode(docIdDelta - 1); } + writer.flush_word(); + _featuresSize = 0; _hitDocs = fw._postings.size(); _docIdLimit = fw._docIdLimit; - _lastDocId = lastDocId; - FeatureEncodeContext<bigEndian> e; - ComprFileWriteContext ectx(e); - e.setWriteContext(&ectx); - ectx.allocComprBuf(64, 1); - e.afterWrite(ectx, 0, 0); + _compressedBits = e.getWriteOffset(); + assert(_compressedBits == counts._bitLength); + assert(_hitDocs == counts._numDocs); + _lastDocId = fw._postings.back()._docId; + writer.on_close(); - // Encode word header - e.encodeExpGolomb(_hitDocs - 1, K_VALUE_ZCPOSTING_NUMDOCS); - _docIdsSize = bytes.size() * 8; - _l1SkipSize = l1SkipBytes.size(); - _l2SkipSize = _l3SkipSize = _l4SkipSize = 0; - if (_l1SkipSize != 0) - _l2SkipSize = l2SkipBytes.size(); - if (_l2SkipSize != 0) - _l3SkipSize = l3SkipBytes.size(); - if (_l3SkipSize != 0) - _l4SkipSize = l4SkipBytes.size(); - - e.encodeExpGolomb(bytes.size() - 1, K_VALUE_ZCPOSTING_DOCIDSSIZE); - e.encodeExpGolomb(_l1SkipSize, K_VALUE_ZCPOSTING_L1SKIPSIZE); - e.writeComprBufferIfNeeded(); + std::pair<void *, size_t> ectxData = writeContext.grabComprBuffer(_compressedMalloc); + _compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first), + ectxData.second); + read_header<bigEndian>(doFeatures, dynamicK, writer.get_min_skip_docs(), writer.get_min_chunk_docs()); +} + +template <bool bigEndian> +void +FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_docs, uint32_t min_chunk_docs) +{ + // read back word header to get skip sizes + using EC = FeatureEncodeContext<bigEndian>; + UC64_DECODECONTEXT(o); + uint32_t length; + uint64_t val64; + UC64_SETUPBITS_NS(o, _compressed.first, 0, EC); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); + assert(static_cast<uint32_t>(val64) + 1 == _hitDocs); + assert(_hitDocs >= min_skip_docs); + assert(_hitDocs < min_chunk_docs); + uint32_t docIdK = dynamicK ? EC::calcDocIdK(_hitDocs, _docIdLimit) : K_VALUE_ZCPOSTING_LASTDOCID; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC); + _docIdsSize = val64 + 1; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC); + _l1SkipSize = val64; if (_l1SkipSize != 0) { - e.encodeExpGolomb(_l2SkipSize, K_VALUE_ZCPOSTING_L2SKIPSIZE); - if (_l2SkipSize != 0) { - e.writeComprBufferIfNeeded(); - e.encodeExpGolomb(_l3SkipSize, K_VALUE_ZCPOSTING_L3SKIPSIZE); - if (_l3SkipSize != 0) { - e.encodeExpGolomb(_l4SkipSize, K_VALUE_ZCPOSTING_L4SKIPSIZE); - } - } + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC); + _l2SkipSize = val64; } - e.writeComprBufferIfNeeded(); - if (doFeatures) { - e.encodeExpGolomb(_featuresSize, K_VALUE_ZCPOSTING_FEATURESSIZE); - } - uint32_t docIdK = e.calcDocIdK(_hitDocs, _docIdLimit); - if (dynamicK) - e.encodeExpGolomb(_docIdLimit - 1 - _lastDocId, docIdK); - else - e.encodeExpGolomb(_docIdLimit - 1 - _lastDocId, - K_VALUE_ZCPOSTING_LASTDOCID); - uint64_t bytePad = (- e.getWriteOffset()) & 7; - if (bytePad > 0) - e.writeBits(0, bytePad); - size_t docIdSize = bytes.size(); - if (docIdSize > 0) { - writeZcBuf(e, bytes); + if (_l2SkipSize != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC); + _l3SkipSize = val64; } - if (_l1SkipSize > 0) { - writeZcBuf(e, l1SkipBytes); - if (_l2SkipSize > 0) { - writeZcBuf(e, l2SkipBytes); - if (_l3SkipSize > 0) { - writeZcBuf(e, l3SkipBytes); - if (_l4SkipSize > 0) { - writeZcBuf(e, l4SkipBytes); - } - } - } + if (_l3SkipSize != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); + _l4SkipSize = val64; } if (doFeatures) { - e.writeBits(static_cast<const uint64_t *>(fctx._comprBuf), - 0, - _featuresSize); + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC); + _featuresSize = val64; } - _compressedBits = e.getWriteOffset(); - // First pad to 64 bits. - uint32_t pad = (64 - e.getWriteOffset()) & 63; - while (pad > 0) { - uint32_t now = std::min(32u, pad); - e.writeBits(0, now); - e.writeComprBufferIfNeeded(); - pad -= now; - } - - // Then write 128 more bits. This allows for 64-bit decoding - // with a readbits that always leaves a nonzero preRead - for (unsigned int i = 0; i < 4; i++) { - e.writeBits(0, 32); - e.writeComprBufferIfNeeded(); - } - e.writeComprBufferIfNeeded(); - e.flush(); - e.writeComprBuffer(); - - std::pair<void *, size_t> ectxData = ectx.grabComprBuffer(_compressedMalloc); - _compressed = std::make_pair(static_cast<uint64_t *>(ectxData.first), - ectxData.second); + UC64_DECODEEXPGOLOMB_NS(o, docIdK, EC); + assert(_lastDocId == _docIdLimit - 1 - val64); } diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h index d5df198acdc..b68e3866461 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h @@ -37,6 +37,9 @@ protected: template <bool bigEndian> void setupT(const FakeWord &fw, bool doFeatures, bool dynamicK); + template <bool bigEndian> + void read_header(bool do_features, bool dynamic_k, uint32_t min_skip_docs, uint32_t min_cunk_docs); + public: FakeZcFilterOcc(const FakeWord &fw); FakeZcFilterOcc(const FakeWord &fw, bool bigEndian, const char *nameSuffix); |