diff options
author | Tor Egge <Tor.Egge@broadpark.no> | 2019-04-29 15:05:48 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@broadpark.no> | 2019-04-29 15:05:48 +0200 |
commit | 2ee7b657bd8597b5fd27b133782d240d2faf62cd (patch) | |
tree | 1fc6249cde4efae8a5c02214d7dd85d6fa1fc0f2 /searchlib | |
parent | 7553e0390c1ceb3834cba62774b3ddc77a6944d1 (diff) |
Check that posting list for fake word can be read by posting list reader.
Diffstat (limited to 'searchlib')
5 files changed, 116 insertions, 15 deletions
diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp index c9b8cf0b017..30cef1dc258 100644 --- a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_reader.cpp @@ -72,22 +72,28 @@ Zc4PostingReader<bigEndian>::read_common_word_doc_id_and_features(DocIdAndFeatur if (docId > _l1SkipDocId) { _l1SkipDocIdPos += _l1Skip.decode() + 1; assert(docIdPos == _l1SkipDocIdPos); - _l1SkipFeaturesPos += _l1Skip.decode() + 1; uint64_t featuresPos = _decodeContext->getReadOffset(); - assert(featuresPos == _l1SkipFeaturesPos); + if (_posting_params._encode_features) { + _l1SkipFeaturesPos += _l1Skip.decode() + 1; + assert(featuresPos == _l1SkipFeaturesPos); + } (void) featuresPos; if (docId > _l2SkipDocId) { _l2SkipDocIdPos += _l2Skip.decode() + 1; assert(docIdPos == _l2SkipDocIdPos); - _l2SkipFeaturesPos += _l2Skip.decode() + 1; - assert(featuresPos == _l2SkipFeaturesPos); + if (_posting_params._encode_features) { + _l2SkipFeaturesPos += _l2Skip.decode() + 1; + assert(featuresPos == _l2SkipFeaturesPos); + } _l2SkipL1SkipPos += _l2Skip.decode() + 1; assert(_l1Skip.pos() == _l2SkipL1SkipPos); if (docId > _l3SkipDocId) { _l3SkipDocIdPos += _l3Skip.decode() + 1; assert(docIdPos == _l3SkipDocIdPos); - _l3SkipFeaturesPos += _l3Skip.decode() + 1; - assert(featuresPos == _l3SkipFeaturesPos); + if (_posting_params._encode_features) { + _l3SkipFeaturesPos += _l3Skip.decode() + 1; + assert(featuresPos == _l3SkipFeaturesPos); + } _l3SkipL1SkipPos += _l3Skip.decode() + 1; assert(_l1Skip.pos() == _l3SkipL1SkipPos); _l3SkipL2SkipPos += _l3Skip.decode() + 1; @@ -96,8 +102,10 @@ Zc4PostingReader<bigEndian>::read_common_word_doc_id_and_features(DocIdAndFeatur _l4SkipDocIdPos += _l4Skip.decode() + 1; assert(docIdPos == _l4SkipDocIdPos); (void) docIdPos; - _l4SkipFeaturesPos += _l4Skip.decode() + 1; - assert(featuresPos == _l4SkipFeaturesPos); + if (_posting_params._encode_features) { + _l4SkipFeaturesPos += _l4Skip.decode() + 1; + assert(featuresPos == _l4SkipFeaturesPos); + } _l4SkipL1SkipPos += _l4Skip.decode() + 1; assert(_l1Skip.pos() == _l4SkipL1SkipPos); _l4SkipL2SkipPos += _l4Skip.decode() + 1; @@ -141,7 +149,9 @@ Zc4PostingReader<bigEndian>::read_common_word_doc_id_and_features(DocIdAndFeatur _chunkNo = 0; } } - _decodeContext->readFeatures(features); + if (_posting_params._encode_features) { + _decodeContext->readFeatures(features); + } --_residue; } @@ -175,7 +185,9 @@ Zc4PostingReader<bigEndian>::read_doc_id_and_features(DocIdAndFeatures &features if (__builtin_expect(oCompr >= d._valE, false)) { _readContext.readComprBuffer(); } - _decodeContext->readFeatures(features); + if (_posting_params._encode_features) { + _decodeContext->readFeatures(features); + } --_residue; } @@ -262,8 +274,10 @@ Zc4PostingReader<bigEndian>::read_word_start_with_skip() UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); l4SkipSize = val64; } - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC); - _featuresSize = val64; + if (_posting_params._encode_features) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC); + _featuresSize = val64; + } if (__builtin_expect(oCompr >= valE, false)) { UC64_DECODECONTEXT_STORE(o, d._); _readContext.readComprBuffer(); diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp index 3d4567ed2ab..f4145c594e3 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp @@ -3,9 +3,10 @@ #include "fakezcfilterocc.h" #include "fpfactory.h" #include <vespa/searchlib/diskindex/zcposocciterators.h> -#include <vespa/searchlib/diskindex/zc4_posting_writer.h> #include <vespa/searchlib/diskindex/zc4_posting_header.h> #include <vespa/searchlib/diskindex/zc4_posting_params.h> +#include <vespa/searchlib/diskindex/zc4_posting_reader.h> +#include <vespa/searchlib/diskindex/zc4_posting_writer.h> using search::fef::TermFieldMatchData; using search::fef::TermFieldMatchDataArray; @@ -125,10 +126,12 @@ void FakeZcFilterOcc::setup(const FakeWord &fw, bool doFeatures, bool dynamicK) { - if (_bigEndian) + if (_bigEndian) { setupT<true>(fw, doFeatures, dynamicK); - else + } else { setupT<false>(fw, doFeatures, dynamicK); + } + validate_read(fw, doFeatures, dynamicK); } @@ -219,6 +222,66 @@ FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_d } +void +FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const +{ + if (_bigEndian) { + validate_read<true>(fw, encode_features, dynamic_k); + } else { + validate_read<false>(fw, encode_features, dynamic_k); + } +} + +template <bool bigEndian> +void +FakeZcFilterOcc::validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const +{ + bitcompression::EGPosOccDecodeContextCooked<bigEndian> decode_context_dynamic_k(&_fieldsParams); + bitcompression::EG2PosOccDecodeContextCooked<bigEndian> decode_context_static_k(&_fieldsParams); + bitcompression::FeatureDecodeContext<bigEndian> &decode_context_dynamic_k_upcast = decode_context_dynamic_k; + bitcompression::FeatureDecodeContext<bigEndian> &decode_context_static_k_upcast = decode_context_static_k; + bitcompression::FeatureDecodeContext<bigEndian> &decode_context = dynamic_k ? decode_context_dynamic_k_upcast : decode_context_static_k_upcast; + Zc4PostingReader<bigEndian> reader(dynamic_k); + reader.set_decode_features(&decode_context); + auto ¶ms = reader.get_posting_params(); + params._min_skip_docs = 1; + params._min_chunk_docs = 1000000000; + params._doc_id_limit = _docIdLimit; + params._encode_features = encode_features; + reader.get_read_context().reference_compressed_buffer(_compressed.first, _compressed.second); + assert(decode_context.getReadOffset() == 0u); + PostingListCounts counts; + counts._bitLength = _compressedBits; + counts._numDocs = _hitDocs; + reader.set_counts(counts); + auto d(fw._postings.begin()); + auto de(fw._postings.end()); + auto p(fw._wordPosFeatures.begin()); + auto pe(fw._wordPosFeatures.end()); + DocIdAndPosOccFeatures check_features; + DocIdAndFeatures features; + uint32_t hits = 0; + while (d != de) { + if (encode_features) { + fw.setupFeatures(*d, &*p, check_features); + p += d->_positions; + } else { + check_features.clear(d->_docId); + } + reader.read_doc_id_and_features(features); + assert(features._docId == d->_docId); + assert(features._elements.size() == check_features._elements.size()); + assert(features._wordPositions.size() == check_features._wordPositions.size()); + ++d; + ++hits; + } + if (encode_features) { + assert(p == pe); + } + reader.read_doc_id_and_features(features); + assert(static_cast<int32_t>(features._docId) == -1); +} + FakeZcFilterOcc::~FakeZcFilterOcc() { free(_compressedMalloc); diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h index b68e3866461..36738a0f5a8 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.h @@ -40,6 +40,10 @@ protected: template <bool bigEndian> void read_header(bool do_features, bool dynamic_k, uint32_t min_skip_docs, uint32_t min_cunk_docs); + void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const; + template <bool bigEndian> + void validate_read(const FakeWord &fw, bool encode_features, bool dynamic_k) const; + public: FakeZcFilterOcc(const FakeWord &fw); FakeZcFilterOcc(const FakeWord &fw, bool bigEndian, const char *nameSuffix); diff --git a/searchlib/src/vespa/searchlib/util/comprfile.cpp b/searchlib/src/vespa/searchlib/util/comprfile.cpp index 155bb194f97..400a93acd26 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.cpp +++ b/searchlib/src/vespa/searchlib/util/comprfile.cpp @@ -408,6 +408,25 @@ ComprFileReadContext::referenceWriteContext(const ComprFileWriteContext &rhs) } } +void +ComprFileReadContext::reference_compressed_buffer(void *buffer, size_t usedUnits) +{ + ComprFileDecodeContext *d = getDecodeContext(); + + _comprBuf = buffer; + _comprBufSize = usedUnits; + setBufferEndFilePos(static_cast<uint64_t>(usedUnits) * _unitSize); + setFileSize(static_cast<uint64_t>(usedUnits) * _unitSize); + if (d != NULL) { + d->afterRead(_comprBuf, + usedUnits, + static_cast<uint64_t>(usedUnits) * _unitSize, + false); + d->setupBits(0); + setBitOffset(-1); + assert(d->getBitPosV() == 0); + } +} ComprFileWriteContext:: ComprFileWriteContext(ComprFileEncodeContext &encodeContext) diff --git a/searchlib/src/vespa/searchlib/util/comprfile.h b/searchlib/src/vespa/searchlib/util/comprfile.h index d4de1d305fa..431126dee47 100644 --- a/searchlib/src/vespa/searchlib/util/comprfile.h +++ b/searchlib/src/vespa/searchlib/util/comprfile.h @@ -137,6 +137,7 @@ public: * long as rhs is live and unchanged. */ void referenceWriteContext(const ComprFileWriteContext &rhs); + void reference_compressed_buffer(void *buffer, size_t usedUnits); }; |