diff options
Diffstat (limited to 'searchlib/src')
5 files changed, 208 insertions, 127 deletions
diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt index b21b799e693..104994ad038 100644 --- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt @@ -18,6 +18,7 @@ vespa_add_library(searchlib_diskindex OBJECT pagedict4file.cpp pagedict4randread.cpp wordnummapper.cpp + zc4_posting_header.cpp zc4_posting_writer.cpp zc4_posting_writer_base.cpp zcbuf.cpp diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp new file mode 100644 index 00000000000..3adb32f8681 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.cpp @@ -0,0 +1,105 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "zc4_posting_header.h" +#include "zc4_posting_params.h" +#include <vespa/searchlib/bitcompression/compression.h> + +namespace search::diskindex +{ + +Zc4PostingHeader::Zc4PostingHeader() + : _has_more(false), + _doc_id_k(K_VALUE_ZCPOSTING_LASTDOCID), + _num_docs(0u), + _doc_ids_size(0u), + _l1_skip_size(0u), + _l2_skip_size(0u), + _l3_skip_size(0u), + _l4_skip_size(0u), + _features_size(0u), + _last_doc_id(0) +{ +} + +template <bool bigEndian> +void +Zc4PostingHeader::read(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams ¶ms) +{ + using EC = bitcompression::FeatureEncodeContext<bigEndian>; + UC64_DECODECONTEXT_CONSTRUCTOR(o, decode_context._); + uint32_t length; + uint64_t val64; + + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); + _num_docs = static_cast<uint32_t>(val64) + 1; + bool has_more = false; + if (__builtin_expect(_num_docs >= params._min_chunk_docs, false)) { + if (bigEndian) { + has_more = static_cast<int64_t>(oVal) < 0; + oVal <<= 1; + length = 1; + } else { + has_more = (oVal & 1) != 0; + oVal >>= 1; + length = 1; + } + UC64_READBITS_NS(o, EC); + } + if (params._dynamic_k) { + _doc_id_k = EC::calcDocIdK((_has_more || has_more) ? 1 : _num_docs, params._doc_id_limit); + } else { + _doc_id_k = K_VALUE_ZCPOSTING_LASTDOCID; + } + if (_num_docs < params._min_skip_docs && !_has_more) { + _doc_ids_size = 0; + _l1_skip_size = 0; + _l2_skip_size = 0; + _l3_skip_size = 0; + _l4_skip_size = 0; + _features_size = 0; + _last_doc_id = 0; + } else { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC); + _doc_ids_size = val64 + 1; + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC); + _l1_skip_size = val64; + if (_l1_skip_size != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC); + _l2_skip_size = val64; + } + if (_l2_skip_size != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC); + _l3_skip_size = val64; + } + if (_l3_skip_size != 0) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); + _l4_skip_size = val64; + } + if (params._encode_features) { + UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC); + _features_size = val64; + } else { + _features_size = 0; + } + UC64_DECODEEXPGOLOMB_NS(o, _doc_id_k, EC); + _last_doc_id = params._doc_id_limit - 1 - val64; + uint64_t bytePad = oPreRead & 7; + if (bytePad > 0) { + length = bytePad; + UC64_READBITS_NS(o, EC); + } + } + UC64_DECODECONTEXT_STORE(o, decode_context._); + _has_more = has_more; +} + +template +void +Zc4PostingHeader::read<false>(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams ¶ms); + +template +void +Zc4PostingHeader::read<true>(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams ¶ms); + + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h new file mode 100644 index 00000000000..7382f59d176 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_header.h @@ -0,0 +1,35 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <cstdint> + +namespace search::bitcompression { class DecodeContext64Base; } + +namespace search::diskindex { + +struct Zc4PostingParams; + +/* + * Struct containing the decoded header for a word. + */ +struct Zc4PostingHeader { + bool _has_more; + uint32_t _doc_id_k; + uint32_t _num_docs; + uint32_t _doc_ids_size; + uint32_t _l1_skip_size; + uint32_t _l2_skip_size; + uint32_t _l3_skip_size; + uint32_t _l4_skip_size; + uint64_t _features_size; + uint32_t _last_doc_id; + + Zc4PostingHeader(); + + template <bool bigEndian> + void + read(bitcompression::DecodeContext64Base &decode_context, const Zc4PostingParams ¶ms); +}; + +} diff --git a/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h new file mode 100644 index 00000000000..ea4cc6f58a6 --- /dev/null +++ b/searchlib/src/vespa/searchlib/diskindex/zc4_posting_params.h @@ -0,0 +1,29 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <cstdint> + +namespace search::diskindex { + +/* + * Struct containing parameters for posting list. + */ +struct Zc4PostingParams { + uint32_t _min_skip_docs; + uint32_t _min_chunk_docs; + uint32_t _doc_id_limit; + bool _dynamic_k; + bool _encode_features; + + Zc4PostingParams(uint32_t min_skip_docs, uint32_t min_chunk_docs, uint32_t doc_id_limit, bool dynamic_k, bool encode_features) + : _min_skip_docs(min_skip_docs), + _min_chunk_docs(min_chunk_docs), + _doc_id_limit(doc_id_limit), + _dynamic_k(dynamic_k), + _encode_features(encode_features) + { + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp index 33819d4f7cb..3d4567ed2ab 100644 --- a/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp +++ b/searchlib/src/vespa/searchlib/test/fakedata/fakezcfilterocc.cpp @@ -4,6 +4,8 @@ #include "fpfactory.h" #include <vespa/searchlib/diskindex/zcposocciterators.h> #include <vespa/searchlib/diskindex/zc4_posting_writer.h> +#include <vespa/searchlib/diskindex/zc4_posting_header.h> +#include <vespa/searchlib/diskindex/zc4_posting_params.h> using search::fef::TermFieldMatchData; using search::fef::TermFieldMatchDataArray; @@ -13,6 +15,8 @@ using search::index::PostingListCounts; using search::index::PostingListParams; using search::index::DocIdAndFeatures; using search::index::DocIdAndPosOccFeatures; +using search::bitcompression::DecodeContext64; +using search::bitcompression::DecodeContext64Base; using search::bitcompression::PosOccFieldParams; using search::bitcompression::EGPosOccEncodeContext; using search::bitcompression::EG2PosOccEncodeContext; @@ -200,38 +204,18 @@ void FakeZcFilterOcc::read_header(bool doFeatures, bool dynamicK, uint32_t min_skip_docs, uint32_t min_chunk_docs) { // read back word header to get skip sizes - using EC = FeatureEncodeContext<bigEndian>; - UC64_DECODECONTEXT(o); - uint32_t length; - uint64_t val64; - UC64_SETUPBITS_NS(o, _compressed.first, 0, EC); - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); - assert(static_cast<uint32_t>(val64) + 1 == _hitDocs); - assert(_hitDocs >= min_skip_docs); - assert(_hitDocs < min_chunk_docs); - uint32_t docIdK = dynamicK ? EC::calcDocIdK(_hitDocs, _docIdLimit) : K_VALUE_ZCPOSTING_LASTDOCID; - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC); - _docIdsSize = val64 + 1; - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC); - _l1SkipSize = val64; - if (_l1SkipSize != 0) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC); - _l2SkipSize = val64; - } - if (_l2SkipSize != 0) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC); - _l3SkipSize = val64; - } - if (_l3SkipSize != 0) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); - _l4SkipSize = val64; - } - if (doFeatures) { - UC64_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_FEATURESSIZE, EC); - _featuresSize = val64; - } - UC64_DECODEEXPGOLOMB_NS(o, docIdK, EC); - assert(_lastDocId == _docIdLimit - 1 - val64); + DecodeContext64<bigEndian> decode_context; + decode_context.setPosition({ _compressed.first, 0 }); + Zc4PostingParams params(min_skip_docs, min_chunk_docs, _docIdLimit, dynamicK, doFeatures); + Zc4PostingHeader header; + header.read<bigEndian>(decode_context, params); + _docIdsSize = header._doc_ids_size; + _l1SkipSize = header._l1_skip_size; + _l2SkipSize = header._l2_skip_size; + _l3SkipSize = header._l3_skip_size; + _l4SkipSize = header._l4_skip_size; + _featuresSize = header._features_size; + assert(_lastDocId == header._last_doc_id); } @@ -383,54 +367,17 @@ FakeFilterOccZCArrayIterator::initRange(uint32_t begin, uint32_t end) { queryeval::RankedSearchIteratorBase::initRange(begin, end); DecodeContext &d = _decodeContext; - typedef EncodeContext EC; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - uint32_t length; - uint64_t val64; - - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); - uint32_t numDocs = static_cast<uint32_t>(val64) + 1; - - uint32_t docIdK = EC::calcDocIdK(numDocs, _docIdLimit); - - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC); - uint32_t docIdsSize = val64 + 1; - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC); - uint32_t l1SkipSize = val64; - uint32_t l2SkipSize = 0; - if (l1SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC); - l2SkipSize = val64; - } - uint32_t l3SkipSize = 0; - if (l2SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC); - l3SkipSize = val64; - } - uint32_t l4SkipSize = 0; - if (l3SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); - l4SkipSize = val64; - } - // Feature size would be here - UC64BE_DECODEEXPGOLOMB_NS(o, docIdK, EC); - _lastDocId = _docIdLimit - 1 - val64; - UC64_DECODECONTEXT_STORE(o, d._); - uint64_t bytePad = oPreRead & 7; - if (bytePad > 0) { - length = bytePad; - oVal <<= length; - UC64BE_READBITS_NS(o, EC); - } - UC64_DECODECONTEXT_STORE(o, d._); + Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false); + Zc4PostingHeader header; + header.read<true>(d, params); assert((d.getBitOffset() & 7) == 0); const uint8_t *bcompr = d.getByteCompr(); _valI = bcompr; - bcompr += docIdsSize; - bcompr += l1SkipSize; - bcompr += l2SkipSize; - bcompr += l3SkipSize; - bcompr += l4SkipSize; + bcompr += header._doc_ids_size; + bcompr += header._l1_skip_size; + bcompr += header._l2_skip_size; + bcompr += header._l3_skip_size; + bcompr += header._l4_skip_size, d.setByteCompr(bcompr); uint32_t oDocId; ZCDECODE(_valI, oDocId = 1 +); @@ -439,7 +386,7 @@ FakeFilterOccZCArrayIterator::initRange(uint32_t begin, uint32_t end) oDocId); #endif setDocId(oDocId); - _residue = numDocs; + _residue = header._num_docs; } @@ -641,79 +588,43 @@ initRange(uint32_t begin, uint32_t end) { queryeval::RankedSearchIteratorBase::initRange(begin, end); DecodeContext &d = _decodeContext; - typedef EncodeContext EC; - UC64_DECODECONTEXT_CONSTRUCTOR(o, d._); - uint32_t length; - uint64_t val64; - - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_NUMDOCS, EC); - uint32_t numDocs = static_cast<uint32_t>(val64) + 1; - - uint32_t docIdK = EC::calcDocIdK(numDocs, _docIdLimit); - - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_DOCIDSSIZE, EC); - uint32_t docIdsSize = val64 + 1; - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L1SKIPSIZE, EC); - uint32_t l1SkipSize = val64; - uint32_t l2SkipSize = 0; - if (l1SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L2SKIPSIZE, EC); - l2SkipSize = val64; - } - uint32_t l3SkipSize = 0; - if (l2SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L3SKIPSIZE, EC); - l3SkipSize = val64; - } - uint32_t l4SkipSize = 0; - if (l3SkipSize != 0) { - UC64BE_DECODEEXPGOLOMB_NS(o, K_VALUE_ZCPOSTING_L4SKIPSIZE, EC); - l4SkipSize = val64; - } - // Feature size would be here - UC64BE_DECODEEXPGOLOMB_NS(o, docIdK, EC); - _lastDocId = _docIdLimit - 1 - val64; - UC64_DECODECONTEXT_STORE(o, d._); - uint64_t bytePad = oPreRead & 7; - if (bytePad > 0) { - length = bytePad; - oVal <<= length; - UC64BE_READBITS_NS(o, EC); - } - UC64_DECODECONTEXT_STORE(o, d._); + Zc4PostingParams params(1, 1000000000, _docIdLimit, true, false); + Zc4PostingHeader header; + header.read<true>(d, params); + _lastDocId = header._last_doc_id; assert((d.getBitOffset() & 7) == 0); const uint8_t *bcompr = d.getByteCompr(); _valIBase = _valI = bcompr; _l1SkipDocIdPos = _l2SkipDocIdPos = bcompr; _l3SkipDocIdPos = _l4SkipDocIdPos = bcompr; - bcompr += docIdsSize; - if (l1SkipSize != 0) { + bcompr += header._doc_ids_size; + if (header._l1_skip_size != 0) { _l1SkipValIBase = _l1SkipValI = bcompr; _l2SkipL1SkipPos = _l3SkipL1SkipPos = _l4SkipL1SkipPos = bcompr; - bcompr += l1SkipSize; + bcompr += header._l1_skip_size; } else { _l1SkipValIBase = _l1SkipValI = NULL; _l2SkipL1SkipPos = _l3SkipL1SkipPos = _l4SkipL1SkipPos = NULL; } - if (l2SkipSize != 0) { + if (header._l2_skip_size != 0) { _l2SkipValIBase = _l2SkipValI = bcompr; _l3SkipL2SkipPos = _l4SkipL2SkipPos = bcompr; - bcompr += l2SkipSize; + bcompr += header._l2_skip_size; } else { _l2SkipValIBase = _l2SkipValI = NULL; _l3SkipL2SkipPos = _l4SkipL2SkipPos = NULL; } - if (l3SkipSize != 0) { + if (header._l3_skip_size != 0) { _l3SkipValIBase = _l3SkipValI = bcompr; _l4SkipL3SkipPos = bcompr; - bcompr += l3SkipSize; + bcompr += header._l3_skip_size; } else { _l3SkipValIBase = _l3SkipValI = NULL; _l4SkipL3SkipPos = NULL; } - if (l4SkipSize != 0) { + if (header._l4_skip_size != 0) { _l4SkipValI = bcompr; - bcompr += l4SkipSize; + bcompr += header._l4_skip_size; } else { _l4SkipValI = NULL; } |