diff options
author | Geir Storli <geirst@yahooinc.com> | 2023-09-28 16:06:07 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-09-28 16:06:07 +0200 |
commit | 168f8847334518c8efe73478805a580b5f5e23a6 (patch) | |
tree | cf736b7198d211dc3a00fc2496de5a7243778fad /searchlib/src | |
parent | baec2cce6cd62f6bd42f09c2ebe4a9dcdd921cb3 (diff) | |
parent | 94ab70c2213512dda2fe4b1824dd238ad33530bb (diff) |
Merge pull request #28687 from vespa-engine/toregge/avoid-unneeded-counting-of-hits
Avoid counting hits in range multiple times.
Diffstat (limited to 'searchlib/src')
4 files changed, 153 insertions, 45 deletions
diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp index 40d4b20aaf2..ac1042dda6c 100644 --- a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp +++ b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp @@ -4,6 +4,7 @@ #include <vespa/searchlib/attribute/attributefactory.h> #include <vespa/searchlib/attribute/attributeiterators.h> #include <vespa/searchlib/attribute/flagattribute.h> +#include <vespa/searchlib/attribute/postinglistsearchcontext.h> #include <vespa/searchlib/attribute/searchcontextelementiterator.h> #include <vespa/searchlib/attribute/singleboolattribute.h> #include <vespa/searchlib/attribute/stringbase.h> @@ -1424,6 +1425,25 @@ SearchContextTest::testPrefixSearch(const vespalib::string& name, const Config& } } } + + // Long range of prefixes with unique strings that causes + // PostingListFoldedSearchContextT<DataT>::countHits() to populate + // partial vector of posting indexes, with scan resumed by + // fillArray or fillBitVector. + auto& vec = dynamic_cast<StringAttribute &>(*attr.get()); + uint32_t old_size = attr->getNumDocs(); + constexpr uint32_t longrange_values = search::attribute::PostingListFoldedSearchContextT<int32_t>::MAX_POSTING_INDEXES_SIZE + 100; + attr->addDocs(longrange_values); + DocSet exp_longrange; + for (uint32_t i = 0; i < longrange_values; ++i) { + vespalib::asciistream ss; + ss << "lpref" << i; + vespalib::string sss(ss.str()); + exp_longrange.put(old_size + i); + vec.update(old_size + i, vespalib::string(ss.str()).c_str()); + } + attr->commit(); + performSearch(*attr, "lpref", exp_longrange, TermType::PREFIXTERM); } diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp index 12c887eb407..2b1d4fa3286 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp @@ -27,7 +27,8 @@ PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_ _FSTC(0.0), _PLSTC(0.0), _hasWeight(hasWeight), - _useBitVector(useBitVector) + _useBitVector(useBitVector), + _counted_hits() { } diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h index 1baf75d1bdf..9a4dbdd9c61 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h @@ -13,6 +13,7 @@ #include <vespa/vespalib/util/regexp.h> #include <vespa/vespalib/fuzzy/fuzzy_matcher.h> #include <regex> +#include <optional> namespace search::attribute { @@ -30,28 +31,30 @@ protected: using Dictionary = EnumPostingTree; using DictionaryConstIterator = Dictionary::ConstIterator; using FrozenDictionary = Dictionary::FrozenView; + using EntryRef = vespalib::datastore::EntryRef; using EnumIndex = IEnumStore::Index; static constexpr long MIN_UNIQUE_VALUES_BEFORE_APPROXIMATION = 100; static constexpr long MIN_UNIQUE_VALUES_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION = 20; static constexpr long MIN_APPROXHITS_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION = 10; - const IEnumStoreDictionary & _dictionary; - const ISearchContext &_baseSearchCtx; - const BitVector *_bv; // bitvector if _useBitVector has been set - const FrozenDictionary _frozenDictionary; - DictionaryConstIterator _lowerDictItr; - DictionaryConstIterator _upperDictItr; - uint64_t _numValues; // attr.getStatus().getNumValues(); - uint32_t _uniqueValues; - uint32_t _docIdLimit; - uint32_t _dictSize; - vespalib::datastore::EntryRef _pidx; - vespalib::datastore::EntryRef _frozenRoot; // Posting list in tree form - float _FSTC; // Filtering Search Time Constant - float _PLSTC; // Posting List Search Time Constant - bool _hasWeight; - bool _useBitVector; + const IEnumStoreDictionary& _dictionary; + const ISearchContext& _baseSearchCtx; + const BitVector* _bv; // bitvector if _useBitVector has been set + const FrozenDictionary _frozenDictionary; + DictionaryConstIterator _lowerDictItr; + DictionaryConstIterator _upperDictItr; + uint64_t _numValues; // attr.getStatus().getNumValues(); + uint32_t _uniqueValues; + uint32_t _docIdLimit; + uint32_t _dictSize; + EntryRef _pidx; + EntryRef _frozenRoot; // Posting list in tree form + float _FSTC; // Filtering Search Time Constant + float _PLSTC; // Posting List Search Time Constant + bool _hasWeight; + bool _useBitVector; + mutable std::optional<size_t> _counted_hits; // Snapshot of size of posting lists in range PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_dictionary, uint32_t docIdLimit, uint64_t numValues, bool hasWeight, bool useBitVector, const ISearchContext &baseSearchCtx); @@ -99,6 +102,9 @@ protected: (calculateApproxNumHits() * MIN_APPROXHITS_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION > _docIdLimit) || (_uniqueValues > MIN_UNIQUE_VALUES_BEFORE_APPROXIMATION*10))); } + virtual size_t countHits() const = 0; + virtual void fillArray() = 0; + virtual void fillBitVector() = 0; }; @@ -126,9 +132,9 @@ protected: ~PostingListSearchContextT() override; void lookupSingle(); - size_t countHits() const; - void fillArray(); - void fillBitVector(); + size_t countHits() const override; + void fillArray() override; + void fillBitVector() override; void fetchPostings(const queryeval::ExecuteInfo & strict) override; // this will be called instead of the fetchPostings function in some cases @@ -147,22 +153,37 @@ protected: template <class DataT> class PostingListFoldedSearchContextT : public PostingListSearchContextT<DataT> { +public: + static constexpr uint32_t MAX_POSTING_INDEXES_SIZE = 10000; + protected: using Parent = PostingListSearchContextT<DataT>; using Dictionary = typename Parent::Dictionary; + using DictionaryConstIterator = Dictionary::ConstIterator; + using EntryRef = vespalib::datastore::EntryRef; using PostingList = typename Parent::PostingList; + using Parent::_counted_hits; + using Parent::_docIdLimit; using Parent::_lowerDictItr; - using Parent::_uniqueValues; + using Parent::_merger; using Parent::_postingList; - using Parent::_docIdLimit; - using Parent::countHits; + using Parent::_uniqueValues; + using Parent::_upperDictItr; using Parent::singleHits; + using Parent::use_dictionary_entry; + + mutable DictionaryConstIterator _resume_scan_itr; + mutable std::vector<EntryRef> _posting_indexes; PostingListFoldedSearchContextT(const IEnumStoreDictionary& dictionary, uint32_t docIdLimit, uint64_t numValues, bool hasWeight, const PostingList &postingList, bool useBitVector, const ISearchContext &baseSearchCtx); + ~PostingListFoldedSearchContextT() override; bool fallback_to_approx_num_hits() const override; + size_t countHits() const override; + void fillArray() override; + void fillBitVector() override; }; diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp index 91054b02d1e..c479bbfcd06 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp @@ -58,51 +58,42 @@ PostingListSearchContextT<DataT>::lookupSingle() } } - template <typename DataT> size_t PostingListSearchContextT<DataT>::countHits() const { + if (_counted_hits.has_value()) { + return _counted_hits.value(); + } size_t sum(0); - for (auto it(_lowerDictItr); it != _upperDictItr;) { - if (use_dictionary_entry(it)) { - sum += _postingList.frozenSize(it.getData().load_acquire()); - ++it; - } + for (auto it(_lowerDictItr); it != _upperDictItr; ++it) { + sum += _postingList.frozenSize(it.getData().load_acquire()); } + _counted_hits = sum; return sum; } - template <typename DataT> void PostingListSearchContextT<DataT>::fillArray() { - for (auto it(_lowerDictItr); it != _upperDictItr;) { - if (use_dictionary_entry(it)) { - _merger.addToArray(PostingListTraverser<PostingList>(_postingList, - it.getData().load_acquire())); - ++it; - } + for (auto it(_lowerDictItr); it != _upperDictItr; ++it) { + _merger.addToArray(PostingListTraverser<PostingList>(_postingList, + it.getData().load_acquire())); } _merger.merge(); } - template <typename DataT> void PostingListSearchContextT<DataT>::fillBitVector() { - for (auto it(_lowerDictItr); it != _upperDictItr;) { - if (use_dictionary_entry(it)) { - _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, - it.getData().load_acquire())); - ++it; - } + for (auto it(_lowerDictItr); it != _upperDictItr; ++it) { + _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, + it.getData().load_acquire())); } } - template <typename DataT> void PostingListSearchContextT<DataT>::fetchPostings(const queryeval::ExecuteInfo & execInfo) @@ -273,15 +264,90 @@ PostingListFoldedSearchContextT<DataT>:: PostingListFoldedSearchContextT(const IEnumStoreDictionary& dictionary, uint32_t docIdLimit, uint64_t numValues, bool hasWeight, const PostingList &postingList, bool useBitVector, const ISearchContext &searchContext) - : Parent(dictionary, docIdLimit, numValues, hasWeight, postingList, useBitVector, searchContext) + : Parent(dictionary, docIdLimit, numValues, hasWeight, postingList, useBitVector, searchContext), + _resume_scan_itr(), + _posting_indexes() { } template <typename DataT> +PostingListFoldedSearchContextT<DataT>::~PostingListFoldedSearchContextT() = default; + +template <typename DataT> bool PostingListFoldedSearchContextT<DataT>::fallback_to_approx_num_hits() const { return false; } +template <typename DataT> +size_t +PostingListFoldedSearchContextT<DataT>::countHits() const +{ + if (_counted_hits.has_value()) { + return _counted_hits.value(); + } + size_t sum(0); + bool overflow = false; + for (auto it(_lowerDictItr); it != _upperDictItr;) { + if (use_dictionary_entry(it)) { + auto pidx = it.getData().load_acquire(); + if (pidx.valid()) { + sum += _postingList.frozenSize(pidx); + if (!overflow) { + if (_posting_indexes.size() < MAX_POSTING_INDEXES_SIZE) { + _posting_indexes.emplace_back(pidx); + } else { + overflow = true; + _resume_scan_itr = it; + } + } + } + ++it; + } + } + if (!overflow) { + _resume_scan_itr = _upperDictItr; + } + _counted_hits = sum; + return sum; +} + +template <typename DataT> +void +PostingListFoldedSearchContextT<DataT>::fillArray() +{ + for (auto pidx : _posting_indexes) { + _merger.addToArray(PostingListTraverser<PostingList>(_postingList, pidx)); + } + for (auto it(_resume_scan_itr); it != _upperDictItr;) { + if (use_dictionary_entry(it)) { + auto pidx = it.getData().load_acquire(); + if (pidx.valid()) { + _merger.addToArray(PostingListTraverser<PostingList>(_postingList, pidx)); + } + ++it; + } + } + _merger.merge(); +} + +template <typename DataT> +void +PostingListFoldedSearchContextT<DataT>::fillBitVector() +{ + for (auto pidx : _posting_indexes) { + _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, pidx)); + } + for (auto it(_resume_scan_itr); it != _upperDictItr;) { + if (use_dictionary_entry(it)) { + auto pidx = it.getData().load_acquire(); + if (pidx.valid()) { + _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, pidx)); + } + ++it; + } + } +} + } |