diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-09-27 16:33:09 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2023-09-27 16:33:09 +0200 |
commit | 94ab70c2213512dda2fe4b1824dd238ad33530bb (patch) | |
tree | cb56890f878dc5a98501d1a1e29e27e830219104 /searchlib/src | |
parent | 03574a1f04b8f8b54401135ee077a86008123573 (diff) |
Store a limited number of posting list indexes in countHits() to
reduce amount of dictionary entry filtering in fillArray()
and fillBitVector() for regexp search and fuzzy search.
Diffstat (limited to 'searchlib/src')
4 files changed, 70 insertions, 10 deletions
diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp index 40d4b20aaf2..ac1042dda6c 100644 --- a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp +++ b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp @@ -4,6 +4,7 @@ #include <vespa/searchlib/attribute/attributefactory.h> #include <vespa/searchlib/attribute/attributeiterators.h> #include <vespa/searchlib/attribute/flagattribute.h> +#include <vespa/searchlib/attribute/postinglistsearchcontext.h> #include <vespa/searchlib/attribute/searchcontextelementiterator.h> #include <vespa/searchlib/attribute/singleboolattribute.h> #include <vespa/searchlib/attribute/stringbase.h> @@ -1424,6 +1425,25 @@ SearchContextTest::testPrefixSearch(const vespalib::string& name, const Config& } } } + + // Long range of prefixes with unique strings that causes + // PostingListFoldedSearchContextT<DataT>::countHits() to populate + // partial vector of posting indexes, with scan resumed by + // fillArray or fillBitVector. + auto& vec = dynamic_cast<StringAttribute &>(*attr.get()); + uint32_t old_size = attr->getNumDocs(); + constexpr uint32_t longrange_values = search::attribute::PostingListFoldedSearchContextT<int32_t>::MAX_POSTING_INDEXES_SIZE + 100; + attr->addDocs(longrange_values); + DocSet exp_longrange; + for (uint32_t i = 0; i < longrange_values; ++i) { + vespalib::asciistream ss; + ss << "lpref" << i; + vespalib::string sss(ss.str()); + exp_longrange.put(old_size + i); + vec.update(old_size + i, vespalib::string(ss.str()).c_str()); + } + attr->commit(); + performSearch(*attr, "lpref", exp_longrange, TermType::PREFIXTERM); } diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp index 12c887eb407..2b1d4fa3286 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp @@ -27,7 +27,8 @@ PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_ _FSTC(0.0), _PLSTC(0.0), _hasWeight(hasWeight), - _useBitVector(useBitVector) + _useBitVector(useBitVector), + _counted_hits() { } diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h index 7e2002f95fd..9a4dbdd9c61 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h @@ -153,9 +153,14 @@ protected: template <class DataT> class PostingListFoldedSearchContextT : public PostingListSearchContextT<DataT> { +public: + static constexpr uint32_t MAX_POSTING_INDEXES_SIZE = 10000; + protected: using Parent = PostingListSearchContextT<DataT>; using Dictionary = typename Parent::Dictionary; + using DictionaryConstIterator = Dictionary::ConstIterator; + using EntryRef = vespalib::datastore::EntryRef; using PostingList = typename Parent::PostingList; using Parent::_counted_hits; using Parent::_docIdLimit; @@ -167,9 +172,13 @@ protected: using Parent::singleHits; using Parent::use_dictionary_entry; + mutable DictionaryConstIterator _resume_scan_itr; + mutable std::vector<EntryRef> _posting_indexes; + PostingListFoldedSearchContextT(const IEnumStoreDictionary& dictionary, uint32_t docIdLimit, uint64_t numValues, bool hasWeight, const PostingList &postingList, bool useBitVector, const ISearchContext &baseSearchCtx); + ~PostingListFoldedSearchContextT() override; bool fallback_to_approx_num_hits() const override; size_t countHits() const override; diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp index 714da9997bb..c479bbfcd06 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp @@ -264,11 +264,16 @@ PostingListFoldedSearchContextT<DataT>:: PostingListFoldedSearchContextT(const IEnumStoreDictionary& dictionary, uint32_t docIdLimit, uint64_t numValues, bool hasWeight, const PostingList &postingList, bool useBitVector, const ISearchContext &searchContext) - : Parent(dictionary, docIdLimit, numValues, hasWeight, postingList, useBitVector, searchContext) + : Parent(dictionary, docIdLimit, numValues, hasWeight, postingList, useBitVector, searchContext), + _resume_scan_itr(), + _posting_indexes() { } template <typename DataT> +PostingListFoldedSearchContextT<DataT>::~PostingListFoldedSearchContextT() = default; + +template <typename DataT> bool PostingListFoldedSearchContextT<DataT>::fallback_to_approx_num_hits() const { @@ -283,12 +288,27 @@ PostingListFoldedSearchContextT<DataT>::countHits() const return _counted_hits.value(); } size_t sum(0); - for (auto it(_lowerDictItr); it != this->_upperDictItr;) { + bool overflow = false; + for (auto it(_lowerDictItr); it != _upperDictItr;) { if (use_dictionary_entry(it)) { - sum += _postingList.frozenSize(it.getData().load_acquire()); + auto pidx = it.getData().load_acquire(); + if (pidx.valid()) { + sum += _postingList.frozenSize(pidx); + if (!overflow) { + if (_posting_indexes.size() < MAX_POSTING_INDEXES_SIZE) { + _posting_indexes.emplace_back(pidx); + } else { + overflow = true; + _resume_scan_itr = it; + } + } + } ++it; } } + if (!overflow) { + _resume_scan_itr = _upperDictItr; + } _counted_hits = sum; return sum; } @@ -297,10 +317,15 @@ template <typename DataT> void PostingListFoldedSearchContextT<DataT>::fillArray() { - for (auto it(_lowerDictItr); it != _upperDictItr;) { + for (auto pidx : _posting_indexes) { + _merger.addToArray(PostingListTraverser<PostingList>(_postingList, pidx)); + } + for (auto it(_resume_scan_itr); it != _upperDictItr;) { if (use_dictionary_entry(it)) { - _merger.addToArray(PostingListTraverser<PostingList>(_postingList, - it.getData().load_acquire())); + auto pidx = it.getData().load_acquire(); + if (pidx.valid()) { + _merger.addToArray(PostingListTraverser<PostingList>(_postingList, pidx)); + } ++it; } } @@ -311,10 +336,15 @@ template <typename DataT> void PostingListFoldedSearchContextT<DataT>::fillBitVector() { - for (auto it(_lowerDictItr); it != _upperDictItr;) { + for (auto pidx : _posting_indexes) { + _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, pidx)); + } + for (auto it(_resume_scan_itr); it != _upperDictItr;) { if (use_dictionary_entry(it)) { - _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, - it.getData().load_acquire())); + auto pidx = it.getData().load_acquire(); + if (pidx.valid()) { + _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, pidx)); + } ++it; } } |