diff options
author | Tor Egge <Tor.Egge@online.no> | 2023-10-19 22:12:47 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-19 22:12:47 +0200 |
commit | 211bd0c3617e9d51843d9c1e33398622f8cdd950 (patch) | |
tree | db6927f805204ecd6919da0c00158a5c0287cf1e /searchlib | |
parent | b87b0db14a2078a3c60da99aad498ed62b2bf2db (diff) |
Revert "Improve modelling of match strategies to use in numeric range search."
Diffstat (limited to 'searchlib')
3 files changed, 113 insertions, 96 deletions
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp index c0d7cb207bf..5f9a44f691c 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp @@ -24,9 +24,11 @@ PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_ _dictSize(_frozenDictionary.size()), _pidx(), _frozenRoot(), + _FSTC(0.0), + _PLSTC(0.0), _hasWeight(hasWeight), _useBitVector(useBitVector), - _estimated_hits() + _counted_hits() { } @@ -70,17 +72,6 @@ PostingListSearchContext::lookupSingle() } } -size_t -PostingListSearchContext::estimated_hits_in_range() const -{ - if (_estimated_hits.has_value()) { - return _estimated_hits.value(); - } - size_t result = calc_estimated_hits_in_range(); - _estimated_hits = result; - return result; -} - template class PostingListSearchContextT<vespalib::btree::BTreeNoLeafData>; template class PostingListSearchContextT<int32_t>; template class PostingListFoldedSearchContextT<vespalib::btree::BTreeNoLeafData>; diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h index 562c15e94d5..91383bfe5f9 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h @@ -35,6 +35,10 @@ protected: using EntryRef = vespalib::datastore::EntryRef; using EnumIndex = IEnumStore::Index; + static constexpr long MIN_UNIQUE_VALUES_BEFORE_APPROXIMATION = 100; + static constexpr long MIN_UNIQUE_VALUES_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION = 20; + static constexpr long MIN_APPROXHITS_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION = 10; + const IEnumStoreDictionary& _dictionary; const ISearchContext& _baseSearchCtx; const BitVector* _bv; // bitvector if _useBitVector has been set @@ -47,9 +51,11 @@ protected: uint32_t _dictSize; EntryRef _pidx; EntryRef _frozenRoot; // Posting list in tree form + float _FSTC; // Filtering Search Time Constant + float _PLSTC; // Posting List Search Time Constant bool _hasWeight; bool _useBitVector; - mutable std::optional<size_t> _estimated_hits; // Snapshot of size of posting lists in range + mutable std::optional<size_t> _counted_hits; // Snapshot of size of posting lists in range PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_dictionary, uint32_t docIdLimit, uint64_t numValues, bool hasWeight, bool useBitVector, const ISearchContext &baseSearchCtx); @@ -59,18 +65,48 @@ protected: void lookupTerm(const vespalib::datastore::EntryComparator &comp); void lookupRange(const vespalib::datastore::EntryComparator &low, const vespalib::datastore::EntryComparator &high); void lookupSingle(); - size_t estimated_hits_in_range() const; virtual bool use_dictionary_entry(DictionaryConstIterator& it) const { (void) it; return true; } - virtual bool use_posting_lists_when_non_strict(const queryeval::ExecuteInfo& info) const = 0; - /** - * Calculates the estimated number of hits when _uniqueValues >= 2, - * by looking at the posting lists in the range [lower, upper>. - */ - virtual size_t calc_estimated_hits_in_range() const = 0; + float calculateFilteringCost() const { + // filtering search time (ms) ~ FSTC * numValues; (FSTC = + // Filtering Search Time Constant) + return _FSTC * _numValues; + } + + float calculatePostingListCost(uint32_t approxNumHits) const { + // search time (ms) ~ PLSTC * numHits * log(numHits); (PLSTC = + // Posting List Search Time Constant) + return _PLSTC * approxNumHits; + } + + uint32_t calculateApproxNumHits() const { + float docsPerUniqueValue = static_cast<float>(_docIdLimit) / + static_cast<float>(_dictSize); + return static_cast<uint32_t>(docsPerUniqueValue * _uniqueValues); + } + + virtual bool fallbackToFiltering() const { + if (_uniqueValues >= 2 && !_dictionary.get_has_btree_dictionary()) { + return true; // force filtering for range search + } + uint32_t numHits = calculateApproxNumHits(); + // numHits > 1000: make sure that posting lists are unit tested. + return (numHits > 1000) && + (calculateFilteringCost() < calculatePostingListCost(numHits)); + } + virtual bool use_posting_list_when_non_strict(const queryeval::ExecuteInfo&) const { + return false; + } + virtual bool fallback_to_approx_num_hits() const { + return ((_uniqueValues > MIN_UNIQUE_VALUES_BEFORE_APPROXIMATION) && + ((_uniqueValues * MIN_UNIQUE_VALUES_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION > static_cast<int>(_docIdLimit)) || + (calculateApproxNumHits() * MIN_APPROXHITS_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION > _docIdLimit) || + (_uniqueValues > MIN_UNIQUE_VALUES_BEFORE_APPROXIMATION*10))); + } + virtual size_t countHits() const = 0; virtual void fillArray() = 0; virtual void fillBitVector() = 0; }; @@ -100,6 +136,7 @@ protected: ~PostingListSearchContextT() override; void lookupSingle(); + size_t countHits() const override; void fillArray() override; void fillBitVector() override; @@ -129,6 +166,7 @@ protected: using DictionaryConstIterator = Dictionary::ConstIterator; using EntryRef = vespalib::datastore::EntryRef; using PostingList = typename Parent::PostingList; + using Parent::_counted_hits; using Parent::_docIdLimit; using Parent::_lowerDictItr; using Parent::_merger; @@ -146,7 +184,8 @@ protected: bool useBitVector, const ISearchContext &baseSearchCtx); ~PostingListFoldedSearchContextT() override; - size_t calc_estimated_hits_in_range() const override; + bool fallback_to_approx_num_hits() const override; + size_t countHits() const override; template <bool fill_array> void fill_array_or_bitvector_helper(EntryRef pidx); template <bool fill_array> @@ -178,13 +217,13 @@ private: using Parent = PostingSearchContext<BaseSC, PostingListFoldedSearchContextT<DataT>, AttrT>; using RegexpUtil = vespalib::RegexpUtil; using Parent::_enumStore; - // Note: Steps iterator one or more steps when not using dictionary entry + // Note: steps iterator one ore more steps when not using dictionary entry bool use_dictionary_entry(PostingListSearchContext::DictionaryConstIterator& it) const override; // Note: Uses copy of dictionary iterator to avoid stepping original. bool use_single_dictionary_entry(PostingListSearchContext::DictionaryConstIterator it) const { return use_dictionary_entry(it); } - bool use_posting_lists_when_non_strict(const queryeval::ExecuteInfo& info) const override; + bool use_posting_list_when_non_strict(const queryeval::ExecuteInfo&) const override; public: StringPostingSearchContext(BaseSC&& base_sc, bool useBitVector, const AttrT &toBeSearched); }; @@ -206,6 +245,11 @@ private: void getIterators(bool shouldApplyRangeLimit); bool valid() const override { return this->isValid(); } + bool fallbackToFiltering() const override { + return (this->getRangeLimit() != 0) + ? (this->_uniqueValues >= 2 && !this->_dictionary.get_has_btree_dictionary()) + : Parent::fallbackToFiltering(); + } unsigned int approximateHits() const override { const unsigned int estimate = PostingListSearchContextT<DataT>::approximateHits(); const unsigned int limit = std::abs(this->getRangeLimit()); @@ -225,9 +269,6 @@ private: } } - bool use_posting_lists_when_non_strict(const queryeval::ExecuteInfo& info) const override; - size_t calc_estimated_hits_in_range() const override; - public: NumericPostingSearchContext(BaseSC&& base_sc, const Params & params, const AttrT &toBeSearched); const Params ¶ms() const { return _params; } @@ -260,6 +301,14 @@ StringPostingSearchContext<BaseSC, AttrT, DataT>:: StringPostingSearchContext(BaseSC&& base_sc, bool useBitVector, const AttrT &toBeSearched) : Parent(std::move(base_sc), useBitVector, toBeSearched) { + // after benchmarking prefix search performance on single, array, and weighted set fast-aggregate string attributes + // with 1M values the following constant has been derived: + this->_FSTC = 0.000028; + + // after benchmarking prefix search performance on single, array, and weighted set fast-search string attributes + // with 1M values the following constant has been derived: + this->_PLSTC = 0.000000; + if (this->valid()) { if (this->isPrefix()) { auto comp = _enumStore.make_folded_comparator_prefix(this->queryTerm()->getTerm()); @@ -314,11 +363,11 @@ StringPostingSearchContext<BaseSC, AttrT, DataT>::use_dictionary_entry(PostingLi template <typename BaseSC, typename AttrT, typename DataT> bool -StringPostingSearchContext<BaseSC, AttrT, DataT>::use_posting_lists_when_non_strict(const queryeval::ExecuteInfo& info) const +StringPostingSearchContext<BaseSC, AttrT, DataT>::use_posting_list_when_non_strict(const queryeval::ExecuteInfo& info) const { if (this->isFuzzy()) { uint32_t exp_doc_hits = this->_docIdLimit * info.hitRate(); - constexpr uint32_t fuzzy_use_posting_lists_doc_limit = 10000; + constexpr uint32_t fuzzy_use_posting_list_doc_limit = 10000; /** * The above constant was derived after a query latency experiment with fuzzy matching * on 2M documents with a dictionary size of 292070. @@ -341,7 +390,7 @@ StringPostingSearchContext<BaseSC, AttrT, DataT>::use_posting_lists_when_non_str * is already performed at this point. * The only work remaining if returning true is merging the posting lists. */ - if (exp_doc_hits > fuzzy_use_posting_lists_doc_limit) { + if (exp_doc_hits > fuzzy_use_posting_list_doc_limit) { return true; } } @@ -354,6 +403,11 @@ NumericPostingSearchContext(BaseSC&& base_sc, const Params & params_in, const At : Parent(std::move(base_sc), params_in.useBitVector(), toBeSearched), _params(params_in) { + // after simplyfying the formula and simple benchmarking and thumbs in the air + // a ratio of 8 between numvalues and estimated number of hits has been found. + this->_FSTC = 1; + + this->_PLSTC = 8; if (valid()) { if (_low == _high) { auto comp = _enumStore.make_comparator(_low); @@ -401,68 +455,7 @@ getIterators(bool shouldApplyRangeLimit) } } -template <typename BaseSC, typename AttrT, typename DataT> -bool -NumericPostingSearchContext<BaseSC, AttrT, DataT>::use_posting_lists_when_non_strict(const queryeval::ExecuteInfo& info) const -{ - // The following initial constants are derived after running parts of - // the range search performance test with 10M documents on an Apple M1 Pro with 32 GB memory. - // This code was compiled with two different behaviors: - // 1) 'filter matching' (never use posting lists). - // 2) 'posting list matching' (always use posting lists). - // https://github.com/vespa-engine/system-test/tree/master/tests/performance/range_search - // - // The following test case was used to establish the baseline cost of producing different number of hits as cheap as possible: - // range_hits_ratio=[1, 10, 50, 100, 200, 500], values_in_range=1, fast_search=true, filter_hits_ratio=0. - // The 6 range queries end up using a single posting list that produces the following number of hits: [10k, 100k, 500k, 1M, 2M, 5M] - // Avg query latency (ms) results: [5.43, 8.56, 11.68, 14.68, 22.77, 42.88] - // - // Then the following test case was executed for both 1) 'filter matching' and 2) 'posting list matching': - // range_hits_ratio=[1, 10, 50, 100, 200, 500], values_in_range=100, fast_search=true, filter_hits_ratio=0. - // Avg query latency (ms) results: - // 1) 'filter matching': [47.52, 51.06, 59.68, 79.3, 118.7, 145.26] - // 2) 'posting list matching': [4.79, 11.6, 13.54, 20.24, 32.65, 67.28] - // - // For 1) 'filter matching' we use the result from range_hits_ratio=1 (10k hits) compared to the baseline - // to calculate the cost per document (in ns) to do filter matching: 1M*(47.52-5.43)/10M = 4.2 - // - // For 2) 'posting list matching' we use the results from range_hits_ratio=[50, 100, 200, 500] compared to the baseline - // to calculate the average cost per hit (in ns) when merging the 100 posting lists: - // 1M*(13.54-11.68)/500k = 3.7 - // 1M*(20.24-14.68)/1M = 5.6 - // 1M*(32.65-22.77)/2M = 4.9 - // 1M*(67.28-42.88)/5M = 4.9 - // - // The average is 4.8. - - constexpr float filtering_match_constant = 4.2; - constexpr float posting_list_merge_constant = 4.8; - - uint32_t exp_doc_hits = this->_docIdLimit * info.hitRate(); - float avg_values_per_document = static_cast<float>(this->_numValues) / static_cast<float>(this->_docIdLimit); - float filtering_cost = exp_doc_hits * avg_values_per_document * filtering_match_constant; - float posting_list_cost = this->estimated_hits_in_range() * posting_list_merge_constant; - return posting_list_cost < filtering_cost; -} -template <typename BaseSC, typename AttrT, typename DataT> -size_t -NumericPostingSearchContext<BaseSC, AttrT, DataT>::calc_estimated_hits_in_range() const -{ - size_t exact_sum = 0; - size_t estimated_sum = 0; - constexpr uint32_t max_posting_lists_to_count = 1000; - auto it = this->_lowerDictItr; - for (uint32_t count = 0; (it != this->_upperDictItr) && (count < max_posting_lists_to_count); ++it, ++count) { - exact_sum += this->_postingList.frozenSize(it.getData().load_acquire()); - } - if (it != this->_upperDictItr) { - uint32_t remaining_posting_lists = this->_upperDictItr - it; - float hits_per_posting_list = static_cast<float>(exact_sum) / static_cast<float>(max_posting_lists_to_count); - estimated_sum = remaining_posting_lists * hits_per_posting_list; - } - return exact_sum + estimated_sum; -} extern template class PostingListSearchContextT<vespalib::btree::BTreeNoLeafData>; extern template class PostingListSearchContextT<int32_t>; diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp index 9144a85d0e1..7c7b9117a30 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp @@ -58,6 +58,21 @@ PostingListSearchContextT<DataT>::lookupSingle() } template <typename DataT> +size_t +PostingListSearchContextT<DataT>::countHits() const +{ + if (_counted_hits.has_value()) { + return _counted_hits.value(); + } + size_t sum(0); + for (auto it(_lowerDictItr); it != _upperDictItr; ++it) { + sum += _postingList.frozenSize(it.getData().load_acquire()); + } + _counted_hits = sum; + return sum; +} + +template <typename DataT> void PostingListSearchContextT<DataT>::fillArray() { @@ -82,9 +97,9 @@ template <typename DataT> void PostingListSearchContextT<DataT>::fetchPostings(const queryeval::ExecuteInfo & execInfo) { - if (!_merger.merge_done() && _uniqueValues >= 2u && this->_dictionary.get_has_btree_dictionary()) { - if (execInfo.isStrict() || use_posting_lists_when_non_strict(execInfo)) { - size_t sum = estimated_hits_in_range(); + if (!_merger.merge_done() && _uniqueValues >= 2u) { + if ((execInfo.isStrict() || use_posting_list_when_non_strict(execInfo)) && !fallbackToFiltering()) { + size_t sum(countHits()); if (sum < _docIdLimit / 64) { _merger.reserveArray(_uniqueValues, sum); fillArray(); @@ -207,11 +222,18 @@ PostingListSearchContextT<DataT>::approximateHits() const } else if (_uniqueValues == 1u) { numHits = singleHits(); } else { - numHits = estimated_hits_in_range(); + if (this->fallbackToFiltering()) { + numHits = _docIdLimit; + } else if (this->fallback_to_approx_num_hits()) { + numHits = this->calculateApproxNumHits(); + } else { + numHits = countHits(); + } } return std::min(numHits, size_t(std::numeric_limits<uint32_t>::max())); } + template <typename DataT> void PostingListSearchContextT<DataT>::applyRangeLimit(int rangeLimit) @@ -251,10 +273,20 @@ template <typename DataT> PostingListFoldedSearchContextT<DataT>::~PostingListFoldedSearchContextT() = default; template <typename DataT> +bool +PostingListFoldedSearchContextT<DataT>::fallback_to_approx_num_hits() const +{ + return false; +} + +template <typename DataT> size_t -PostingListFoldedSearchContextT<DataT>::calc_estimated_hits_in_range() const +PostingListFoldedSearchContextT<DataT>::countHits() const { - size_t sum = 0; + if (_counted_hits.has_value()) { + return _counted_hits.value(); + } + size_t sum(0); bool overflow = false; for (auto it(_lowerDictItr); it != _upperDictItr;) { if (use_dictionary_entry(it)) { @@ -273,6 +305,7 @@ PostingListFoldedSearchContextT<DataT>::calc_estimated_hits_in_range() const ++it; } } + _counted_hits = sum; return sum; } |