diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-12-17 13:54:53 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2023-12-17 13:54:53 +0000 |
commit | 7251d807994bde066a1d05586d8596498657c55a (patch) | |
tree | 581f7c1eb4946b38ac24093a38850d3c5416e352 | |
parent | 19d714651b07bdce9f7b6bb0def2f77237a1471f (diff) |
- When estimating number of hits in range measure at both ends of range, as they might be very different.
- Reduce the effect of outliers in the measured range by considering the global rate.
-rw-r--r-- | searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h | 39 |
1 files changed, 28 insertions, 11 deletions
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h index 50c1e4d2498..83267dc4bfb 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h @@ -57,6 +57,13 @@ protected: ~PostingListSearchContext() override; + double avg_values_per_document() const noexcept { + return static_cast<double>(_numValues) / static_cast<double>(_docIdLimit); + } + double avg_postinglist_size() const noexcept { + return static_cast<double>(_numValues) / _dictSize; + } + void lookupTerm(const vespalib::datastore::EntryComparator &comp); void lookupRange(const vespalib::datastore::EntryComparator &low, const vespalib::datastore::EntryComparator &high); void lookupSingle(); @@ -347,12 +354,11 @@ NumericPostingSearchContext<BaseSC, AttrT, DataT>::use_posting_lists_when_non_st if ( ! info.create_postinglist_when_non_strict()) return false; - constexpr float lookup_match_constant = 5.0; - constexpr float posting_list_merge_constant = 1.0; + constexpr double lookup_match_constant = 5.0; + constexpr double posting_list_merge_constant = 1.0; uint32_t exp_doc_hits = this->_docIdLimit * info.hit_rate(); - float avg_values_per_document = static_cast<float>(this->_numValues) / static_cast<float>(this->_docIdLimit); - float lookup_match_cost = exp_doc_hits * avg_values_per_document * lookup_match_constant; + float lookup_match_cost = exp_doc_hits * this->avg_values_per_document() * lookup_match_constant; float posting_list_cost = this->estimated_hits_in_range() * posting_list_merge_constant; return posting_list_cost < lookup_match_cost; } @@ -364,14 +370,25 @@ NumericPostingSearchContext<BaseSC, AttrT, DataT>::calc_estimated_hits_in_range( size_t exact_sum = 0; size_t estimated_sum = 0; - auto it = this->_lowerDictItr; - for (uint32_t count = 0; (it != this->_upperDictItr) && (count < this->max_posting_lists_to_count); ++it, ++count) { - exact_sum += this->_posting_store.frozenSize(it.getData().load_acquire()); + // Sample lower range + auto it_forward = this->_lowerDictItr; + for (uint32_t count = 0; (it_forward != this->_upperDictItr) && (count < this->max_posting_lists_to_count); ++it_forward, ++count) { + exact_sum += this->_posting_store.frozenSize(it_forward.getData().load_acquire()); } - if (it != this->_upperDictItr) { - uint32_t remaining_posting_lists = this->_upperDictItr - it; - float hits_per_posting_list = static_cast<float>(exact_sum) / static_cast<float>(this->max_posting_lists_to_count); - estimated_sum = remaining_posting_lists * hits_per_posting_list; + if (it_forward != this->_upperDictItr) { + //Sample upper range + auto it_backward = this->_upperDictItr; + for (uint32_t count = 0; (it_backward != it_forward) && (count < this->max_posting_lists_to_count);++count) { + --it_backward; + exact_sum += this->_posting_store.frozenSize(it_backward.getData().load_acquire()); + } + if (it_forward != it_backward) { + // Estimate the rest + uint32_t remaining_posting_lists = it_backward - it_forward; + double measured_hits_per_posting_list = static_cast<double>(exact_sum) / (this->max_posting_lists_to_count * 2); + // Let measure and global rate count equally, to reduce the effect of outlayers. + estimated_sum = remaining_posting_lists * (measured_hits_per_posting_list + this->avg_postinglist_size())/2; + } } return exact_sum + estimated_sum; } |