diff options
author | Geir Storli <geirst@yahooinc.com> | 2024-02-14 10:31:36 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-14 10:31:36 +0100 |
commit | 4afe7452be17bcbfb388cf5f3588b29ed30de4ee (patch) | |
tree | 867d4951f6da70f11bdeeb2db4e9efaf56ee7595 | |
parent | 70b306992a38d92d478822aabca0cc7a264e364f (diff) | |
parent | 9b204f9727d97cbf2c0de8cf90b65b382b328d18 (diff) |
Merge pull request #30263 from vespa-engine/havardpe/baseline-flow-stats-for-complex-leafs
baseline flow stats for complex leafs
13 files changed, 141 insertions, 24 deletions
diff --git a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp index 510c6be4bf3..ee7c201f093 100644 --- a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp +++ b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp @@ -1278,6 +1278,10 @@ TEST("require that OR blueprint use saturated sum as estimate") { TEST_DO(verify_or_est({{100, false},{300, false},{200, false}}, {300, false})); } +std::vector<FlowStats> child_stats({{0.2, 1.1, 0.2*1.1}, + {0.3, 1.2, 0.3*1.2}, + {0.5, 1.3, 1.3}}); + void verify_relative_estimate(make &&mk, double expect) { EXPECT_EQUAL(mk.making->estimate(), 0.0); Blueprint::UP bp = std::move(mk).leafs({200,300,950}); @@ -1316,8 +1320,10 @@ TEST("relative estimate for ONEAR") { } TEST("relative estimate for WEAKAND") { - verify_relative_estimate(make::WEAKAND(1000), 1.0-0.8*0.7*0.5); - verify_relative_estimate(make::WEAKAND(50), 0.05); + double est1 = (Blueprint::abs_to_rel_est(1000, 1000) + OrFlow::estimate_of(child_stats)) / 2.0; + double est2 = (Blueprint::abs_to_rel_est(50, 1000) + OrFlow::estimate_of(child_stats)) / 2.0; + verify_relative_estimate(make::WEAKAND(1000), est1); + verify_relative_estimate(make::WEAKAND(50), est2); } void verify_cost(make &&mk, double expect, double expect_strict) { @@ -1333,10 +1339,6 @@ void verify_cost(make &&mk, double expect, double expect_strict) { EXPECT_EQUAL(bp->strict_cost(), expect_strict); } -std::vector<FlowStats> child_stats({{0.2, 1.1, 0.2*1.1}, - {0.3, 1.2, 0.3*1.2}, - {0.5, 1.3, 1.3}}); - TEST("cost for OR") { verify_cost(make::OR(), OrFlow::cost_of(child_stats, false), @@ -1377,9 +1379,10 @@ TEST("cost for ONEAR") { } TEST("cost for WEAKAND") { + double est = (Blueprint::abs_to_rel_est(1000, 1000) + OrFlow::estimate_of(child_stats)) / 2.0; verify_cost(make::WEAKAND(1000), OrFlow::cost_of(child_stats, false), - OrFlow::cost_of(child_stats, true)); + OrFlow::cost_of(child_stats, true) + flow::heap_cost(est, 3)); } TEST_MAIN() { TEST_DEBUG("lhs.out", "rhs.out"); TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp index f358ee56dd4..8de8f6247c9 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp @@ -31,6 +31,7 @@ #include <vespa/searchlib/queryeval/matching_elements_search.h> #include <vespa/searchlib/queryeval/nearest_neighbor_blueprint.h> #include <vespa/searchlib/queryeval/orlikesearch.h> +#include <vespa/searchlib/queryeval/flow_tuning.h> #include <vespa/searchlib/queryeval/predicate_blueprint.h> #include <vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.h> #include <vespa/searchlib/queryeval/wand/parallel_weak_and_search.h> @@ -229,8 +230,10 @@ struct LocationPreFilterIterator : public OrLikeSearch<is_strict, NoUnpack> class LocationPreFilterBlueprint : public ComplexLeafBlueprint { private: + using AttrHitEstimate = attribute::HitEstimate; const IAttributeVector &_attribute; std::vector<ISearchContext::UP> _rangeSearches; + std::vector<AttrHitEstimate> _estimates; bool _should_use; public: @@ -238,6 +241,7 @@ public: : ComplexLeafBlueprint(field), _attribute(attribute), _rangeSearches(), + _estimates(), _should_use(false) { uint64_t estHits(0); @@ -247,7 +251,8 @@ public: query::SimpleRangeTerm rt(qr, "", 0, query::Weight(0)); string stack(StackDumpCreator::create(rt)); _rangeSearches.push_back(attr.createSearchContext(QueryTermDecoder::decodeTerm(stack), scParams)); - estHits += _rangeSearches.back()->calc_hit_estimate().est_hits(); + _estimates.push_back(_rangeSearches.back()->calc_hit_estimate()); + estHits += _estimates.back().est_hits(); LOG(debug, "Range '%s' estHits %" PRId64, qr.getRangeString().c_str(), estHits); } if (estHits > attr.getNumDocs()) { @@ -266,9 +271,23 @@ public: bool should_use() const { return _should_use; } queryeval::FlowStats calculate_flow_stats(uint32_t docid_limit) const override { - return default_flow_stats(docid_limit, getState().estimate().estHits, _rangeSearches.size()); + using OrFlow = search::queryeval::OrFlow; + struct MyAdapter { + uint32_t docid_limit; + MyAdapter(uint32_t docid_limit_in) noexcept : docid_limit(docid_limit_in) {} + double estimate(const AttrHitEstimate &est) const noexcept { + return est.is_unknown() ? 0.5 : abs_to_rel_est(est.est_hits(), docid_limit); + } + double cost(const AttrHitEstimate &) const noexcept { return 1.0; } + double strict_cost(const AttrHitEstimate &est) const noexcept { + return est.is_unknown() ? 1.0 : abs_to_rel_est(est.est_hits(), docid_limit); + } + }; + double est = OrFlow::estimate_of(MyAdapter(docid_limit), _estimates); + return {est, OrFlow::cost_of(MyAdapter(docid_limit), _estimates, false), + OrFlow::cost_of(MyAdapter(docid_limit), _estimates, true) + queryeval::flow::array_cost(est, _estimates.size())}; } - + SearchIterator::UP createLeafSearch(const TermFieldMatchDataArray &tfmda, bool strict) const override { @@ -458,7 +477,23 @@ public: } queryeval::FlowStats calculate_flow_stats(uint32_t docid_limit) const override { - return default_flow_stats(docid_limit, getState().estimate().estHits, _terms.size()); + using OrFlow = search::queryeval::OrFlow; + struct MyAdapter { + uint32_t docid_limit; + MyAdapter(uint32_t docid_limit_in) noexcept : docid_limit(docid_limit_in) {} + double estimate(const IDirectPostingStore::LookupResult &term) const noexcept { + return abs_to_rel_est(term.posting_size, docid_limit); + } + double cost(const IDirectPostingStore::LookupResult &) const noexcept { return 1.0; } + double strict_cost(const IDirectPostingStore::LookupResult &term) const noexcept { + return abs_to_rel_est(term.posting_size, docid_limit); + } + }; + double child_est = OrFlow::estimate_of(MyAdapter(docid_limit), _terms); + double my_est = abs_to_rel_est(_scores.getScoresToTrack(), docid_limit); + double est = (child_est + my_est) / 2.0; + return {est, OrFlow::cost_of(MyAdapter(docid_limit), _terms, false), + OrFlow::cost_of(MyAdapter(docid_limit), _terms, true) + queryeval::flow::heap_cost(est, _terms.size())}; } SearchIterator::UP createLeafSearch(const TermFieldMatchDataArray &tfmda, bool strict) const override { diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp index ede2ecd0ad5..18748641ca4 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp @@ -8,6 +8,8 @@ #include <vespa/searchlib/query/query_term_ucs4.h> #include <vespa/searchlib/queryeval/filter_wrapper.h> #include <vespa/searchlib/queryeval/orsearch.h> +#include <vespa/searchlib/queryeval/flow.h> +#include <vespa/searchlib/queryeval/flow_tuning.h> #include <vespa/searchlib/queryeval/weighted_set_term_search.h> #include <vespa/vespalib/objects/visit.h> #include <vespa/vespalib/stllike/hash_map.hpp> @@ -20,6 +22,7 @@ namespace { using attribute::ISearchContext; using attribute::IAttributeVector; +using queryeval::OrFlow; class AttrWrapper { @@ -94,7 +97,8 @@ AttributeWeightedSetBlueprint::AttributeWeightedSetBlueprint(const queryeval::Fi _estHits(0), _weights(), _attr(attr), - _contexts() + _contexts(), + _estimates() { set_allow_termwise_eval(true); } @@ -110,7 +114,8 @@ AttributeWeightedSetBlueprint::~AttributeWeightedSetBlueprint() void AttributeWeightedSetBlueprint::addToken(std::unique_ptr<ISearchContext> context, int32_t weight) { - _estHits = std::min(_estHits + context->calc_hit_estimate().est_hits(), _numDocs); + _estimates.push_back(context->calc_hit_estimate()); + _estHits = std::min(_estHits + _estimates.back().est_hits(), _numDocs); setEstimate(HitEstimate(_estHits, (_estHits == 0))); _weights.push_back(weight); _contexts.push_back(context.release()); @@ -119,7 +124,20 @@ AttributeWeightedSetBlueprint::addToken(std::unique_ptr<ISearchContext> context, queryeval::FlowStats AttributeWeightedSetBlueprint::calculate_flow_stats(uint32_t docid_limit) const { - return default_flow_stats(docid_limit, _estHits, _weights.size()); + struct MyAdapter { + uint32_t docid_limit; + MyAdapter(uint32_t docid_limit_in) noexcept : docid_limit(docid_limit_in) {} + double estimate(const AttrHitEstimate &est) const noexcept { + return est.is_unknown() ? 0.5 : abs_to_rel_est(est.est_hits(), docid_limit); + } + double cost(const AttrHitEstimate &) const noexcept { return 1.0; } + double strict_cost(const AttrHitEstimate &est) const noexcept { + return est.is_unknown() ? 1.0 : abs_to_rel_est(est.est_hits(), docid_limit); + } + }; + double est = OrFlow::estimate_of(MyAdapter(docid_limit), _estimates); + return {est, OrFlow::cost_of(MyAdapter(docid_limit), _estimates, false), + OrFlow::cost_of(MyAdapter(docid_limit), _estimates, true) + queryeval::flow::heap_cost(est, _estimates.size())}; } queryeval::SearchIterator::UP diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.h b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.h index 16319654024..32632403e42 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.h +++ b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.h @@ -5,6 +5,7 @@ #include "attributeguard.h" #include <vespa/searchlib/queryeval/blueprint.h> #include <vespa/searchlib/queryeval/searchiterator.h> +#include <vespa/searchcommon/attribute/hit_estimate.h> #include <vector> namespace search { @@ -16,12 +17,14 @@ class AttributeWeightedSetBlueprint : public queryeval::ComplexLeafBlueprint private: using ISearchContext = attribute::ISearchContext; using IAttributeVector = attribute::IAttributeVector; + using AttrHitEstimate = attribute::HitEstimate; size_t _numDocs; size_t _estHits; std::vector<int32_t> _weights; const IAttributeVector & _attr; std::vector<ISearchContext*> _contexts; - + std::vector<AttrHitEstimate> _estimates; + public: AttributeWeightedSetBlueprint(const AttributeWeightedSetBlueprint &) = delete; AttributeWeightedSetBlueprint &operator=(const AttributeWeightedSetBlueprint &) = delete; diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h index e0b206dbdd9..076c375091a 100644 --- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h +++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h @@ -8,6 +8,7 @@ #include <vespa/searchlib/common/matching_elements_fields.h> #include <vespa/searchlib/fef/termfieldmatchdataarray.h> #include <vespa/searchlib/queryeval/blueprint.h> +#include <vespa/searchlib/queryeval/flow_tuning.h> #include <vespa/searchlib/queryeval/field_spec.h> #include <vespa/searchlib/queryeval/matching_elements_search.h> #include <variant> @@ -72,7 +73,21 @@ public: } queryeval::FlowStats calculate_flow_stats(uint32_t docid_limit) const override { - return default_flow_stats(docid_limit, getState().estimate().estHits, _terms.size()); + using OrFlow = search::queryeval::OrFlow; + struct MyAdapter { + uint32_t docid_limit; + MyAdapter(uint32_t docid_limit_in) noexcept : docid_limit(docid_limit_in) {} + double estimate(const IDirectPostingStore::LookupResult &term) const noexcept { + return abs_to_rel_est(term.posting_size, docid_limit); + } + double cost(const IDirectPostingStore::LookupResult &) const noexcept { return 1.0; } + double strict_cost(const IDirectPostingStore::LookupResult &term) const noexcept { + return abs_to_rel_est(term.posting_size, docid_limit); + } + }; + double est = OrFlow::estimate_of(MyAdapter(docid_limit), _terms); + return {est, OrFlow::cost_of(MyAdapter(docid_limit), _terms, false), + OrFlow::cost_of(MyAdapter(docid_limit), _terms, true) + queryeval::flow::heap_cost(est, _terms.size())}; } std::unique_ptr<queryeval::SearchIterator> createLeafSearch(const fef::TermFieldMatchDataArray &tfmda, bool) const override; diff --git a/searchlib/src/vespa/searchlib/queryeval/dot_product_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/dot_product_blueprint.cpp index 3b8975f9883..7cd00d02bb3 100644 --- a/searchlib/src/vespa/searchlib/queryeval/dot_product_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/dot_product_blueprint.cpp @@ -2,6 +2,7 @@ #include "dot_product_blueprint.h" #include "dot_product_search.h" +#include "flow_tuning.h" #include "field_spec.hpp" #include <vespa/vespalib/objects/visit.hpp> @@ -41,7 +42,12 @@ DotProductBlueprint::addTerm(Blueprint::UP term, int32_t weight, HitEstimate & e FlowStats DotProductBlueprint::calculate_flow_stats(uint32_t docid_limit) const { - return default_flow_stats(docid_limit, getState().estimate().estHits, _terms.size()); + for (auto &term: _terms) { + term->update_flow_stats(docid_limit); + } + double est = OrFlow::estimate_of(_terms); + return {est, OrFlow::cost_of(_terms, false), + OrFlow::cost_of(_terms, true) + flow::heap_cost(est, _terms.size())}; } SearchIterator::UP diff --git a/searchlib/src/vespa/searchlib/queryeval/equiv_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/equiv_blueprint.cpp index 84745fdd0bd..27624abf515 100644 --- a/searchlib/src/vespa/searchlib/queryeval/equiv_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/equiv_blueprint.cpp @@ -3,6 +3,7 @@ #include "equiv_blueprint.h" #include "equivsearch.h" #include "field_spec.hpp" +#include "flow_tuning.h" #include <vespa/vespalib/objects/visit.hpp> #include <vespa/vespalib/stllike/hash_map.hpp> @@ -55,7 +56,12 @@ EquivBlueprint::~EquivBlueprint() = default; FlowStats EquivBlueprint::calculate_flow_stats(uint32_t docid_limit) const { - return default_flow_stats(docid_limit, _estimate.estHits, _terms.size()); + for (auto &term: _terms) { + term->update_flow_stats(docid_limit); + } + double est = OrFlow::estimate_of(_terms); + return {est, OrFlow::cost_of(_terms, false), + OrFlow::cost_of(_terms, true) + flow::array_cost(est, _terms.size())}; } SearchIterator::UP diff --git a/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h b/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h index ab0381fa12c..491f0ad5571 100644 --- a/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h +++ b/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h @@ -10,4 +10,8 @@ inline double heap_cost(double my_est, size_t num_children) { return my_est * std::log2(std::max(size_t(1),num_children)); } +inline double array_cost(double my_est, size_t num_children) { + return my_est * num_children; +} + } diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp index 938044c0c19..993639becf2 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp @@ -421,9 +421,10 @@ FlowStats WeakAndBlueprint::calculate_flow_stats(uint32_t docid_limit) const { double child_est = OrFlow::estimate_of(get_children()); double my_est = abs_to_rel_est(_n, docid_limit); - return {std::min(my_est, child_est), + double est = (child_est + my_est) / 2.0; + return {est, OrFlow::cost_of(get_children(), false), - OrFlow::cost_of(get_children(), true)}; + OrFlow::cost_of(get_children(), true) + flow::heap_cost(est, get_children().size())}; } Blueprint::HitEstimate diff --git a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp index 3d4f917aeac..628020cfea2 100644 --- a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp @@ -47,7 +47,13 @@ SameElementBlueprint::addTerm(Blueprint::UP term) FlowStats SameElementBlueprint::calculate_flow_stats(uint32_t docid_limit) const { - return default_flow_stats(docid_limit, _estimate.estHits, _terms.size()); + for (auto &term: _terms) { + term->update_flow_stats(docid_limit); + } + double est = AndFlow::estimate_of(_terms); + return {est, + AndFlow::cost_of(_terms, false) + est * _terms.size(), + AndFlow::cost_of(_terms, true) + est * _terms.size()}; } void diff --git a/searchlib/src/vespa/searchlib/queryeval/simple_phrase_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/simple_phrase_blueprint.cpp index ace7d12c32b..953e7350074 100644 --- a/searchlib/src/vespa/searchlib/queryeval/simple_phrase_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/simple_phrase_blueprint.cpp @@ -48,7 +48,13 @@ SimplePhraseBlueprint::addTerm(Blueprint::UP term) FlowStats SimplePhraseBlueprint::calculate_flow_stats(uint32_t docid_limit) const { - return default_flow_stats(docid_limit, _estimate.estHits, _terms.size()); + for (auto &term: _terms) { + term->update_flow_stats(docid_limit); + } + double est = AndFlow::estimate_of(_terms); + return {est, + AndFlow::cost_of(_terms, false) + est * _terms.size(), + AndFlow::cost_of(_terms, true) + est * _terms.size()}; } SearchIterator::UP diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp index 4e400b3c055..78fe3882aab 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp @@ -5,6 +5,7 @@ #include "parallel_weak_and_search.h" #include <vespa/searchlib/queryeval/field_spec.hpp> #include <vespa/searchlib/queryeval/searchiterator.h> +#include <vespa/searchlib/queryeval/flow_tuning.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> #include <vespa/vespalib/objects/visit.hpp> #include <algorithm> @@ -68,7 +69,14 @@ ParallelWeakAndBlueprint::addTerm(Blueprint::UP term, int32_t weight, HitEstimat FlowStats ParallelWeakAndBlueprint::calculate_flow_stats(uint32_t docid_limit) const { - return default_flow_stats(docid_limit, getState().estimate().estHits, _terms.size()); + for (auto &term: _terms) { + term->update_flow_stats(docid_limit); + } + double child_est = OrFlow::estimate_of(_terms); + double my_est = abs_to_rel_est(_scores.getScoresToTrack(), docid_limit); + double est = (child_est + my_est) / 2.0; + return {est, OrFlow::cost_of(_terms, false), + OrFlow::cost_of(_terms, true) + flow::heap_cost(est, _terms.size())}; } SearchIterator::UP diff --git a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_blueprint.cpp index b3bbec8428f..8bf2dd53470 100644 --- a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_blueprint.cpp @@ -4,6 +4,7 @@ #include "weighted_set_term_search.h" #include "orsearch.h" #include "matching_elements_search.h" +#include "flow_tuning.h" #include <vespa/searchlib/common/matching_elements.h> #include <vespa/searchlib/common/matching_elements_fields.h> #include <vespa/vespalib/objects/visit.hpp> @@ -95,7 +96,12 @@ WeightedSetTermBlueprint::addTerm(Blueprint::UP term, int32_t weight, HitEstimat FlowStats WeightedSetTermBlueprint::calculate_flow_stats(uint32_t docid_limit) const { - return default_flow_stats(docid_limit, getState().estimate().estHits, _terms.size()); + for (auto &term: _terms) { + term->update_flow_stats(docid_limit); + } + double est = OrFlow::estimate_of(_terms); + return {est, OrFlow::cost_of(_terms, false), + OrFlow::cost_of(_terms, true) + flow::heap_cost(est, _terms.size())}; } SearchIterator::UP |