Share code to calculate FlowStats based on attribute::HitEstimate.

This also changes to using the formulas in flow_tuning.h for lookup and btree costs.
author: Geir Storli <geirst@yahooinc.com> 2024-04-23 07:56:19 +0000
committer: Geir Storli <geirst@yahooinc.com> 2024-04-23 07:56:19 +0000
commit: 39fede2d5fe7e668f1baf306711e43726a6fee36 (patch)
tree: 1fcb406c186cb46aa7e6ef8d9f4e74e6071b2039 /searchlib
parent: e1165fba6e2b6d42fc71372567f886ddfdd4b998 (diff)
4 files changed, 69 insertions, 48 deletions
diff --git a/searchlib/src/vespa/searchcommon/attribute/hit_estimate_flow_stats_adapter.h b/searchlib/src/vespa/searchcommon/attribute/hit_estimate_flow_stats_adapter.h
new file mode 100644
index 00000000000..33031069d8e
--- /dev/null
+++ b/searchlib/src/vespa/searchcommon/attribute/hit_estimate_flow_stats_adapter.h
@@ -0,0 +1,33 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "hit_estimate.h"
+#include <vespa/searchlib/queryeval/blueprint.h>
+#include <vespa/searchlib/queryeval/flow_tuning.h>
+
+namespace search::attribute {
+
+/**
+ * Adapter used when calculating FlowStats based on HitEstimate per term.
+ */
+struct HitEstimateFlowStatsAdapter {
+    uint32_t docid_limit;
+    uint32_t num_indirections;
+    explicit HitEstimateFlowStatsAdapter(uint32_t docid_limit_in, uint32_t num_indirections_in) noexcept
+        : docid_limit(docid_limit_in), num_indirections(num_indirections_in) {}
+    double abs_to_rel_est(const HitEstimate& est) const noexcept {
+        return queryeval::Blueprint::abs_to_rel_est(est.est_hits(), docid_limit);
+    }
+    double estimate(const HitEstimate& est) const noexcept {
+        return est.is_unknown() ? 0.5 : abs_to_rel_est(est);
+    }
+    double cost(const HitEstimate& est) const noexcept {
+        return est.is_unknown() ? queryeval::flow::lookup_cost(num_indirections) : queryeval::flow::btree_cost(abs_to_rel_est(est));
+    }
+    double strict_cost(const HitEstimate &est) const noexcept {
+        return est.is_unknown() ? queryeval::flow::lookup_strict_cost(num_indirections) : queryeval::flow::btree_strict_cost(abs_to_rel_est(est));
+    }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
index cafff1d25cd..5b17b491a20 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
@@ -13,6 +13,7 @@
 #include "predicate_attribute.h"
 #include <vespa/eval/eval/value.h>
 #include <vespa/searchcommon/attribute/config.h>
+#include <vespa/searchcommon/attribute/hit_estimate_flow_stats_adapter.h>
 #include <vespa/searchlib/common/location.h>
 #include <vespa/searchlib/common/locationiterators.h>
 #include <vespa/searchlib/query/query_term_decoder.h>
@@ -25,23 +26,23 @@
 #include <vespa/searchlib/queryeval/emptysearch.h>
 #include <vespa/searchlib/queryeval/field_spec.hpp>
 #include <vespa/searchlib/queryeval/filter_wrapper.h>
+#include <vespa/searchlib/queryeval/flow_tuning.h>
 #include <vespa/searchlib/queryeval/get_weight_from_node.h>
 #include <vespa/searchlib/queryeval/intermediate_blueprints.h>
+#include <vespa/searchlib/queryeval/irequestcontext.h>
 #include <vespa/searchlib/queryeval/leaf_blueprints.h>
 #include <vespa/searchlib/queryeval/nearest_neighbor_blueprint.h>
 #include <vespa/searchlib/queryeval/orlikesearch.h>
-#include <vespa/searchlib/queryeval/flow_tuning.h>
 #include <vespa/searchlib/queryeval/predicate_blueprint.h>
 #include <vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.h>
 #include <vespa/searchlib/queryeval/wand/parallel_weak_and_search.h>
 #include <vespa/searchlib/queryeval/weighted_set_term_blueprint.h>
 #include <vespa/searchlib/queryeval/weighted_set_term_search.h>
-#include <vespa/searchlib/queryeval/irequestcontext.h>
 #include <vespa/searchlib/tensor/dense_tensor_attribute.h>
-#include <vespa/vespalib/util/regexp.h>
-#include <vespa/vespalib/util/stringfmt.h>
 #include <vespa/vespalib/util/exceptions.h>
 #include <vespa/vespalib/util/issue.h>
+#include <vespa/vespalib/util/regexp.h>
+#include <vespa/vespalib/util/stringfmt.h>
 #include <charconv>
 
 #include <vespa/log/log.h>
@@ -93,6 +94,7 @@ using search::queryeval::StrictHeapOrSearch;
 using search::queryeval::WeightedSetTermBlueprint;
 using search::queryeval::flow::btree_cost;
 using search::queryeval::flow::btree_strict_cost;
+using search::queryeval::flow::get_num_indirections;
 using search::queryeval::flow::lookup_cost;
 using search::queryeval::flow::lookup_strict_cost;
 using search::tensor::DenseTensorAttribute;
@@ -123,19 +125,6 @@ private:
 };
 //-----------------------------------------------------------------------------
 
-size_t
-get_num_indirections(const BasicType& basic_type, const CollectionType& col_type)
-{
-    size_t res = 0;
-    if (basic_type == BasicType::STRING) {
-        res += 1;
-    }
-    if (col_type != CollectionType::SINGLE) {
-        res += 1;
-    }
-    return res;
-}
-
 /**
  * Blueprint for creating regular, stack-based attribute iterators.
  **/
@@ -293,20 +282,11 @@ public:
     }
     queryeval::FlowStats calculate_flow_stats(uint32_t docid_limit) const override {
         using OrFlow = search::queryeval::OrFlow;
-        struct MyAdapter {
-            uint32_t docid_limit;
-            explicit MyAdapter(uint32_t docid_limit_in) noexcept : docid_limit(docid_limit_in) {}
-            double estimate(const AttrHitEstimate &est) const noexcept {
-                return est.is_unknown() ? 0.5 : abs_to_rel_est(est.est_hits(), docid_limit);
-            }
-            double cost(const AttrHitEstimate &) const noexcept { return 1.0; }
-            double strict_cost(const AttrHitEstimate &est) const noexcept {
-                return est.is_unknown() ? 1.0 : abs_to_rel_est(est.est_hits(), docid_limit);
-            }
-        };
-        double est = OrFlow::estimate_of(MyAdapter(docid_limit), _estimates);
-        return {est, OrFlow::cost_of(MyAdapter(docid_limit), _estimates, false),
-                OrFlow::cost_of(MyAdapter(docid_limit), _estimates, true) + queryeval::flow::heap_cost(est, _estimates.size())};
+        using MyAdapter = attribute::HitEstimateFlowStatsAdapter;
+        size_t indirections = queryeval::flow::get_num_indirections(_attribute.getBasicType(), _attribute.getCollectionType());
+        double est = OrFlow::estimate_of(MyAdapter(docid_limit, indirections), _estimates);
+        return {est, OrFlow::cost_of(MyAdapter(docid_limit, indirections), _estimates, false),
+                OrFlow::cost_of(MyAdapter(docid_limit, indirections), _estimates, true) + queryeval::flow::heap_cost(est, _estimates.size())};
     }
 
     SearchIterator::UP
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp
index 2ad67c85429..928023c0f94 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp
@@ -2,20 +2,19 @@
 
 #include "attribute_weighted_set_blueprint.h"
 #include "multi_term_hash_filter.hpp"
+#include <vespa/searchcommon/attribute/hit_estimate_flow_stats_adapter.h>
 #include <vespa/searchcommon/attribute/i_search_context.h>
-#include <vespa/searchlib/common/bitvector.h>
 #include <vespa/searchlib/fef/matchdatalayout.h>
 #include <vespa/searchlib/query/query_term_ucs4.h>
 #include <vespa/searchlib/queryeval/filter_wrapper.h>
-#include <vespa/searchlib/queryeval/orsearch.h>
 #include <vespa/searchlib/queryeval/flow.h>
 #include <vespa/searchlib/queryeval/flow_tuning.h>
+#include <vespa/searchlib/queryeval/orsearch.h>
 #include <vespa/searchlib/queryeval/weighted_set_term_search.h>
 #include <vespa/vespalib/objects/visit.h>
 #include <vespa/vespalib/stllike/hash_map.hpp>
 #include <vespa/vespalib/util/stringfmt.h>
 
-
 namespace search {
 
 namespace {
@@ -130,20 +129,11 @@ AttributeWeightedSetBlueprint::sort(queryeval::InFlow in_flow)
 queryeval::FlowStats
 AttributeWeightedSetBlueprint::calculate_flow_stats(uint32_t docid_limit) const
 {
-    struct MyAdapter {
-        uint32_t docid_limit;
-        MyAdapter(uint32_t docid_limit_in) noexcept : docid_limit(docid_limit_in) {}
-        double estimate(const AttrHitEstimate &est) const noexcept {
-            return est.is_unknown() ? 0.5 : abs_to_rel_est(est.est_hits(), docid_limit);
-        }
-        double cost(const AttrHitEstimate &) const noexcept { return 1.0; }
-        double strict_cost(const AttrHitEstimate &est) const noexcept {
-            return est.is_unknown() ? 1.0 : abs_to_rel_est(est.est_hits(), docid_limit);
-        }
-    };
-    double est = OrFlow::estimate_of(MyAdapter(docid_limit), _estimates);
-    return {est, OrFlow::cost_of(MyAdapter(docid_limit), _estimates, false),
-            OrFlow::cost_of(MyAdapter(docid_limit), _estimates, true) + queryeval::flow::heap_cost(est, _estimates.size())};
+    using MyAdapter = attribute::HitEstimateFlowStatsAdapter;
+    size_t num_indirections = queryeval::flow::get_num_indirections(_attr.getBasicType(), _attr.getCollectionType());
+    double est = OrFlow::estimate_of(MyAdapter(docid_limit, num_indirections), _estimates);
+    return {est, OrFlow::cost_of(MyAdapter(docid_limit, num_indirections), _estimates, false),
+            OrFlow::cost_of(MyAdapter(docid_limit, num_indirections), _estimates, true) + queryeval::flow::heap_cost(est, _estimates.size())};
 }
 
 queryeval::SearchIterator::UP
diff --git a/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h b/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h
index e8a58bba9dc..6389de35264 100644
--- a/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h
+++ b/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h
@@ -1,6 +1,8 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #pragma once
+#include <vespa/searchcommon/attribute/basictype.h>
+#include <vespa/searchcommon/attribute/collectiontype.h>
 #include <cmath>
 #include <cstddef>
 
@@ -41,6 +43,22 @@ inline double heap_cost(double my_est, size_t num_children) {
     return my_est * std::log2(std::max(size_t(1), num_children));
 }
 
+/**
+ * Returns the number of memory indirections needed when doing lookups
+ * in an attribute with the given type.
+ */
+inline size_t get_num_indirections(const attribute::BasicType& basic_type,
+                                   const attribute::CollectionType& col_type) {
+    size_t res = 0;
+    if (basic_type == attribute::BasicType::STRING) {
+        res += 1;
+    }
+    if (col_type != attribute::CollectionType::SINGLE) {
+        res += 1;
+    }
+    return res;
+}
+
 // Non-strict cost of lookup based matching in an attribute (not fast-search).
 // Test used: IteratorBenchmark::analyze_term_search_in_attributes_non_strict
 inline double lookup_cost(size_t num_indirections) {
author	Geir Storli <geirst@yahooinc.com>	2024-04-23 07:56:19 +0000
committer	Geir Storli <geirst@yahooinc.com>	2024-04-23 07:56:19 +0000
commit	39fede2d5fe7e668f1baf306711e43726a6fee36 (patch)
tree	1fcb406c186cb46aa7e6ef8d9f4e74e6071b2039 /searchlib
parent	e1165fba6e2b6d42fc71372567f886ddfdd4b998 (diff)