find crossover

author: Håvard Pettersen <havardpe@yahooinc.com> 2024-03-21 11:18:09 +0000
committer: Håvard Pettersen <havardpe@yahooinc.com> 2024-03-25 10:38:12 +0000
commit: a0b9fe6bb5b4a64bc3ce65074f923ba9a2bc4940 (patch)
tree: 53a68dceca538bb664f16b8d905eb666cd8c28c4
parent: c3be0b5826152973ec422d32558ba41a1dc6311d (diff)
2 files changed, 151 insertions, 4 deletions
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h
index 1459cbfe856..d3e529fcd65 100644
--- a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h
@@ -24,4 +24,3 @@ make_blueprint_factory(const FieldConfig& field_cfg, QueryOperator query_op,
                        double op_hit_ratio, uint32_t children);
 
 }
-
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp
index 5b061697220..b08fde50d7c 100644
--- a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp
@@ -6,6 +6,7 @@
 #include <vespa/searchlib/queryeval/blueprint.h>
 #include <vespa/vespalib/gtest/gtest.h>
 #include <vespa/vespalib/util/benchmark_timer.h>
+#include <vespa/vespalib/util/stringfmt.h>
 #include <cmath>
 #include <numeric>
 #include <vector>
@@ -19,6 +20,8 @@ using namespace vespalib;
 
 using search::index::Schema;
 
+using vespalib::make_string_short::fmt;
+
 const vespalib::string field_name = "myfield";
 double budget_sec = 1.0;
 
@@ -43,10 +46,12 @@ struct BenchmarkResult {
           iterator_name(iterator_name_in),
           blueprint_name(blueprint_name_in)
     {}
+    ~BenchmarkResult();
     double ns_per_seek() const { return (time_ms / seeks) * 1000.0 * 1000.0; }
     double ms_per_actual_cost() const { return (time_ms / actual_cost); }
     double ms_per_alt_cost() const { return (time_ms / alt_cost); }
 };
+BenchmarkResult::~BenchmarkResult() = default;
 
 struct Stats {
     double average;
@@ -176,11 +181,11 @@ struct MatchLoopContext {
 MatchLoopContext::~MatchLoopContext() = default;
 
 MatchLoopContext
-make_match_loop_context(BenchmarkBlueprintFactory& factory, bool strict, uint32_t docid_limit)
+make_match_loop_context(BenchmarkBlueprintFactory& factory, InFlow in_flow, uint32_t docid_limit)
 {
     auto blueprint = factory.make_blueprint();
     assert(blueprint);
-    blueprint->basic_plan(strict, docid_limit);
+    blueprint->basic_plan(in_flow, docid_limit);
     blueprint->fetchPostings(ExecuteInfo::FULL);
     // Note: All blueprints get the same TermFieldMatchData instance.
     //       This is OK as long as we don't do unpacking and only use 1 thread.
@@ -232,7 +237,7 @@ non_strict_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit, doub
     uint32_t docid_skip = 1.0 / filter_hit_ratio;
     MatchLoopContext ctx;
     while (timer.has_budget()) {
-        ctx = make_match_loop_context(factory, force_strict, docid_limit);
+        ctx = make_match_loop_context(factory, InFlow(force_strict, filter_hit_ratio), docid_limit);
         auto* itr = ctx.iterator.get();
         timer.before();
         seeks = 0;
@@ -274,6 +279,131 @@ benchmark_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit, bool
     }
 }
 
+//-----------------------------------------------------------------------------
+
+double est_forced_strict_cost(double estimate, double strict_cost, double rate) {
+    return (rate - estimate) * 0.2 + strict_cost;
+}
+
+struct Sample {
+    double value;
+    double other;
+    bool forced_strict;
+    Sample(double v) noexcept : value(v), other(v), forced_strict(false) {}
+    Sample(double v, double o, bool fs) noexcept : value(v), other(o), forced_strict(fs) {}
+    bool operator<(const Sample &rhs) const noexcept { return value < rhs.value; }
+    vespalib::string str() const noexcept {
+        if (other == value) {
+            return fmt("%g", value);
+        }
+        auto fs = [](bool forced){ return forced ? "_FS" : ""; };
+        return fmt("%g%s(%g%s)", value, fs(forced_strict), other, fs(!forced_strict));
+    }
+};
+
+double find_crossover(const char *type, const auto &calculate_at, double delta) {
+    double min = delta;
+    double max = 1.0;
+    fprintf(stderr, "looking for %s crossover in the range [%g, %g]...\n", type, min, max);
+    auto at_min = calculate_at(min);
+    auto at_max = calculate_at(max);
+    fprintf(stderr, "  before: [%s, %s], after: [%s, %s]\n",
+            at_min.first.str().c_str(), at_max.first.str().c_str(),
+            at_min.second.str().c_str(), at_max.second.str().c_str());
+    auto best_before = [](auto values) { return (values.first < values.second); };
+    if (best_before(at_min) == best_before(at_max)) {
+        fprintf(stderr, "  NO %s CROSSOVER FOUND\n", type);
+        return 0.0;
+    }
+    while (max > (min + delta)) {
+        double x = (min + max) / 2.0;
+        auto at_x = calculate_at(x);
+        fprintf(stderr, "  best@%g: %s (%s vs %s)\n", x, best_before(at_x) ? "before" : "after",
+                at_x.first.str().c_str(), at_x.second.str().c_str());
+        if (best_before(at_min) == best_before(at_x)) {
+            min = x;
+            at_min = at_x;
+        } else {
+            max = x;
+            at_max = at_x;
+        }
+    }
+    double result = (min + max) / 2.0;
+    fprintf(stderr, "  %s CROSSOVER AT %g\n", type, result);
+    return result;
+}
+
+void sample_at(const char *type, const auto &calculate_at, const std::vector<double> &values, const std::vector<const char *> &names) {
+    assert(values.size() == names.size());
+    for (size_t i = 0; i < values.size(); ++i) {
+        double x = values[i];
+        const char *x_name = names[i];
+        auto at_x = calculate_at(x);
+        fprintf(stderr, "%s@%s(%g): before: %s, after: %s\n", type, x_name, x,
+                at_x.first.str().c_str(), at_x.second.str().c_str());
+    }
+}
+
+void analyze_crossover(BenchmarkBlueprintFactory &fixed, std::function<std::unique_ptr<BenchmarkBlueprintFactory>(double arg)> variable,
+                       uint32_t docid_limit, bool allow_force_strict, double delta)
+{
+    auto estimate_AND_time_ms = [&](auto &first, auto &last) {
+                                    auto a = first.make_blueprint();
+                                    a->basic_plan(true, docid_limit);
+                                    double est_a = a->estimate();
+                                    double a_ms = benchmark_search(first, docid_limit, true, false, false, 1.0).time_ms;
+                                    double b_ms = benchmark_search(last, docid_limit, false, false, false, est_a).time_ms;
+                                    if (!allow_force_strict) {
+                                        return Sample(a_ms + b_ms);
+                                    }
+                                    double c_ms = benchmark_search(last, docid_limit, false, true, false, est_a).time_ms;
+                                    if (c_ms < b_ms) {
+                                        return Sample(a_ms + c_ms, a_ms + b_ms, true);
+                                    }
+                                    return Sample(a_ms + b_ms, a_ms + c_ms, false);
+                                };
+    auto calculate_AND_cost = [&](auto &first, auto &last) {
+                                  auto a = first.make_blueprint();
+                                  auto b = last.make_blueprint();
+                                  a->basic_plan(true, docid_limit);
+                                  double a_cost = a->strict_cost();
+                                  b->basic_plan(a->estimate(), docid_limit);
+                                  double b_cost = b->cost() * a->estimate();
+                                  if (!allow_force_strict) {
+                                      return Sample(a_cost + b_cost);
+                                  }
+                                  auto c = last.make_blueprint();
+                                  c->basic_plan(true, docid_limit);
+                                  double c_cost = est_forced_strict_cost(c->estimate(), c->strict_cost(), a->estimate());
+                                  if (c_cost < b_cost) {
+                                      return Sample(a_cost + c_cost, a_cost + b_cost, true);
+                                  }
+                                  return Sample(a_cost + b_cost, a_cost + c_cost, false);
+                              };
+    auto first_abs_est = [&](auto &first, auto &) {
+                             auto a = first.make_blueprint();
+                             return Sample(a->getState().estimate().estHits);
+                         };
+    auto combine = [&](auto &fun) {
+                       return [&](double rate) {
+                                  auto variable_at_rate = variable(rate);
+                                  return std::make_pair(fun(*variable_at_rate, fixed), fun(fixed, *variable_at_rate));
+                              };
+                   };
+    std::vector<double> results;
+    std::vector<const char *> names;
+    names.push_back("time crossover");
+    results.push_back(find_crossover("TIME", combine(estimate_AND_time_ms), delta));
+    names.push_back("cost crossover");
+    results.push_back(find_crossover("COST", combine(calculate_AND_cost), delta));
+    names.push_back("abs_est crossover");
+    results.push_back(find_crossover("ABS_EST", combine(first_abs_est), delta));
+    sample_at("COST", combine(calculate_AND_cost), results, names);
+    sample_at("TIME", combine(estimate_AND_time_ms), results, names);
+}
+
+//-----------------------------------------------------------------------------
+
 vespalib::string
 to_string(bool val)
 {
@@ -632,6 +762,24 @@ TEST(IteratorBenchmark, or_benchmark)
     run_benchmarks(setup);
 }
 
+TEST(IteratorBenchmark, or_vs_filter_crossover)
+{
+    auto fixed_or = make_blueprint_factory(int32_array_fs, QueryOperator::Or, num_docs, 0, 0.1, 100);
+    auto variable_term = [](double rate) {
+                             return make_blueprint_factory(int32_array_fs, QueryOperator::Term, num_docs, 0, rate, 1);
+                         };
+    analyze_crossover(*fixed_or, variable_term, num_docs + 1, false, 0.0001);
+}
+
+TEST(IteratorBenchmark, or_vs_filter_crossover_with_allow_force_strict)
+{
+    auto fixed_or = make_blueprint_factory(int32_array_fs, QueryOperator::Or, num_docs, 0, 0.1, 100);
+    auto variable_term = [](double rate) {
+                             return make_blueprint_factory(int32_array_fs, QueryOperator::Term, num_docs, 0, rate, 1);
+                         };
+    analyze_crossover(*fixed_or, variable_term, num_docs + 1, true, 0.0001);
+}
+
 int main(int argc, char **argv) {
     ::testing::InitGoogleTest(&argc, argv);
     int res = RUN_ALL_TESTS();
author	Håvard Pettersen <havardpe@yahooinc.com>	2024-03-21 11:18:09 +0000
committer	Håvard Pettersen <havardpe@yahooinc.com>	2024-03-25 10:38:12 +0000
commit	a0b9fe6bb5b4a64bc3ce65074f923ba9a2bc4940 (patch)
tree	53a68dceca538bb664f16b8d905eb666cd8c28c4
parent	c3be0b5826152973ec422d32558ba41a1dc6311d (diff)