Merge pull request #30973 from vespa-engine/geirst/better-non-strict-cost-estimates-for-btree-iterators

Use better non-strict cost estimates for btree and disk index iterators.
author: Håvard Pettersen <3535158+havardpe@users.noreply.github.com> 2024-04-19 15:19:44 +0200
committer: GitHub <noreply@github.com> 2024-04-19 15:19:44 +0200
commit: 3809a09b0e33ea2203f5165430631d4f1aa74adc (patch)
tree: ecb6acffec03107e46c1d8794720b8de95291280 /searchlib
parent: 433cb01e19f6bb51d6a2d029482a6e16431cb055 (diff)
parent: 4d66b9a6ac8c6991bed43ae195820a8c7f8a74d6 (diff)
6 files changed, 154 insertions, 90 deletions
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp
index f7a358efb26..f4a1ade8a66 100644
--- a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp
@@ -750,19 +750,36 @@ void
 run_intermediate_blueprint_benchmark(const BlueprintFactorySetup& a, const BlueprintFactorySetup& b, size_t num_docs)
 {
     print_intermediate_blueprint_result_header(2);
+    double max_speedup = 0.0;
+    double min_speedup = std::numeric_limits<double>::max();
     for (double b_hit_ratio: b.op_hit_ratios) {
         auto b_factory = b.make_factory_shared(num_docs, b_hit_ratio);
         for (double a_hit_ratio : a.op_hit_ratios) {
             IntermediateBlueprintFactoryType factory;
             factory.add_child(a.make_factory(num_docs, a_hit_ratio));
             factory.add_child(b_factory);
-            for (auto algo: {PlanningAlgo::Order, PlanningAlgo::Estimate, PlanningAlgo::Cost, PlanningAlgo::CostForceStrict}) {
+            double time_ms_esti = 0.0;
+            for (auto algo: {PlanningAlgo::Order, PlanningAlgo::Estimate, PlanningAlgo::Cost,
+                             PlanningAlgo::CostForceStrict}) {
                 auto res = benchmark_search(factory, num_docs + 1, true, false, false, 1.0, algo);
                 print_intermediate_blueprint_result(res, {a_hit_ratio, b_hit_ratio}, algo, num_docs);
+                if (algo == PlanningAlgo::Estimate) {
+                    time_ms_esti = res.time_ms;
+                }
+                if (algo == PlanningAlgo::CostForceStrict) {
+                    double speedup = time_ms_esti / res.time_ms;
+                    if (speedup > max_speedup) {
+                        max_speedup = speedup;
+                    }
+                    if (speedup < min_speedup) {
+                        min_speedup = speedup;
+                    }
+                    std::cout << "speedup (esti/forc)=" << std::setprecision(4) << speedup << std::endl;
+                }
             }
-            std::cout << std::endl;
         }
     }
+    std::cout << "max_speedup=" << max_speedup << ", min_speedup=" << min_speedup << std::endl << std::endl;
 }
 
 void
@@ -787,6 +804,12 @@ gen_ratios(double middle, double range_multiplier, size_t num_samples)
     for (size_t i = 0; i < num_samples; ++i) {
         res.push_back(ratio);
         ratio *= factor;
+        if (ratio > 1.0) {
+            if (res.size() < num_samples) {
+                res.push_back(1.0);
+            }
+            break;
+        }
     }
     return res;
 }
@@ -831,7 +854,6 @@ TEST(IteratorBenchmark, analyze_term_search_in_disk_index)
 {
     BenchmarkSetup setup(num_docs, {str_index}, {QueryOperator::Term}, {true, false}, base_hit_ratios);
     setup.filter_hit_ratios = filter_hit_ratios;
-    setup.filter_crossover_factor = 1.0;
     run_benchmarks(setup, global_summary);
 }
 
@@ -863,31 +885,24 @@ TEST(IteratorBenchmark, analyze_term_search_in_fast_search_attributes)
     run_benchmarks(setup, global_summary);
 }
 
-TEST(IteratorBenchmark, analyze_in_operator_non_strict)
+TEST(IteratorBenchmark, analyze_IN_non_strict)
 {
-    const std::vector<double> hit_ratios = {0.001, 0.01, 0.1, 0.2, 0.4, 0.6, 0.8};
-    BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::In}, {false}, hit_ratios, {5, 9, 10, 100, 1000, 10000});
-    setup.disjunct_children = true;
-    run_benchmarks(setup);
+    for (auto in_hit_ratio : {0.01, 0.1, 0.5}) {
+        BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::In}, {false}, {in_hit_ratio}, {2, 5, 9, 10, 100, 1000, 10000});
+        setup.filter_hit_ratios = gen_ratios(in_hit_ratio, 10.0, 13);
+        setup.disjunct_children = true;
+        run_benchmarks(setup);
+    }
 }
 
-TEST(IteratorBenchmark, analyze_in_operator_strict)
+TEST(IteratorBenchmark, analyze_IN_strict)
 {
     const std::vector<double> hit_ratios = {0.001, 0.01, 0.1, 0.2, 0.4, 0.6, 0.8};
-    BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::In}, {true}, hit_ratios, {5, 9, 10, 100, 1000, 10000});
+    BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::In}, {true}, hit_ratios, {2, 5, 9, 10, 100, 1000, 10000});
     setup.disjunct_children = true;
     run_benchmarks(setup);
 }
 
-TEST(IteratorBenchmark, analyze_complex_leaf_operators)
-{
-    std::vector<FieldConfig> field_cfgs = {int32_array_fs};
-    std::vector<QueryOperator> query_ops = {QueryOperator::In, QueryOperator::DotProduct};
-    const std::vector<double> hit_ratios = {0.001, 0.01, 0.1, 0.2, 0.4, 0.6, 0.8};
-    BenchmarkSetup setup(num_docs, field_cfgs, query_ops, {true, false}, hit_ratios, {1, 2, 10, 100});
-    run_benchmarks(setup);
-}
-
 TEST(IteratorBenchmark, analyze_weak_and_operators)
 {
     std::vector<FieldConfig> field_cfgs = {int32_wset_fs};
@@ -897,24 +912,6 @@ TEST(IteratorBenchmark, analyze_weak_and_operators)
     run_benchmarks(setup);
 }
 
-TEST(IteratorBenchmark, term_benchmark)
-{
-    BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::Term}, {true, false}, base_hit_ratios);
-    run_benchmarks(setup);
-}
-
-TEST(IteratorBenchmark, and_benchmark)
-{
-    BenchmarkSetup setup(num_docs, {int32_array_fs}, {QueryOperator::And}, {true, false}, base_hit_ratios, {1, 2, 4, 8});
-    run_benchmarks(setup);
-}
-
-TEST(IteratorBenchmark, or_benchmark)
-{
-    BenchmarkSetup setup(num_docs, {int32_array_fs}, {QueryOperator::Or}, {true, false}, base_hit_ratios, {1, 10, 100, 1000});
-    run_benchmarks(setup);
-}
-
 TEST(IteratorBenchmark, or_vs_filter_crossover)
 {
     auto fixed_or = make_blueprint_factory(int32_array_fs, QueryOperator::Or, num_docs, 0, 0.1, 100, false);
@@ -933,16 +930,38 @@ TEST(IteratorBenchmark, or_vs_filter_crossover_with_allow_force_strict)
     analyze_crossover(*fixed_or, variable_term, num_docs + 1, true, 0.0001);
 }
 
-TEST(IteratorBenchmark, analyze_and_with_filter_vs_in)
+TEST(IteratorBenchmark, analyze_AND_filter_vs_IN)
+{
+    for (auto in_filter_ratio : {0.01, 0.1, 0.5}) {
+        for (uint32_t children: {2, 10, 100, 1000}) {
+            run_and_benchmark({int32_fs, QueryOperator::Term, gen_ratios(in_filter_ratio, 10.0, 13)},
+                              {int32_fs, QueryOperator::In, {in_filter_ratio}, children, false},
+                              num_docs);
+        }
+    }
+}
+
+TEST(IteratorBenchmark, analyze_AND_filter_vs_OR)
+{
+    for (auto or_filter_ratio : {0.01, 0.1, 0.5}) {
+        for (uint32_t children: {2, 10, 100, 1000}) {
+            run_and_benchmark({int32_fs, QueryOperator::Term, gen_ratios(or_filter_ratio, 10, 13)},
+                              {int32_fs, QueryOperator::Or, {or_filter_ratio}, children, false},
+                              num_docs);
+        }
+    }
+}
+
+TEST(IteratorBenchmark, analyze_AND_filter_vs_IN_array)
 {
-    for (uint32_t children: {10, 100, 1000}) {
-        run_and_benchmark({int32_fs, QueryOperator::Term, gen_ratios(0.1, 8.0, 15)},
-                          {int32_fs, QueryOperator::In, {0.1}, children, false},
+    for (uint32_t children: {2, 10, 100, 1000}) {
+        run_and_benchmark({int32_fs, QueryOperator::Term, gen_ratios(0.1, 10.0, 13)},
+                          {int32_array_fs, QueryOperator::In, {0.1}, children, false},
                           num_docs);
     }
 }
 
-TEST(IteratorBenchmark, analyze_and_with_bitvector_vs_in)
+TEST(IteratorBenchmark, analyze_AND_bitvector_vs_IN)
 {
     for (uint32_t children: {10, 100, 1000, 10000}) {
         run_and_benchmark({int32_fs, QueryOperator::In, {0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60}, children, true},
@@ -951,21 +970,35 @@ TEST(IteratorBenchmark, analyze_and_with_bitvector_vs_in)
     }
 }
 
-TEST(IteratorBenchmark, analyze_and_with_filter_vs_in_array)
+TEST(IteratorBenchmark, analyze_OR_non_strict_fs)
 {
-    for (uint32_t children: {10, 100, 1000}) {
-        run_and_benchmark({int32_fs, QueryOperator::Term, gen_ratios(0.1, 8.0, 15)},
-                          {int32_array_fs, QueryOperator::In, {0.1}, children, false},
-                          num_docs);
+    for (auto or_hit_ratio : {0.01, 0.1, 0.5}) {
+        BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::Or}, {false}, {or_hit_ratio},
+                             {2, 4, 6, 8, 10, 100, 1000});
+        setup.filter_hit_ratios = gen_ratios(or_hit_ratio, 10.0, 13);
+        run_benchmarks(setup);
     }
 }
 
-TEST(IteratorBenchmark, analyze_and_with_filter_vs_or)
+TEST(IteratorBenchmark, analyze_OR_non_strict_non_fs)
 {
-    for (uint32_t children: {10, 100, 1000}) {
-        run_and_benchmark({int32_fs, QueryOperator::Term, gen_ratios(0.1, 8.0, 15)},
-                          {int32_fs, QueryOperator::Or, {0.1}, children, false},
-                          num_docs);
+    BenchmarkSetup setup(num_docs, {int32}, {QueryOperator::Or}, {false}, {0.1}, {2, 4, 6, 8, 10});
+    setup.filter_hit_ratios = gen_ratios(0.1, 10.0, 13);
+    run_benchmarks(setup);
+}
+
+TEST(IteratorBenchmark, analyze_OR_strict)
+{
+    BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::Or}, {true}, {0.01, 0.1, 0.5}, {2, 4, 6, 8, 10, 100, 1000});
+    run_benchmarks(setup);
+}
+
+TEST(IteratorBenchmark, analyze_btree_iterator_non_strict)
+{
+    for (auto term_ratio : {0.01, 0.1, 0.5, 1.0}) {
+        BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::Term}, {false}, {term_ratio}, {1});
+        setup.filter_hit_ratios = gen_ratios(term_ratio, 10.0, 15);
+        run_benchmarks(setup);
     }
 }
 
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
index aebfda8fffd..569edc0a4ab 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
@@ -166,7 +166,7 @@ public:
             return {0.5, lookup_cost(indirections), lookup_strict_cost(indirections)};
         } else {
             double rel_est = abs_to_rel_est(_hit_estimate.est_hits(), docid_limit);
-            return {rel_est, btree_cost(), btree_strict_cost(rel_est)};
+            return {rel_est, btree_cost(rel_est), btree_strict_cost(rel_est)};
         }
     }
 
@@ -519,8 +519,9 @@ public:
             double estimate(const IDirectPostingStore::LookupResult &term) const noexcept {
                 return abs_to_rel_est(term.posting_size, docid_limit);
             }
-            double cost(const IDirectPostingStore::LookupResult &) const noexcept {
-                return btree_cost();
+            double cost(const IDirectPostingStore::LookupResult &term) const noexcept {
+                double rel_est = abs_to_rel_est(term.posting_size, docid_limit);
+                return btree_cost(rel_est);
             }
             double strict_cost(const IDirectPostingStore::LookupResult &term) const noexcept {
                 double rel_est = abs_to_rel_est(term.posting_size, docid_limit);
diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp
index 160bb199fb8..4ede8957f4e 100644
--- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp
@@ -192,8 +192,9 @@ DirectMultiTermBlueprint<PostingStoreType, SearchType>::calculate_flow_stats(uin
         double estimate(const IDirectPostingStore::LookupResult &term) const noexcept {
             return abs_to_rel_est(term.posting_size, docid_limit);
         }
-        double cost(const IDirectPostingStore::LookupResult &) const noexcept {
-            return search::queryeval::flow::btree_cost();
+        double cost(const IDirectPostingStore::LookupResult &term) const noexcept {
+            double rel_est = abs_to_rel_est(term.posting_size, docid_limit);
+            return search::queryeval::flow::btree_cost(rel_est);
         }
         double strict_cost(const IDirectPostingStore::LookupResult &term) const noexcept {
             double rel_est = abs_to_rel_est(term.posting_size, docid_limit);
diff --git a/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp b/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp
index c0d0b1baa8c..c30b02a0c37 100644
--- a/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp
@@ -72,7 +72,7 @@ queryeval::FlowStats
 DiskTermBlueprint::calculate_flow_stats(uint32_t docid_limit) const
 {
     double rel_est = abs_to_rel_est(_lookupRes->counts._numDocs, docid_limit);
-    return {rel_est, disk_index_cost(), disk_index_strict_cost(rel_est)};
+    return {rel_est, disk_index_cost(rel_est), disk_index_strict_cost(rel_est)};
 }
 
 SearchIterator::UP
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
index eba135d8519..2bc94073c92 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
@@ -261,7 +261,7 @@ public:
 
     queryeval::FlowStats calculate_flow_stats(uint32_t docid_limit) const override {
         double rel_est = abs_to_rel_est(_posting_itr.size(), docid_limit);
-        return {rel_est, btree_cost(), btree_strict_cost(rel_est)};
+        return {rel_est, btree_cost(rel_est), btree_strict_cost(rel_est)};
     }
     
     SearchIterator::UP createLeafSearch(const TermFieldMatchDataArray& tfmda) const override {
diff --git a/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h b/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h
index 356ecd4c992..e8a58bba9dc 100644
--- a/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h
+++ b/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h
@@ -7,60 +7,87 @@
 namespace search::queryeval::flow {
 
 /**
- * This function is used when calculating the strict cost of
- * intermediate and complex leaf blueprints that use a heap for their strict iterator implementation.
- *
- * Iterator benchmarking has shown the need to increase the strict cost
- * of complex blueprints, to avoid that they are forced strict too early.
- * The 5.0 multiplier reflects this.
- *
- * Program used: searchlib/src/tests/queryeval/iterator_benchmark
- * Tests used: analyze_and_with_filter_vs_*
- */
-inline double heap_cost(double my_est, size_t num_children) {
-    return 5.0 * my_est * std::log2(std::max(size_t(1), num_children));
-}
-
-/**
- * The following constants and formulas were derived after analyzing term search over attributes
- * (with and without fast-search) and disk index by using the iterator benchmark program:
+ * The following constants and formulas were derived after benchmarking and analyzing
+ * using the following benchmark program:
  * searchlib/src/tests/queryeval/iterator_benchmark
  *
- * The following tests were executed on a machine with an Intel Xeon 2.5 GHz CPU with 48 cores and 256 Gb of memory:
- * ./searchlib_iterator_benchmark_test_app --gtest_filter='*analyze_term_search*'
- *
- * TODO: Add details on OR benchmarking.
+ * The tests were executed on:
+ *   - Machine with an Intel Xeon 2.5 GHz CPU with 48 cores and 256 Gb of memory.
+ *   - Apple M1 MacBook Pro (2021) 32 GB.
  *
  * The benchmark summary shows the 'average ms per cost' of the different benchmark cases.
- * The following constants and formulas were derived to balance 'average ms per cost' to be similar
+ * The constants and formulas are adjusted to balance 'average ms per cost' to be similar
  * across the different benchmark cases.
+ *
+ * The AND benchmark cases outputs the ratio (esti/forc) of the time_ms used by two query planning algorithms:
+ * 'estimate' (legacy) and 'cost with allowed force strict' (new).
+ * 'max_speedup' indicates the gain of using the new cost model, while 'min_speedup' indicates the loss.
+ * The constants and formulas are also adjusted to maximize speedup, while reducing loss.
+ * Tests used:
+ *   - IteratorBenchmark::analyze_AND_filter_vs_IN
+ *   - IteratorBenchmark::analyze_AND_filter_vs_OR
+ *   - IteratorBenchmark::analyze_AND_filter_vs_IN_array
+ *   - IteratorBenchmark::analyze_AND_bitvector_vs_IN
  */
 
+/**
+ * This function is used when calculating the strict cost of
+ * intermediate and complex leaf blueprints that use a heap for their strict iterator implementation.
+ * Tests used:
+ *   - IteratorBenchmark::analyze_IN_strict
+ *   - IteratorBenchmark::analyze_OR_strict
+ */
+inline double heap_cost(double my_est, size_t num_children) {
+    return my_est * std::log2(std::max(size_t(1), num_children));
+}
+
 // Non-strict cost of lookup based matching in an attribute (not fast-search).
+// Test used: IteratorBenchmark::analyze_term_search_in_attributes_non_strict
 inline double lookup_cost(size_t num_indirections) {
     return 1.0 + (num_indirections * 1.0);
 }
 
 // Non-strict cost of reverse lookup into a hash table (containing terms from a multi-term operator).
+// Test used: IteratorBenchmark::analyze_IN_non_strict
 inline double reverse_hash_lookup() {
-    return 5.0;
+    return 1.0;
 }
 
 // Strict cost of lookup based matching in an attribute (not fast-search).
+// IteratorBenchmark::analyze_term_search_in_attributes_strict
 inline double lookup_strict_cost(size_t num_indirections) {
     return lookup_cost(num_indirections);
 }
 
-// Non-strict cost of matching in a btree posting list (e.g. fast-search attribute or memory index field).
-inline double btree_cost() {
-    return 1.0;
+/**
+ * Estimates the cost of evaluating an always strict iterator (e.g. btree) in a non-strict context.
+ *
+ * When the estimate and strict cost is low, this models the cost of checking whether
+ * the seek docid matches the docid the iterator is already positioned at (the 0.2 factor).
+ *
+ * The resulting non-strict cost is most accurate when the inflow is 1.0.
+ * The resulting non-strict cost is highly underestimated when the inflow goes to 0.0.
+ * It is important to have a better estimate at higher inflows,
+ * as the latency (time) penalty is higher if choosing wrong.
+ *
+ * Note: This formula is equal to forced_strict_cost() in flow.h.
+ */
+inline double non_strict_cost_of_strict_iterator(double estimate, double strict_cost) {
+    return 0.2 * (1.0 - estimate) + strict_cost;
 }
 
 // Strict cost of matching in a btree posting list (e.g. fast-search attribute or memory index field).
+// Test used: IteratorBenchmark::analyze_term_search_in_fast_search_attributes
 inline double btree_strict_cost(double my_est) {
     return my_est;
 }
 
+// Non-strict cost of matching in a btree posting list (e.g. fast-search attribute or memory index field).
+// Test used: IteratorBenchmark::analyze_btree_iterator_non_strict
+inline double btree_cost(double my_est) {
+    return non_strict_cost_of_strict_iterator(my_est, btree_strict_cost(my_est));
+}
+
 // Non-strict cost of matching in a bitvector.
 inline double bitvector_cost() {
     return 1.0;
@@ -72,14 +99,16 @@ inline double bitvector_strict_cost(double my_est) {
     return 1.5 * my_est;
 }
 
-// Non-strict cost of matching in a disk index posting list.
-inline double disk_index_cost() {
-    return 1.5;
-}
-
 // Strict cost of matching in a disk index posting list.
+// Test used: IteratorBenchmark::analyze_term_search_in_disk_index
 inline double disk_index_strict_cost(double my_est) {
     return 1.5 * my_est;
 }
 
+// Non-strict cost of matching in a disk index posting list.
+// Test used: IteratorBenchmark::analyze_term_search_in_disk_index
+inline double disk_index_cost(double my_est) {
+    return non_strict_cost_of_strict_iterator(my_est, disk_index_strict_cost(my_est));
+}
+
 }
author	Håvard Pettersen <3535158+havardpe@users.noreply.github.com>	2024-04-19 15:19:44 +0200
committer	GitHub <noreply@github.com>	2024-04-19 15:19:44 +0200
commit	3809a09b0e33ea2203f5165430631d4f1aa74adc (patch)
tree	ecb6acffec03107e46c1d8794720b8de95291280 /searchlib
parent	433cb01e19f6bb51d6a2d029482a6e16431cb055 (diff)
parent	4d66b9a6ac8c6991bed43ae195820a8c7f8a74d6 (diff)