Merge pull request #27547 from vespa-engine/havardpe/est-80-percentile-as-result

use estimated 80 percentile as benchmark result
author: Henning Baldersheim <balder@yahoo-inc.com> 2023-06-26 18:56:45 +0200
committer: GitHub <noreply@github.com> 2023-06-26 18:56:45 +0200
commit: 7e1c15d0d0ab6312612fe67e75104139e278d097 (patch)
tree: cbe69c29fd09499428e47726d8a844e61922073e
parent: 55fb18fa6800ddfc8abc33d6bad602710a5c91ae (diff)
parent: 0716bd52e13fcb5526b9ab7af38836cb3eddb11e (diff)
3 files changed, 129 insertions, 41 deletions
diff --git a/vespalib/src/tests/rw_spin_lock/rw_spin_lock_test.cpp b/vespalib/src/tests/rw_spin_lock/rw_spin_lock_test.cpp
index c6f0581bceb..afd18a13d2e 100644
--- a/vespalib/src/tests/rw_spin_lock/rw_spin_lock_test.cpp
+++ b/vespalib/src/tests/rw_spin_lock/rw_spin_lock_test.cpp
@@ -11,6 +11,7 @@
 #include <ranges>
 #include <random>
 #include <array>
+#include <algorithm>
 
 using namespace vespalib;
 using namespace vespalib::test;
@@ -23,6 +24,71 @@ size_t state_loop = 1;
 
 //-----------------------------------------------------------------------------
 
+/**
+ * Estimates the 80th percentile by throwing away the 2 best samples
+ * in each set of 10 samples, using the best remaining sample as a
+ * representative for the set. Representatives are hierarchically
+ * matched against representatives from other sample sets. Result
+ * extraction is simplified in that it does not try to estimate the
+ * actual 80th percentile, but rather tries to drop the best samples
+ * if possible.
+ *
+ * The goal is to have a more robust way of combining repeated
+ * micro-benchmark samples than simply using minimum time. With simple
+ * single-threaded CPU-bound tasks, minimum time is a good measure of
+ * how expensive something is, but when we start benchmarking
+ * operations that may conflict with themselves, we do not want to
+ * account for being super lucky. However, we still want to account
+ * for the benchmark conditions being as good as possible.
+ **/
+struct Est80P {
+    struct Level {
+        int cnt;
+        std::array<double,3> data;
+        Level(double value) noexcept
+          : cnt(1), data{value, 0.0, 0.0} {}
+        bool empty() const noexcept { return (cnt == 0); }
+        bool full() const noexcept { return (cnt == 10); }
+        void add(double value) noexcept {
+            assert(!full());
+            if (cnt < 3 || data[2] > value) {
+                size_t i = std::min(cnt, 2);
+                while (i > 0 && data[i - 1] > value) {
+                    data[i] = data[i - 1];
+                    --i;
+                }
+                data[i] = value;
+            }
+            ++cnt;
+        }
+        double get() const noexcept {
+            assert(!empty());
+            return data[std::min(2, cnt - 1)];
+        }
+        void clear() noexcept {
+            cnt = 0;
+        }
+    };
+    std::vector<Level> levels;
+    void add_sample(double value) {
+        for (auto &level: levels) {
+            level.add(value);
+            if (!level.full()) [[likely]] {
+                return;
+            }
+            value = level.get();
+            level.clear();
+        }
+        levels.emplace_back(value);
+    }
+    double get_result() {
+        assert(!levels.empty());
+        return levels.back().get();
+    }
+};
+
+//-----------------------------------------------------------------------------
+
 struct DummyLock {
     constexpr DummyLock() noexcept {}
     // BasicLockable
@@ -158,47 +224,31 @@ double measure_ns(auto &work) {
 
 struct BenchmarkResult {
     double cost_ns;
-    double range_ns;
-    size_t threads;
-    BenchmarkResult(size_t num_threads)
-      : cost_ns(std::numeric_limits<double>::max()), range_ns(0.0), threads(num_threads) {}
+    BenchmarkResult(double cost_ns_in) : cost_ns(cost_ns_in) {}
     void report(vespalib::string desc) {
-        if (threads == 1) {
-            fprintf(stderr, "%s: cost_ns: %g\n",
-                    desc.c_str(), cost_ns);
-        } else {
-            fprintf(stderr, "%s: cost_ns: %g, range_ns: %g (%zu threads)\n",
-                    desc.c_str(), cost_ns, range_ns, threads);
-        }
+        fprintf(stderr, "%s: cost_ns: %g\n", desc.c_str(), cost_ns);
     }
     void report(vespalib::string name, vespalib::string desc) {
         report(name + "(" + desc + ")");
     }
 };
 
-struct Meets {
-    vespalib::test::ThreadMeets::Avg avg;
-    vespalib::test::ThreadMeets::Range<double> range;
-    Meets(size_t num_threads) : avg(num_threads), range(num_threads) {}
-};
-
 BenchmarkResult benchmark_ns(auto &&work, size_t num_threads = 1) {
-    Meets meets(num_threads);
+    Est80P collector;
+    vespalib::test::ThreadMeets::Avg avg(num_threads);
     auto entry = [&](Nexus &ctx) {
         Timer timer;
         BenchmarkResult result(ctx.num_threads());
         for (bool once_more = true; ctx.vote(once_more); once_more = (timer.elapsed() < budget)) {
-            auto my_ns = measure_ns(work);
-            auto cost_ns = meets.avg(my_ns);
-            auto range_ns = meets.range(my_ns);
-            if (cost_ns < result.cost_ns) {
-                result.cost_ns = cost_ns;
-                result.range_ns = range_ns;
+            auto cost_ns = avg(measure_ns(work));
+            if (ctx.is_main()) {
+                collector.add_sample(cost_ns);
             }
         }
-        return result;
     };
-    return Nexus::run(num_threads, entry);
+    Nexus::run(num_threads, entry);
+    auto result = collector.get_result();
+    return {result};
 }
 
 //-----------------------------------------------------------------------------
@@ -224,7 +274,7 @@ void estimate_cost() {
 //-----------------------------------------------------------------------------
 
 template <typename T>
-void thread_safety_loop(Nexus &ctx, T &lock, MyState &state, Meets &meets, int read_bp) {
+void thread_safety_loop(Nexus &ctx, T &lock, MyState &state, auto &max, int read_bp) {
     Rnd rnd(ctx.thread_id());
     size_t write_cnt = 0;
     size_t bad_reads = 0;
@@ -247,16 +297,11 @@ void thread_safety_loop(Nexus &ctx, T &lock, MyState &state, Meets &meets, int r
             }
         }
     }
-    auto t1 = steady_clock::now();
-    ctx.barrier();
-    auto t2 = steady_clock::now();
-    auto my_ms = count_ns(t1 - t0) / 1'000'000.0;
-    auto total_ms = count_ns(t2 - t0) / 1'000'000.0;
-    auto cost_ms = meets.avg(my_ms);
-    auto range_ms = meets.range(my_ms);
-    if (ctx.thread_id() == 0) {
-        fprintf(stderr, "---> %s with %2zu threads (%5d bp r): avg: %10.2f ms, range: %10.2f ms, max: %10.2f ms\n",
-                getClassName(lock).c_str(), ctx.num_threads(), read_bp, cost_ms, range_ms, total_ms);
+    auto my_ms = count_ns(steady_clock::now() - t0) / 1'000'000.0;
+    auto actual_ms = max(my_ms);
+    if (ctx.is_main()) {
+        fprintf(stderr, "---> %s with %2zu threads (%5d bp r): time: %10.2f ms\n",
+                getClassName(lock).c_str(), ctx.num_threads(), read_bp, actual_ms);
     }
     state.commit_inconsistent_reads(bad_reads);
     state.commit_expected_writes(write_cnt);
@@ -290,9 +335,9 @@ void benchmark_lock() {
     for (size_t bp: {10000, 9999, 5000, 0}) {
         for (size_t num_threads: {8, 4, 2, 1}) {
             if (bench || (bp == 9999 && num_threads == 8)) {
-                Meets meets(num_threads);
+                vespalib::test::ThreadMeets::Max<double> max(num_threads);
                 Nexus::run(num_threads, [&](Nexus &ctx) {
-                    thread_safety_loop(ctx, *lock, *state, meets, bp);
+                    thread_safety_loop(ctx, *lock, *state, max, bp);
                 });
             }
         }
diff --git a/vespalib/src/vespa/vespalib/test/nexus.h b/vespalib/src/vespa/vespalib/test/nexus.h
index 659b563e43c..7be35d42463 100644
--- a/vespalib/src/vespa/vespalib/test/nexus.h
+++ b/vespalib/src/vespa/vespalib/test/nexus.h
@@ -36,6 +36,7 @@ public:
     Nexus &operator=(const Nexus &) = delete;
     size_t num_threads() const noexcept { return _vote.size(); }
     size_t thread_id() const noexcept { return _thread_id; }
+    bool is_main() const noexcept { return _thread_id == 0; }
     bool vote(bool my_vote) { return _vote(my_vote); }
     void barrier() { REQUIRE_EQ(_vote(true), true); }
     struct select_thread_0 {};
diff --git a/vespalib/src/vespa/vespalib/test/thread_meets.h b/vespalib/src/vespa/vespalib/test/thread_meets.h
index 7ef4dcb9921..26ca560641d 100644
--- a/vespalib/src/vespa/vespalib/test/thread_meets.h
+++ b/vespalib/src/vespa/vespalib/test/thread_meets.h
@@ -38,8 +38,8 @@ struct ThreadMeets {
         explicit Sum(size_t N) : vespalib::Rendezvous<T,T>(N) {}
         T operator()(T value) { return rendezvous(value); }
         void mingle() override {
-            T acc{};
-            for (size_t i = 0; i < size(); ++i) {
+            T acc = in(0);
+            for (size_t i = 1; i < size(); ++i) {
                 acc += in(i);
             }
             for (size_t i = 0; i < size(); ++i) {
@@ -47,6 +47,48 @@ struct ThreadMeets {
             }
         }
     };
+    // maximum of values across all threads
+    template <typename T>
+    struct Max : vespalib::Rendezvous<T,T> {
+        using vespalib::Rendezvous<T,T>::in;
+        using vespalib::Rendezvous<T,T>::out;
+        using vespalib::Rendezvous<T,T>::size;
+        using vespalib::Rendezvous<T,T>::rendezvous;
+        explicit Max(size_t N) : vespalib::Rendezvous<T,T>(N) {}
+        T operator()(T value) { return rendezvous(value); }
+        void mingle() override {
+            T max = in(0);
+            for (size_t i = 1; i < size(); ++i) {
+                if (in(i) > max) {
+                    max = in(i);
+                }
+            }
+            for (size_t i = 0; i < size(); ++i) {
+                out(i) = max;
+            }
+        }
+    };
+    // minimum of values across all threads
+    template <typename T>
+    struct Min : vespalib::Rendezvous<T,T> {
+        using vespalib::Rendezvous<T,T>::in;
+        using vespalib::Rendezvous<T,T>::out;
+        using vespalib::Rendezvous<T,T>::size;
+        using vespalib::Rendezvous<T,T>::rendezvous;
+        explicit Min(size_t N) : vespalib::Rendezvous<T,T>(N) {}
+        T operator()(T value) { return rendezvous(value); }
+        void mingle() override {
+            T min = in(0);
+            for (size_t i = 1; i < size(); ++i) {
+                if (in(i) < min) {
+                    min = in(i);
+                }
+            }
+            for (size_t i = 0; i < size(); ++i) {
+                out(i) = min;
+            }
+        }
+    };
     // range of values across all threads
     template <typename T>
     struct Range : vespalib::Rendezvous<T,T> {
author	Henning Baldersheim <balder@yahoo-inc.com>	2023-06-26 18:56:45 +0200
committer	GitHub <noreply@github.com>	2023-06-26 18:56:45 +0200
commit	7e1c15d0d0ab6312612fe67e75104139e278d097 (patch)
tree	cbe69c29fd09499428e47726d8a844e61922073e
parent	55fb18fa6800ddfc8abc33d6bad602710a5c91ae (diff)
parent	0716bd52e13fcb5526b9ab7af38836cb3eddb11e (diff)