diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-06-26 18:56:45 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-26 18:56:45 +0200 |
commit | 7e1c15d0d0ab6312612fe67e75104139e278d097 (patch) | |
tree | cbe69c29fd09499428e47726d8a844e61922073e | |
parent | 55fb18fa6800ddfc8abc33d6bad602710a5c91ae (diff) | |
parent | 0716bd52e13fcb5526b9ab7af38836cb3eddb11e (diff) |
Merge pull request #27547 from vespa-engine/havardpe/est-80-percentile-as-result
use estimated 80 percentile as benchmark result
-rw-r--r-- | vespalib/src/tests/rw_spin_lock/rw_spin_lock_test.cpp | 123 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/test/nexus.h | 1 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/test/thread_meets.h | 46 |
3 files changed, 129 insertions, 41 deletions
diff --git a/vespalib/src/tests/rw_spin_lock/rw_spin_lock_test.cpp b/vespalib/src/tests/rw_spin_lock/rw_spin_lock_test.cpp index c6f0581bceb..afd18a13d2e 100644 --- a/vespalib/src/tests/rw_spin_lock/rw_spin_lock_test.cpp +++ b/vespalib/src/tests/rw_spin_lock/rw_spin_lock_test.cpp @@ -11,6 +11,7 @@ #include <ranges> #include <random> #include <array> +#include <algorithm> using namespace vespalib; using namespace vespalib::test; @@ -23,6 +24,71 @@ size_t state_loop = 1; //----------------------------------------------------------------------------- +/** + * Estimates the 80th percentile by throwing away the 2 best samples + * in each set of 10 samples, using the best remaining sample as a + * representative for the set. Representatives are hierarchically + * matched against representatives from other sample sets. Result + * extraction is simplified in that it does not try to estimate the + * actual 80th percentile, but rather tries to drop the best samples + * if possible. + * + * The goal is to have a more robust way of combining repeated + * micro-benchmark samples than simply using minimum time. With simple + * single-threaded CPU-bound tasks, minimum time is a good measure of + * how expensive something is, but when we start benchmarking + * operations that may conflict with themselves, we do not want to + * account for being super lucky. However, we still want to account + * for the benchmark conditions being as good as possible. + **/ +struct Est80P { + struct Level { + int cnt; + std::array<double,3> data; + Level(double value) noexcept + : cnt(1), data{value, 0.0, 0.0} {} + bool empty() const noexcept { return (cnt == 0); } + bool full() const noexcept { return (cnt == 10); } + void add(double value) noexcept { + assert(!full()); + if (cnt < 3 || data[2] > value) { + size_t i = std::min(cnt, 2); + while (i > 0 && data[i - 1] > value) { + data[i] = data[i - 1]; + --i; + } + data[i] = value; + } + ++cnt; + } + double get() const noexcept { + assert(!empty()); + return data[std::min(2, cnt - 1)]; + } + void clear() noexcept { + cnt = 0; + } + }; + std::vector<Level> levels; + void add_sample(double value) { + for (auto &level: levels) { + level.add(value); + if (!level.full()) [[likely]] { + return; + } + value = level.get(); + level.clear(); + } + levels.emplace_back(value); + } + double get_result() { + assert(!levels.empty()); + return levels.back().get(); + } +}; + +//----------------------------------------------------------------------------- + struct DummyLock { constexpr DummyLock() noexcept {} // BasicLockable @@ -158,47 +224,31 @@ double measure_ns(auto &work) { struct BenchmarkResult { double cost_ns; - double range_ns; - size_t threads; - BenchmarkResult(size_t num_threads) - : cost_ns(std::numeric_limits<double>::max()), range_ns(0.0), threads(num_threads) {} + BenchmarkResult(double cost_ns_in) : cost_ns(cost_ns_in) {} void report(vespalib::string desc) { - if (threads == 1) { - fprintf(stderr, "%s: cost_ns: %g\n", - desc.c_str(), cost_ns); - } else { - fprintf(stderr, "%s: cost_ns: %g, range_ns: %g (%zu threads)\n", - desc.c_str(), cost_ns, range_ns, threads); - } + fprintf(stderr, "%s: cost_ns: %g\n", desc.c_str(), cost_ns); } void report(vespalib::string name, vespalib::string desc) { report(name + "(" + desc + ")"); } }; -struct Meets { - vespalib::test::ThreadMeets::Avg avg; - vespalib::test::ThreadMeets::Range<double> range; - Meets(size_t num_threads) : avg(num_threads), range(num_threads) {} -}; - BenchmarkResult benchmark_ns(auto &&work, size_t num_threads = 1) { - Meets meets(num_threads); + Est80P collector; + vespalib::test::ThreadMeets::Avg avg(num_threads); auto entry = [&](Nexus &ctx) { Timer timer; BenchmarkResult result(ctx.num_threads()); for (bool once_more = true; ctx.vote(once_more); once_more = (timer.elapsed() < budget)) { - auto my_ns = measure_ns(work); - auto cost_ns = meets.avg(my_ns); - auto range_ns = meets.range(my_ns); - if (cost_ns < result.cost_ns) { - result.cost_ns = cost_ns; - result.range_ns = range_ns; + auto cost_ns = avg(measure_ns(work)); + if (ctx.is_main()) { + collector.add_sample(cost_ns); } } - return result; }; - return Nexus::run(num_threads, entry); + Nexus::run(num_threads, entry); + auto result = collector.get_result(); + return {result}; } //----------------------------------------------------------------------------- @@ -224,7 +274,7 @@ void estimate_cost() { //----------------------------------------------------------------------------- template <typename T> -void thread_safety_loop(Nexus &ctx, T &lock, MyState &state, Meets &meets, int read_bp) { +void thread_safety_loop(Nexus &ctx, T &lock, MyState &state, auto &max, int read_bp) { Rnd rnd(ctx.thread_id()); size_t write_cnt = 0; size_t bad_reads = 0; @@ -247,16 +297,11 @@ void thread_safety_loop(Nexus &ctx, T &lock, MyState &state, Meets &meets, int r } } } - auto t1 = steady_clock::now(); - ctx.barrier(); - auto t2 = steady_clock::now(); - auto my_ms = count_ns(t1 - t0) / 1'000'000.0; - auto total_ms = count_ns(t2 - t0) / 1'000'000.0; - auto cost_ms = meets.avg(my_ms); - auto range_ms = meets.range(my_ms); - if (ctx.thread_id() == 0) { - fprintf(stderr, "---> %s with %2zu threads (%5d bp r): avg: %10.2f ms, range: %10.2f ms, max: %10.2f ms\n", - getClassName(lock).c_str(), ctx.num_threads(), read_bp, cost_ms, range_ms, total_ms); + auto my_ms = count_ns(steady_clock::now() - t0) / 1'000'000.0; + auto actual_ms = max(my_ms); + if (ctx.is_main()) { + fprintf(stderr, "---> %s with %2zu threads (%5d bp r): time: %10.2f ms\n", + getClassName(lock).c_str(), ctx.num_threads(), read_bp, actual_ms); } state.commit_inconsistent_reads(bad_reads); state.commit_expected_writes(write_cnt); @@ -290,9 +335,9 @@ void benchmark_lock() { for (size_t bp: {10000, 9999, 5000, 0}) { for (size_t num_threads: {8, 4, 2, 1}) { if (bench || (bp == 9999 && num_threads == 8)) { - Meets meets(num_threads); + vespalib::test::ThreadMeets::Max<double> max(num_threads); Nexus::run(num_threads, [&](Nexus &ctx) { - thread_safety_loop(ctx, *lock, *state, meets, bp); + thread_safety_loop(ctx, *lock, *state, max, bp); }); } } diff --git a/vespalib/src/vespa/vespalib/test/nexus.h b/vespalib/src/vespa/vespalib/test/nexus.h index 659b563e43c..7be35d42463 100644 --- a/vespalib/src/vespa/vespalib/test/nexus.h +++ b/vespalib/src/vespa/vespalib/test/nexus.h @@ -36,6 +36,7 @@ public: Nexus &operator=(const Nexus &) = delete; size_t num_threads() const noexcept { return _vote.size(); } size_t thread_id() const noexcept { return _thread_id; } + bool is_main() const noexcept { return _thread_id == 0; } bool vote(bool my_vote) { return _vote(my_vote); } void barrier() { REQUIRE_EQ(_vote(true), true); } struct select_thread_0 {}; diff --git a/vespalib/src/vespa/vespalib/test/thread_meets.h b/vespalib/src/vespa/vespalib/test/thread_meets.h index 7ef4dcb9921..26ca560641d 100644 --- a/vespalib/src/vespa/vespalib/test/thread_meets.h +++ b/vespalib/src/vespa/vespalib/test/thread_meets.h @@ -38,8 +38,8 @@ struct ThreadMeets { explicit Sum(size_t N) : vespalib::Rendezvous<T,T>(N) {} T operator()(T value) { return rendezvous(value); } void mingle() override { - T acc{}; - for (size_t i = 0; i < size(); ++i) { + T acc = in(0); + for (size_t i = 1; i < size(); ++i) { acc += in(i); } for (size_t i = 0; i < size(); ++i) { @@ -47,6 +47,48 @@ struct ThreadMeets { } } }; + // maximum of values across all threads + template <typename T> + struct Max : vespalib::Rendezvous<T,T> { + using vespalib::Rendezvous<T,T>::in; + using vespalib::Rendezvous<T,T>::out; + using vespalib::Rendezvous<T,T>::size; + using vespalib::Rendezvous<T,T>::rendezvous; + explicit Max(size_t N) : vespalib::Rendezvous<T,T>(N) {} + T operator()(T value) { return rendezvous(value); } + void mingle() override { + T max = in(0); + for (size_t i = 1; i < size(); ++i) { + if (in(i) > max) { + max = in(i); + } + } + for (size_t i = 0; i < size(); ++i) { + out(i) = max; + } + } + }; + // minimum of values across all threads + template <typename T> + struct Min : vespalib::Rendezvous<T,T> { + using vespalib::Rendezvous<T,T>::in; + using vespalib::Rendezvous<T,T>::out; + using vespalib::Rendezvous<T,T>::size; + using vespalib::Rendezvous<T,T>::rendezvous; + explicit Min(size_t N) : vespalib::Rendezvous<T,T>(N) {} + T operator()(T value) { return rendezvous(value); } + void mingle() override { + T min = in(0); + for (size_t i = 1; i < size(); ++i) { + if (in(i) < min) { + min = in(i); + } + } + for (size_t i = 0; i < size(); ++i) { + out(i) = min; + } + } + }; // range of values across all threads template <typename T> struct Range : vespalib::Rendezvous<T,T> { |