Merge pull request #15412 from vespa-engine/havardpe/improved-benchmarking-fairness

Havardpe/improved benchmarking fairness
author: Arne H Juul <arnej27959@users.noreply.github.com> 2020-11-21 18:12:03 +0100
committer: GitHub <noreply@github.com> 2020-11-21 18:12:03 +0100
commit: ce02f0498515a3de7b8559fc8adce0b47dcbd2e5 (patch)
tree: 8ce01aad5d7adec83f59b864f2e9e0eb4de7b4df /eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
parent: 1d82ff21476a9918ded0b70242397b7ee3157096 (diff)
parent: 7d305eb524afa7a5640af5b1d3581152633113c5 (diff)
1 files changed, 80 insertions, 29 deletions
diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
index 1a796aac88f..e4c1af3100a 100644
--- a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
+++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
@@ -354,6 +354,7 @@ MyParam::~MyParam() = default;
 
 struct EvalOp {
     using UP = std::unique_ptr<EvalOp>;
+    Stash                    my_stash;
     const Impl              &impl;
     MyParam                  my_param;
     std::vector<Value::UP>   values;
@@ -361,8 +362,8 @@ struct EvalOp {
     EvalSingle               single;
     EvalOp(const EvalOp &) = delete;
     EvalOp &operator=(const EvalOp &) = delete;
-    EvalOp(Instruction op, const std::vector<CREF<TensorSpec>> &stack_spec, const Impl &impl_in)
-        : impl(impl_in), my_param(), values(), stack(), single(impl.engine, op)
+    EvalOp(Stash &&stash_in, Instruction op, const std::vector<CREF<TensorSpec>> &stack_spec, const Impl &impl_in)
+        : my_stash(std::move(stash_in)), impl(impl_in), my_param(), values(), stack(), single(impl.engine, op)
     {
         for (const TensorSpec &spec: stack_spec) {
             values.push_back(impl.create_value(spec));
@@ -371,14 +372,51 @@ struct EvalOp {
             stack.push_back(*value.get());
         }
     }
-    EvalOp(Instruction op, const TensorSpec &p0, const Impl &impl_in)
-        : impl(impl_in), my_param(p0, impl), values(), stack(), single(impl.engine, op, my_param)
+    EvalOp(Stash &&stash_in, Instruction op, const TensorSpec &p0, const Impl &impl_in)
+        : my_stash(std::move(stash_in)), impl(impl_in), my_param(p0, impl), values(), stack(), single(impl.engine, op, my_param)
     {
     }
     TensorSpec result() { return impl.create_spec(single.eval(stack)); }
-    double estimate_cost_us() {
-        auto actual = [&](){ single.eval(stack); };
-        return BenchmarkTimer::benchmark(actual, budget) * 1000.0 * 1000.0;
+    size_t suggest_loop_cnt() {
+        size_t loop_cnt = 1;
+        auto my_loop = [&](){
+            for (size_t i = 0; i < loop_cnt; ++i) {
+                single.eval(stack);
+            }
+        };
+        for (;;) {
+            vespalib::BenchmarkTimer timer(0.0);
+            for (size_t i = 0; i < 5; ++i) {
+                timer.before();
+                my_loop();
+                timer.after();
+            }
+            double min_time = timer.min_time();
+            if (min_time > 0.004) {
+                break;
+            } else {
+                loop_cnt *= 2;
+            }
+        }
+        return std::max(loop_cnt, size_t(8));
+    }
+    double estimate_cost_us(size_t self_loop_cnt, size_t ref_loop_cnt) {
+        size_t loop_cnt = ((self_loop_cnt * 128) < ref_loop_cnt) ? self_loop_cnt : ref_loop_cnt;
+        assert((loop_cnt % 8) == 0);
+        auto my_loop = [&](){
+            for (size_t i = 0; (i + 7) < loop_cnt; i += 8) {
+                for (size_t j = 0; j < 8; ++j) {
+                    single.eval(stack);
+                }
+            }
+        };
+        BenchmarkTimer timer(budget);
+        while (timer.has_budget()) {
+            timer.before();
+            my_loop();
+            timer.after();
+        }
+        return timer.min_time() * 1000.0 * 1000.0 / double(loop_cnt);
     }
 };
 
@@ -396,8 +434,12 @@ void benchmark(const vespalib::string &desc, const std::vector<EvalOp::UP> &list
         }
     }
     BenchmarkResult result(desc, list.size());
+    std::vector<size_t> loop_cnt(list.size());
     for (const auto &eval: list) {
-        double time = eval->estimate_cost_us();
+        loop_cnt[eval->impl.order] = eval->suggest_loop_cnt();
+    }
+    for (const auto &eval: list) {
+        double time = eval->estimate_cost_us(loop_cnt[eval->impl.order], loop_cnt[1]);
         result.sample(eval->impl.order, time);
         fprintf(stderr, "    %s(%s): %10.3f us\n", eval->impl.name.c_str(), eval->impl.short_name.c_str(), time);
     }
@@ -420,9 +462,10 @@ void benchmark_join(const vespalib::string &desc, const TensorSpec &lhs,
     ASSERT_FALSE(res_type.is_error());
     std::vector<EvalOp::UP> list;
     for (const Impl &impl: impl_list) {
-        auto op = impl.create_join(lhs_type, rhs_type, function, stash);
+        Stash my_stash;
+        auto op = impl.create_join(lhs_type, rhs_type, function, my_stash);
         std::vector<CREF<TensorSpec>> stack_spec({lhs, rhs});
-        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+        list.push_back(std::make_unique<EvalOp>(std::move(my_stash), op, stack_spec, impl));
     }
     benchmark(desc, list);
 }
@@ -439,9 +482,10 @@ void benchmark_reduce(const vespalib::string &desc, const TensorSpec &lhs,
     ASSERT_FALSE(res_type.is_error());
     std::vector<EvalOp::UP> list;
     for (const Impl &impl: impl_list) {
-        auto op = impl.create_reduce(lhs_type, aggr, dims, stash);
+        Stash my_stash;
+        auto op = impl.create_reduce(lhs_type, aggr, dims, my_stash);
         std::vector<CREF<TensorSpec>> stack_spec({lhs});
-        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+        list.push_back(std::make_unique<EvalOp>(std::move(my_stash), op, stack_spec, impl));
     }
     benchmark(desc, list);
 }
@@ -459,9 +503,10 @@ void benchmark_rename(const vespalib::string &desc, const TensorSpec &lhs,
     ASSERT_FALSE(res_type.is_error());
     std::vector<EvalOp::UP> list;
     for (const Impl &impl: impl_list) {
-        auto op = impl.create_rename(lhs_type, from, to, stash);
+        Stash my_stash;
+        auto op = impl.create_rename(lhs_type, from, to, my_stash);
         std::vector<CREF<TensorSpec>> stack_spec({lhs});
-        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+        list.push_back(std::make_unique<EvalOp>(std::move(my_stash), op, stack_spec, impl));
     }
     benchmark(desc, list);
 }
@@ -480,9 +525,10 @@ void benchmark_merge(const vespalib::string &desc, const TensorSpec &lhs,
     ASSERT_FALSE(res_type.is_error());
     std::vector<EvalOp::UP> list;
     for (const Impl &impl: impl_list) {
-        auto op = impl.create_merge(lhs_type, rhs_type, function, stash);
+        Stash my_stash;
+        auto op = impl.create_merge(lhs_type, rhs_type, function, my_stash);
         std::vector<CREF<TensorSpec>> stack_spec({lhs, rhs});
-        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+        list.push_back(std::make_unique<EvalOp>(std::move(my_stash), op, stack_spec, impl));
     }
     benchmark(desc, list);
 }
@@ -496,9 +542,10 @@ void benchmark_map(const vespalib::string &desc, const TensorSpec &lhs, operatio
     ASSERT_FALSE(lhs_type.is_error());
     std::vector<EvalOp::UP> list;
     for (const Impl &impl: impl_list) {
-        auto op = impl.create_map(lhs_type, function, stash);
+        Stash my_stash;
+        auto op = impl.create_map(lhs_type, function, my_stash);
         std::vector<CREF<TensorSpec>> stack_spec({lhs});
-        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+        list.push_back(std::make_unique<EvalOp>(std::move(my_stash), op, stack_spec, impl));
     }
     benchmark(desc, list);
 }
@@ -517,9 +564,10 @@ void benchmark_concat(const vespalib::string &desc, const TensorSpec &lhs,
     ASSERT_FALSE(res_type.is_error());
     std::vector<EvalOp::UP> list;
     for (const Impl &impl: impl_list) {
-        auto op = impl.create_concat(lhs_type, rhs_type, dimension, stash);
+        Stash my_stash;
+        auto op = impl.create_concat(lhs_type, rhs_type, dimension, my_stash);
         std::vector<CREF<TensorSpec>> stack_spec({lhs, rhs});
-        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+        list.push_back(std::make_unique<EvalOp>(std::move(my_stash), op, stack_spec, impl));
     }
     benchmark(desc, list);
 }
@@ -536,10 +584,11 @@ void benchmark_tensor_create(const vespalib::string &desc, const TensorSpec &pro
     }
     std::vector<EvalOp::UP> list;
     for (const Impl &impl: impl_list) {
-        auto op = impl.create_tensor_create(proto_type, proto, stash);
-        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+        Stash my_stash;
+        auto op = impl.create_tensor_create(proto_type, proto, my_stash);
+        list.push_back(std::make_unique<EvalOp>(std::move(my_stash), op, stack_spec, impl));
     }
-    benchmark(desc, list);    
+    benchmark(desc, list);
 }
 
 //-----------------------------------------------------------------------------
@@ -550,8 +599,9 @@ void benchmark_tensor_lambda(const vespalib::string &desc, const ValueType &type
     ASSERT_FALSE(p0_type.is_error());
     std::vector<EvalOp::UP> list;
     for (const Impl &impl: impl_list) {
-        auto op = impl.create_tensor_lambda(type, function, p0_type, stash);
-        list.push_back(std::make_unique<EvalOp>(op, p0, impl));
+        Stash my_stash;
+        auto op = impl.create_tensor_lambda(type, function, p0_type, my_stash);
+        list.push_back(std::make_unique<EvalOp>(std::move(my_stash), op, p0, impl));
     }
     benchmark(desc, list);
 }
@@ -571,8 +621,9 @@ void benchmark_tensor_peek(const vespalib::string &desc, const TensorSpec &lhs,
     }
     std::vector<EvalOp::UP> list;
     for (const Impl &impl: impl_list) {
-        auto op = impl.create_tensor_peek(type, peek_spec, stash);
-        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+        Stash my_stash;
+        auto op = impl.create_tensor_peek(type, peek_spec, my_stash);
+        list.push_back(std::make_unique<EvalOp>(std::move(my_stash), op, stack_spec, impl));
     }
     benchmark(desc, list);
 }
@@ -610,11 +661,11 @@ void benchmark_encode_decode(const vespalib::string &desc, const TensorSpec &pro
     BenchmarkResult encode_result(desc + " <encode>", impl_list.size());
     BenchmarkResult decode_result(desc + " <decode>", impl_list.size());
     for (const Impl &impl: impl_list) {
-        constexpr size_t loop_cnt = 16;
+        constexpr size_t loop_cnt = 32;
         auto value = impl.create_value(proto);
         BenchmarkTimer encode_timer(2 * budget);
         BenchmarkTimer decode_timer(2 * budget);
-        while (encode_timer.has_budget() || decode_timer.has_budget()) {
+        while (encode_timer.has_budget()) {
             std::array<vespalib::nbostream, loop_cnt> data;
             std::array<Value::UP, loop_cnt> object;
             encode_timer.before();
author	Arne H Juul <arnej27959@users.noreply.github.com>	2020-11-21 18:12:03 +0100
committer	GitHub <noreply@github.com>	2020-11-21 18:12:03 +0100
commit	ce02f0498515a3de7b8559fc8adce0b47dcbd2e5 (patch)
tree	8ce01aad5d7adec83f59b864f2e9e0eb4de7b4df /eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
parent	1d82ff21476a9918ded0b70242397b7ee3157096 (diff)
parent	7d305eb524afa7a5640af5b1d3581152633113c5 (diff)