use same loop_cnt when benchmarking if possible

author: Håvard Pettersen <havardpe@oath.com> 2020-11-20 15:05:03 +0000
committer: Håvard Pettersen <havardpe@oath.com> 2020-11-20 15:05:03 +0000
commit: 7d305eb524afa7a5640af5b1d3581152633113c5 (patch)
tree: 574d21a82b08159a99a8cf61c26e8f87f6695a79 /eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
parent: 89f750817275de285f6d02b04b7fc521bf404af4 (diff)
1 files changed, 47 insertions, 6 deletions
diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
index 49d21f2de52..e5bd8e7c054 100644
--- a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
+++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
@@ -377,9 +377,46 @@ struct EvalOp {
     {
     }
     TensorSpec result() { return impl.create_spec(single.eval(stack)); }
-    double estimate_cost_us() {
-        auto actual = [&](){ single.eval(stack); };
-        return BenchmarkTimer::benchmark(actual, budget) * 1000.0 * 1000.0;
+    size_t suggest_loop_cnt() {
+        size_t loop_cnt = 1;
+        auto my_loop = [&](){
+            for (size_t i = 0; i < loop_cnt; ++i) {
+                single.eval(stack);
+            }
+        };
+        for (;;) {
+            vespalib::BenchmarkTimer timer(0.0);
+            for (size_t i = 0; i < 5; ++i) {
+                timer.before();
+                my_loop();
+                timer.after();
+            }
+            double min_time = timer.min_time();
+            if (min_time > 0.004) {
+                break;
+            } else {
+                loop_cnt *= 2;
+            }
+        }
+        return std::max(loop_cnt, size_t(8));
+    }
+    double estimate_cost_us(size_t self_loop_cnt, size_t ref_loop_cnt) {
+        size_t loop_cnt = ((self_loop_cnt * 128) < ref_loop_cnt) ? self_loop_cnt : ref_loop_cnt;
+        assert((loop_cnt % 8) == 0);
+        auto my_loop = [&](){
+            for (size_t i = 0; (i + 7) < loop_cnt; i += 8) {
+                for (size_t j = 0; j < 8; ++j) {
+                    single.eval(stack);
+                }
+            }
+        };
+        BenchmarkTimer timer(budget);
+        while (timer.has_budget()) {
+            timer.before();
+            my_loop();
+            timer.after();
+        }
+        return timer.min_time() * 1000.0 * 1000.0 / double(loop_cnt);
     }
 };
 
@@ -397,8 +434,12 @@ void benchmark(const vespalib::string &desc, const std::vector<EvalOp::UP> &list
         }
     }
     BenchmarkResult result(desc, list.size());
+    std::vector<size_t> loop_cnt(list.size());
+    for (const auto &eval: list) {
+        loop_cnt[eval->impl.order] = eval->suggest_loop_cnt();
+    }
     for (const auto &eval: list) {
-        double time = eval->estimate_cost_us();
+        double time = eval->estimate_cost_us(loop_cnt[eval->impl.order], loop_cnt[1]);
         result.sample(eval->impl.order, time);
         fprintf(stderr, "    %s(%s): %10.3f us\n", eval->impl.name.c_str(), eval->impl.short_name.c_str(), time);
     }
@@ -620,11 +661,11 @@ void benchmark_encode_decode(const vespalib::string &desc, const TensorSpec &pro
     BenchmarkResult encode_result(desc + " <encode>", impl_list.size());
     BenchmarkResult decode_result(desc + " <decode>", impl_list.size());
     for (const Impl &impl: impl_list) {
-        constexpr size_t loop_cnt = 16;
+        constexpr size_t loop_cnt = 32;
         auto value = impl.create_value(proto);
         BenchmarkTimer encode_timer(2 * budget);
         BenchmarkTimer decode_timer(2 * budget);
-        while (encode_timer.has_budget() || decode_timer.has_budget()) {
+        while (encode_timer.has_budget()) {
             std::array<vespalib::nbostream, loop_cnt> data;
             std::array<Value::UP, loop_cnt> object;
             encode_timer.before();
author	Håvard Pettersen <havardpe@oath.com>	2020-11-20 15:05:03 +0000
committer	Håvard Pettersen <havardpe@oath.com>	2020-11-20 15:05:03 +0000
commit	7d305eb524afa7a5640af5b1d3581152633113c5 (patch)
tree	574d21a82b08159a99a8cf61c26e8f87f6695a79 /eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
parent	89f750817275de285f6d02b04b7fc521bf404af4 (diff)