Merge pull request #20915 from vespa-engine/havardpe/proton-cpu-util-metrics

added cpu util metrics
author: Geir Storli <geirst@yahooinc.com> 2022-01-25 13:51:23 +0100
committer: GitHub <noreply@github.com> 2022-01-25 13:51:23 +0100
commit: b204e44404142250b79be20c492951de1287609d (patch)
tree: 68a2d1248a1c08125cc1b40945495b33764d1504
parent: 0dcd724d233f3833180030907e969007a81a9682 (diff)
parent: 949932e310963df295c27af0dc62b77d4625288a (diff)
7 files changed, 104 insertions, 64 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp
index 9bc20f95d13..9cf0339986c 100644
--- a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp
+++ b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp
@@ -4,6 +4,18 @@
 
 namespace proton {
 
+ResourceUsageMetrics::CpuUtilMetrics::CpuUtilMetrics(metrics::MetricSet *parent)
+  : MetricSet("cpu_util", {}, "Unnormalized cpu utilization for various categories", parent),
+    setup("setup", {}, "cpu used by system init and (re-)configuration", this),
+    read("read", {}, "cpu used by reading data from the system", this),
+    write("write", {}, "cpu used by writing data to the system", this),
+    compact("compact", {}, "cpu used by internal data re-structuring", this),
+    other("other", {}, "cpu used by work not classified as a specific category", this)
+{
+}
+
+ResourceUsageMetrics::CpuUtilMetrics::~CpuUtilMetrics() = default;
+
 ResourceUsageMetrics::ResourceUsageMetrics(metrics::MetricSet *parent)
     : MetricSet("resource_usage", {}, "Usage metrics for various resources in this search engine", parent),
       disk("disk", {}, "The relative amount of disk space used on this machine (value in the range [0, 1])", this),
@@ -15,7 +27,8 @@ ResourceUsageMetrics::ResourceUsageMetrics(metrics::MetricSet *parent)
       memoryMappings("memory_mappings", {}, "The number of mapped memory areas", this),
       openFileDescriptors("open_file_descriptors", {}, "The number of open files", this),
       feedingBlocked("feeding_blocked", {}, "Whether feeding is blocked due to resource limits being reached (value is either 0 or 1)", this),
-      mallocArena("malloc_arena", {}, "Size of malloc arena", this)
+      mallocArena("malloc_arena", {}, "Size of malloc arena", this),
+      cpu_util(this)
 {
 }
 
diff --git a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h
index 774bb645c84..35100084cf7 100644
--- a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h
+++ b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h
@@ -12,6 +12,17 @@ namespace proton {
  */
 struct ResourceUsageMetrics : metrics::MetricSet
 {
+    struct CpuUtilMetrics : metrics::MetricSet {
+        metrics::DoubleValueMetric setup;
+        metrics::DoubleValueMetric read;
+        metrics::DoubleValueMetric write;
+        metrics::DoubleValueMetric compact;
+        metrics::DoubleValueMetric other;
+
+        CpuUtilMetrics(metrics::MetricSet *parent);
+        ~CpuUtilMetrics();
+    };
+
     metrics::DoubleValueMetric disk;
     metrics::DoubleValueMetric diskUtilization;
     metrics::DoubleValueMetric memory;
@@ -22,6 +33,7 @@ struct ResourceUsageMetrics : metrics::MetricSet
     metrics::LongValueMetric openFileDescriptors;
     metrics::LongValueMetric feedingBlocked;
     metrics::LongValueMetric mallocArena;
+    CpuUtilMetrics           cpu_util;
 
     ResourceUsageMetrics(metrics::MetricSet *parent);
     ~ResourceUsageMetrics();
diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.cpp b/searchcore/src/vespa/searchcore/proton/server/proton.cpp
index b128fe16e5e..b2a86102cd1 100644
--- a/searchcore/src/vespa/searchcore/proton/server/proton.cpp
+++ b/searchcore/src/vespa/searchcore/proton/server/proton.cpp
@@ -61,6 +61,7 @@ using vespalib::Slime;
 using vespalib::makeLambdaTask;
 using vespalib::slime::ArrayInserter;
 using vespalib::slime::Cursor;
+using CpuCategory = vespalib::CpuUsage::Category;
 
 using search::transactionlog::DomainStats;
 using vespa::config::search::core::ProtonConfig;
@@ -207,6 +208,7 @@ Proton::Proton(const config::ConfigUri & configUri,
       StatusProducer(),
       IPersistenceEngineOwner(),
       ComponentConfigProducer(),
+      _cpu_util(),
       _configUri(configUri),
       _mutex(),
       _metricsHook(std::make_unique<MetricsUpdateHook>(*this)),
@@ -775,6 +777,12 @@ Proton::updateMetrics(const metrics::MetricLockGuard &)
 #else
         metrics.resourceUsage.mallocArena.set(UINT64_C(0));
 #endif
+        auto cpu_util = _cpu_util.get_util();
+        metrics.resourceUsage.cpu_util.setup.set(cpu_util[CpuCategory::SETUP]);
+        metrics.resourceUsage.cpu_util.read.set(cpu_util[CpuCategory::READ]);
+        metrics.resourceUsage.cpu_util.write.set(cpu_util[CpuCategory::WRITE]);
+        metrics.resourceUsage.cpu_util.compact.set(cpu_util[CpuCategory::COMPACT]);
+        metrics.resourceUsage.cpu_util.other.set(cpu_util[CpuCategory::OTHER]);
     }
     {
         ContentProtonMetrics::ProtonExecutorMetrics &metrics = _metricsEngine->root().executor;
diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.h b/searchcore/src/vespa/searchcore/proton/server/proton.h
index 573f215c722..90a257a0aaa 100644
--- a/searchcore/src/vespa/searchcore/proton/server/proton.h
+++ b/searchcore/src/vespa/searchcore/proton/server/proton.h
@@ -25,6 +25,7 @@
 #include <vespa/vespalib/net/json_handler_repo.h>
 #include <vespa/vespalib/net/state_explorer.h>
 #include <vespa/vespalib/util/varholder.h>
+#include <vespa/vespalib/util/cpu_usage.h>
 #include <mutex>
 #include <shared_mutex>
 
@@ -81,6 +82,7 @@ private:
         void setClusterName(const vespalib::string &clusterName, const vespalib::string &baseDir);
     };
 
+    vespalib::CpuUtil                      _cpu_util;
     const config::ConfigUri                _configUri;
     mutable std::shared_mutex              _mutex;
     std::unique_ptr<metrics::UpdateHook>   _metricsHook;
diff --git a/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp b/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp
index fde02dc8435..83f49d6c73b 100644
--- a/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp
+++ b/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp
@@ -68,22 +68,22 @@ void verify_sampling(size_t thread_id, size_t num_threads, std::vector<Sampler*>
         auto post_total = cpu_usage::total_cpu_usage();
         TEST_BARRIER(); // #4
         double wall = to_s(t1 - t0);
-        std::vector<double> load(4, 0.0);
+        std::vector<double> util(4, 0.0);
         for (size_t i = 0; i < 4; ++i) {
-            load[i] = to_s(post_usage[i] - pre_usage[i]) / wall;
+            util[i] = to_s(post_usage[i] - pre_usage[i]) / wall;
         }
-        double total_load = to_s(post_total - pre_total) / wall;
-        EXPECT_GREATER(load[3], load[0]);
-        // NB: cannot expect total_load to be greater than load[3]
-        // here due to mock loads being 'as expected' while valgrind
-        // will cut all loads in about half.
-        EXPECT_GREATER(total_load, load[0]);
-        fprintf(stderr, "loads: { %.3f, %.3f, %.3f, %.3f }\n", load[0], load[1], load[2], load[3]);
-        fprintf(stderr, "total load: %.3f\n", total_load);
+        double total_util = to_s(post_total - pre_total) / wall;
+        EXPECT_GREATER(util[3], util[0]);
+        // NB: cannot expect total_util to be greater than util[3]
+        // here due to mock utils being 'as expected' while valgrind
+        // will cut all utils in about half.
+        EXPECT_GREATER(total_util, util[0]);
+        fprintf(stderr, "utils: { %.3f, %.3f, %.3f, %.3f }\n", util[0], util[1], util[2], util[3]);
+        fprintf(stderr, "total util: %.3f\n", total_util);
     } else {
         int idx = (thread_id - 1);
-        double target_load = double(thread_id - 1) / (num_threads - 2);
-        auto sampler = cpu_usage::create_thread_sampler(force_mock, target_load);
+        double target_util = double(thread_id - 1) / (num_threads - 2);
+        auto sampler = cpu_usage::create_thread_sampler(force_mock, target_util);
         samplers[idx] = sampler.get();
         TEST_BARRIER(); // #1
         TEST_BARRIER(); // #2
@@ -97,7 +97,7 @@ void verify_sampling(size_t thread_id, size_t num_threads, std::vector<Sampler*>
 
 //-----------------------------------------------------------------------------
 
-TEST_MT_F("require that dummy thread-based CPU usage sampling with known expected load works", 5, std::vector<Sampler*>(4, nullptr)) {
+TEST_MT_F("require that dummy thread-based CPU usage sampling with known expected util works", 5, std::vector<Sampler*>(4, nullptr)) {
     TEST_DO(verify_sampling(thread_id, num_threads, f1, true));
 }
 
@@ -396,46 +396,18 @@ TEST("require that thread tracker implementation does not track OTHER cpu use")
 
 //-----------------------------------------------------------------------------
 
-// prototype for the class we want to use when integrating CPU usage
-// into metrics as load values. NB: this class is not thread safe.
-
-class CpuMonitor {
-private:
-    duration _min_delay;
-    CpuUsage::TimedSample _old_sample;
-    std::array<double,CpuUsage::num_categories> _load;
-
-public:
-    CpuMonitor(duration min_delay)
-      : _min_delay(min_delay),
-        _old_sample(CpuUsage::sample()),
-        _load() {}
-
-    std::array<double,CpuUsage::num_categories> get_load() {
-        if (steady_clock::now() >= (_old_sample.first + _min_delay)) {
-            auto new_sample = CpuUsage::sample();
-            auto dt = to_s(new_sample.first - _old_sample.first);
-            for (size_t i = 0; i < CpuUsage::num_categories; ++i) {
-                _load[i] = to_s(new_sample.second[i] - _old_sample.second[i]) / dt;
-            }
-            _old_sample = new_sample;
-        }
-        return _load;
-    }
-};
-
 void do_sample_cpu_usage(const EndTime &end_time) {
     auto my_usage = CpuUsage::use(CpuUsage::Category::SETUP);
-    CpuMonitor monitor(8ms);
+    CpuUtil cpu(8ms);
     while (!end_time()) {
         std::this_thread::sleep_for(verbose ? 1s : 10ms);
-        auto load = monitor.get_load();
+        auto util = cpu.get_util();
         vespalib::string body;
-        for (size_t i = 0; i < load.size(); ++i) {
+        for (size_t i = 0; i < util.size(); ++i) {
             if (!body.empty()) {
                 body.append(", ");
             }
-            body.append(fmt("%s: %.3f", CpuUsage::name_of(CpuUsage::Category(i)).c_str(), load[i]));
+            body.append(fmt("%s: %.3f", CpuUsage::name_of(CpuUsage::Category(i)).c_str(), util[i]));
         }
         fprintf(stderr, "CPU: %s\n", body.c_str());
     }
diff --git a/vespalib/src/vespa/vespalib/util/cpu_usage.cpp b/vespalib/src/vespa/vespalib/util/cpu_usage.cpp
index 5609b0d8d09..5307239118d 100644
--- a/vespalib/src/vespa/vespalib/util/cpu_usage.cpp
+++ b/vespalib/src/vespa/vespalib/util/cpu_usage.cpp
@@ -17,11 +17,11 @@ namespace {
 class DummyThreadSampler : public ThreadSampler {
 private:
     steady_time _start;
-    double _load;
+    double _util;
 public:
-    DummyThreadSampler(double load) : _start(steady_clock::now()), _load(load) {}
+    DummyThreadSampler(double util) : _start(steady_clock::now()), _util(util) {}
     duration sample() const noexcept override {
-        return from_s(to_s(steady_clock::now() - _start) * _load);
+        return from_s(to_s(steady_clock::now() - _start) * _util);
     }
 };
 
@@ -53,14 +53,14 @@ duration total_cpu_usage() noexcept {
         return from_timespec(ts);
 }
 
-ThreadSampler::UP create_thread_sampler(bool force_mock_impl, double expected_load) {
+ThreadSampler::UP create_thread_sampler(bool force_mock_impl, double expected_util) {
     if (force_mock_impl) {
-        return std::make_unique<DummyThreadSampler>(expected_load);
+        return std::make_unique<DummyThreadSampler>(expected_util);
     }
 #ifdef __linux__
     return std::make_unique<LinuxThreadSampler>();
 #endif
-    return std::make_unique<DummyThreadSampler>(expected_load);
+    return std::make_unique<DummyThreadSampler>(expected_util);
 }
 
 } // cpu_usage
diff --git a/vespalib/src/vespa/vespalib/util/cpu_usage.h b/vespalib/src/vespa/vespalib/util/cpu_usage.h
index 3c30937151c..87e0a289e87 100644
--- a/vespalib/src/vespa/vespalib/util/cpu_usage.h
+++ b/vespalib/src/vespa/vespalib/util/cpu_usage.h
@@ -28,7 +28,7 @@ struct ThreadSampler {
     virtual ~ThreadSampler() {}
 };
 
-ThreadSampler::UP create_thread_sampler(bool force_mock_impl = false, double expected_load = 0.16);
+ThreadSampler::UP create_thread_sampler(bool force_mock_impl = false, double expected_util = 0.16);
 
 } // cpu_usage
 
@@ -61,21 +61,25 @@ public:
     static constexpr size_t index_of(Category cat) { return static_cast<size_t>(cat); }
     static constexpr size_t num_categories = 5;
 
-    // A sample contains how much CPU has been spent in various
-    // categories.
-    class Sample {
+    template <typename T>
+    class PerCategory {
     private:
-        std::array<duration,num_categories> _usage;
+        std::array<T,num_categories> _array;
+    public:
+        PerCategory() : _array() {}
+        size_t size() const { return _array.size(); }
+        T &operator[](size_t idx) { return _array[idx]; }
+        T &operator[](Category cat) { return _array[index_of(cat)]; }
+        const T &operator[](size_t idx) const { return _array[idx]; }
+        const T &operator[](Category cat) const { return _array[index_of(cat)]; }
+    };
+
+    // A sample contains how much CPU has been spent in each category.
+    class Sample : public PerCategory<duration> {
     public:
-        Sample() : _usage() {}
-        size_t size() const { return _usage.size(); }
-        duration &operator[](size_t idx) { return _usage[idx]; }
-        duration &operator[](Category cat) { return _usage[index_of(cat)]; }
-        const duration &operator[](size_t idx) const { return _usage[idx]; }
-        const duration &operator[](Category cat) const { return _usage[index_of(cat)]; }
         void merge(const Sample &rhs) {
             for (size_t i = 0; i < size(); ++i) {
-                _usage[i] += rhs._usage[i];
+                (*this)[i] += rhs[i];
             }
         }
     };
@@ -176,4 +180,33 @@ public:
     static TimedSample sample();
 };
 
+/**
+ * Simple class used to track cpu utilization over time.
+ **/
+class CpuUtil
+{
+private:
+    duration _min_delay;
+    CpuUsage::TimedSample _old_sample;
+    CpuUsage::PerCategory<double> _util;
+
+public:
+    CpuUtil(duration min_delay = 850ms)
+      : _min_delay(min_delay),
+        _old_sample(CpuUsage::sample()),
+        _util() {}
+
+    CpuUsage::PerCategory<double> get_util() {
+        if (steady_clock::now() >= (_old_sample.first + _min_delay)) {
+            auto new_sample = CpuUsage::sample();
+            auto dt = to_s(new_sample.first - _old_sample.first);
+            for (size_t i = 0; i < _util.size(); ++i) {
+                _util[i] = to_s(new_sample.second[i] - _old_sample.second[i]) / dt;
+            }
+            _old_sample = new_sample;
+        }
+        return _util;
+    }
+};
+
 } // namespace
author	Geir Storli <geirst@yahooinc.com>	2022-01-25 13:51:23 +0100
committer	GitHub <noreply@github.com>	2022-01-25 13:51:23 +0100
commit	b204e44404142250b79be20c492951de1287609d (patch)
tree	68a2d1248a1c08125cc1b40945495b33764d1504
parent	0dcd724d233f3833180030907e969007a81a9682 (diff)
parent	949932e310963df295c27af0dc62b77d4625288a (diff)