diff options
author | Geir Storli <geirst@yahooinc.com> | 2022-01-25 13:51:23 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-25 13:51:23 +0100 |
commit | b204e44404142250b79be20c492951de1287609d (patch) | |
tree | 68a2d1248a1c08125cc1b40945495b33764d1504 | |
parent | 0dcd724d233f3833180030907e969007a81a9682 (diff) | |
parent | 949932e310963df295c27af0dc62b77d4625288a (diff) |
Merge pull request #20915 from vespa-engine/havardpe/proton-cpu-util-metrics
added cpu util metrics
7 files changed, 104 insertions, 64 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp index 9bc20f95d13..9cf0339986c 100644 --- a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp +++ b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp @@ -4,6 +4,18 @@ namespace proton { +ResourceUsageMetrics::CpuUtilMetrics::CpuUtilMetrics(metrics::MetricSet *parent) + : MetricSet("cpu_util", {}, "Unnormalized cpu utilization for various categories", parent), + setup("setup", {}, "cpu used by system init and (re-)configuration", this), + read("read", {}, "cpu used by reading data from the system", this), + write("write", {}, "cpu used by writing data to the system", this), + compact("compact", {}, "cpu used by internal data re-structuring", this), + other("other", {}, "cpu used by work not classified as a specific category", this) +{ +} + +ResourceUsageMetrics::CpuUtilMetrics::~CpuUtilMetrics() = default; + ResourceUsageMetrics::ResourceUsageMetrics(metrics::MetricSet *parent) : MetricSet("resource_usage", {}, "Usage metrics for various resources in this search engine", parent), disk("disk", {}, "The relative amount of disk space used on this machine (value in the range [0, 1])", this), @@ -15,7 +27,8 @@ ResourceUsageMetrics::ResourceUsageMetrics(metrics::MetricSet *parent) memoryMappings("memory_mappings", {}, "The number of mapped memory areas", this), openFileDescriptors("open_file_descriptors", {}, "The number of open files", this), feedingBlocked("feeding_blocked", {}, "Whether feeding is blocked due to resource limits being reached (value is either 0 or 1)", this), - mallocArena("malloc_arena", {}, "Size of malloc arena", this) + mallocArena("malloc_arena", {}, "Size of malloc arena", this), + cpu_util(this) { } diff --git a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h index 774bb645c84..35100084cf7 100644 --- a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h +++ b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h @@ -12,6 +12,17 @@ namespace proton { */ struct ResourceUsageMetrics : metrics::MetricSet { + struct CpuUtilMetrics : metrics::MetricSet { + metrics::DoubleValueMetric setup; + metrics::DoubleValueMetric read; + metrics::DoubleValueMetric write; + metrics::DoubleValueMetric compact; + metrics::DoubleValueMetric other; + + CpuUtilMetrics(metrics::MetricSet *parent); + ~CpuUtilMetrics(); + }; + metrics::DoubleValueMetric disk; metrics::DoubleValueMetric diskUtilization; metrics::DoubleValueMetric memory; @@ -22,6 +33,7 @@ struct ResourceUsageMetrics : metrics::MetricSet metrics::LongValueMetric openFileDescriptors; metrics::LongValueMetric feedingBlocked; metrics::LongValueMetric mallocArena; + CpuUtilMetrics cpu_util; ResourceUsageMetrics(metrics::MetricSet *parent); ~ResourceUsageMetrics(); diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.cpp b/searchcore/src/vespa/searchcore/proton/server/proton.cpp index b128fe16e5e..b2a86102cd1 100644 --- a/searchcore/src/vespa/searchcore/proton/server/proton.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/proton.cpp @@ -61,6 +61,7 @@ using vespalib::Slime; using vespalib::makeLambdaTask; using vespalib::slime::ArrayInserter; using vespalib::slime::Cursor; +using CpuCategory = vespalib::CpuUsage::Category; using search::transactionlog::DomainStats; using vespa::config::search::core::ProtonConfig; @@ -207,6 +208,7 @@ Proton::Proton(const config::ConfigUri & configUri, StatusProducer(), IPersistenceEngineOwner(), ComponentConfigProducer(), + _cpu_util(), _configUri(configUri), _mutex(), _metricsHook(std::make_unique<MetricsUpdateHook>(*this)), @@ -775,6 +777,12 @@ Proton::updateMetrics(const metrics::MetricLockGuard &) #else metrics.resourceUsage.mallocArena.set(UINT64_C(0)); #endif + auto cpu_util = _cpu_util.get_util(); + metrics.resourceUsage.cpu_util.setup.set(cpu_util[CpuCategory::SETUP]); + metrics.resourceUsage.cpu_util.read.set(cpu_util[CpuCategory::READ]); + metrics.resourceUsage.cpu_util.write.set(cpu_util[CpuCategory::WRITE]); + metrics.resourceUsage.cpu_util.compact.set(cpu_util[CpuCategory::COMPACT]); + metrics.resourceUsage.cpu_util.other.set(cpu_util[CpuCategory::OTHER]); } { ContentProtonMetrics::ProtonExecutorMetrics &metrics = _metricsEngine->root().executor; diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.h b/searchcore/src/vespa/searchcore/proton/server/proton.h index 573f215c722..90a257a0aaa 100644 --- a/searchcore/src/vespa/searchcore/proton/server/proton.h +++ b/searchcore/src/vespa/searchcore/proton/server/proton.h @@ -25,6 +25,7 @@ #include <vespa/vespalib/net/json_handler_repo.h> #include <vespa/vespalib/net/state_explorer.h> #include <vespa/vespalib/util/varholder.h> +#include <vespa/vespalib/util/cpu_usage.h> #include <mutex> #include <shared_mutex> @@ -81,6 +82,7 @@ private: void setClusterName(const vespalib::string &clusterName, const vespalib::string &baseDir); }; + vespalib::CpuUtil _cpu_util; const config::ConfigUri _configUri; mutable std::shared_mutex _mutex; std::unique_ptr<metrics::UpdateHook> _metricsHook; diff --git a/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp b/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp index fde02dc8435..83f49d6c73b 100644 --- a/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp +++ b/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp @@ -68,22 +68,22 @@ void verify_sampling(size_t thread_id, size_t num_threads, std::vector<Sampler*> auto post_total = cpu_usage::total_cpu_usage(); TEST_BARRIER(); // #4 double wall = to_s(t1 - t0); - std::vector<double> load(4, 0.0); + std::vector<double> util(4, 0.0); for (size_t i = 0; i < 4; ++i) { - load[i] = to_s(post_usage[i] - pre_usage[i]) / wall; + util[i] = to_s(post_usage[i] - pre_usage[i]) / wall; } - double total_load = to_s(post_total - pre_total) / wall; - EXPECT_GREATER(load[3], load[0]); - // NB: cannot expect total_load to be greater than load[3] - // here due to mock loads being 'as expected' while valgrind - // will cut all loads in about half. - EXPECT_GREATER(total_load, load[0]); - fprintf(stderr, "loads: { %.3f, %.3f, %.3f, %.3f }\n", load[0], load[1], load[2], load[3]); - fprintf(stderr, "total load: %.3f\n", total_load); + double total_util = to_s(post_total - pre_total) / wall; + EXPECT_GREATER(util[3], util[0]); + // NB: cannot expect total_util to be greater than util[3] + // here due to mock utils being 'as expected' while valgrind + // will cut all utils in about half. + EXPECT_GREATER(total_util, util[0]); + fprintf(stderr, "utils: { %.3f, %.3f, %.3f, %.3f }\n", util[0], util[1], util[2], util[3]); + fprintf(stderr, "total util: %.3f\n", total_util); } else { int idx = (thread_id - 1); - double target_load = double(thread_id - 1) / (num_threads - 2); - auto sampler = cpu_usage::create_thread_sampler(force_mock, target_load); + double target_util = double(thread_id - 1) / (num_threads - 2); + auto sampler = cpu_usage::create_thread_sampler(force_mock, target_util); samplers[idx] = sampler.get(); TEST_BARRIER(); // #1 TEST_BARRIER(); // #2 @@ -97,7 +97,7 @@ void verify_sampling(size_t thread_id, size_t num_threads, std::vector<Sampler*> //----------------------------------------------------------------------------- -TEST_MT_F("require that dummy thread-based CPU usage sampling with known expected load works", 5, std::vector<Sampler*>(4, nullptr)) { +TEST_MT_F("require that dummy thread-based CPU usage sampling with known expected util works", 5, std::vector<Sampler*>(4, nullptr)) { TEST_DO(verify_sampling(thread_id, num_threads, f1, true)); } @@ -396,46 +396,18 @@ TEST("require that thread tracker implementation does not track OTHER cpu use") //----------------------------------------------------------------------------- -// prototype for the class we want to use when integrating CPU usage -// into metrics as load values. NB: this class is not thread safe. - -class CpuMonitor { -private: - duration _min_delay; - CpuUsage::TimedSample _old_sample; - std::array<double,CpuUsage::num_categories> _load; - -public: - CpuMonitor(duration min_delay) - : _min_delay(min_delay), - _old_sample(CpuUsage::sample()), - _load() {} - - std::array<double,CpuUsage::num_categories> get_load() { - if (steady_clock::now() >= (_old_sample.first + _min_delay)) { - auto new_sample = CpuUsage::sample(); - auto dt = to_s(new_sample.first - _old_sample.first); - for (size_t i = 0; i < CpuUsage::num_categories; ++i) { - _load[i] = to_s(new_sample.second[i] - _old_sample.second[i]) / dt; - } - _old_sample = new_sample; - } - return _load; - } -}; - void do_sample_cpu_usage(const EndTime &end_time) { auto my_usage = CpuUsage::use(CpuUsage::Category::SETUP); - CpuMonitor monitor(8ms); + CpuUtil cpu(8ms); while (!end_time()) { std::this_thread::sleep_for(verbose ? 1s : 10ms); - auto load = monitor.get_load(); + auto util = cpu.get_util(); vespalib::string body; - for (size_t i = 0; i < load.size(); ++i) { + for (size_t i = 0; i < util.size(); ++i) { if (!body.empty()) { body.append(", "); } - body.append(fmt("%s: %.3f", CpuUsage::name_of(CpuUsage::Category(i)).c_str(), load[i])); + body.append(fmt("%s: %.3f", CpuUsage::name_of(CpuUsage::Category(i)).c_str(), util[i])); } fprintf(stderr, "CPU: %s\n", body.c_str()); } diff --git a/vespalib/src/vespa/vespalib/util/cpu_usage.cpp b/vespalib/src/vespa/vespalib/util/cpu_usage.cpp index 5609b0d8d09..5307239118d 100644 --- a/vespalib/src/vespa/vespalib/util/cpu_usage.cpp +++ b/vespalib/src/vespa/vespalib/util/cpu_usage.cpp @@ -17,11 +17,11 @@ namespace { class DummyThreadSampler : public ThreadSampler { private: steady_time _start; - double _load; + double _util; public: - DummyThreadSampler(double load) : _start(steady_clock::now()), _load(load) {} + DummyThreadSampler(double util) : _start(steady_clock::now()), _util(util) {} duration sample() const noexcept override { - return from_s(to_s(steady_clock::now() - _start) * _load); + return from_s(to_s(steady_clock::now() - _start) * _util); } }; @@ -53,14 +53,14 @@ duration total_cpu_usage() noexcept { return from_timespec(ts); } -ThreadSampler::UP create_thread_sampler(bool force_mock_impl, double expected_load) { +ThreadSampler::UP create_thread_sampler(bool force_mock_impl, double expected_util) { if (force_mock_impl) { - return std::make_unique<DummyThreadSampler>(expected_load); + return std::make_unique<DummyThreadSampler>(expected_util); } #ifdef __linux__ return std::make_unique<LinuxThreadSampler>(); #endif - return std::make_unique<DummyThreadSampler>(expected_load); + return std::make_unique<DummyThreadSampler>(expected_util); } } // cpu_usage diff --git a/vespalib/src/vespa/vespalib/util/cpu_usage.h b/vespalib/src/vespa/vespalib/util/cpu_usage.h index 3c30937151c..87e0a289e87 100644 --- a/vespalib/src/vespa/vespalib/util/cpu_usage.h +++ b/vespalib/src/vespa/vespalib/util/cpu_usage.h @@ -28,7 +28,7 @@ struct ThreadSampler { virtual ~ThreadSampler() {} }; -ThreadSampler::UP create_thread_sampler(bool force_mock_impl = false, double expected_load = 0.16); +ThreadSampler::UP create_thread_sampler(bool force_mock_impl = false, double expected_util = 0.16); } // cpu_usage @@ -61,21 +61,25 @@ public: static constexpr size_t index_of(Category cat) { return static_cast<size_t>(cat); } static constexpr size_t num_categories = 5; - // A sample contains how much CPU has been spent in various - // categories. - class Sample { + template <typename T> + class PerCategory { private: - std::array<duration,num_categories> _usage; + std::array<T,num_categories> _array; + public: + PerCategory() : _array() {} + size_t size() const { return _array.size(); } + T &operator[](size_t idx) { return _array[idx]; } + T &operator[](Category cat) { return _array[index_of(cat)]; } + const T &operator[](size_t idx) const { return _array[idx]; } + const T &operator[](Category cat) const { return _array[index_of(cat)]; } + }; + + // A sample contains how much CPU has been spent in each category. + class Sample : public PerCategory<duration> { public: - Sample() : _usage() {} - size_t size() const { return _usage.size(); } - duration &operator[](size_t idx) { return _usage[idx]; } - duration &operator[](Category cat) { return _usage[index_of(cat)]; } - const duration &operator[](size_t idx) const { return _usage[idx]; } - const duration &operator[](Category cat) const { return _usage[index_of(cat)]; } void merge(const Sample &rhs) { for (size_t i = 0; i < size(); ++i) { - _usage[i] += rhs._usage[i]; + (*this)[i] += rhs[i]; } } }; @@ -176,4 +180,33 @@ public: static TimedSample sample(); }; +/** + * Simple class used to track cpu utilization over time. + **/ +class CpuUtil +{ +private: + duration _min_delay; + CpuUsage::TimedSample _old_sample; + CpuUsage::PerCategory<double> _util; + +public: + CpuUtil(duration min_delay = 850ms) + : _min_delay(min_delay), + _old_sample(CpuUsage::sample()), + _util() {} + + CpuUsage::PerCategory<double> get_util() { + if (steady_clock::now() >= (_old_sample.first + _min_delay)) { + auto new_sample = CpuUsage::sample(); + auto dt = to_s(new_sample.first - _old_sample.first); + for (size_t i = 0; i < _util.size(); ++i) { + _util[i] = to_s(new_sample.second[i] - _old_sample.second[i]) / dt; + } + _old_sample = new_sample; + } + return _util; + } +}; + } // namespace |