summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2022-01-25 13:51:23 +0100
committerGitHub <noreply@github.com>2022-01-25 13:51:23 +0100
commitb204e44404142250b79be20c492951de1287609d (patch)
tree68a2d1248a1c08125cc1b40945495b33764d1504
parent0dcd724d233f3833180030907e969007a81a9682 (diff)
parent949932e310963df295c27af0dc62b77d4625288a (diff)
Merge pull request #20915 from vespa-engine/havardpe/proton-cpu-util-metrics
added cpu util metrics
-rw-r--r--searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp15
-rw-r--r--searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h12
-rw-r--r--searchcore/src/vespa/searchcore/proton/server/proton.cpp8
-rw-r--r--searchcore/src/vespa/searchcore/proton/server/proton.h2
-rw-r--r--vespalib/src/tests/cpu_usage/cpu_usage_test.cpp62
-rw-r--r--vespalib/src/vespa/vespalib/util/cpu_usage.cpp12
-rw-r--r--vespalib/src/vespa/vespalib/util/cpu_usage.h57
7 files changed, 104 insertions, 64 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp
index 9bc20f95d13..9cf0339986c 100644
--- a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp
+++ b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp
@@ -4,6 +4,18 @@
namespace proton {
+ResourceUsageMetrics::CpuUtilMetrics::CpuUtilMetrics(metrics::MetricSet *parent)
+ : MetricSet("cpu_util", {}, "Unnormalized cpu utilization for various categories", parent),
+ setup("setup", {}, "cpu used by system init and (re-)configuration", this),
+ read("read", {}, "cpu used by reading data from the system", this),
+ write("write", {}, "cpu used by writing data to the system", this),
+ compact("compact", {}, "cpu used by internal data re-structuring", this),
+ other("other", {}, "cpu used by work not classified as a specific category", this)
+{
+}
+
+ResourceUsageMetrics::CpuUtilMetrics::~CpuUtilMetrics() = default;
+
ResourceUsageMetrics::ResourceUsageMetrics(metrics::MetricSet *parent)
: MetricSet("resource_usage", {}, "Usage metrics for various resources in this search engine", parent),
disk("disk", {}, "The relative amount of disk space used on this machine (value in the range [0, 1])", this),
@@ -15,7 +27,8 @@ ResourceUsageMetrics::ResourceUsageMetrics(metrics::MetricSet *parent)
memoryMappings("memory_mappings", {}, "The number of mapped memory areas", this),
openFileDescriptors("open_file_descriptors", {}, "The number of open files", this),
feedingBlocked("feeding_blocked", {}, "Whether feeding is blocked due to resource limits being reached (value is either 0 or 1)", this),
- mallocArena("malloc_arena", {}, "Size of malloc arena", this)
+ mallocArena("malloc_arena", {}, "Size of malloc arena", this),
+ cpu_util(this)
{
}
diff --git a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h
index 774bb645c84..35100084cf7 100644
--- a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h
+++ b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h
@@ -12,6 +12,17 @@ namespace proton {
*/
struct ResourceUsageMetrics : metrics::MetricSet
{
+ struct CpuUtilMetrics : metrics::MetricSet {
+ metrics::DoubleValueMetric setup;
+ metrics::DoubleValueMetric read;
+ metrics::DoubleValueMetric write;
+ metrics::DoubleValueMetric compact;
+ metrics::DoubleValueMetric other;
+
+ CpuUtilMetrics(metrics::MetricSet *parent);
+ ~CpuUtilMetrics();
+ };
+
metrics::DoubleValueMetric disk;
metrics::DoubleValueMetric diskUtilization;
metrics::DoubleValueMetric memory;
@@ -22,6 +33,7 @@ struct ResourceUsageMetrics : metrics::MetricSet
metrics::LongValueMetric openFileDescriptors;
metrics::LongValueMetric feedingBlocked;
metrics::LongValueMetric mallocArena;
+ CpuUtilMetrics cpu_util;
ResourceUsageMetrics(metrics::MetricSet *parent);
~ResourceUsageMetrics();
diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.cpp b/searchcore/src/vespa/searchcore/proton/server/proton.cpp
index b128fe16e5e..b2a86102cd1 100644
--- a/searchcore/src/vespa/searchcore/proton/server/proton.cpp
+++ b/searchcore/src/vespa/searchcore/proton/server/proton.cpp
@@ -61,6 +61,7 @@ using vespalib::Slime;
using vespalib::makeLambdaTask;
using vespalib::slime::ArrayInserter;
using vespalib::slime::Cursor;
+using CpuCategory = vespalib::CpuUsage::Category;
using search::transactionlog::DomainStats;
using vespa::config::search::core::ProtonConfig;
@@ -207,6 +208,7 @@ Proton::Proton(const config::ConfigUri & configUri,
StatusProducer(),
IPersistenceEngineOwner(),
ComponentConfigProducer(),
+ _cpu_util(),
_configUri(configUri),
_mutex(),
_metricsHook(std::make_unique<MetricsUpdateHook>(*this)),
@@ -775,6 +777,12 @@ Proton::updateMetrics(const metrics::MetricLockGuard &)
#else
metrics.resourceUsage.mallocArena.set(UINT64_C(0));
#endif
+ auto cpu_util = _cpu_util.get_util();
+ metrics.resourceUsage.cpu_util.setup.set(cpu_util[CpuCategory::SETUP]);
+ metrics.resourceUsage.cpu_util.read.set(cpu_util[CpuCategory::READ]);
+ metrics.resourceUsage.cpu_util.write.set(cpu_util[CpuCategory::WRITE]);
+ metrics.resourceUsage.cpu_util.compact.set(cpu_util[CpuCategory::COMPACT]);
+ metrics.resourceUsage.cpu_util.other.set(cpu_util[CpuCategory::OTHER]);
}
{
ContentProtonMetrics::ProtonExecutorMetrics &metrics = _metricsEngine->root().executor;
diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.h b/searchcore/src/vespa/searchcore/proton/server/proton.h
index 573f215c722..90a257a0aaa 100644
--- a/searchcore/src/vespa/searchcore/proton/server/proton.h
+++ b/searchcore/src/vespa/searchcore/proton/server/proton.h
@@ -25,6 +25,7 @@
#include <vespa/vespalib/net/json_handler_repo.h>
#include <vespa/vespalib/net/state_explorer.h>
#include <vespa/vespalib/util/varholder.h>
+#include <vespa/vespalib/util/cpu_usage.h>
#include <mutex>
#include <shared_mutex>
@@ -81,6 +82,7 @@ private:
void setClusterName(const vespalib::string &clusterName, const vespalib::string &baseDir);
};
+ vespalib::CpuUtil _cpu_util;
const config::ConfigUri _configUri;
mutable std::shared_mutex _mutex;
std::unique_ptr<metrics::UpdateHook> _metricsHook;
diff --git a/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp b/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp
index fde02dc8435..83f49d6c73b 100644
--- a/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp
+++ b/vespalib/src/tests/cpu_usage/cpu_usage_test.cpp
@@ -68,22 +68,22 @@ void verify_sampling(size_t thread_id, size_t num_threads, std::vector<Sampler*>
auto post_total = cpu_usage::total_cpu_usage();
TEST_BARRIER(); // #4
double wall = to_s(t1 - t0);
- std::vector<double> load(4, 0.0);
+ std::vector<double> util(4, 0.0);
for (size_t i = 0; i < 4; ++i) {
- load[i] = to_s(post_usage[i] - pre_usage[i]) / wall;
+ util[i] = to_s(post_usage[i] - pre_usage[i]) / wall;
}
- double total_load = to_s(post_total - pre_total) / wall;
- EXPECT_GREATER(load[3], load[0]);
- // NB: cannot expect total_load to be greater than load[3]
- // here due to mock loads being 'as expected' while valgrind
- // will cut all loads in about half.
- EXPECT_GREATER(total_load, load[0]);
- fprintf(stderr, "loads: { %.3f, %.3f, %.3f, %.3f }\n", load[0], load[1], load[2], load[3]);
- fprintf(stderr, "total load: %.3f\n", total_load);
+ double total_util = to_s(post_total - pre_total) / wall;
+ EXPECT_GREATER(util[3], util[0]);
+ // NB: cannot expect total_util to be greater than util[3]
+ // here due to mock utils being 'as expected' while valgrind
+ // will cut all utils in about half.
+ EXPECT_GREATER(total_util, util[0]);
+ fprintf(stderr, "utils: { %.3f, %.3f, %.3f, %.3f }\n", util[0], util[1], util[2], util[3]);
+ fprintf(stderr, "total util: %.3f\n", total_util);
} else {
int idx = (thread_id - 1);
- double target_load = double(thread_id - 1) / (num_threads - 2);
- auto sampler = cpu_usage::create_thread_sampler(force_mock, target_load);
+ double target_util = double(thread_id - 1) / (num_threads - 2);
+ auto sampler = cpu_usage::create_thread_sampler(force_mock, target_util);
samplers[idx] = sampler.get();
TEST_BARRIER(); // #1
TEST_BARRIER(); // #2
@@ -97,7 +97,7 @@ void verify_sampling(size_t thread_id, size_t num_threads, std::vector<Sampler*>
//-----------------------------------------------------------------------------
-TEST_MT_F("require that dummy thread-based CPU usage sampling with known expected load works", 5, std::vector<Sampler*>(4, nullptr)) {
+TEST_MT_F("require that dummy thread-based CPU usage sampling with known expected util works", 5, std::vector<Sampler*>(4, nullptr)) {
TEST_DO(verify_sampling(thread_id, num_threads, f1, true));
}
@@ -396,46 +396,18 @@ TEST("require that thread tracker implementation does not track OTHER cpu use")
//-----------------------------------------------------------------------------
-// prototype for the class we want to use when integrating CPU usage
-// into metrics as load values. NB: this class is not thread safe.
-
-class CpuMonitor {
-private:
- duration _min_delay;
- CpuUsage::TimedSample _old_sample;
- std::array<double,CpuUsage::num_categories> _load;
-
-public:
- CpuMonitor(duration min_delay)
- : _min_delay(min_delay),
- _old_sample(CpuUsage::sample()),
- _load() {}
-
- std::array<double,CpuUsage::num_categories> get_load() {
- if (steady_clock::now() >= (_old_sample.first + _min_delay)) {
- auto new_sample = CpuUsage::sample();
- auto dt = to_s(new_sample.first - _old_sample.first);
- for (size_t i = 0; i < CpuUsage::num_categories; ++i) {
- _load[i] = to_s(new_sample.second[i] - _old_sample.second[i]) / dt;
- }
- _old_sample = new_sample;
- }
- return _load;
- }
-};
-
void do_sample_cpu_usage(const EndTime &end_time) {
auto my_usage = CpuUsage::use(CpuUsage::Category::SETUP);
- CpuMonitor monitor(8ms);
+ CpuUtil cpu(8ms);
while (!end_time()) {
std::this_thread::sleep_for(verbose ? 1s : 10ms);
- auto load = monitor.get_load();
+ auto util = cpu.get_util();
vespalib::string body;
- for (size_t i = 0; i < load.size(); ++i) {
+ for (size_t i = 0; i < util.size(); ++i) {
if (!body.empty()) {
body.append(", ");
}
- body.append(fmt("%s: %.3f", CpuUsage::name_of(CpuUsage::Category(i)).c_str(), load[i]));
+ body.append(fmt("%s: %.3f", CpuUsage::name_of(CpuUsage::Category(i)).c_str(), util[i]));
}
fprintf(stderr, "CPU: %s\n", body.c_str());
}
diff --git a/vespalib/src/vespa/vespalib/util/cpu_usage.cpp b/vespalib/src/vespa/vespalib/util/cpu_usage.cpp
index 5609b0d8d09..5307239118d 100644
--- a/vespalib/src/vespa/vespalib/util/cpu_usage.cpp
+++ b/vespalib/src/vespa/vespalib/util/cpu_usage.cpp
@@ -17,11 +17,11 @@ namespace {
class DummyThreadSampler : public ThreadSampler {
private:
steady_time _start;
- double _load;
+ double _util;
public:
- DummyThreadSampler(double load) : _start(steady_clock::now()), _load(load) {}
+ DummyThreadSampler(double util) : _start(steady_clock::now()), _util(util) {}
duration sample() const noexcept override {
- return from_s(to_s(steady_clock::now() - _start) * _load);
+ return from_s(to_s(steady_clock::now() - _start) * _util);
}
};
@@ -53,14 +53,14 @@ duration total_cpu_usage() noexcept {
return from_timespec(ts);
}
-ThreadSampler::UP create_thread_sampler(bool force_mock_impl, double expected_load) {
+ThreadSampler::UP create_thread_sampler(bool force_mock_impl, double expected_util) {
if (force_mock_impl) {
- return std::make_unique<DummyThreadSampler>(expected_load);
+ return std::make_unique<DummyThreadSampler>(expected_util);
}
#ifdef __linux__
return std::make_unique<LinuxThreadSampler>();
#endif
- return std::make_unique<DummyThreadSampler>(expected_load);
+ return std::make_unique<DummyThreadSampler>(expected_util);
}
} // cpu_usage
diff --git a/vespalib/src/vespa/vespalib/util/cpu_usage.h b/vespalib/src/vespa/vespalib/util/cpu_usage.h
index 3c30937151c..87e0a289e87 100644
--- a/vespalib/src/vespa/vespalib/util/cpu_usage.h
+++ b/vespalib/src/vespa/vespalib/util/cpu_usage.h
@@ -28,7 +28,7 @@ struct ThreadSampler {
virtual ~ThreadSampler() {}
};
-ThreadSampler::UP create_thread_sampler(bool force_mock_impl = false, double expected_load = 0.16);
+ThreadSampler::UP create_thread_sampler(bool force_mock_impl = false, double expected_util = 0.16);
} // cpu_usage
@@ -61,21 +61,25 @@ public:
static constexpr size_t index_of(Category cat) { return static_cast<size_t>(cat); }
static constexpr size_t num_categories = 5;
- // A sample contains how much CPU has been spent in various
- // categories.
- class Sample {
+ template <typename T>
+ class PerCategory {
private:
- std::array<duration,num_categories> _usage;
+ std::array<T,num_categories> _array;
+ public:
+ PerCategory() : _array() {}
+ size_t size() const { return _array.size(); }
+ T &operator[](size_t idx) { return _array[idx]; }
+ T &operator[](Category cat) { return _array[index_of(cat)]; }
+ const T &operator[](size_t idx) const { return _array[idx]; }
+ const T &operator[](Category cat) const { return _array[index_of(cat)]; }
+ };
+
+ // A sample contains how much CPU has been spent in each category.
+ class Sample : public PerCategory<duration> {
public:
- Sample() : _usage() {}
- size_t size() const { return _usage.size(); }
- duration &operator[](size_t idx) { return _usage[idx]; }
- duration &operator[](Category cat) { return _usage[index_of(cat)]; }
- const duration &operator[](size_t idx) const { return _usage[idx]; }
- const duration &operator[](Category cat) const { return _usage[index_of(cat)]; }
void merge(const Sample &rhs) {
for (size_t i = 0; i < size(); ++i) {
- _usage[i] += rhs._usage[i];
+ (*this)[i] += rhs[i];
}
}
};
@@ -176,4 +180,33 @@ public:
static TimedSample sample();
};
+/**
+ * Simple class used to track cpu utilization over time.
+ **/
+class CpuUtil
+{
+private:
+ duration _min_delay;
+ CpuUsage::TimedSample _old_sample;
+ CpuUsage::PerCategory<double> _util;
+
+public:
+ CpuUtil(duration min_delay = 850ms)
+ : _min_delay(min_delay),
+ _old_sample(CpuUsage::sample()),
+ _util() {}
+
+ CpuUsage::PerCategory<double> get_util() {
+ if (steady_clock::now() >= (_old_sample.first + _min_delay)) {
+ auto new_sample = CpuUsage::sample();
+ auto dt = to_s(new_sample.first - _old_sample.first);
+ for (size_t i = 0; i < _util.size(); ++i) {
+ _util[i] = to_s(new_sample.second[i] - _old_sample.second[i]) / dt;
+ }
+ _old_sample = new_sample;
+ }
+ return _util;
+ }
+};
+
} // namespace