diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2022-01-27 13:37:49 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-27 13:37:49 +0100 |
commit | 76ce14e1560213d61d9dc86991f2f3b1c68b79ec (patch) | |
tree | d03b71336b76605365efaf207e9b06bcd98ec127 /searchcore | |
parent | 4472baa8620ca485ec1bc65eaf7577b60d3a3987 (diff) | |
parent | dc6cb592ca330a00e9d70778acd76f79e6193fef (diff) |
Merge pull request #20953 from vespa-engine/geirst/align-resource-usage-metrics
Align resource usage metrics with what is tracked and reported to theā¦
Diffstat (limited to 'searchcore')
9 files changed, 131 insertions, 59 deletions
diff --git a/searchcore/src/tests/proton/server/disk_mem_usage_filter/disk_mem_usage_filter_test.cpp b/searchcore/src/tests/proton/server/disk_mem_usage_filter/disk_mem_usage_filter_test.cpp index ce85517ee09..db32c1e77c4 100644 --- a/searchcore/src/tests/proton/server/disk_mem_usage_filter/disk_mem_usage_filter_test.cpp +++ b/searchcore/src/tests/proton/server/disk_mem_usage_filter/disk_mem_usage_filter_test.cpp @@ -121,18 +121,22 @@ TEST_F(DiskMemUsageFilterTest, both_disk_limit_and_memory_limit_can_be_reached) "capacity: 100, used: 90, diskUsed: 0.9, diskLimit: 0.8}}"); } -TEST_F(DiskMemUsageFilterTest, transient_disk_usage_is_tracked_in_usage_state_and_metrics) +TEST_F(DiskMemUsageFilterTest, transient_and_non_transient_disk_usage_tracked_in_usage_state_and_metrics) { - _filter.set_transient_resource_usage({40, 0}); - EXPECT_EQ(0.4, _filter.usageState().transient_disk_usage()); - EXPECT_EQ(0.4, _filter.get_metrics().get_transient_disk_usage()); + _filter.set_transient_resource_usage({15, 0}); + EXPECT_DOUBLE_EQ(0.15, _filter.usageState().transient_disk_usage()); + EXPECT_DOUBLE_EQ(0.15, _filter.get_metrics().transient_disk_usage()); + EXPECT_DOUBLE_EQ(0.05, _filter.usageState().non_transient_disk_usage()); + EXPECT_DOUBLE_EQ(0.05, _filter.get_metrics().non_transient_disk_usage()); } -TEST_F(DiskMemUsageFilterTest, transient_memory_usage_is_tracked_in_usage_state_and_metrics) +TEST_F(DiskMemUsageFilterTest, transient_and_non_transient_memory_usage_tracked_in_usage_state_and_metrics) { - _filter.set_transient_resource_usage({0, 200}); - EXPECT_EQ(0.2, _filter.usageState().transient_memory_usage()); - EXPECT_EQ(0.2, _filter.get_metrics().get_transient_memory_usage()); + _filter.set_transient_resource_usage({0, 100}); + EXPECT_DOUBLE_EQ(0.1, _filter.usageState().transient_memory_usage()); + EXPECT_DOUBLE_EQ(0.1, _filter.get_metrics().transient_memory_usage()); + EXPECT_DOUBLE_EQ(0.2, _filter.usageState().non_transient_memory_usage()); + EXPECT_DOUBLE_EQ(0.2, _filter.get_metrics().non_transient_memory_usage()); } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchcore/src/tests/proton/server/disk_mem_usage_metrics/disk_mem_usage_metrics_test.cpp b/searchcore/src/tests/proton/server/disk_mem_usage_metrics/disk_mem_usage_metrics_test.cpp index 60240e06fc4..8ba02875dc0 100644 --- a/searchcore/src/tests/proton/server/disk_mem_usage_metrics/disk_mem_usage_metrics_test.cpp +++ b/searchcore/src/tests/proton/server/disk_mem_usage_metrics/disk_mem_usage_metrics_test.cpp @@ -10,30 +10,46 @@ using proton::DiskMemUsageState; using proton::ResourceUsageState; bool -expect_metrics(double disk_usage, double disk_utilization, double memory_usage, double memory_utilization, const DiskMemUsageMetrics &dm_metrics) +expect_metrics(double disk_usage, double disk_utilization, double transient_disk, double non_transient_disk, + double memory_usage, double memory_utilization, double transient_memory, double non_transient_memory, + const DiskMemUsageMetrics &dm_metrics) { bool result = true; - EXPECT_DOUBLE_EQ(disk_usage, dm_metrics.get_disk_usage()) << (result = false, ""); - EXPECT_DOUBLE_EQ(disk_utilization, dm_metrics.get_disk_utilization()) << (result = false, ""); - EXPECT_DOUBLE_EQ(memory_usage, dm_metrics.get_memory_usage()) << (result = false, ""); - EXPECT_DOUBLE_EQ(memory_utilization, dm_metrics.get_memory_utilization()) << (result = false, ""); + EXPECT_DOUBLE_EQ(disk_usage, dm_metrics.total_disk_usage()) << (result = false, ""); + EXPECT_DOUBLE_EQ(disk_utilization, dm_metrics.total_disk_utilization()) << (result = false, ""); + EXPECT_DOUBLE_EQ(transient_disk, dm_metrics.transient_disk_usage()) << (result = false, ""); + EXPECT_DOUBLE_EQ(non_transient_disk, dm_metrics.non_transient_disk_usage()) << (result = false, ""); + EXPECT_DOUBLE_EQ(memory_usage, dm_metrics.total_memory_usage()) << (result = false, ""); + EXPECT_DOUBLE_EQ(memory_utilization, dm_metrics.total_memory_utilization()) << (result = false, ""); + EXPECT_DOUBLE_EQ(transient_memory, dm_metrics.transient_memory_usage()) << (result = false, ""); + EXPECT_DOUBLE_EQ(non_transient_memory, dm_metrics.non_transient_memory_usage()) << (result = false, ""); return result; } TEST(DiskMemUsageMetricsTest, default_value_is_zero) { DiskMemUsageMetrics dm_metrics; - EXPECT_TRUE(expect_metrics(0.0, 0.0, 0.0, 0.0, dm_metrics)); + EXPECT_TRUE(expect_metrics(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, dm_metrics)); } TEST(DiskMemUsageMetricsTest, merging_uses_max) { - DiskMemUsageMetrics dm_metrics({ResourceUsageState(0.5, 0.4), ResourceUsageState(0.5, 0.3)}); - EXPECT_TRUE(expect_metrics(0.4, 0.8, 0.3, 0.6, dm_metrics)); - dm_metrics.merge({ResourceUsageState(0.4, 0.4), ResourceUsageState(0.5, 0.4)}); - EXPECT_TRUE(expect_metrics(0.4, 1.0, 0.4, 0.8, dm_metrics)); - dm_metrics.merge({ResourceUsageState(0.5, 0.4), ResourceUsageState(0.5, 0.3)}); - EXPECT_TRUE(expect_metrics(0.4, 1.0, 0.4, 0.8, dm_metrics)); + DiskMemUsageMetrics dm_metrics({ResourceUsageState(0.5, 0.4), + ResourceUsageState(0.5, 0.3), 0.1, 0.05}); + EXPECT_TRUE(expect_metrics(0.4, 0.8, 0.1, 0.3, + 0.3, 0.6, 0.05, 0.25, dm_metrics)); + dm_metrics.merge({ResourceUsageState(0.4, 0.4), + ResourceUsageState(0.3, 0.3), 0.1, 0.05}); + EXPECT_TRUE(expect_metrics(0.4, 1.0, 0.1, 0.3, + 0.3, 1.0, 0.05, 0.25, dm_metrics)); + dm_metrics.merge({ResourceUsageState(0.5, 0.45), + ResourceUsageState(0.5, 0.35), 0.1, 0.05}); + EXPECT_TRUE(expect_metrics(0.45, 1.0, 0.1, 0.35, + 0.35, 1.0, 0.05, 0.3, dm_metrics)); + dm_metrics.merge({ResourceUsageState(0.5, 0.4), + ResourceUsageState(0.5, 0.3), 0.15, 0.1}); + EXPECT_TRUE(expect_metrics(0.45, 1.0, 0.15, 0.35, + 0.35, 1.0, 0.10, 0.3, dm_metrics)); } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp index 9cf0339986c..d026ef549d4 100644 --- a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp +++ b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.cpp @@ -1,6 +1,9 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "resource_usage_metrics.h" +#include <vespa/vespalib/util/stringfmt.h> + +using vespalib::make_string; namespace proton { @@ -16,14 +19,30 @@ ResourceUsageMetrics::CpuUtilMetrics::CpuUtilMetrics(metrics::MetricSet *parent) ResourceUsageMetrics::CpuUtilMetrics::~CpuUtilMetrics() = default; +ResourceUsageMetrics::DetailedResourceMetrics::DetailedResourceMetrics(const vespalib::string& resource_type, metrics::MetricSet* parent) + : MetricSet(make_string("%s_usage", resource_type.c_str()), {}, make_string("Detailed resource usage metrics for %s", + resource_type.c_str()), parent), + total("total", {}, make_string("The total relative amount of %s used by this content node (value in the range [0, 1])", + resource_type.c_str()), this), + total_util("total_utilization", {}, make_string("The relative amount of %s used compared to the content node %s resource limit", + resource_type.c_str(), resource_type.c_str()), this), + transient("transient", {}, make_string("The relative amount of transient %s used by this content node (value in the range [0, 1])", + resource_type.c_str()), this) +{ +} + +ResourceUsageMetrics::DetailedResourceMetrics::~DetailedResourceMetrics() = default; + ResourceUsageMetrics::ResourceUsageMetrics(metrics::MetricSet *parent) - : MetricSet("resource_usage", {}, "Usage metrics for various resources in this search engine", parent), - disk("disk", {}, "The relative amount of disk space used on this machine (value in the range [0, 1])", this), + : MetricSet("resource_usage", {}, "Usage metrics for various resources in this content node", parent), + disk("disk", {}, "The relative amount of disk used by this content node (transient usage not included, value in the range [0, 1]). Same value as reported to the cluster controller", this), diskUtilization("disk_utilization", {}, "The relative amount of disk used compared to the disk resource limit", this), - memory("memory", {}, "The relative amount of memory used by this process (value in the range [0, 1])", this), + memory("memory", {}, "The relative amount of memory used by this content node (transient usage not included, value in the range [0, 1]). Same value as reported to the cluster controller", this), memoryUtilization("memory_utilization", {}, "The relative amount of memory used compared to the memory resource limit", this), transient_memory("transient_memory", {}, "The relative amount of transient memory needed for loading attributes. Max value among all attributes (value in the range [0, 1])", this), transient_disk("transient_disk", {}, "The relative amount of transient disk needed for running disk index fusion. Max value among all disk indexes (value in the range [0, 1])", this), + disk_usage("disk", this), + memory_usage("memory", this), memoryMappings("memory_mappings", {}, "The number of mapped memory areas", this), openFileDescriptors("open_file_descriptors", {}, "The number of open files", this), feedingBlocked("feeding_blocked", {}, "Whether feeding is blocked due to resource limits being reached (value is either 0 or 1)", this), diff --git a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h index 35100084cf7..97cad935dba 100644 --- a/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h +++ b/searchcore/src/vespa/searchcore/proton/metrics/resource_usage_metrics.h @@ -23,12 +23,26 @@ struct ResourceUsageMetrics : metrics::MetricSet ~CpuUtilMetrics(); }; + struct DetailedResourceMetrics : metrics::MetricSet { + metrics::DoubleValueMetric total; + metrics::DoubleValueMetric total_util; + metrics::DoubleValueMetric transient; + + DetailedResourceMetrics(const vespalib::string& resource_type, metrics::MetricSet* parent); + ~DetailedResourceMetrics(); + }; + + // TODO Vespa 8: Remove diskUtilization, memoryUtilization, transient_memory, transient_disk. + // These are now included in disk_usage and memory_usage. + metrics::DoubleValueMetric disk; metrics::DoubleValueMetric diskUtilization; metrics::DoubleValueMetric memory; metrics::DoubleValueMetric memoryUtilization; metrics::DoubleValueMetric transient_memory; metrics::DoubleValueMetric transient_disk; + DetailedResourceMetrics disk_usage; + DetailedResourceMetrics memory_usage; metrics::LongValueMetric memoryMappings; metrics::LongValueMetric openFileDescriptors; metrics::LongValueMetric feedingBlocked; diff --git a/searchcore/src/vespa/searchcore/proton/persistenceengine/resource_usage_tracker.cpp b/searchcore/src/vespa/searchcore/proton/persistenceengine/resource_usage_tracker.cpp index 63e8b1d5196..9c8e6591730 100644 --- a/searchcore/src/vespa/searchcore/proton/persistenceengine/resource_usage_tracker.cpp +++ b/searchcore/src/vespa/searchcore/proton/persistenceengine/resource_usage_tracker.cpp @@ -94,13 +94,13 @@ void ResourceUsageTracker::notifyDiskMemUsage(DiskMemUsageState state) { std::lock_guard guard(_lock); - // The transient resource usage is subtracted from the absolute resource usage + // The transient resource usage is subtracted from the total resource usage // before it eventually is reported to the cluster controller (to decide whether to block client feed). // This ensures that the transient resource usage is covered by the resource headroom on the content node, // instead of leading to feed blocked due to natural fluctuations. - double adj_disk_usage = std::max(0.0, state.diskState().usage() - state.transient_disk_usage()); - double adj_memory_usage = std::max(0.0, state.memoryState().usage() - state.transient_memory_usage()); - _resource_usage = ResourceUsage(adj_disk_usage, adj_memory_usage, _resource_usage.get_attribute_address_space_usage()); + _resource_usage = ResourceUsage(state.non_transient_disk_usage(), + state.non_transient_memory_usage(), + _resource_usage.get_attribute_address_space_usage()); if (_listener != nullptr) { _listener->update_resource_usage(_resource_usage); } diff --git a/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_metrics.cpp b/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_metrics.cpp index 230593c2c1d..d740d5b129d 100644 --- a/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_metrics.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_metrics.cpp @@ -11,25 +11,29 @@ DiskMemUsageMetrics::DiskMemUsageMetrics() noexcept { } -DiskMemUsageMetrics::DiskMemUsageMetrics(const DiskMemUsageState &usage_state) noexcept - : _disk_usage(usage_state.diskState().usage()), - _disk_utilization(usage_state.diskState().utilization()), - _transient_disk_usage(usage_state.transient_disk_usage()), - _memory_usage(usage_state.memoryState().usage()), - _memory_utilization(usage_state.memoryState().utilization()), - _transient_memory_usage(usage_state.transient_memory_usage()) +DiskMemUsageMetrics::DiskMemUsageMetrics(const DiskMemUsageState& usage) noexcept + : _total_disk_usage(usage.diskState().usage()), + _total_disk_utilization(usage.diskState().utilization()), + _transient_disk_usage(usage.transient_disk_usage()), + _non_transient_disk_usage(usage.non_transient_disk_usage()), + _total_memory_usage(usage.memoryState().usage()), + _total_memory_utilization(usage.memoryState().utilization()), + _transient_memory_usage(usage.transient_memory_usage()), + _non_transient_memory_usage(usage.non_transient_memory_usage()) { } void -DiskMemUsageMetrics::merge(const DiskMemUsageState &usage_state) noexcept +DiskMemUsageMetrics::merge(const DiskMemUsageState& usage) noexcept { - _disk_usage = std::max(_disk_usage, usage_state.diskState().usage()); - _disk_utilization = std::max(_disk_utilization, usage_state.diskState().utilization()); - _transient_disk_usage = std::max(_transient_disk_usage, usage_state.transient_disk_usage()); - _memory_usage = std::max(_memory_usage, usage_state.memoryState().usage()); - _memory_utilization = std::max(_memory_utilization, usage_state.memoryState().utilization()); - _transient_memory_usage = std::max(_transient_memory_usage, usage_state.transient_memory_usage()); + _total_disk_usage = std::max(_total_disk_usage, usage.diskState().usage()); + _total_disk_utilization = std::max(_total_disk_utilization, usage.diskState().utilization()); + _transient_disk_usage = std::max(_transient_disk_usage, usage.transient_disk_usage()); + _non_transient_disk_usage = std::max(_non_transient_disk_usage, usage.non_transient_disk_usage()); + _total_memory_usage = std::max(_total_memory_usage, usage.memoryState().usage()); + _total_memory_utilization = std::max(_total_memory_utilization, usage.memoryState().utilization()); + _transient_memory_usage = std::max(_transient_memory_usage, usage.transient_memory_usage()); + _non_transient_memory_usage = std::max(_non_transient_memory_usage, usage.non_transient_memory_usage()); } } diff --git a/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_metrics.h b/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_metrics.h index cb97eb4c891..3e3d6fdc752 100644 --- a/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_metrics.h +++ b/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_metrics.h @@ -12,23 +12,27 @@ class DiskMemUsageState; */ class DiskMemUsageMetrics { - double _disk_usage; - double _disk_utilization; + double _total_disk_usage; + double _total_disk_utilization; double _transient_disk_usage; - double _memory_usage; - double _memory_utilization; + double _non_transient_disk_usage; + double _total_memory_usage; + double _total_memory_utilization; double _transient_memory_usage; + double _non_transient_memory_usage; public: DiskMemUsageMetrics() noexcept; - DiskMemUsageMetrics(const DiskMemUsageState &usage_state) noexcept; - void merge(const DiskMemUsageState &usage_state) noexcept; - double get_disk_usage() const noexcept { return _disk_usage; } - double get_disk_utilization() const noexcept { return _disk_utilization; } - double get_transient_disk_usage() const noexcept { return _transient_disk_usage; } - double get_memory_usage() const noexcept { return _memory_usage; } - double get_memory_utilization() const noexcept { return _memory_utilization; } - double get_transient_memory_usage() const noexcept { return _transient_memory_usage; } + DiskMemUsageMetrics(const DiskMemUsageState& usage) noexcept; + void merge(const DiskMemUsageState& usage) noexcept; + double total_disk_usage() const noexcept { return _total_disk_usage; } + double total_disk_utilization() const noexcept { return _total_disk_utilization; } + double transient_disk_usage() const noexcept { return _transient_disk_usage; } + double non_transient_disk_usage() const noexcept { return _non_transient_disk_usage; } + double total_memory_usage() const noexcept { return _total_memory_usage; } + double total_memory_utilization() const noexcept { return _total_memory_utilization; } + double transient_memory_usage() const noexcept { return _transient_memory_usage; } + double non_transient_memory_usage() const noexcept { return _non_transient_memory_usage; } }; } // namespace proton diff --git a/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_state.h b/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_state.h index b205b441bcf..2730388de9a 100644 --- a/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_state.h +++ b/searchcore/src/vespa/searchcore/proton/server/disk_mem_usage_state.h @@ -3,6 +3,7 @@ #pragma once #include "resource_usage_state.h" +#include <algorithm> namespace proton { @@ -42,6 +43,8 @@ public: const ResourceUsageState &memoryState() const { return _memoryState; } double transient_disk_usage() const { return _transient_disk_usage; } double transient_memory_usage() const { return _transient_memory_usage; } + double non_transient_disk_usage() const { return std::max(0.0, _diskState.usage() - _transient_disk_usage); } + double non_transient_memory_usage() const { return std::max(0.0, _memoryState.usage() - _transient_memory_usage); } bool aboveDiskLimit(double resourceLimitFactor) const { return diskState().aboveLimit(resourceLimitFactor); } bool aboveMemoryLimit(double resourceLimitFactor) const { return memoryState().aboveLimit(resourceLimitFactor); } }; diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.cpp b/searchcore/src/vespa/searchcore/proton/server/proton.cpp index b2a86102cd1..f2bd68e53d7 100644 --- a/searchcore/src/vespa/searchcore/proton/server/proton.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/proton.cpp @@ -755,12 +755,20 @@ Proton::updateMetrics(const metrics::MetricLockGuard &) const DiskMemUsageFilter &usageFilter = _diskMemUsageSampler->writeFilter(); auto dm_metrics = usageFilter.get_metrics(); - metrics.resourceUsage.disk.set(dm_metrics.get_disk_usage()); - metrics.resourceUsage.diskUtilization.set(dm_metrics.get_disk_utilization()); - metrics.resourceUsage.memory.set(dm_metrics.get_memory_usage()); - metrics.resourceUsage.memoryUtilization.set(dm_metrics.get_memory_utilization()); - metrics.resourceUsage.transient_memory.set(dm_metrics.get_transient_memory_usage()); - metrics.resourceUsage.transient_disk.set(dm_metrics.get_transient_disk_usage()); + metrics.resourceUsage.disk.set(dm_metrics.non_transient_disk_usage()); + metrics.resourceUsage.diskUtilization.set(dm_metrics.total_disk_utilization()); + metrics.resourceUsage.transient_disk.set(dm_metrics.transient_disk_usage()); + metrics.resourceUsage.disk_usage.total.set(dm_metrics.total_disk_usage()); + metrics.resourceUsage.disk_usage.total_util.set(dm_metrics.total_disk_utilization()); + metrics.resourceUsage.disk_usage.transient.set(dm_metrics.transient_disk_usage()); + + metrics.resourceUsage.memory.set(dm_metrics.non_transient_memory_usage()); + metrics.resourceUsage.memoryUtilization.set(dm_metrics.total_memory_utilization()); + metrics.resourceUsage.transient_memory.set(dm_metrics.transient_memory_usage()); + metrics.resourceUsage.memory_usage.total.set(dm_metrics.total_memory_usage()); + metrics.resourceUsage.memory_usage.total_util.set(dm_metrics.total_memory_utilization()); + metrics.resourceUsage.memory_usage.transient.set(dm_metrics.transient_memory_usage()); + metrics.resourceUsage.memoryMappings.set(usageFilter.getMemoryStats().getMappingsCount()); metrics.resourceUsage.openFileDescriptors.set(FastOS_File::count_open_files()); metrics.resourceUsage.feedingBlocked.set((usageFilter.acceptWriteOperation() ? 0.0 : 1.0)); |