summaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2021-09-28 11:59:29 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2021-09-28 11:59:29 +0000
commitd0448a2b6d56a5957a50c98c5803bad4acea77f3 (patch)
tree0f3b8c4408fa96f1f3e8ece1f956828d32e59bce /storage
parentb0b6933959c4974ab02d706c6bf5fa9b2723ad9f (diff)
Expose aggregated low-level data movement statistics as metrics
Adds metrics for the following: * Bucket replicas that should be moved out, e.g. retirement case or node added to cluster that has higher ideal state priority. * Bucket replicas that should be copied out, e.g. node is in ideal state but might have to provide data other nodes in a merge. * Bucket replicas that should be copied in, e.g. node does not have a replica for a bucket that it is in ideal state for * Bucket replicas that need syncing due to mismatching metadata. These are aggregates across all bucket replicas, buckets and bucket spaces. Should aid in visibility for data movement during node retirements when there are concurrent replicas out of sync events.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/distributor_stripe_test.cpp9
-rw-r--r--storage/src/vespa/storage/distributor/distributor_stripe.cpp10
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemanager.cpp6
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemetricsset.cpp16
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemetricsset.h7
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp4
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h33
7 files changed, 62 insertions, 23 deletions
diff --git a/storage/src/tests/distributor/distributor_stripe_test.cpp b/storage/src/tests/distributor/distributor_stripe_test.cpp
index a38b40d3682..067b0efdd5c 100644
--- a/storage/src/tests/distributor/distributor_stripe_test.cpp
+++ b/storage/src/tests/distributor/distributor_stripe_test.cpp
@@ -475,7 +475,7 @@ TEST_F(DistributorStripeTest, merge_stats_are_accumulated_during_database_iterat
// added to existing.
tickDistributorNTimes(50);
- const auto& stats = stripe_maintenance_stats();
+ const auto stats = stripe_maintenance_stats();
{
NodeMaintenanceStats wanted;
wanted.syncing = 1;
@@ -501,6 +501,11 @@ TEST_F(DistributorStripeTest, merge_stats_are_accumulated_during_database_iterat
assertBucketSpaceStats(1, 3, 0, "default", bucketStats);
assertBucketSpaceStats(0, 1, 1, "default", bucketStats);
assertBucketSpaceStats(3, 1, 2, "default", bucketStats);
+
+ EXPECT_EQ(stats.perNodeStats.total_replica_stats().movingOut, 1);
+ EXPECT_EQ(stats.perNodeStats.total_replica_stats().copyingOut, 2);
+ EXPECT_EQ(stats.perNodeStats.total_replica_stats().copyingIn, 2);
+ EXPECT_EQ(stats.perNodeStats.total_replica_stats().syncing, 2);
}
void
@@ -534,7 +539,7 @@ TEST_F(DistributorStripeTest, stats_generated_for_preempted_operations)
// by activation, we'll see no merge stats at all.
addNodesToBucketDB(document::BucketId(16, 1), "0=1/1/1,1=2/2/2");
tickDistributorNTimes(50);
- const auto& stats = stripe_maintenance_stats();
+ const auto stats = stripe_maintenance_stats();
{
NodeMaintenanceStats wanted;
wanted.syncing = 1;
diff --git a/storage/src/vespa/storage/distributor/distributor_stripe.cpp b/storage/src/vespa/storage/distributor/distributor_stripe.cpp
index 21059649ef9..da32b7ad4c6 100644
--- a/storage/src/vespa/storage/distributor/distributor_stripe.cpp
+++ b/storage/src/vespa/storage/distributor/distributor_stripe.cpp
@@ -558,8 +558,14 @@ DistributorStripe::propagateInternalScanMetricsToExternal()
// All shared values are written when _metricLock is held, so no races.
if (_bucketDBMetricUpdater.hasCompletedRound()) {
- _bucketDbStats.propagateMetrics(_idealStateManager.getMetrics(), getMetrics());
- _idealStateManager.getMetrics().setPendingOperations(_maintenanceStats.global.pending);
+ auto& ideal_state_metrics = _idealStateManager.getMetrics();
+ _bucketDbStats.propagateMetrics(ideal_state_metrics, getMetrics());
+ ideal_state_metrics.setPendingOperations(_maintenanceStats.global.pending);
+ const auto& total_stats = _maintenanceStats.perNodeStats.total_replica_stats();
+ ideal_state_metrics.buckets_replicas_moving_out.set(total_stats.movingOut);
+ ideal_state_metrics.buckets_replicas_copying_out.set(total_stats.copyingOut);
+ ideal_state_metrics.buckets_replicas_copying_in.set(total_stats.copyingIn);
+ ideal_state_metrics.buckets_replicas_syncing.set(total_stats.syncing);
}
}
diff --git a/storage/src/vespa/storage/distributor/idealstatemanager.cpp b/storage/src/vespa/storage/distributor/idealstatemanager.cpp
index 124f75ec169..f1d2b163623 100644
--- a/storage/src/vespa/storage/distributor/idealstatemanager.cpp
+++ b/storage/src/vespa/storage/distributor/idealstatemanager.cpp
@@ -21,8 +21,7 @@ using document::BucketSpace;
using storage::lib::Node;
using storage::lib::NodeType;
-namespace storage {
-namespace distributor {
+namespace storage::distributor {
IdealStateManager::IdealStateManager(
const DistributorNodeContext& node_ctx,
@@ -298,5 +297,4 @@ void IdealStateManager::getBucketStatus(std::ostream& out) const {
}
}
-} // distributor
-} // storage
+} // storage::distributor
diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp
index fd193ad6fd8..e786d81df91 100644
--- a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp
+++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp
@@ -86,7 +86,21 @@ IdealStateMetricSet::IdealStateMetricSet()
{{"logdefault"},{"yamasdefault"}},
"The number of buckets that we are rechecking for "
"ideal state operations", this),
- startOperationsLatency("start_operations_latency", {}, "Time used in startOperations()", this),
+ buckets_replicas_moving_out("bucket_replicas_moving_out",
+ {{"logdefault"},{"yamasdefault"}},
+ "Bucket replicas that should be moved out, e.g. retirement case or node "
+ "added to cluster that has higher ideal state priority.", this),
+ buckets_replicas_copying_in("bucket_replicas_copying_in",
+ {{"logdefault"},{"yamasdefault"}},
+ "Bucket replicas that should be copied in, e.g. node does not have a "
+ "replica for a bucket that it is in ideal state for", this),
+ buckets_replicas_copying_out("bucket_replicas_copying_out",
+ {{"logdefault"},{"yamasdefault"}},
+ "Bucket replicas that should be copied out, e.g. node is in ideal state "
+ "but might have to provide data other nodes in a merge", this),
+ buckets_replicas_syncing("bucket_replicas_syncing",
+ {{"logdefault"},{"yamasdefault"}},
+ "Bucket replicas that need syncing due to mismatching metadata", this),
nodesPerMerge("nodes_per_merge", {}, "The number of nodes involved in a single merge operation.", this)
{
createOperationMetrics();
diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.h b/storage/src/vespa/storage/distributor/idealstatemetricsset.h
index c1fb39bb50a..e9ccef9f93e 100644
--- a/storage/src/vespa/storage/distributor/idealstatemetricsset.h
+++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.h
@@ -38,8 +38,11 @@ public:
metrics::LongValueMetric buckets_toomanycopies;
metrics::LongValueMetric buckets;
metrics::LongValueMetric buckets_notrusted;
- metrics::LongValueMetric buckets_rechecking;
- metrics::LongAverageMetric startOperationsLatency;
+ metrics::LongValueMetric buckets_rechecking; // TODO remove, not used (but exposed by VespaMetricSet)
+ metrics::LongValueMetric buckets_replicas_moving_out;
+ metrics::LongValueMetric buckets_replicas_copying_in;
+ metrics::LongValueMetric buckets_replicas_copying_out;
+ metrics::LongValueMetric buckets_replicas_syncing;
metrics::DoubleAverageMetric nodesPerMerge;
void createOperationMetrics();
diff --git a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp
index 4e7f7d9d89d..db2eb6aadc9 100644
--- a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp
+++ b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp
@@ -34,9 +34,9 @@ merge_bucket_spaces_stats(NodeMaintenanceStatsTracker::BucketSpacesStats& dest,
void
NodeMaintenanceStatsTracker::merge(const NodeMaintenanceStatsTracker& rhs)
{
- for (const auto& entry : rhs._stats) {
+ for (const auto& entry : rhs._node_stats) {
auto node_index = entry.first;
- merge_bucket_spaces_stats(_stats[node_index], entry.second);
+ merge_bucket_spaces_stats(_node_stats[node_index], entry.second);
}
}
diff --git a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h
index 6399e53089b..3c45bcdd5e5 100644
--- a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h
+++ b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h
@@ -50,7 +50,8 @@ public:
using PerNodeStats = std::unordered_map<uint16_t, BucketSpacesStats>;
private:
- PerNodeStats _stats;
+ PerNodeStats _node_stats;
+ NodeMaintenanceStats _total_stats;
static const NodeMaintenanceStats _emptyNodeMaintenanceStats;
public:
@@ -58,23 +59,28 @@ public:
~NodeMaintenanceStatsTracker();
void incMovingOut(uint16_t node, document::BucketSpace bucketSpace) {
- ++_stats[node][bucketSpace].movingOut;
+ ++_node_stats[node][bucketSpace].movingOut;
+ ++_total_stats.movingOut;
}
void incSyncing(uint16_t node, document::BucketSpace bucketSpace) {
- ++_stats[node][bucketSpace].syncing;
+ ++_node_stats[node][bucketSpace].syncing;
+ ++_total_stats.syncing;
}
void incCopyingIn(uint16_t node, document::BucketSpace bucketSpace) {
- ++_stats[node][bucketSpace].copyingIn;
+ ++_node_stats[node][bucketSpace].copyingIn;
+ ++_total_stats.copyingIn;
}
void incCopyingOut(uint16_t node, document::BucketSpace bucketSpace) {
- ++_stats[node][bucketSpace].copyingOut;
+ ++_node_stats[node][bucketSpace].copyingOut;
+ ++_total_stats.copyingOut;
}
void incTotal(uint16_t node, document::BucketSpace bucketSpace) {
- ++_stats[node][bucketSpace].total;
+ ++_node_stats[node][bucketSpace].total;
+ ++_total_stats.total;
}
/**
@@ -82,8 +88,8 @@ public:
* if none have been recorded yet
*/
const NodeMaintenanceStats& forNode(uint16_t node, document::BucketSpace bucketSpace) const {
- auto nodeItr = _stats.find(node);
- if (nodeItr != _stats.end()) {
+ auto nodeItr = _node_stats.find(node);
+ if (nodeItr != _node_stats.end()) {
auto bucketSpaceItr = nodeItr->second.find(bucketSpace);
if (bucketSpaceItr != nodeItr->second.end()) {
return bucketSpaceItr->second;
@@ -93,11 +99,18 @@ public:
}
const PerNodeStats& perNodeStats() const {
- return _stats;
+ return _node_stats;
+ }
+
+ // Note: the total statistics are across all replicas across all buckets across all bucket spaces.
+ // That means it's possible for a single bucket to count more than once, up to once per replica.
+ // So this should not be treated as a bucket-level statistic.
+ const NodeMaintenanceStats& total_replica_stats() const noexcept {
+ return _total_stats;
}
bool operator==(const NodeMaintenanceStatsTracker& rhs) const {
- return _stats == rhs._stats;
+ return _node_stats == rhs._node_stats;
}
void merge(const NodeMaintenanceStatsTracker& rhs);
};