diff options
author | Tor Brede Vekterli <vekterli@yahooinc.com> | 2021-10-14 10:16:50 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahooinc.com> | 2021-10-14 10:16:50 +0000 |
commit | cd39f17c3f09ef72825aa545c8ba3d0e604efaf0 (patch) | |
tree | f1147fead50425cb3f4c2c8562e477b475f123f0 /storage | |
parent | e796dcf2409e6c9bfef432eded6cd90d69ce0c27 (diff) |
Add metric for max time since bucket GC was last run
Max time is aggregated across all buckets. If this metric value grows
substantially larger than the configured GC period it indicates that
GC is being starved.
Diffstat (limited to 'storage')
9 files changed, 89 insertions, 16 deletions
diff --git a/storage/src/tests/distributor/simplemaintenancescannertest.cpp b/storage/src/tests/distributor/simplemaintenancescannertest.cpp index 46732624b46..723b0baa6cd 100644 --- a/storage/src/tests/distributor/simplemaintenancescannertest.cpp +++ b/storage/src/tests/distributor/simplemaintenancescannertest.cpp @@ -16,6 +16,7 @@ using document::BucketId; using document::test::makeBucketSpace; using Priority = MaintenancePriority; using namespace ::testing; +using namespace std::chrono_literals; struct SimpleMaintenanceScannerTest : Test { using PendingStats = SimpleMaintenanceScanner::PendingMaintenanceStats; @@ -255,6 +256,7 @@ TEST_F(SimpleMaintenanceScannerTest, merge_pending_maintenance_stats) { stats_a.perNodeStats.incCopyingIn(5, default_space); stats_a.perNodeStats.incCopyingOut(5, global_space); stats_a.perNodeStats.incTotal(5, default_space); + stats_a.perNodeStats.update_observed_time_since_last_gc(100s); PendingStats stats_b; stats_b.global.pending[MaintenanceOperation::DELETE_BUCKET] = 10; @@ -268,6 +270,7 @@ TEST_F(SimpleMaintenanceScannerTest, merge_pending_maintenance_stats) { stats_b.perNodeStats.incCopyingIn(5, default_space); stats_b.perNodeStats.incCopyingOut(5, global_space); stats_b.perNodeStats.incTotal(5, default_space); + stats_b.perNodeStats.update_observed_time_since_last_gc(300s); PendingStats result; result.merge(stats_a); @@ -290,6 +293,7 @@ TEST_F(SimpleMaintenanceScannerTest, merge_pending_maintenance_stats) { exp.perNodeStats.incTotal(5, default_space); exp.perNodeStats.incMovingOut(7, default_space); exp.perNodeStats.incSyncing(7, global_space); + exp.perNodeStats.update_observed_time_since_last_gc(300s); EXPECT_EQ(exp.global, result.global); EXPECT_EQ(exp.perNodeStats, result.perNodeStats); } diff --git a/storage/src/tests/distributor/statecheckerstest.cpp b/storage/src/tests/distributor/statecheckerstest.cpp index c7d2638aa81..25009156f18 100644 --- a/storage/src/tests/distributor/statecheckerstest.cpp +++ b/storage/src/tests/distributor/statecheckerstest.cpp @@ -20,6 +20,7 @@ using document::test::makeBucketSpace; using document::test::makeDocumentBucket; using namespace ::testing; +using namespace std::chrono_literals; namespace storage::distributor { @@ -1564,6 +1565,18 @@ public: return *this; } + StateCheckerRunner& time_now(uint32_t secs_since_epoch) { + _fixture.getClock().setAbsoluteTimeInSeconds(secs_since_epoch); + return *this; + } + + StateCheckerRunner& last_gc_at_time(const document::BucketId& bucket, uint32_t secs_since_epoch) { + auto entry = _fixture.getBucket(bucket); + entry->setLastGarbageCollectionTime(secs_since_epoch); + _fixture.getBucketDatabase(makeBucketSpace()).update(entry); + return *this; + } + // Run the templated state checker with the provided parameters, updating // _result with the ideal state operations triggered. // NOTE: resets the bucket database! @@ -1650,4 +1663,22 @@ TEST_F(StateCheckersTest, stats_updated_when_merging_due_to_out_of_sync_copies) } } +TEST_F(StateCheckersTest, stats_updates_for_maximum_time_since_gc_run) { + setup_stripe(1, 1, "distributor:1 storage:1"); + + auto cfg = make_config(); + cfg->setGarbageCollection("music", 1000s); + cfg->setLastGarbageCollectionChangeTime(vespalib::steady_time(vespalib::duration::zero())); + configure_stripe(cfg); + + StateCheckerRunner<GarbageCollectionStateChecker> runner(*this); + runner.addToDb({17, 0}, "1=1,3=2") + .clusterState("distributor:1 storage:4") + .time_now(2000) + .last_gc_at_time({17, 0}, 100) + .runFor({17, 0}); + + EXPECT_EQ(runner.stats().max_observed_time_since_last_gc().count(), 1900); +} + } diff --git a/storage/src/vespa/storage/distributor/distributor_stripe.cpp b/storage/src/vespa/storage/distributor/distributor_stripe.cpp index d87b2db1638..60f387e9d16 100644 --- a/storage/src/vespa/storage/distributor/distributor_stripe.cpp +++ b/storage/src/vespa/storage/distributor/distributor_stripe.cpp @@ -566,6 +566,8 @@ DistributorStripe::propagateInternalScanMetricsToExternal() ideal_state_metrics.buckets_replicas_copying_out.set(total_stats.copyingOut); ideal_state_metrics.buckets_replicas_copying_in.set(total_stats.copyingIn); ideal_state_metrics.buckets_replicas_syncing.set(total_stats.syncing); + ideal_state_metrics.max_observed_time_since_last_gc_sec.set( + _maintenanceStats.perNodeStats.max_observed_time_since_last_gc().count()); } } diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp index 89dc1ada39d..5e57213bc73 100644 --- a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp +++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp @@ -107,6 +107,10 @@ IdealStateMetricSet::IdealStateMetricSet() buckets_replicas_syncing("bucket_replicas_syncing", {{"logdefault"},{"yamasdefault"}}, "Bucket replicas that need syncing due to mismatching metadata", this), + max_observed_time_since_last_gc_sec("max_observed_time_since_last_gc_sec", + {{"logdefault"},{"yamasdefault"}}, + "Maximum time (in seconds) since GC was last successfully run for a bucket. " + "Aggregated max value across all buckets on the distributor.", this), nodesPerMerge("nodes_per_merge", {}, "The number of nodes involved in a single merge operation.", this) { createOperationMetrics(); diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.h b/storage/src/vespa/storage/distributor/idealstatemetricsset.h index ada1bd50e5c..a9805a41556 100644 --- a/storage/src/vespa/storage/distributor/idealstatemetricsset.h +++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.h @@ -45,6 +45,7 @@ public: metrics::LongValueMetric buckets_replicas_copying_in; metrics::LongValueMetric buckets_replicas_copying_out; metrics::LongValueMetric buckets_replicas_syncing; + metrics::LongValueMetric max_observed_time_since_last_gc_sec; metrics::DoubleAverageMetric nodesPerMerge; void createOperationMetrics(); diff --git a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp index 5a6dedf8db4..47b8aec4aa6 100644 --- a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp +++ b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp @@ -10,11 +10,11 @@ const NodeMaintenanceStats NodeMaintenanceStatsTracker::_emptyNodeMaintenanceSta void NodeMaintenanceStats::merge(const NodeMaintenanceStats& rhs) { - movingOut += rhs.movingOut; - syncing += rhs.syncing; - copyingIn += rhs.copyingIn; + movingOut += rhs.movingOut; + syncing += rhs.syncing; + copyingIn += rhs.copyingIn; copyingOut += rhs.copyingOut; - total += rhs.total; + total += rhs.total; } namespace { @@ -38,6 +38,8 @@ NodeMaintenanceStatsTracker::merge(const NodeMaintenanceStatsTracker& rhs) auto node_index = entry.first; merge_bucket_spaces_stats(_node_stats[node_index], entry.second); } + _max_observed_time_since_last_gc = std::max(_max_observed_time_since_last_gc, + rhs._max_observed_time_since_last_gc); } std::ostream& @@ -53,7 +55,12 @@ operator<<(std::ostream& os, const NodeMaintenanceStats& stats) return os; } -NodeMaintenanceStatsTracker::NodeMaintenanceStatsTracker() = default; +NodeMaintenanceStatsTracker::NodeMaintenanceStatsTracker() + : _node_stats(), + _total_stats(), + _max_observed_time_since_last_gc(0) +{} + NodeMaintenanceStatsTracker::~NodeMaintenanceStatsTracker() = default; } diff --git a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h index eb00c8076ea..0e6c2bb23c2 100644 --- a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h +++ b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once +#include <chrono> #include <unordered_map> #include <iosfwd> #include <stdint.h> @@ -50,8 +51,10 @@ public: using PerNodeStats = std::unordered_map<uint16_t, BucketSpacesStats>; private: - PerNodeStats _node_stats; + PerNodeStats _node_stats; NodeMaintenanceStats _total_stats; + std::chrono::seconds _max_observed_time_since_last_gc; + static const NodeMaintenanceStats _emptyNodeMaintenanceStats; public: @@ -83,6 +86,10 @@ public: ++_total_stats.total; } + void update_observed_time_since_last_gc(std::chrono::seconds time_since_gc) noexcept { + _max_observed_time_since_last_gc = std::max(time_since_gc, _max_observed_time_since_last_gc); + } + /** * Returned statistics for a given node index and bucket space, or all zero statistics * if none have been recorded yet @@ -109,8 +116,13 @@ public: return _total_stats; } + std::chrono::seconds max_observed_time_since_last_gc() const noexcept { + return _max_observed_time_since_last_gc; + } + bool operator==(const NodeMaintenanceStatsTracker& rhs) const { - return _node_stats == rhs._node_stats; + return ((_node_stats == rhs._node_stats) && + (_max_observed_time_since_last_gc == rhs._max_observed_time_since_last_gc)); } void merge(const NodeMaintenanceStatsTracker& rhs); }; diff --git a/storage/src/vespa/storage/distributor/statecheckers.cpp b/storage/src/vespa/storage/distributor/statecheckers.cpp index 053f5956b10..a2069219d5d 100644 --- a/storage/src/vespa/storage/distributor/statecheckers.cpp +++ b/storage/src/vespa/storage/distributor/statecheckers.cpp @@ -1127,25 +1127,35 @@ BucketStateStateChecker::check(StateChecker::Context& c) } bool -GarbageCollectionStateChecker::needsGarbageCollection(const Context& c) const +GarbageCollectionStateChecker::garbage_collection_disabled(const Context& c) const noexcept { - if (c.entry->getNodeCount() == 0 || c.distributorConfig.getGarbageCollectionInterval() == vespalib::duration::zero()) { + return (c.distributorConfig.getGarbageCollectionInterval() == vespalib::duration::zero()); +} + +bool +GarbageCollectionStateChecker::needs_garbage_collection(const Context& c, std::chrono::seconds time_since_epoch) const +{ + if (c.entry->getNodeCount() == 0) { return false; } if (containsMaintenanceNode(c.idealState, c)) { return false; } std::chrono::seconds lastRunAt(c.entry->getLastGarbageCollectionTime()); - std::chrono::seconds currentTime( - c.node_ctx.clock().getTimeInSeconds().getTime()); - - return c.gcTimeCalculator.shouldGc(c.getBucketId(), currentTime, lastRunAt); + return c.gcTimeCalculator.shouldGc(c.getBucketId(), time_since_epoch, lastRunAt); } StateChecker::Result GarbageCollectionStateChecker::check(Context& c) { - if (needsGarbageCollection(c)) { + if (garbage_collection_disabled(c)) { + return Result::noMaintenanceNeeded(); + } + const std::chrono::seconds now(c.node_ctx.clock().getTimeInSeconds().getTime()); + const std::chrono::seconds last_run_at(c.entry->getLastGarbageCollectionTime()); + c.stats.update_observed_time_since_last_gc(now - last_run_at); + + if (needs_garbage_collection(c, now)) { auto op = std::make_unique<GarbageCollectionOperation>( c.node_ctx, BucketAndNodes(c.getBucket(), c.entry->getNodes())); @@ -1154,7 +1164,7 @@ GarbageCollectionStateChecker::check(Context& c) reason << "[Needs garbage collection: Last check at " << c.entry->getLastGarbageCollectionTime() << ", current time " - << c.node_ctx.clock().getTimeInSeconds().getTime() + << now.count() << ", configured interval " << vespalib::to_s(c.distributorConfig.getGarbageCollectionInterval()) << "]"; diff --git a/storage/src/vespa/storage/distributor/statecheckers.h b/storage/src/vespa/storage/distributor/statecheckers.h index 32e6623785f..cce66dadec6 100644 --- a/storage/src/vespa/storage/distributor/statecheckers.h +++ b/storage/src/vespa/storage/distributor/statecheckers.h @@ -108,9 +108,11 @@ class GarbageCollectionStateChecker : public StateChecker { public: std::string getStatusText() const override { return "Garbage collection"; } - bool needsGarbageCollection(const Context& c) const; Result check(Context& c) override; const char* getName() const override { return "GarbageCollection"; } +private: + [[nodiscard]] bool garbage_collection_disabled(const Context& c) const noexcept; + [[nodiscard]] bool needs_garbage_collection(const Context& c, std::chrono::seconds time_since_epoch) const; }; } |