summaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2021-10-14 10:16:50 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2021-10-14 10:16:50 +0000
commitcd39f17c3f09ef72825aa545c8ba3d0e604efaf0 (patch)
treef1147fead50425cb3f4c2c8562e477b475f123f0 /storage
parente796dcf2409e6c9bfef432eded6cd90d69ce0c27 (diff)
Add metric for max time since bucket GC was last run
Max time is aggregated across all buckets. If this metric value grows substantially larger than the configured GC period it indicates that GC is being starved.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/simplemaintenancescannertest.cpp4
-rw-r--r--storage/src/tests/distributor/statecheckerstest.cpp31
-rw-r--r--storage/src/vespa/storage/distributor/distributor_stripe.cpp2
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemetricsset.cpp4
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemetricsset.h1
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp17
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h16
-rw-r--r--storage/src/vespa/storage/distributor/statecheckers.cpp26
-rw-r--r--storage/src/vespa/storage/distributor/statecheckers.h4
9 files changed, 89 insertions, 16 deletions
diff --git a/storage/src/tests/distributor/simplemaintenancescannertest.cpp b/storage/src/tests/distributor/simplemaintenancescannertest.cpp
index 46732624b46..723b0baa6cd 100644
--- a/storage/src/tests/distributor/simplemaintenancescannertest.cpp
+++ b/storage/src/tests/distributor/simplemaintenancescannertest.cpp
@@ -16,6 +16,7 @@ using document::BucketId;
using document::test::makeBucketSpace;
using Priority = MaintenancePriority;
using namespace ::testing;
+using namespace std::chrono_literals;
struct SimpleMaintenanceScannerTest : Test {
using PendingStats = SimpleMaintenanceScanner::PendingMaintenanceStats;
@@ -255,6 +256,7 @@ TEST_F(SimpleMaintenanceScannerTest, merge_pending_maintenance_stats) {
stats_a.perNodeStats.incCopyingIn(5, default_space);
stats_a.perNodeStats.incCopyingOut(5, global_space);
stats_a.perNodeStats.incTotal(5, default_space);
+ stats_a.perNodeStats.update_observed_time_since_last_gc(100s);
PendingStats stats_b;
stats_b.global.pending[MaintenanceOperation::DELETE_BUCKET] = 10;
@@ -268,6 +270,7 @@ TEST_F(SimpleMaintenanceScannerTest, merge_pending_maintenance_stats) {
stats_b.perNodeStats.incCopyingIn(5, default_space);
stats_b.perNodeStats.incCopyingOut(5, global_space);
stats_b.perNodeStats.incTotal(5, default_space);
+ stats_b.perNodeStats.update_observed_time_since_last_gc(300s);
PendingStats result;
result.merge(stats_a);
@@ -290,6 +293,7 @@ TEST_F(SimpleMaintenanceScannerTest, merge_pending_maintenance_stats) {
exp.perNodeStats.incTotal(5, default_space);
exp.perNodeStats.incMovingOut(7, default_space);
exp.perNodeStats.incSyncing(7, global_space);
+ exp.perNodeStats.update_observed_time_since_last_gc(300s);
EXPECT_EQ(exp.global, result.global);
EXPECT_EQ(exp.perNodeStats, result.perNodeStats);
}
diff --git a/storage/src/tests/distributor/statecheckerstest.cpp b/storage/src/tests/distributor/statecheckerstest.cpp
index c7d2638aa81..25009156f18 100644
--- a/storage/src/tests/distributor/statecheckerstest.cpp
+++ b/storage/src/tests/distributor/statecheckerstest.cpp
@@ -20,6 +20,7 @@
using document::test::makeBucketSpace;
using document::test::makeDocumentBucket;
using namespace ::testing;
+using namespace std::chrono_literals;
namespace storage::distributor {
@@ -1564,6 +1565,18 @@ public:
return *this;
}
+ StateCheckerRunner& time_now(uint32_t secs_since_epoch) {
+ _fixture.getClock().setAbsoluteTimeInSeconds(secs_since_epoch);
+ return *this;
+ }
+
+ StateCheckerRunner& last_gc_at_time(const document::BucketId& bucket, uint32_t secs_since_epoch) {
+ auto entry = _fixture.getBucket(bucket);
+ entry->setLastGarbageCollectionTime(secs_since_epoch);
+ _fixture.getBucketDatabase(makeBucketSpace()).update(entry);
+ return *this;
+ }
+
// Run the templated state checker with the provided parameters, updating
// _result with the ideal state operations triggered.
// NOTE: resets the bucket database!
@@ -1650,4 +1663,22 @@ TEST_F(StateCheckersTest, stats_updated_when_merging_due_to_out_of_sync_copies)
}
}
+TEST_F(StateCheckersTest, stats_updates_for_maximum_time_since_gc_run) {
+ setup_stripe(1, 1, "distributor:1 storage:1");
+
+ auto cfg = make_config();
+ cfg->setGarbageCollection("music", 1000s);
+ cfg->setLastGarbageCollectionChangeTime(vespalib::steady_time(vespalib::duration::zero()));
+ configure_stripe(cfg);
+
+ StateCheckerRunner<GarbageCollectionStateChecker> runner(*this);
+ runner.addToDb({17, 0}, "1=1,3=2")
+ .clusterState("distributor:1 storage:4")
+ .time_now(2000)
+ .last_gc_at_time({17, 0}, 100)
+ .runFor({17, 0});
+
+ EXPECT_EQ(runner.stats().max_observed_time_since_last_gc().count(), 1900);
+}
+
}
diff --git a/storage/src/vespa/storage/distributor/distributor_stripe.cpp b/storage/src/vespa/storage/distributor/distributor_stripe.cpp
index d87b2db1638..60f387e9d16 100644
--- a/storage/src/vespa/storage/distributor/distributor_stripe.cpp
+++ b/storage/src/vespa/storage/distributor/distributor_stripe.cpp
@@ -566,6 +566,8 @@ DistributorStripe::propagateInternalScanMetricsToExternal()
ideal_state_metrics.buckets_replicas_copying_out.set(total_stats.copyingOut);
ideal_state_metrics.buckets_replicas_copying_in.set(total_stats.copyingIn);
ideal_state_metrics.buckets_replicas_syncing.set(total_stats.syncing);
+ ideal_state_metrics.max_observed_time_since_last_gc_sec.set(
+ _maintenanceStats.perNodeStats.max_observed_time_since_last_gc().count());
}
}
diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp
index 89dc1ada39d..5e57213bc73 100644
--- a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp
+++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp
@@ -107,6 +107,10 @@ IdealStateMetricSet::IdealStateMetricSet()
buckets_replicas_syncing("bucket_replicas_syncing",
{{"logdefault"},{"yamasdefault"}},
"Bucket replicas that need syncing due to mismatching metadata", this),
+ max_observed_time_since_last_gc_sec("max_observed_time_since_last_gc_sec",
+ {{"logdefault"},{"yamasdefault"}},
+ "Maximum time (in seconds) since GC was last successfully run for a bucket. "
+ "Aggregated max value across all buckets on the distributor.", this),
nodesPerMerge("nodes_per_merge", {}, "The number of nodes involved in a single merge operation.", this)
{
createOperationMetrics();
diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.h b/storage/src/vespa/storage/distributor/idealstatemetricsset.h
index ada1bd50e5c..a9805a41556 100644
--- a/storage/src/vespa/storage/distributor/idealstatemetricsset.h
+++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.h
@@ -45,6 +45,7 @@ public:
metrics::LongValueMetric buckets_replicas_copying_in;
metrics::LongValueMetric buckets_replicas_copying_out;
metrics::LongValueMetric buckets_replicas_syncing;
+ metrics::LongValueMetric max_observed_time_since_last_gc_sec;
metrics::DoubleAverageMetric nodesPerMerge;
void createOperationMetrics();
diff --git a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp
index 5a6dedf8db4..47b8aec4aa6 100644
--- a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp
+++ b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp
@@ -10,11 +10,11 @@ const NodeMaintenanceStats NodeMaintenanceStatsTracker::_emptyNodeMaintenanceSta
void
NodeMaintenanceStats::merge(const NodeMaintenanceStats& rhs)
{
- movingOut += rhs.movingOut;
- syncing += rhs.syncing;
- copyingIn += rhs.copyingIn;
+ movingOut += rhs.movingOut;
+ syncing += rhs.syncing;
+ copyingIn += rhs.copyingIn;
copyingOut += rhs.copyingOut;
- total += rhs.total;
+ total += rhs.total;
}
namespace {
@@ -38,6 +38,8 @@ NodeMaintenanceStatsTracker::merge(const NodeMaintenanceStatsTracker& rhs)
auto node_index = entry.first;
merge_bucket_spaces_stats(_node_stats[node_index], entry.second);
}
+ _max_observed_time_since_last_gc = std::max(_max_observed_time_since_last_gc,
+ rhs._max_observed_time_since_last_gc);
}
std::ostream&
@@ -53,7 +55,12 @@ operator<<(std::ostream& os, const NodeMaintenanceStats& stats)
return os;
}
-NodeMaintenanceStatsTracker::NodeMaintenanceStatsTracker() = default;
+NodeMaintenanceStatsTracker::NodeMaintenanceStatsTracker()
+ : _node_stats(),
+ _total_stats(),
+ _max_observed_time_since_last_gc(0)
+{}
+
NodeMaintenanceStatsTracker::~NodeMaintenanceStatsTracker() = default;
}
diff --git a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h
index eb00c8076ea..0e6c2bb23c2 100644
--- a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h
+++ b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
+#include <chrono>
#include <unordered_map>
#include <iosfwd>
#include <stdint.h>
@@ -50,8 +51,10 @@ public:
using PerNodeStats = std::unordered_map<uint16_t, BucketSpacesStats>;
private:
- PerNodeStats _node_stats;
+ PerNodeStats _node_stats;
NodeMaintenanceStats _total_stats;
+ std::chrono::seconds _max_observed_time_since_last_gc;
+
static const NodeMaintenanceStats _emptyNodeMaintenanceStats;
public:
@@ -83,6 +86,10 @@ public:
++_total_stats.total;
}
+ void update_observed_time_since_last_gc(std::chrono::seconds time_since_gc) noexcept {
+ _max_observed_time_since_last_gc = std::max(time_since_gc, _max_observed_time_since_last_gc);
+ }
+
/**
* Returned statistics for a given node index and bucket space, or all zero statistics
* if none have been recorded yet
@@ -109,8 +116,13 @@ public:
return _total_stats;
}
+ std::chrono::seconds max_observed_time_since_last_gc() const noexcept {
+ return _max_observed_time_since_last_gc;
+ }
+
bool operator==(const NodeMaintenanceStatsTracker& rhs) const {
- return _node_stats == rhs._node_stats;
+ return ((_node_stats == rhs._node_stats) &&
+ (_max_observed_time_since_last_gc == rhs._max_observed_time_since_last_gc));
}
void merge(const NodeMaintenanceStatsTracker& rhs);
};
diff --git a/storage/src/vespa/storage/distributor/statecheckers.cpp b/storage/src/vespa/storage/distributor/statecheckers.cpp
index 053f5956b10..a2069219d5d 100644
--- a/storage/src/vespa/storage/distributor/statecheckers.cpp
+++ b/storage/src/vespa/storage/distributor/statecheckers.cpp
@@ -1127,25 +1127,35 @@ BucketStateStateChecker::check(StateChecker::Context& c)
}
bool
-GarbageCollectionStateChecker::needsGarbageCollection(const Context& c) const
+GarbageCollectionStateChecker::garbage_collection_disabled(const Context& c) const noexcept
{
- if (c.entry->getNodeCount() == 0 || c.distributorConfig.getGarbageCollectionInterval() == vespalib::duration::zero()) {
+ return (c.distributorConfig.getGarbageCollectionInterval() == vespalib::duration::zero());
+}
+
+bool
+GarbageCollectionStateChecker::needs_garbage_collection(const Context& c, std::chrono::seconds time_since_epoch) const
+{
+ if (c.entry->getNodeCount() == 0) {
return false;
}
if (containsMaintenanceNode(c.idealState, c)) {
return false;
}
std::chrono::seconds lastRunAt(c.entry->getLastGarbageCollectionTime());
- std::chrono::seconds currentTime(
- c.node_ctx.clock().getTimeInSeconds().getTime());
-
- return c.gcTimeCalculator.shouldGc(c.getBucketId(), currentTime, lastRunAt);
+ return c.gcTimeCalculator.shouldGc(c.getBucketId(), time_since_epoch, lastRunAt);
}
StateChecker::Result
GarbageCollectionStateChecker::check(Context& c)
{
- if (needsGarbageCollection(c)) {
+ if (garbage_collection_disabled(c)) {
+ return Result::noMaintenanceNeeded();
+ }
+ const std::chrono::seconds now(c.node_ctx.clock().getTimeInSeconds().getTime());
+ const std::chrono::seconds last_run_at(c.entry->getLastGarbageCollectionTime());
+ c.stats.update_observed_time_since_last_gc(now - last_run_at);
+
+ if (needs_garbage_collection(c, now)) {
auto op = std::make_unique<GarbageCollectionOperation>(
c.node_ctx,
BucketAndNodes(c.getBucket(), c.entry->getNodes()));
@@ -1154,7 +1164,7 @@ GarbageCollectionStateChecker::check(Context& c)
reason << "[Needs garbage collection: Last check at "
<< c.entry->getLastGarbageCollectionTime()
<< ", current time "
- << c.node_ctx.clock().getTimeInSeconds().getTime()
+ << now.count()
<< ", configured interval "
<< vespalib::to_s(c.distributorConfig.getGarbageCollectionInterval()) << "]";
diff --git a/storage/src/vespa/storage/distributor/statecheckers.h b/storage/src/vespa/storage/distributor/statecheckers.h
index 32e6623785f..cce66dadec6 100644
--- a/storage/src/vespa/storage/distributor/statecheckers.h
+++ b/storage/src/vespa/storage/distributor/statecheckers.h
@@ -108,9 +108,11 @@ class GarbageCollectionStateChecker : public StateChecker
{
public:
std::string getStatusText() const override { return "Garbage collection"; }
- bool needsGarbageCollection(const Context& c) const;
Result check(Context& c) override;
const char* getName() const override { return "GarbageCollection"; }
+private:
+ [[nodiscard]] bool garbage_collection_disabled(const Context& c) const noexcept;
+ [[nodiscard]] bool needs_garbage_collection(const Context& c, std::chrono::seconds time_since_epoch) const;
};
}