summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--storage/src/tests/distributor/top_level_distributor_test.cpp30
-rw-r--r--storage/src/vespa/storage/common/hostreporter/hostinfo.cpp3
-rw-r--r--storage/src/vespa/storage/common/hostreporter/hostreporter.h2
-rw-r--r--storage/src/vespa/storage/distributor/distributor_stripe.cpp36
-rw-r--r--storage/src/vespa/storage/distributor/distributor_stripe.h4
-rw-r--r--storage/src/vespa/storage/distributor/min_replica_provider.h2
6 files changed, 61 insertions, 16 deletions
diff --git a/storage/src/tests/distributor/top_level_distributor_test.cpp b/storage/src/tests/distributor/top_level_distributor_test.cpp
index 548f0f6be2c..a0477e352d1 100644
--- a/storage/src/tests/distributor/top_level_distributor_test.cpp
+++ b/storage/src/tests/distributor/top_level_distributor_test.cpp
@@ -90,6 +90,10 @@ struct TopLevelDistributorTest : Test, TopLevelDistributorTestUtil {
return _distributor->getBucketSpacesStats();
}
+ std::unordered_map<uint16_t, uint32_t> distributor_min_replica_stats() {
+ return _distributor->getMinReplica();
+ }
+
uint64_t db_sample_interval_sec() const noexcept {
// Sampling interval is equal across stripes, so just grab the first one and go with it.
return std::chrono::duration_cast<std::chrono::seconds>(
@@ -471,7 +475,7 @@ TEST_F(TopLevelDistributorTest, host_info_reporter_config_is_propagated_to_repor
namespace {
-void assert_invalid_stats_for_all_spaces(
+void assert_invalid_bucket_stats_for_all_spaces(
const BucketSpacesStatsProvider::PerNodeBucketSpacesStats& stats,
uint16_t node_index)
{
@@ -486,9 +490,15 @@ void assert_invalid_stats_for_all_spaces(
ASSERT_FALSE(space_iter->second.valid());
}
+void assert_min_replica_stats_zeroed(const std::unordered_map<uint16_t, uint32_t>& stats, uint16_t node_index) {
+ auto iter = stats.find(node_index);
+ ASSERT_TRUE(iter != stats.cend());
+ EXPECT_EQ(iter->second, 0);
}
-TEST_F(TopLevelDistributorTest, entering_recovery_mode_resets_bucket_space_stats_across_all_stripes) {
+}
+
+TEST_F(TopLevelDistributorTest, entering_recovery_mode_resets_bucket_space_and_min_replica_stats_across_all_stripes) {
// Set up a cluster state + DB contents which implies merge maintenance ops
setup_distributor(Redundancy(2), NodeCount(2), "version:1 distributor:1 storage:2");
add_nodes_to_stripe_bucket_db(document::BucketId(16, 1), "0=1/1/1/t/a");
@@ -503,10 +513,18 @@ TEST_F(TopLevelDistributorTest, entering_recovery_mode_resets_bucket_space_stats
// from state version 2. Exposing stats from version 1 risks reporting stale
// information back to the cluster controller.
const auto stats = distributor_bucket_spaces_stats();
- ASSERT_EQ(2, stats.size());
-
- assert_invalid_stats_for_all_spaces(stats, 0);
- assert_invalid_stats_for_all_spaces(stats, 2);
+ ASSERT_EQ(stats.size(), 2);
+
+ assert_invalid_bucket_stats_for_all_spaces(stats, 0);
+ assert_invalid_bucket_stats_for_all_spaces(stats, 2);
+
+ auto min_replica_stats = distributor_min_replica_stats();
+ ASSERT_EQ(min_replica_stats.size(), 2);
+ assert_min_replica_stats_zeroed(min_replica_stats, 0);
+ // Even though we don't have any replicas on node 2 in the DB, we don't know this until
+ // we've completed a full DB scan and updated the stats. Until that point in time we
+ // have to assume we _do_ have replicas with an unknown replication factor.
+ assert_min_replica_stats_zeroed(min_replica_stats, 2);
}
TEST_F(TopLevelDistributorTest, leaving_recovery_mode_immediately_sends_getnodestate_replies) {
diff --git a/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp b/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp
index f15885769e6..7ff999735c0 100644
--- a/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp
+++ b/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp
@@ -9,8 +9,7 @@ HostInfo::HostInfo() {
registerReporter(&versionReporter);
}
-HostInfo::~HostInfo() {
-}
+HostInfo::~HostInfo() = default;
void HostInfo::printReport(vespalib::JsonStream& report) {
for (HostReporter* reporter : customReporters) {
diff --git a/storage/src/vespa/storage/common/hostreporter/hostreporter.h b/storage/src/vespa/storage/common/hostreporter/hostreporter.h
index 115115328cc..6ce6bd803df 100644
--- a/storage/src/vespa/storage/common/hostreporter/hostreporter.h
+++ b/storage/src/vespa/storage/common/hostreporter/hostreporter.h
@@ -11,7 +11,7 @@ namespace storage {
class HostReporter {
public:
virtual void report(vespalib::JsonStream& jsonreport) = 0;
- virtual ~HostReporter() {}
+ virtual ~HostReporter() = default;
};
}
diff --git a/storage/src/vespa/storage/distributor/distributor_stripe.cpp b/storage/src/vespa/storage/distributor/distributor_stripe.cpp
index 50c70306d92..bcba976f2c3 100644
--- a/storage/src/vespa/storage/distributor/distributor_stripe.cpp
+++ b/storage/src/vespa/storage/distributor/distributor_stripe.cpp
@@ -316,9 +316,12 @@ DistributorStripe::enterRecoveryMode()
LOG(debug, "Entering recovery mode");
_schedulingMode = MaintenanceScheduler::RECOVERY_SCHEDULING_MODE;
_scanner->reset();
- _bucketDBMetricUpdater.reset();
- // TODO reset _bucketDbStats?
- invalidate_bucket_spaces_stats();
+ // We enter recovery mode due to cluster state or distribution config changes.
+ // Until we have completed a new DB scan round, we don't know the state of our
+ // newly owned buckets and must not report stats for these out to the cluster
+ // controller as they will be stale (valid only for the _previous_ state/config).
+ // As a consequence, we must explicitly invalidate all such statistics in this edge.
+ invalidate_internal_db_dependent_stats();
_recoveryTimeStarted = framework::MilliSecTimer(_component.getClock());
}
@@ -337,6 +340,17 @@ DistributorStripe::leaveRecoveryMode()
_schedulingMode = MaintenanceScheduler::NORMAL_SCHEDULING_MODE;
}
+void
+DistributorStripe::invalidate_internal_db_dependent_stats()
+{
+ _bucketDBMetricUpdater.reset();
+ {
+ std::lock_guard guard(_metricLock);
+ invalidate_bucket_spaces_stats(guard);
+ invalidate_min_replica_stats(guard);
+ }
+}
+
template <typename NodeFunctor>
void DistributorStripe::for_each_available_content_node_in(const lib::ClusterState& state, NodeFunctor&& func) {
const auto node_count = state.getNodeCount(lib::NodeType::STORAGE);
@@ -357,8 +371,9 @@ BucketSpacesStatsProvider::BucketSpacesStats DistributorStripe::make_invalid_sta
return invalid_space_stats;
}
-void DistributorStripe::invalidate_bucket_spaces_stats() {
- std::lock_guard guard(_metricLock);
+void
+DistributorStripe::invalidate_bucket_spaces_stats([[maybe_unused]] std::lock_guard<std::mutex>& held_metric_lock)
+{
_bucketSpacesStats = BucketSpacesStatsProvider::PerNodeBucketSpacesStats();
auto invalid_space_stats = make_invalid_stats_per_configured_space();
@@ -369,6 +384,17 @@ void DistributorStripe::invalidate_bucket_spaces_stats() {
}
void
+DistributorStripe::invalidate_min_replica_stats([[maybe_unused]] std::lock_guard<std::mutex>& held_metric_lock)
+{
+ _bucketDbStats._minBucketReplica.clear();
+ // Insert an explicit zero value for all nodes that are up in the pending/current cluster state
+ const auto& baseline = *_clusterStateBundle.getBaselineClusterState();
+ for_each_available_content_node_in(baseline, [this](const lib::Node& node) {
+ _bucketDbStats._minBucketReplica[node.getIndex()] = 0;
+ });
+}
+
+void
DistributorStripe::recheckBucketInfo(uint16_t nodeIdx, const document::Bucket &bucket) {
_bucketDBUpdater.recheckBucketInfo(nodeIdx, bucket);
}
diff --git a/storage/src/vespa/storage/distributor/distributor_stripe.h b/storage/src/vespa/storage/distributor/distributor_stripe.h
index ce6a2071efd..809b4dd0e41 100644
--- a/storage/src/vespa/storage/distributor/distributor_stripe.h
+++ b/storage/src/vespa/storage/distributor/distributor_stripe.h
@@ -259,7 +259,9 @@ private:
BucketSpacesStatsProvider::BucketSpacesStats make_invalid_stats_per_configured_space() const;
template <typename NodeFunctor>
void for_each_available_content_node_in(const lib::ClusterState&, NodeFunctor&&);
- void invalidate_bucket_spaces_stats();
+ void invalidate_internal_db_dependent_stats();
+ void invalidate_bucket_spaces_stats(std::lock_guard<std::mutex>& held_metric_lock);
+ void invalidate_min_replica_stats(std::lock_guard<std::mutex>& held_metric_lock);
void send_updated_host_info_if_required();
void propagate_config_snapshot_to_internal_components();
diff --git a/storage/src/vespa/storage/distributor/min_replica_provider.h b/storage/src/vespa/storage/distributor/min_replica_provider.h
index 56fd1e8fc81..a4374b906fe 100644
--- a/storage/src/vespa/storage/distributor/min_replica_provider.h
+++ b/storage/src/vespa/storage/distributor/min_replica_provider.h
@@ -9,7 +9,7 @@ namespace storage::distributor {
class MinReplicaProvider
{
public:
- virtual ~MinReplicaProvider() {}
+ virtual ~MinReplicaProvider() = default;
/**
* Get a snapshot of the minimum bucket replica for each of the nodes.