diff options
6 files changed, 61 insertions, 16 deletions
diff --git a/storage/src/tests/distributor/top_level_distributor_test.cpp b/storage/src/tests/distributor/top_level_distributor_test.cpp index 548f0f6be2c..a0477e352d1 100644 --- a/storage/src/tests/distributor/top_level_distributor_test.cpp +++ b/storage/src/tests/distributor/top_level_distributor_test.cpp @@ -90,6 +90,10 @@ struct TopLevelDistributorTest : Test, TopLevelDistributorTestUtil { return _distributor->getBucketSpacesStats(); } + std::unordered_map<uint16_t, uint32_t> distributor_min_replica_stats() { + return _distributor->getMinReplica(); + } + uint64_t db_sample_interval_sec() const noexcept { // Sampling interval is equal across stripes, so just grab the first one and go with it. return std::chrono::duration_cast<std::chrono::seconds>( @@ -471,7 +475,7 @@ TEST_F(TopLevelDistributorTest, host_info_reporter_config_is_propagated_to_repor namespace { -void assert_invalid_stats_for_all_spaces( +void assert_invalid_bucket_stats_for_all_spaces( const BucketSpacesStatsProvider::PerNodeBucketSpacesStats& stats, uint16_t node_index) { @@ -486,9 +490,15 @@ void assert_invalid_stats_for_all_spaces( ASSERT_FALSE(space_iter->second.valid()); } +void assert_min_replica_stats_zeroed(const std::unordered_map<uint16_t, uint32_t>& stats, uint16_t node_index) { + auto iter = stats.find(node_index); + ASSERT_TRUE(iter != stats.cend()); + EXPECT_EQ(iter->second, 0); } -TEST_F(TopLevelDistributorTest, entering_recovery_mode_resets_bucket_space_stats_across_all_stripes) { +} + +TEST_F(TopLevelDistributorTest, entering_recovery_mode_resets_bucket_space_and_min_replica_stats_across_all_stripes) { // Set up a cluster state + DB contents which implies merge maintenance ops setup_distributor(Redundancy(2), NodeCount(2), "version:1 distributor:1 storage:2"); add_nodes_to_stripe_bucket_db(document::BucketId(16, 1), "0=1/1/1/t/a"); @@ -503,10 +513,18 @@ TEST_F(TopLevelDistributorTest, entering_recovery_mode_resets_bucket_space_stats // from state version 2. Exposing stats from version 1 risks reporting stale // information back to the cluster controller. const auto stats = distributor_bucket_spaces_stats(); - ASSERT_EQ(2, stats.size()); - - assert_invalid_stats_for_all_spaces(stats, 0); - assert_invalid_stats_for_all_spaces(stats, 2); + ASSERT_EQ(stats.size(), 2); + + assert_invalid_bucket_stats_for_all_spaces(stats, 0); + assert_invalid_bucket_stats_for_all_spaces(stats, 2); + + auto min_replica_stats = distributor_min_replica_stats(); + ASSERT_EQ(min_replica_stats.size(), 2); + assert_min_replica_stats_zeroed(min_replica_stats, 0); + // Even though we don't have any replicas on node 2 in the DB, we don't know this until + // we've completed a full DB scan and updated the stats. Until that point in time we + // have to assume we _do_ have replicas with an unknown replication factor. + assert_min_replica_stats_zeroed(min_replica_stats, 2); } TEST_F(TopLevelDistributorTest, leaving_recovery_mode_immediately_sends_getnodestate_replies) { diff --git a/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp b/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp index f15885769e6..7ff999735c0 100644 --- a/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp +++ b/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp @@ -9,8 +9,7 @@ HostInfo::HostInfo() { registerReporter(&versionReporter); } -HostInfo::~HostInfo() { -} +HostInfo::~HostInfo() = default; void HostInfo::printReport(vespalib::JsonStream& report) { for (HostReporter* reporter : customReporters) { diff --git a/storage/src/vespa/storage/common/hostreporter/hostreporter.h b/storage/src/vespa/storage/common/hostreporter/hostreporter.h index 115115328cc..6ce6bd803df 100644 --- a/storage/src/vespa/storage/common/hostreporter/hostreporter.h +++ b/storage/src/vespa/storage/common/hostreporter/hostreporter.h @@ -11,7 +11,7 @@ namespace storage { class HostReporter { public: virtual void report(vespalib::JsonStream& jsonreport) = 0; - virtual ~HostReporter() {} + virtual ~HostReporter() = default; }; } diff --git a/storage/src/vespa/storage/distributor/distributor_stripe.cpp b/storage/src/vespa/storage/distributor/distributor_stripe.cpp index 50c70306d92..bcba976f2c3 100644 --- a/storage/src/vespa/storage/distributor/distributor_stripe.cpp +++ b/storage/src/vespa/storage/distributor/distributor_stripe.cpp @@ -316,9 +316,12 @@ DistributorStripe::enterRecoveryMode() LOG(debug, "Entering recovery mode"); _schedulingMode = MaintenanceScheduler::RECOVERY_SCHEDULING_MODE; _scanner->reset(); - _bucketDBMetricUpdater.reset(); - // TODO reset _bucketDbStats? - invalidate_bucket_spaces_stats(); + // We enter recovery mode due to cluster state or distribution config changes. + // Until we have completed a new DB scan round, we don't know the state of our + // newly owned buckets and must not report stats for these out to the cluster + // controller as they will be stale (valid only for the _previous_ state/config). + // As a consequence, we must explicitly invalidate all such statistics in this edge. + invalidate_internal_db_dependent_stats(); _recoveryTimeStarted = framework::MilliSecTimer(_component.getClock()); } @@ -337,6 +340,17 @@ DistributorStripe::leaveRecoveryMode() _schedulingMode = MaintenanceScheduler::NORMAL_SCHEDULING_MODE; } +void +DistributorStripe::invalidate_internal_db_dependent_stats() +{ + _bucketDBMetricUpdater.reset(); + { + std::lock_guard guard(_metricLock); + invalidate_bucket_spaces_stats(guard); + invalidate_min_replica_stats(guard); + } +} + template <typename NodeFunctor> void DistributorStripe::for_each_available_content_node_in(const lib::ClusterState& state, NodeFunctor&& func) { const auto node_count = state.getNodeCount(lib::NodeType::STORAGE); @@ -357,8 +371,9 @@ BucketSpacesStatsProvider::BucketSpacesStats DistributorStripe::make_invalid_sta return invalid_space_stats; } -void DistributorStripe::invalidate_bucket_spaces_stats() { - std::lock_guard guard(_metricLock); +void +DistributorStripe::invalidate_bucket_spaces_stats([[maybe_unused]] std::lock_guard<std::mutex>& held_metric_lock) +{ _bucketSpacesStats = BucketSpacesStatsProvider::PerNodeBucketSpacesStats(); auto invalid_space_stats = make_invalid_stats_per_configured_space(); @@ -369,6 +384,17 @@ void DistributorStripe::invalidate_bucket_spaces_stats() { } void +DistributorStripe::invalidate_min_replica_stats([[maybe_unused]] std::lock_guard<std::mutex>& held_metric_lock) +{ + _bucketDbStats._minBucketReplica.clear(); + // Insert an explicit zero value for all nodes that are up in the pending/current cluster state + const auto& baseline = *_clusterStateBundle.getBaselineClusterState(); + for_each_available_content_node_in(baseline, [this](const lib::Node& node) { + _bucketDbStats._minBucketReplica[node.getIndex()] = 0; + }); +} + +void DistributorStripe::recheckBucketInfo(uint16_t nodeIdx, const document::Bucket &bucket) { _bucketDBUpdater.recheckBucketInfo(nodeIdx, bucket); } diff --git a/storage/src/vespa/storage/distributor/distributor_stripe.h b/storage/src/vespa/storage/distributor/distributor_stripe.h index ce6a2071efd..809b4dd0e41 100644 --- a/storage/src/vespa/storage/distributor/distributor_stripe.h +++ b/storage/src/vespa/storage/distributor/distributor_stripe.h @@ -259,7 +259,9 @@ private: BucketSpacesStatsProvider::BucketSpacesStats make_invalid_stats_per_configured_space() const; template <typename NodeFunctor> void for_each_available_content_node_in(const lib::ClusterState&, NodeFunctor&&); - void invalidate_bucket_spaces_stats(); + void invalidate_internal_db_dependent_stats(); + void invalidate_bucket_spaces_stats(std::lock_guard<std::mutex>& held_metric_lock); + void invalidate_min_replica_stats(std::lock_guard<std::mutex>& held_metric_lock); void send_updated_host_info_if_required(); void propagate_config_snapshot_to_internal_components(); diff --git a/storage/src/vespa/storage/distributor/min_replica_provider.h b/storage/src/vespa/storage/distributor/min_replica_provider.h index 56fd1e8fc81..a4374b906fe 100644 --- a/storage/src/vespa/storage/distributor/min_replica_provider.h +++ b/storage/src/vespa/storage/distributor/min_replica_provider.h @@ -9,7 +9,7 @@ namespace storage::distributor { class MinReplicaProvider { public: - virtual ~MinReplicaProvider() {} + virtual ~MinReplicaProvider() = default; /** * Get a snapshot of the minimum bucket replica for each of the nodes. |