Invalidate bucket DB replica statistics upon recovery mode entry

The replica stats track the minimum replication factor for any bucket for a given content node the distributor maintains buckets for. These statistics may be asynchronously queried by the cluster controller through the host info reporting API. If we do not invalidate the statistics upon a cluster state change, there is a very small window of time where the distributor may potentially report back _stale_ statistics that were valid for the _prior_ cluster state version but not for the new one. This can happen if the cluster controller fetches host info from the node in between start of the recovery period and the completion of the recovery mode DB scan. Receiving stale replication statistics may cause the cluster controller to erroneously believe that replication due to node retirements etc has completed earlier than it really has, possibly impacting orchestration decisions in a sub- optimal manner.
author: Tor Brede Vekterli <vekterli@yahooinc.com> 2022-01-03 13:54:40 +0000
committer: Tor Brede Vekterli <vekterli@yahooinc.com> 2022-01-03 13:54:40 +0000
commit: 831cd97c7ef749ba486e524122a90e1513a139fd (patch)
tree: fea066f3c11c793ac33fb577ce4303ea14464458
parent: 1f3e5d1003a33d9a7076575eab8059c582611cdb (diff)
6 files changed, 61 insertions, 16 deletions
diff --git a/storage/src/tests/distributor/top_level_distributor_test.cpp b/storage/src/tests/distributor/top_level_distributor_test.cpp
index 548f0f6be2c..a0477e352d1 100644
--- a/storage/src/tests/distributor/top_level_distributor_test.cpp
+++ b/storage/src/tests/distributor/top_level_distributor_test.cpp
@@ -90,6 +90,10 @@ struct TopLevelDistributorTest : Test, TopLevelDistributorTestUtil {
         return _distributor->getBucketSpacesStats();
     }
 
+    std::unordered_map<uint16_t, uint32_t> distributor_min_replica_stats() {
+        return _distributor->getMinReplica();
+    }
+
     uint64_t db_sample_interval_sec() const noexcept {
         // Sampling interval is equal across stripes, so just grab the first one and go with it.
         return std::chrono::duration_cast<std::chrono::seconds>(
@@ -471,7 +475,7 @@ TEST_F(TopLevelDistributorTest, host_info_reporter_config_is_propagated_to_repor
 
 namespace {
 
-void assert_invalid_stats_for_all_spaces(
+void assert_invalid_bucket_stats_for_all_spaces(
         const BucketSpacesStatsProvider::PerNodeBucketSpacesStats& stats,
         uint16_t node_index)
 {
@@ -486,9 +490,15 @@ void assert_invalid_stats_for_all_spaces(
     ASSERT_FALSE(space_iter->second.valid());
 }
 
+void assert_min_replica_stats_zeroed(const std::unordered_map<uint16_t, uint32_t>& stats, uint16_t node_index) {
+    auto iter = stats.find(node_index);
+    ASSERT_TRUE(iter != stats.cend());
+    EXPECT_EQ(iter->second, 0);
 }
 
-TEST_F(TopLevelDistributorTest, entering_recovery_mode_resets_bucket_space_stats_across_all_stripes) {
+}
+
+TEST_F(TopLevelDistributorTest, entering_recovery_mode_resets_bucket_space_and_min_replica_stats_across_all_stripes) {
     // Set up a cluster state + DB contents which implies merge maintenance ops
     setup_distributor(Redundancy(2), NodeCount(2), "version:1 distributor:1 storage:2");
     add_nodes_to_stripe_bucket_db(document::BucketId(16, 1), "0=1/1/1/t/a");
@@ -503,10 +513,18 @@ TEST_F(TopLevelDistributorTest, entering_recovery_mode_resets_bucket_space_stats
     // from state version 2. Exposing stats from version 1 risks reporting stale
     // information back to the cluster controller.
     const auto stats = distributor_bucket_spaces_stats();
-    ASSERT_EQ(2, stats.size());
-
-    assert_invalid_stats_for_all_spaces(stats, 0);
-    assert_invalid_stats_for_all_spaces(stats, 2);
+    ASSERT_EQ(stats.size(), 2);
+
+    assert_invalid_bucket_stats_for_all_spaces(stats, 0);
+    assert_invalid_bucket_stats_for_all_spaces(stats, 2);
+
+    auto min_replica_stats = distributor_min_replica_stats();
+    ASSERT_EQ(min_replica_stats.size(), 2);
+    assert_min_replica_stats_zeroed(min_replica_stats, 0);
+    // Even though we don't have any replicas on node 2 in the DB, we don't know this until
+    // we've completed a full DB scan and updated the stats. Until that point in time we
+    // have to assume we _do_ have replicas with an unknown replication factor.
+    assert_min_replica_stats_zeroed(min_replica_stats, 2);
 }
 
 TEST_F(TopLevelDistributorTest, leaving_recovery_mode_immediately_sends_getnodestate_replies) {
diff --git a/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp b/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp
index f15885769e6..7ff999735c0 100644
--- a/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp
+++ b/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp
@@ -9,8 +9,7 @@ HostInfo::HostInfo() {
     registerReporter(&versionReporter);
 }
 
-HostInfo::~HostInfo() {
-}
+HostInfo::~HostInfo() = default;
 
 void HostInfo::printReport(vespalib::JsonStream& report) {
     for (HostReporter* reporter : customReporters) {
diff --git a/storage/src/vespa/storage/common/hostreporter/hostreporter.h b/storage/src/vespa/storage/common/hostreporter/hostreporter.h
index 115115328cc..6ce6bd803df 100644
--- a/storage/src/vespa/storage/common/hostreporter/hostreporter.h
+++ b/storage/src/vespa/storage/common/hostreporter/hostreporter.h
@@ -11,7 +11,7 @@ namespace storage {
 class HostReporter {
 public:
 	virtual void report(vespalib::JsonStream& jsonreport) = 0;
-	virtual ~HostReporter() {}
+	virtual ~HostReporter() = default;
 };
 }
 
diff --git a/storage/src/vespa/storage/distributor/distributor_stripe.cpp b/storage/src/vespa/storage/distributor/distributor_stripe.cpp
index 50c70306d92..bcba976f2c3 100644
--- a/storage/src/vespa/storage/distributor/distributor_stripe.cpp
+++ b/storage/src/vespa/storage/distributor/distributor_stripe.cpp
@@ -316,9 +316,12 @@ DistributorStripe::enterRecoveryMode()
     LOG(debug, "Entering recovery mode");
     _schedulingMode = MaintenanceScheduler::RECOVERY_SCHEDULING_MODE;
     _scanner->reset();
-    _bucketDBMetricUpdater.reset();
-    // TODO reset _bucketDbStats?
-    invalidate_bucket_spaces_stats();
+    // We enter recovery mode due to cluster state or distribution config changes.
+    // Until we have completed a new DB scan round, we don't know the state of our
+    // newly owned buckets and must not report stats for these out to the cluster
+    // controller as they will be stale (valid only for the _previous_ state/config).
+    // As a consequence, we must explicitly invalidate all such statistics in this edge.
+    invalidate_internal_db_dependent_stats();
 
     _recoveryTimeStarted = framework::MilliSecTimer(_component.getClock());
 }
@@ -337,6 +340,17 @@ DistributorStripe::leaveRecoveryMode()
     _schedulingMode = MaintenanceScheduler::NORMAL_SCHEDULING_MODE;
 }
 
+void
+DistributorStripe::invalidate_internal_db_dependent_stats()
+{
+    _bucketDBMetricUpdater.reset();
+    {
+        std::lock_guard guard(_metricLock);
+        invalidate_bucket_spaces_stats(guard);
+        invalidate_min_replica_stats(guard);
+    }
+}
+
 template <typename NodeFunctor>
 void DistributorStripe::for_each_available_content_node_in(const lib::ClusterState& state, NodeFunctor&& func) {
     const auto node_count = state.getNodeCount(lib::NodeType::STORAGE);
@@ -357,8 +371,9 @@ BucketSpacesStatsProvider::BucketSpacesStats DistributorStripe::make_invalid_sta
     return invalid_space_stats;
 }
 
-void DistributorStripe::invalidate_bucket_spaces_stats() {
-    std::lock_guard guard(_metricLock);
+void
+DistributorStripe::invalidate_bucket_spaces_stats([[maybe_unused]] std::lock_guard<std::mutex>& held_metric_lock)
+{
     _bucketSpacesStats = BucketSpacesStatsProvider::PerNodeBucketSpacesStats();
     auto invalid_space_stats = make_invalid_stats_per_configured_space();
 
@@ -369,6 +384,17 @@ void DistributorStripe::invalidate_bucket_spaces_stats() {
 }
 
 void
+DistributorStripe::invalidate_min_replica_stats([[maybe_unused]] std::lock_guard<std::mutex>& held_metric_lock)
+{
+    _bucketDbStats._minBucketReplica.clear();
+    // Insert an explicit zero value for all nodes that are up in the pending/current cluster state
+    const auto& baseline = *_clusterStateBundle.getBaselineClusterState();
+    for_each_available_content_node_in(baseline, [this](const lib::Node& node) {
+        _bucketDbStats._minBucketReplica[node.getIndex()] = 0;
+    });
+}
+
+void
 DistributorStripe::recheckBucketInfo(uint16_t nodeIdx, const document::Bucket &bucket) {
     _bucketDBUpdater.recheckBucketInfo(nodeIdx, bucket);
 }
diff --git a/storage/src/vespa/storage/distributor/distributor_stripe.h b/storage/src/vespa/storage/distributor/distributor_stripe.h
index ce6a2071efd..809b4dd0e41 100644
--- a/storage/src/vespa/storage/distributor/distributor_stripe.h
+++ b/storage/src/vespa/storage/distributor/distributor_stripe.h
@@ -259,7 +259,9 @@ private:
     BucketSpacesStatsProvider::BucketSpacesStats make_invalid_stats_per_configured_space() const;
     template <typename NodeFunctor>
     void for_each_available_content_node_in(const lib::ClusterState&, NodeFunctor&&);
-    void invalidate_bucket_spaces_stats();
+    void invalidate_internal_db_dependent_stats();
+    void invalidate_bucket_spaces_stats(std::lock_guard<std::mutex>& held_metric_lock);
+    void invalidate_min_replica_stats(std::lock_guard<std::mutex>& held_metric_lock);
     void send_updated_host_info_if_required();
     void propagate_config_snapshot_to_internal_components();
 
diff --git a/storage/src/vespa/storage/distributor/min_replica_provider.h b/storage/src/vespa/storage/distributor/min_replica_provider.h
index 56fd1e8fc81..a4374b906fe 100644
--- a/storage/src/vespa/storage/distributor/min_replica_provider.h
+++ b/storage/src/vespa/storage/distributor/min_replica_provider.h
@@ -9,7 +9,7 @@ namespace storage::distributor {
 class MinReplicaProvider
 {
 public:
-    virtual ~MinReplicaProvider() {}
+    virtual ~MinReplicaProvider() = default;
 
     /**
      * Get a snapshot of the minimum bucket replica for each of the nodes.
author	Tor Brede Vekterli <vekterli@yahooinc.com>	2022-01-03 13:54:40 +0000
committer	Tor Brede Vekterli <vekterli@yahooinc.com>	2022-01-03 13:54:40 +0000
commit	831cd97c7ef749ba486e524122a90e1513a139fd (patch)
tree	fea066f3c11c793ac33fb577ce4303ea14464458
parent	1f3e5d1003a33d9a7076575eab8059c582611cdb (diff)