Do not block scanning all buckets on first cluster state edge

Scanning all buckets is expensive and blocks the main distributor thread during the entire process. Instead, use the standard "recovery mode" functionality that is triggered for all subsequent state transitions. Recovery mode scans allow client operations to be scheduled alongside the scanning, but still tries to scan the DB as quickly as possible. There shouldn't be anything that special with the first cluster state that implies a full scan is inherently required.
author: Tor Brede Vekterli <vekterli@verizonmedia.com> 2019-06-04 15:48:16 +0000
committer: Tor Brede Vekterli <vekterli@verizonmedia.com> 2019-06-06 11:50:23 +0000
commit: 3c8f88a252127b98ea7a94e123b5d65a56ffd571 (patch)
tree: 581c77dd672109dd1f9d0e6d3d3ca2c793cac05a /storage
parent: 60f28da57c182f342f2af0f22cf3244a105c7d2c (diff)
3 files changed, 16 insertions, 10 deletions
diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp
index cdaa6e9aaa3..2ddf41236c9 100644
--- a/storage/src/tests/distributor/bucketdbupdatertest.cpp
+++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp
@@ -544,6 +544,10 @@ public:
     }
 
     uint32_t populate_bucket_db_via_request_bucket_info_for_benchmarking();
+
+    void complete_recovery_mode() {
+        _distributor->scanAllBuckets();
+    }
 };
 
 BucketDBUpdaterTest::BucketDBUpdaterTest()
@@ -1900,8 +1904,8 @@ TEST_F(BucketDBUpdaterTest, testClusterStateAlwaysSendsFullFetchWhenDistribution
 TEST_F(BucketDBUpdaterTest, testChangedDistributionConfigTriggersRecoveryMode) {
     ASSERT_NO_FATAL_FAILURE(setAndEnableClusterState(lib::ClusterState("distributor:6 storage:6"), messageCount(6), 20));
     _sender.clear();
-    // First cluster state; implicit scan of all buckets which does not
-    // use normal recovery mode ticking-path.
+    EXPECT_TRUE(_distributor->isInRecoveryMode());
+    complete_recovery_mode();
     EXPECT_FALSE(_distributor->isInRecoveryMode());
 
     std::string distConfig(getDistConfig6Nodes4Groups());
@@ -1920,6 +1924,8 @@ TEST_F(BucketDBUpdaterTest, testChangedDistributionConfigTriggersRecoveryMode) {
     // Pending cluster state (i.e. distribution) has been enabled, which should
     // cause recovery mode to be entered.
     EXPECT_TRUE(_distributor->isInRecoveryMode());
+    complete_recovery_mode();
+    EXPECT_FALSE(_distributor->isInRecoveryMode());
 }
 
 namespace {
diff --git a/storage/src/tests/distributor/distributortest.cpp b/storage/src/tests/distributor/distributortest.cpp
index 46c756001d9..c519ef0713b 100644
--- a/storage/src/tests/distributor/distributortest.cpp
+++ b/storage/src/tests/distributor/distributortest.cpp
@@ -1072,6 +1072,7 @@ void Distributor_Test::leaving_recovery_mode_immediately_sends_getnodestate_repl
 
 void Distributor_Test::do_test_pending_merge_getnodestate_reply_edge(BucketSpace space) {
     setupDistributor(Redundancy(2), NodeCount(2), "version:1 distributor:1 storage:2");
+    CPPUNIT_ASSERT(_distributor->isInRecoveryMode());
     // 2 buckets with missing replicas triggering merge pending stats
     addNodesToBucketDB(Bucket(space, BucketId(16, 1)), "0=1/1/1/t/a");
     addNodesToBucketDB(Bucket(space, BucketId(16, 2)), "0=1/1/1/t/a");
@@ -1079,29 +1080,30 @@ void Distributor_Test::do_test_pending_merge_getnodestate_reply_edge(BucketSpace
     CPPUNIT_ASSERT(!_distributor->isInRecoveryMode());
     const auto space_name = FixedBucketSpaces::to_string(space);
     assertBucketSpaceStats(2, 0, 1, space_name, _distributor->getBucketSpacesStats());
-    CPPUNIT_ASSERT_EQUAL(size_t(0), explicit_node_state_reply_send_invocations());
+    // First completed scan sends off merge stats et al to cluster controller
+    CPPUNIT_ASSERT_EQUAL(size_t(1), explicit_node_state_reply_send_invocations());
 
     // Edge not triggered when 1 bucket with missing replica left
     addNodesToBucketDB(Bucket(space, BucketId(16, 1)), "0=1/1/1/t/a,1=1/1/1/t");
     tickDistributorNTimes(3);
     assertBucketSpaceStats(1, 1, 1, space_name, _distributor->getBucketSpacesStats());
-    CPPUNIT_ASSERT_EQUAL(size_t(0), explicit_node_state_reply_send_invocations());
+    CPPUNIT_ASSERT_EQUAL(size_t(1), explicit_node_state_reply_send_invocations());
 
     // Edge triggered when no more buckets with requiring merge
     addNodesToBucketDB(Bucket(space, BucketId(16, 2)), "0=1/1/1/t/a,1=1/1/1/t");
     tickDistributorNTimes(3);
     assertBucketSpaceStats(0, 2, 1, space_name, _distributor->getBucketSpacesStats());
-    CPPUNIT_ASSERT_EQUAL(size_t(1), explicit_node_state_reply_send_invocations());
+    CPPUNIT_ASSERT_EQUAL(size_t(2), explicit_node_state_reply_send_invocations());
 
     // Should only send when edge happens, not in subsequent DB iterations
     tickDistributorNTimes(10);
-    CPPUNIT_ASSERT_EQUAL(size_t(1), explicit_node_state_reply_send_invocations());
+    CPPUNIT_ASSERT_EQUAL(size_t(2), explicit_node_state_reply_send_invocations());
 
     // Going back to merges pending should _not_ send a getnodestate reply (at least for now)
     addNodesToBucketDB(Bucket(space, BucketId(16, 1)), "0=1/1/1/t/a");
     tickDistributorNTimes(3);
     assertBucketSpaceStats(1, 1, 1, space_name, _distributor->getBucketSpacesStats());
-    CPPUNIT_ASSERT_EQUAL(size_t(1), explicit_node_state_reply_send_invocations());
+    CPPUNIT_ASSERT_EQUAL(size_t(2), explicit_node_state_reply_send_invocations());
 }
 
 void Distributor_Test::pending_to_no_pending_default_merges_edge_immediately_sends_getnodestate_replies() {
diff --git a/storage/src/vespa/storage/distributor/distributor.cpp b/storage/src/vespa/storage/distributor/distributor.cpp
index 771baa04fca..7cb7d687446 100644
--- a/storage/src/vespa/storage/distributor/distributor.cpp
+++ b/storage/src/vespa/storage/distributor/distributor.cpp
@@ -359,12 +359,10 @@ Distributor::enableClusterStateBundle(const lib::ClusterStateBundle& state)
     if (!_doneInitializing &&
         baselineState.getNodeState(myNode).getState() == lib::State::UP)
     {
-        scanAllBuckets();
         _doneInitializing = true;
         _doneInitializeHandler.notifyDoneInitializing();
-    } else {
-        enterRecoveryMode();
     }
+    enterRecoveryMode();
 
     // Clear all active messages on nodes that are down.
     for (uint16_t i = 0; i < baselineState.getNodeCount(lib::NodeType::STORAGE); ++i) {
author	Tor Brede Vekterli <vekterli@verizonmedia.com>	2019-06-04 15:48:16 +0000
committer	Tor Brede Vekterli <vekterli@verizonmedia.com>	2019-06-06 11:50:23 +0000
commit	3c8f88a252127b98ea7a94e123b5d65a56ffd571 (patch)
tree	581c77dd672109dd1f9d0e6d3d3ca2c793cac05a /storage
parent	60f28da57c182f342f2af0f22cf3244a105c7d2c (diff)