From 7fe561bae2574f6ce70fe3255c57ff7308310b9a Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Tue, 23 Nov 2021 15:46:19 +0000 Subject: Handle case where bucket spaces have differing maintenance state for a node Only skip deactivating buckets if the entire _node_ is marked as maintenance state, i.e. the node has maintenance state across all bucket spaces provided in the bundle. Otherwise treat the state transition as if the node goes down, deactivating all buckets. Also ensure that the bucket deactivation logic above the SPI is identical to that within Proton. This avoids bucket DBs getting out of sync between the two. --- .../documentdb/buckethandler/buckethandler_test.cpp | 13 +++++++++++++ .../vespa/searchcore/proton/server/buckethandler.cpp | 18 +++++++++++++----- .../searchcore/proton/server/clusterstatehandler.cpp | 3 ++- .../src/vespa/searchcore/proton/server/proton.cpp | 4 ++-- searchcore/src/vespa/searchcore/proton/server/proton.h | 1 + 5 files changed, 31 insertions(+), 8 deletions(-) (limited to 'searchcore/src') diff --git a/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp b/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp index 82410d28610..cadfa8cd72f 100644 --- a/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp +++ b/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp @@ -281,6 +281,19 @@ TEST_F("node going from maintenance to up state deactivates all buckets", Fixtur EXPECT_FALSE(f._bucketInfo.getInfo().isActive()); } +TEST_F("node going from maintenance to down state deactivates all buckets", Fixture) +{ + f._handler.handleSetCurrentState(f._ready.bucket(2), + BucketInfo::ACTIVE, f._genResult); + f.sync(); + f.setNodeMaintenance(true); + f.sync(); + f.setNodeUp(false); + f.sync(); + f.handleGetBucketInfo(f._ready.bucket(2)); + EXPECT_FALSE(f._bucketInfo.getInfo().isActive()); +} + TEST_MAIN() { TEST_RUN_ALL(); diff --git a/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp b/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp index d329d9ce27c..0d9b2f6aaf6 100644 --- a/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp @@ -144,6 +144,12 @@ BucketHandler::handlePopulateActiveBuckets(document::BucketId::List &buckets, })); } +namespace { +constexpr const char* bool_str(bool v) noexcept { + return v ? "true" : "false"; +} +} + void BucketHandler::notifyClusterStateChanged(const std::shared_ptr & newCalc) { @@ -151,17 +157,19 @@ BucketHandler::notifyClusterStateChanged(const std::shared_ptrnodeUp(); // Up, Retired or Initializing _nodeMaintenance = newCalc->nodeMaintenance(); - LOG(spam, "notifyClusterStateChanged: %s -> %s", oldNodeUp ? "up" : "down", _nodeUp ? "up" : "down"); + LOG(spam, "notifyClusterStateChanged; up: %s -> %s, maintenance: %s -> %s", + bool_str(oldNodeUp), bool_str(_nodeUp), + bool_str(oldNodeMaintenance), bool_str(_nodeMaintenance)); if (_nodeMaintenance) { return; // Don't deactivate buckets in maintenance mode; let query traffic drain away naturally. } // We implicitly deactivate buckets in two edge cases: // - Up -> Down (not maintenance; handled above), since the node can not be expected to offer // any graceful query draining when set Down. - // - Maintenance -> Up, since we'd otherwise introduce a bunch of transient duplicate results - // into queries. The assumption is that the system has already activated buckets on other - // nodes in such a scenario. - if ((oldNodeUp && !_nodeUp) || (oldNodeMaintenance && _nodeUp)) { + // - Maintenance -> !Maintenance, since we'd otherwise introduce a bunch of transient duplicate + // results into queries if we transition to an available state. + // The assumption is that the system has already activated buckets on other nodes in such a scenario. + if ((oldNodeUp && !_nodeUp) || oldNodeMaintenance) { deactivateAllActiveBuckets(); } } diff --git a/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp b/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp index 137b2596eeb..20cd4ced6b2 100644 --- a/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp @@ -56,11 +56,12 @@ ClusterStateHandler::performSetClusterState(const ClusterState *calc, IGenericRe { LOG(debug, "performSetClusterState(): " - "clusterUp(%s), nodeUp(%s), nodeInitializing(%s)" + "clusterUp(%s), nodeUp(%s), nodeInitializing(%s), nodeMaintenance(%s)" "changedHandlers.size() = %zu", (calc->clusterUp() ? "true" : "false"), (calc->nodeUp() ? "true" : "false"), (calc->nodeInitializing() ? "true" : "false"), + (calc->nodeMaintenance() ? "true" : "false"), _changedHandlers.size()); if (!_changedHandlers.empty()) { auto newCalc = std::make_shared(*calc); diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.cpp b/searchcore/src/vespa/searchcore/proton/server/proton.cpp index 621f3acd072..5b21242e397 100644 --- a/searchcore/src/vespa/searchcore/proton/server/proton.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/proton.cpp @@ -865,11 +865,11 @@ Proton::setClusterState(BucketSpace bucketSpace, const storage::spi::ClusterStat // forward info sent by cluster controller to persistence engine // about whether node is supposed to be up or not. Match engine // needs to know this in order to stop serving queries. - bool nodeUpInBucketSpace(calc.nodeUp()); + bool nodeUpInBucketSpace(calc.nodeUp()); // TODO rename calculator function to imply bucket space affinity bool nodeRetired(calc.nodeRetired()); bool nodeUp = updateNodeUp(bucketSpace, nodeUpInBucketSpace); _matchEngine->setNodeUp(nodeUp); - _matchEngine->setNodeMaintenance(calc.nodeMaintenance()); + _matchEngine->setNodeMaintenance(calc.nodeMaintenance()); // Note: _all_ bucket spaces in maintenance if (_memoryFlushConfigUpdater) { _memoryFlushConfigUpdater->setNodeRetired(nodeRetired); } diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.h b/searchcore/src/vespa/searchcore/proton/server/proton.h index 6b0bef50cae..6fd31352051 100644 --- a/searchcore/src/vespa/searchcore/proton/server/proton.h +++ b/searchcore/src/vespa/searchcore/proton/server/proton.h @@ -133,6 +133,7 @@ private: uint32_t getDistributionKey() const override { return _distributionKey; } BootstrapConfig::SP getActiveConfigSnapshot() const; std::shared_ptr getDocumentDBReferenceRegistry() const override; + // Returns true if the node is up in _any_ bucket space bool updateNodeUp(BucketSpace bucketSpace, bool nodeUpInBucketSpace); void closeDocumentDBs(vespalib::ThreadStackExecutorBase & executor); public: -- cgit v1.2.3