diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2021-11-23 21:12:52 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-11-23 21:12:52 +0100 |
commit | 6e7385e7858ee5491f028c7012d9928ea340d678 (patch) | |
tree | 20706e75771d68979fa5b6219f49791ed4dd4c8f /searchcore | |
parent | a2f9a7b7d6afcd9e9567e53e5a0f489bddaf3cb4 (diff) | |
parent | 3d31f8967fc8970835b14a615f393ec4acda3394 (diff) |
Merge pull request #20156 from vespa-engine/vekterli/allow-searches-when-node-is-in-maintenance-mode
Continue serving search queries when in Maintenance node state [run-systemtest]
Diffstat (limited to 'searchcore')
11 files changed, 164 insertions, 21 deletions
diff --git a/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp b/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp index 29748a2010c..cadfa8cd72f 100644 --- a/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp +++ b/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp @@ -140,6 +140,11 @@ struct Fixture setNodeUp(bool value) { _calc->setNodeUp(value); + _calc->setNodeMaintenance(false); + _handler.notifyClusterStateChanged(_calc); + } + void setNodeMaintenance(bool value) { + _calc->setNodeMaintenance(value); _handler.notifyClusterStateChanged(_calc); } }; @@ -223,7 +228,7 @@ TEST_F("require that unready bucket can be reported as active", Fixture) } -TEST_F("require that node being down deactivates buckets", Fixture) +TEST_F("node going down (but not into maintenance state) deactivates all buckets", Fixture) { f._handler.handleSetCurrentState(f._ready.bucket(2), BucketInfo::ACTIVE, f._genResult); @@ -252,6 +257,42 @@ TEST_F("require that node being down deactivates buckets", Fixture) EXPECT_EQUAL(true, f._bucketInfo.getInfo().isActive()); } +TEST_F("node going into maintenance state does _not_ deactivate any buckets", Fixture) +{ + f._handler.handleSetCurrentState(f._ready.bucket(2), + BucketInfo::ACTIVE, f._genResult); + f.sync(); + f.setNodeMaintenance(true); + f.sync(); + f.handleGetBucketInfo(f._ready.bucket(2)); + EXPECT_TRUE(f._bucketInfo.getInfo().isActive()); +} + +TEST_F("node going from maintenance to up state deactivates all buckets", Fixture) +{ + f._handler.handleSetCurrentState(f._ready.bucket(2), + BucketInfo::ACTIVE, f._genResult); + f.sync(); + f.setNodeMaintenance(true); + f.sync(); + f.setNodeUp(true); + f.sync(); + f.handleGetBucketInfo(f._ready.bucket(2)); + EXPECT_FALSE(f._bucketInfo.getInfo().isActive()); +} + +TEST_F("node going from maintenance to down state deactivates all buckets", Fixture) +{ + f._handler.handleSetCurrentState(f._ready.bucket(2), + BucketInfo::ACTIVE, f._genResult); + f.sync(); + f.setNodeMaintenance(true); + f.sync(); + f.setNodeUp(false); + f.sync(); + f.handleGetBucketInfo(f._ready.bucket(2)); + EXPECT_FALSE(f._bucketInfo.getInfo().isActive()); +} TEST_MAIN() { diff --git a/searchcore/src/tests/proton/matchengine/matchengine.cpp b/searchcore/src/tests/proton/matchengine/matchengine.cpp index 481a9f061be..34c36fd9a72 100644 --- a/searchcore/src/tests/proton/matchengine/matchengine.cpp +++ b/searchcore/src/tests/proton/matchengine/matchengine.cpp @@ -17,7 +17,7 @@ class MySearchHandler : public ISearchHandler { std::string _name; std::string _reply; public: - MySearchHandler(size_t numHits = 0) : + explicit MySearchHandler(size_t numHits = 0) : _numHits(numHits), _name("my"), _reply("myreply") {} DocsumReply::UP getDocsums(const DocsumRequest &) override { @@ -91,6 +91,7 @@ assertSearchReply(MatchEngine & engine, const std::string & searchDocType, size_ LocalSearchClient client; engine.search(SearchRequest::Source(request), client); SearchReply::UP reply = client.getReply(10000); + ASSERT_TRUE(reply); return EXPECT_EQUAL(expHits, reply->hits.size()); } @@ -173,11 +174,24 @@ TEST("requireThatEmptySearchReplyIsReturnedWhenEngineIsClosed") LocalSearchClient client; SearchRequest::Source request(new SearchRequest()); SearchReply::UP reply = engine.search(std::move(request), client); - EXPECT_TRUE(reply ); + ASSERT_TRUE(reply); EXPECT_EQUAL(0u, reply->hits.size()); EXPECT_EQUAL(7u, reply->getDistributionKey()); } +namespace { + +constexpr const char* search_interface_offline_slime_str() noexcept { + return "{\n" + " \"status\": {\n" + " \"state\": \"OFFLINE\",\n" + " \"message\": \"Search interface is offline\"\n" + " }\n" + "}\n"; +} + +} + TEST("requireThatStateIsReported") { MatchEngine engine(1, 1, 7); @@ -185,14 +199,44 @@ TEST("requireThatStateIsReported") Slime slime; SlimeInserter inserter(slime); engine.get_state(inserter, false); - EXPECT_EQUAL( - "{\n" - " \"status\": {\n" - " \"state\": \"OFFLINE\",\n" - " \"message\": \"Search interface is offline\"\n" - " }\n" - "}\n", - slime.toString()); + EXPECT_EQUAL(search_interface_offline_slime_str(), + slime.toString()); +} + +TEST("searches are executed when node is in maintenance mode") +{ + MatchEngine engine(1, 1, 7); + engine.setNodeMaintenance(true); + engine.putSearchHandler(DocTypeName("foo"), std::make_shared<MySearchHandler>(3)); + EXPECT_TRUE(assertSearchReply(engine, "foo", 3)); +} + +TEST("setNodeMaintenance(true) implies setNodeUp(false)") +{ + MatchEngine engine(1, 1, 7); + engine.setNodeUp(true); + engine.setNodeMaintenance(true); + EXPECT_FALSE(engine.isOnline()); +} + +TEST("setNodeMaintenance(false) does not imply setNodeUp(false)") +{ + MatchEngine engine(1, 1, 7); + engine.setNodeUp(true); + engine.setNodeMaintenance(false); + EXPECT_TRUE(engine.isOnline()); +} + +TEST("search interface is reported as offline when node is in maintenance mode") +{ + MatchEngine engine(1, 1, 7); + engine.setNodeMaintenance(true); + + Slime slime; + SlimeInserter inserter(slime); + engine.get_state(inserter, false); + EXPECT_EQUAL(search_interface_offline_slime_str(), + slime.toString()); } TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp index 5ad4a7ed52b..58dc473b85e 100644 --- a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp +++ b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp @@ -50,7 +50,8 @@ MatchEngine::MatchEngine(size_t numThreads, size_t threadsPerSearch, uint32_t di _handlers(), _executor(std::max(size_t(1), numThreads / threadsPerSearch), 256_Ki, match_engine_executor), _threadBundlePool(std::max(size_t(1), threadsPerSearch)), - _nodeUp(false) + _nodeUp(false), + _nodeMaintenance(false) { } @@ -98,7 +99,8 @@ search::engine::SearchReply::UP MatchEngine::search(search::engine::SearchRequest::Source request, search::engine::SearchClient &client) { - if (_closed || !_nodeUp) { + // We continue to allow searches if the node is in Maintenance mode + if (_closed || (!_nodeUp && !_nodeMaintenance)) { auto ret = std::make_unique<search::engine::SearchReply>(); ret->setDistributionKey(_distributionKey); @@ -177,6 +179,14 @@ MatchEngine::setNodeUp(bool nodeUp) _nodeUp = nodeUp; } +void +MatchEngine::setNodeMaintenance(bool nodeMaintenance) +{ + _nodeMaintenance = nodeMaintenance; + if (nodeMaintenance) { + _nodeUp = false; + } +} StatusReport::UP MatchEngine::reportStatus() const diff --git a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h index b4e32c45003..7cc0c97048b 100644 --- a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h +++ b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h @@ -26,6 +26,7 @@ private: vespalib::ThreadStackExecutor _executor; vespalib::SimpleThreadBundle::Pool _threadBundlePool; bool _nodeUp; + bool _nodeMaintenance; public: /** @@ -131,6 +132,13 @@ public: */ void setNodeUp(bool nodeUp); + /** + * Set node into maintenance, based on info from cluster controller. Note that + * nodeMaintenance == true also implies setNodeUp(false), as the node is technically + * not in a Up state. + */ + void setNodeMaintenance(bool nodeMaintenance); + StatusReport::UP reportStatus() const; search::engine::SearchReply::UP search( diff --git a/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp b/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp index c15be9336fe..0d9b2f6aaf6 100644 --- a/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp @@ -68,7 +68,8 @@ BucketHandler::BucketHandler(vespalib::Executor &executor) _executor(executor), _ready(nullptr), _changedHandlers(), - _nodeUp(false) + _nodeUp(false), + _nodeMaintenance(false) { LOG(spam, "BucketHandler::BucketHandler"); } @@ -143,13 +144,32 @@ BucketHandler::handlePopulateActiveBuckets(document::BucketId::List &buckets, })); } +namespace { +constexpr const char* bool_str(bool v) noexcept { + return v ? "true" : "false"; +} +} + void BucketHandler::notifyClusterStateChanged(const std::shared_ptr<IBucketStateCalculator> & newCalc) { bool oldNodeUp = _nodeUp; - _nodeUp = newCalc->nodeUp(); - LOG(spam, "notifyClusterStateChanged: %s -> %s", oldNodeUp ? "up" : "down", _nodeUp ? "up" : "down"); - if (oldNodeUp && !_nodeUp) { + bool oldNodeMaintenance = _nodeMaintenance; + _nodeUp = newCalc->nodeUp(); // Up, Retired or Initializing + _nodeMaintenance = newCalc->nodeMaintenance(); + LOG(spam, "notifyClusterStateChanged; up: %s -> %s, maintenance: %s -> %s", + bool_str(oldNodeUp), bool_str(_nodeUp), + bool_str(oldNodeMaintenance), bool_str(_nodeMaintenance)); + if (_nodeMaintenance) { + return; // Don't deactivate buckets in maintenance mode; let query traffic drain away naturally. + } + // We implicitly deactivate buckets in two edge cases: + // - Up -> Down (not maintenance; handled above), since the node can not be expected to offer + // any graceful query draining when set Down. + // - Maintenance -> !Maintenance, since we'd otherwise introduce a bunch of transient duplicate + // results into queries if we transition to an available state. + // The assumption is that the system has already activated buckets on other nodes in such a scenario. + if ((oldNodeUp && !_nodeUp) || oldNodeMaintenance) { deactivateAllActiveBuckets(); } } diff --git a/searchcore/src/vespa/searchcore/proton/server/buckethandler.h b/searchcore/src/vespa/searchcore/proton/server/buckethandler.h index 7f44d2ebd71..2344e080450 100644 --- a/searchcore/src/vespa/searchcore/proton/server/buckethandler.h +++ b/searchcore/src/vespa/searchcore/proton/server/buckethandler.h @@ -25,6 +25,7 @@ private: documentmetastore::IBucketHandler *_ready; std::vector<IBucketStateChangedHandler *> _changedHandlers; bool _nodeUp; + bool _nodeMaintenance; void performSetCurrentState(document::BucketId bucketId, storage::spi::BucketInfo::ActiveState newState, diff --git a/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp b/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp index 3d709bd19d1..20cd4ced6b2 100644 --- a/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp @@ -27,6 +27,7 @@ private: bool _nodeUp; bool _nodeInitializing; bool _nodeRetired; + bool _nodeMaintenance; public: ClusterStateAdapter(const ClusterState &calc) @@ -34,7 +35,8 @@ public: _clusterUp(_calc.clusterUp()), _nodeUp(_calc.nodeUp()), _nodeInitializing(_calc.nodeInitializing()), - _nodeRetired(_calc.nodeRetired()) + _nodeRetired(_calc.nodeRetired()), + _nodeMaintenance(_calc.nodeMaintenance()) { } vespalib::Trinary shouldBeReady(const document::Bucket &bucket) const override { @@ -44,6 +46,7 @@ public: bool nodeUp() const override { return _nodeUp; } bool nodeInitializing() const override { return _nodeInitializing; } bool nodeRetired() const override { return _nodeRetired; } + bool nodeMaintenance() const noexcept override { return _nodeMaintenance; } }; } @@ -53,11 +56,12 @@ ClusterStateHandler::performSetClusterState(const ClusterState *calc, IGenericRe { LOG(debug, "performSetClusterState(): " - "clusterUp(%s), nodeUp(%s), nodeInitializing(%s)" + "clusterUp(%s), nodeUp(%s), nodeInitializing(%s), nodeMaintenance(%s)" "changedHandlers.size() = %zu", (calc->clusterUp() ? "true" : "false"), (calc->nodeUp() ? "true" : "false"), (calc->nodeInitializing() ? "true" : "false"), + (calc->nodeMaintenance() ? "true" : "false"), _changedHandlers.size()); if (!_changedHandlers.empty()) { auto newCalc = std::make_shared<ClusterStateAdapter>(*calc); diff --git a/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h b/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h index b60a5ad8175..9534f346d1f 100644 --- a/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h +++ b/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h @@ -15,6 +15,7 @@ struct IBucketStateCalculator virtual bool nodeUp() const = 0; virtual bool nodeInitializing() const = 0; virtual bool nodeRetired() const = 0; + virtual bool nodeMaintenance() const noexcept = 0; virtual ~IBucketStateCalculator() = default; }; diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.cpp b/searchcore/src/vespa/searchcore/proton/server/proton.cpp index 275f9029107..5b21242e397 100644 --- a/searchcore/src/vespa/searchcore/proton/server/proton.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/proton.cpp @@ -865,10 +865,11 @@ Proton::setClusterState(BucketSpace bucketSpace, const storage::spi::ClusterStat // forward info sent by cluster controller to persistence engine // about whether node is supposed to be up or not. Match engine // needs to know this in order to stop serving queries. - bool nodeUpInBucketSpace(calc.nodeUp()); + bool nodeUpInBucketSpace(calc.nodeUp()); // TODO rename calculator function to imply bucket space affinity bool nodeRetired(calc.nodeRetired()); bool nodeUp = updateNodeUp(bucketSpace, nodeUpInBucketSpace); _matchEngine->setNodeUp(nodeUp); + _matchEngine->setNodeMaintenance(calc.nodeMaintenance()); // Note: _all_ bucket spaces in maintenance if (_memoryFlushConfigUpdater) { _memoryFlushConfigUpdater->setNodeRetired(nodeRetired); } diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.h b/searchcore/src/vespa/searchcore/proton/server/proton.h index 6b0bef50cae..6fd31352051 100644 --- a/searchcore/src/vespa/searchcore/proton/server/proton.h +++ b/searchcore/src/vespa/searchcore/proton/server/proton.h @@ -133,6 +133,7 @@ private: uint32_t getDistributionKey() const override { return _distributionKey; } BootstrapConfig::SP getActiveConfigSnapshot() const; std::shared_ptr<IDocumentDBReferenceRegistry> getDocumentDBReferenceRegistry() const override; + // Returns true if the node is up in _any_ bucket space bool updateNodeUp(BucketSpace bucketSpace, bool nodeUpInBucketSpace); void closeDocumentDBs(vespalib::ThreadStackExecutorBase & executor); public: diff --git a/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h b/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h index a5a0185d787..e218058f01e 100644 --- a/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h +++ b/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h @@ -20,6 +20,7 @@ private: bool _clusterUp; bool _nodeUp; bool _nodeRetired; + bool _nodeMaintenance; public: typedef std::shared_ptr<BucketStateCalculator> SP; @@ -28,7 +29,8 @@ public: _asked(), _clusterUp(true), _nodeUp(true), - _nodeRetired(false) + _nodeRetired(false), + _nodeMaintenance(false) { } BucketStateCalculator &addReady(const document::BucketId &bucket) { @@ -54,6 +56,15 @@ public: return *this; } + BucketStateCalculator& setNodeMaintenance(bool maintenance) noexcept { + _nodeMaintenance = maintenance; + if (maintenance) { + _nodeUp = false; + _nodeRetired = false; + } + return *this; + } + const BucketIdVector &asked() const noexcept { return _asked; } void resetAsked() { _asked.clear(); } @@ -67,6 +78,7 @@ public: bool nodeUp() const override { return _nodeUp; } bool nodeRetired() const override { return _nodeRetired; } bool nodeInitializing() const override { return false; } + bool nodeMaintenance() const noexcept override { return _nodeMaintenance; } }; } |