diff options
13 files changed, 184 insertions, 28 deletions
diff --git a/persistence/src/tests/spi/clusterstatetest.cpp b/persistence/src/tests/spi/clusterstatetest.cpp index 8a303c1a1ac..9f7507c992b 100644 --- a/persistence/src/tests/spi/clusterstatetest.cpp +++ b/persistence/src/tests/spi/clusterstatetest.cpp @@ -233,4 +233,31 @@ TEST(ClusterStateTest, can_infer_own_node_retired_state) EXPECT_TRUE(!node_marked_as_retired_in_state("distributor:3 storage:3 .1.s:r", d, 0)); } +namespace { + +bool +node_marked_as_maintenance_in_state(const std::string& stateStr, + const lib::Distribution& d, + uint16_t node) +{ + lib::ClusterState s(stateStr); + ClusterState state(s, node, d); + return state.nodeMaintenance(); +} + +} + +TEST(ClusterStateTest, can_infer_own_node_maintenance_state) +{ + lib::Distribution d(lib::Distribution::getDefaultDistributionConfig(3, 3)); + + EXPECT_FALSE(node_marked_as_maintenance_in_state("distributor:3 storage:3", d, 0)); + EXPECT_FALSE(node_marked_as_maintenance_in_state("distributor:3 storage:3 .0.s:i", d, 0)); + EXPECT_FALSE(node_marked_as_maintenance_in_state("distributor:3 storage:3 .0.s:d", d, 0)); + EXPECT_TRUE( node_marked_as_maintenance_in_state("distributor:3 storage:3 .0.s:m", d, 0)); + EXPECT_FALSE(node_marked_as_maintenance_in_state("distributor:3 storage:3 .0.s:r", d, 0)); + EXPECT_FALSE(node_marked_as_maintenance_in_state("distributor:3 storage:3 .0.s:m", d, 1)); + EXPECT_FALSE(node_marked_as_maintenance_in_state("distributor:3 storage:3 .1.s:m", d, 0)); +} + } diff --git a/persistence/src/vespa/persistence/spi/clusterstate.cpp b/persistence/src/vespa/persistence/spi/clusterstate.cpp index 4bc538996ca..a244b607391 100644 --- a/persistence/src/vespa/persistence/spi/clusterstate.cpp +++ b/persistence/src/vespa/persistence/spi/clusterstate.cpp @@ -68,28 +68,32 @@ ClusterState::shouldBeReady(const Bucket& b) const { return Trinary::False; } -bool ClusterState::clusterUp() const { +bool ClusterState::clusterUp() const noexcept { return _state && _state->getClusterState() == lib::State::UP; } -bool ClusterState::nodeHasStateOneOf(const char* states) const { +bool ClusterState::nodeHasStateOneOf(const char* states) const noexcept { return _state && _state->getNodeState(lib::Node(lib::NodeType::STORAGE, _nodeIndex)). getState().oneOf(states); } -bool ClusterState::nodeUp() const { +bool ClusterState::nodeUp() const noexcept { return nodeHasStateOneOf("uir"); } -bool ClusterState::nodeInitializing() const { +bool ClusterState::nodeInitializing() const noexcept { return nodeHasStateOneOf("i"); } -bool ClusterState::nodeRetired() const { +bool ClusterState::nodeRetired() const noexcept { return nodeHasStateOneOf("r"); } +bool ClusterState::nodeMaintenance() const noexcept { + return nodeHasStateOneOf("m"); +} + void ClusterState::serialize(vespalib::nbostream& o) const { assert(_distribution); assert(_state); diff --git a/persistence/src/vespa/persistence/spi/clusterstate.h b/persistence/src/vespa/persistence/spi/clusterstate.h index 8e48758e243..df556d09cb7 100644 --- a/persistence/src/vespa/persistence/spi/clusterstate.h +++ b/persistence/src/vespa/persistence/spi/clusterstate.h @@ -45,23 +45,28 @@ public: * compared to the complete list of nodes, and deigns the system to be * unusable. */ - bool clusterUp() const; + bool clusterUp() const noexcept; /** * Returns false if this node has been set in a state where it should not * receive external load. */ - bool nodeUp() const; + bool nodeUp() const noexcept; /** * Returns true iff this node is marked as Initializing in the cluster state. */ - bool nodeInitializing() const; + bool nodeInitializing() const noexcept; /** * Returns true iff this node is marked as Retired in the cluster state. */ - bool nodeRetired() const; + bool nodeRetired() const noexcept; + + /** + * Returns true iff this node is marked as Maintenance in the cluster state. + */ + bool nodeMaintenance() const noexcept; /** * Returns a serialized form of this object. @@ -74,7 +79,7 @@ private: uint16_t _nodeIndex; void deserialize(vespalib::nbostream&); - bool nodeHasStateOneOf(const char* states) const; + bool nodeHasStateOneOf(const char* states) const noexcept; }; } diff --git a/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp b/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp index 29748a2010c..82410d28610 100644 --- a/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp +++ b/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp @@ -140,6 +140,11 @@ struct Fixture setNodeUp(bool value) { _calc->setNodeUp(value); + _calc->setNodeMaintenance(false); + _handler.notifyClusterStateChanged(_calc); + } + void setNodeMaintenance(bool value) { + _calc->setNodeMaintenance(value); _handler.notifyClusterStateChanged(_calc); } }; @@ -223,7 +228,7 @@ TEST_F("require that unready bucket can be reported as active", Fixture) } -TEST_F("require that node being down deactivates buckets", Fixture) +TEST_F("node going down (but not into maintenance state) deactivates all buckets", Fixture) { f._handler.handleSetCurrentState(f._ready.bucket(2), BucketInfo::ACTIVE, f._genResult); @@ -252,6 +257,29 @@ TEST_F("require that node being down deactivates buckets", Fixture) EXPECT_EQUAL(true, f._bucketInfo.getInfo().isActive()); } +TEST_F("node going into maintenance state does _not_ deactivate any buckets", Fixture) +{ + f._handler.handleSetCurrentState(f._ready.bucket(2), + BucketInfo::ACTIVE, f._genResult); + f.sync(); + f.setNodeMaintenance(true); + f.sync(); + f.handleGetBucketInfo(f._ready.bucket(2)); + EXPECT_TRUE(f._bucketInfo.getInfo().isActive()); +} + +TEST_F("node going from maintenance to up state deactivates all buckets", Fixture) +{ + f._handler.handleSetCurrentState(f._ready.bucket(2), + BucketInfo::ACTIVE, f._genResult); + f.sync(); + f.setNodeMaintenance(true); + f.sync(); + f.setNodeUp(true); + f.sync(); + f.handleGetBucketInfo(f._ready.bucket(2)); + EXPECT_FALSE(f._bucketInfo.getInfo().isActive()); +} TEST_MAIN() { diff --git a/searchcore/src/tests/proton/matchengine/matchengine.cpp b/searchcore/src/tests/proton/matchengine/matchengine.cpp index 481a9f061be..34c36fd9a72 100644 --- a/searchcore/src/tests/proton/matchengine/matchengine.cpp +++ b/searchcore/src/tests/proton/matchengine/matchengine.cpp @@ -17,7 +17,7 @@ class MySearchHandler : public ISearchHandler { std::string _name; std::string _reply; public: - MySearchHandler(size_t numHits = 0) : + explicit MySearchHandler(size_t numHits = 0) : _numHits(numHits), _name("my"), _reply("myreply") {} DocsumReply::UP getDocsums(const DocsumRequest &) override { @@ -91,6 +91,7 @@ assertSearchReply(MatchEngine & engine, const std::string & searchDocType, size_ LocalSearchClient client; engine.search(SearchRequest::Source(request), client); SearchReply::UP reply = client.getReply(10000); + ASSERT_TRUE(reply); return EXPECT_EQUAL(expHits, reply->hits.size()); } @@ -173,11 +174,24 @@ TEST("requireThatEmptySearchReplyIsReturnedWhenEngineIsClosed") LocalSearchClient client; SearchRequest::Source request(new SearchRequest()); SearchReply::UP reply = engine.search(std::move(request), client); - EXPECT_TRUE(reply ); + ASSERT_TRUE(reply); EXPECT_EQUAL(0u, reply->hits.size()); EXPECT_EQUAL(7u, reply->getDistributionKey()); } +namespace { + +constexpr const char* search_interface_offline_slime_str() noexcept { + return "{\n" + " \"status\": {\n" + " \"state\": \"OFFLINE\",\n" + " \"message\": \"Search interface is offline\"\n" + " }\n" + "}\n"; +} + +} + TEST("requireThatStateIsReported") { MatchEngine engine(1, 1, 7); @@ -185,14 +199,44 @@ TEST("requireThatStateIsReported") Slime slime; SlimeInserter inserter(slime); engine.get_state(inserter, false); - EXPECT_EQUAL( - "{\n" - " \"status\": {\n" - " \"state\": \"OFFLINE\",\n" - " \"message\": \"Search interface is offline\"\n" - " }\n" - "}\n", - slime.toString()); + EXPECT_EQUAL(search_interface_offline_slime_str(), + slime.toString()); +} + +TEST("searches are executed when node is in maintenance mode") +{ + MatchEngine engine(1, 1, 7); + engine.setNodeMaintenance(true); + engine.putSearchHandler(DocTypeName("foo"), std::make_shared<MySearchHandler>(3)); + EXPECT_TRUE(assertSearchReply(engine, "foo", 3)); +} + +TEST("setNodeMaintenance(true) implies setNodeUp(false)") +{ + MatchEngine engine(1, 1, 7); + engine.setNodeUp(true); + engine.setNodeMaintenance(true); + EXPECT_FALSE(engine.isOnline()); +} + +TEST("setNodeMaintenance(false) does not imply setNodeUp(false)") +{ + MatchEngine engine(1, 1, 7); + engine.setNodeUp(true); + engine.setNodeMaintenance(false); + EXPECT_TRUE(engine.isOnline()); +} + +TEST("search interface is reported as offline when node is in maintenance mode") +{ + MatchEngine engine(1, 1, 7); + engine.setNodeMaintenance(true); + + Slime slime; + SlimeInserter inserter(slime); + engine.get_state(inserter, false); + EXPECT_EQUAL(search_interface_offline_slime_str(), + slime.toString()); } TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp index 5ad4a7ed52b..58dc473b85e 100644 --- a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp +++ b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp @@ -50,7 +50,8 @@ MatchEngine::MatchEngine(size_t numThreads, size_t threadsPerSearch, uint32_t di _handlers(), _executor(std::max(size_t(1), numThreads / threadsPerSearch), 256_Ki, match_engine_executor), _threadBundlePool(std::max(size_t(1), threadsPerSearch)), - _nodeUp(false) + _nodeUp(false), + _nodeMaintenance(false) { } @@ -98,7 +99,8 @@ search::engine::SearchReply::UP MatchEngine::search(search::engine::SearchRequest::Source request, search::engine::SearchClient &client) { - if (_closed || !_nodeUp) { + // We continue to allow searches if the node is in Maintenance mode + if (_closed || (!_nodeUp && !_nodeMaintenance)) { auto ret = std::make_unique<search::engine::SearchReply>(); ret->setDistributionKey(_distributionKey); @@ -177,6 +179,14 @@ MatchEngine::setNodeUp(bool nodeUp) _nodeUp = nodeUp; } +void +MatchEngine::setNodeMaintenance(bool nodeMaintenance) +{ + _nodeMaintenance = nodeMaintenance; + if (nodeMaintenance) { + _nodeUp = false; + } +} StatusReport::UP MatchEngine::reportStatus() const diff --git a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h index b4e32c45003..7cc0c97048b 100644 --- a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h +++ b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h @@ -26,6 +26,7 @@ private: vespalib::ThreadStackExecutor _executor; vespalib::SimpleThreadBundle::Pool _threadBundlePool; bool _nodeUp; + bool _nodeMaintenance; public: /** @@ -131,6 +132,13 @@ public: */ void setNodeUp(bool nodeUp); + /** + * Set node into maintenance, based on info from cluster controller. Note that + * nodeMaintenance == true also implies setNodeUp(false), as the node is technically + * not in a Up state. + */ + void setNodeMaintenance(bool nodeMaintenance); + StatusReport::UP reportStatus() const; search::engine::SearchReply::UP search( diff --git a/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp b/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp index c15be9336fe..d329d9ce27c 100644 --- a/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp @@ -68,7 +68,8 @@ BucketHandler::BucketHandler(vespalib::Executor &executor) _executor(executor), _ready(nullptr), _changedHandlers(), - _nodeUp(false) + _nodeUp(false), + _nodeMaintenance(false) { LOG(spam, "BucketHandler::BucketHandler"); } @@ -147,9 +148,20 @@ void BucketHandler::notifyClusterStateChanged(const std::shared_ptr<IBucketStateCalculator> & newCalc) { bool oldNodeUp = _nodeUp; - _nodeUp = newCalc->nodeUp(); + bool oldNodeMaintenance = _nodeMaintenance; + _nodeUp = newCalc->nodeUp(); // Up, Retired or Initializing + _nodeMaintenance = newCalc->nodeMaintenance(); LOG(spam, "notifyClusterStateChanged: %s -> %s", oldNodeUp ? "up" : "down", _nodeUp ? "up" : "down"); - if (oldNodeUp && !_nodeUp) { + if (_nodeMaintenance) { + return; // Don't deactivate buckets in maintenance mode; let query traffic drain away naturally. + } + // We implicitly deactivate buckets in two edge cases: + // - Up -> Down (not maintenance; handled above), since the node can not be expected to offer + // any graceful query draining when set Down. + // - Maintenance -> Up, since we'd otherwise introduce a bunch of transient duplicate results + // into queries. The assumption is that the system has already activated buckets on other + // nodes in such a scenario. + if ((oldNodeUp && !_nodeUp) || (oldNodeMaintenance && _nodeUp)) { deactivateAllActiveBuckets(); } } diff --git a/searchcore/src/vespa/searchcore/proton/server/buckethandler.h b/searchcore/src/vespa/searchcore/proton/server/buckethandler.h index 7f44d2ebd71..2344e080450 100644 --- a/searchcore/src/vespa/searchcore/proton/server/buckethandler.h +++ b/searchcore/src/vespa/searchcore/proton/server/buckethandler.h @@ -25,6 +25,7 @@ private: documentmetastore::IBucketHandler *_ready; std::vector<IBucketStateChangedHandler *> _changedHandlers; bool _nodeUp; + bool _nodeMaintenance; void performSetCurrentState(document::BucketId bucketId, storage::spi::BucketInfo::ActiveState newState, diff --git a/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp b/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp index 3d709bd19d1..137b2596eeb 100644 --- a/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp @@ -27,6 +27,7 @@ private: bool _nodeUp; bool _nodeInitializing; bool _nodeRetired; + bool _nodeMaintenance; public: ClusterStateAdapter(const ClusterState &calc) @@ -34,7 +35,8 @@ public: _clusterUp(_calc.clusterUp()), _nodeUp(_calc.nodeUp()), _nodeInitializing(_calc.nodeInitializing()), - _nodeRetired(_calc.nodeRetired()) + _nodeRetired(_calc.nodeRetired()), + _nodeMaintenance(_calc.nodeMaintenance()) { } vespalib::Trinary shouldBeReady(const document::Bucket &bucket) const override { @@ -44,6 +46,7 @@ public: bool nodeUp() const override { return _nodeUp; } bool nodeInitializing() const override { return _nodeInitializing; } bool nodeRetired() const override { return _nodeRetired; } + bool nodeMaintenance() const noexcept override { return _nodeMaintenance; } }; } diff --git a/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h b/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h index b60a5ad8175..9534f346d1f 100644 --- a/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h +++ b/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h @@ -15,6 +15,7 @@ struct IBucketStateCalculator virtual bool nodeUp() const = 0; virtual bool nodeInitializing() const = 0; virtual bool nodeRetired() const = 0; + virtual bool nodeMaintenance() const noexcept = 0; virtual ~IBucketStateCalculator() = default; }; diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.cpp b/searchcore/src/vespa/searchcore/proton/server/proton.cpp index 275f9029107..621f3acd072 100644 --- a/searchcore/src/vespa/searchcore/proton/server/proton.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/proton.cpp @@ -869,6 +869,7 @@ Proton::setClusterState(BucketSpace bucketSpace, const storage::spi::ClusterStat bool nodeRetired(calc.nodeRetired()); bool nodeUp = updateNodeUp(bucketSpace, nodeUpInBucketSpace); _matchEngine->setNodeUp(nodeUp); + _matchEngine->setNodeMaintenance(calc.nodeMaintenance()); if (_memoryFlushConfigUpdater) { _memoryFlushConfigUpdater->setNodeRetired(nodeRetired); } diff --git a/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h b/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h index a5a0185d787..e218058f01e 100644 --- a/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h +++ b/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h @@ -20,6 +20,7 @@ private: bool _clusterUp; bool _nodeUp; bool _nodeRetired; + bool _nodeMaintenance; public: typedef std::shared_ptr<BucketStateCalculator> SP; @@ -28,7 +29,8 @@ public: _asked(), _clusterUp(true), _nodeUp(true), - _nodeRetired(false) + _nodeRetired(false), + _nodeMaintenance(false) { } BucketStateCalculator &addReady(const document::BucketId &bucket) { @@ -54,6 +56,15 @@ public: return *this; } + BucketStateCalculator& setNodeMaintenance(bool maintenance) noexcept { + _nodeMaintenance = maintenance; + if (maintenance) { + _nodeUp = false; + _nodeRetired = false; + } + return *this; + } + const BucketIdVector &asked() const noexcept { return _asked; } void resetAsked() { _asked.clear(); } @@ -67,6 +78,7 @@ public: bool nodeUp() const override { return _nodeUp; } bool nodeRetired() const override { return _nodeRetired; } bool nodeInitializing() const override { return false; } + bool nodeMaintenance() const noexcept override { return _nodeMaintenance; } }; } |