aboutsummaryrefslogtreecommitdiffstats
path: root/searchcore
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2021-11-22 14:06:42 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2021-11-24 10:12:49 +0000
commita428b8f2754eec0b3748451b7aca79bc00d0a04e (patch)
treef6c65436980cafa8b5d89bec01f965d48acf1146 /searchcore
parent6dd2e13d43f5870d404196f9294ef867e77d1c73 (diff)
Continue serving search queries when in Maintenance node state
Previously, entering maintenance state would implicitly deactivate all buckets on the searchnode and cause empty responses to be returned for searches. However, container query dispatch uses async health pings to decide which nodes to route queries to, so it would be possible for a node to still be used for queries for a few seconds until the ping discovered that the node should not be used. In the case of multiple groups without multiple ready replicas within the group, this would cause transient coverage loss since the dispatcher would not realize it should route queries to other groups instead. With this commit, maintenance edge behavior is changed as follows: - Buckets are _not_ deactivated when going from an available state to the maintenance state. However, they _are_ deactivate when going from maintenance state to an available state in order to avoid transient query duplicates immediately after the change. - Searches are executed as normal instead of returning empty replies when the node is in maintenance state. The following behavior is intentionally _not_ changed: - The search interface is still marked as offline when in maintenance state, as this signals that the node should be taken out of rotation. In particular, it's critical that the RPC health ping response is explicitly tagged as having zero active docs when the search interface is offline, even though many buckets may now actually be active. Otherwise, queries would not be gracefully drained from the node.
Diffstat (limited to 'searchcore')
-rw-r--r--searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp30
-rw-r--r--searchcore/src/tests/proton/matchengine/matchengine.cpp64
-rw-r--r--searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp14
-rw-r--r--searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h8
-rw-r--r--searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp18
-rw-r--r--searchcore/src/vespa/searchcore/proton/server/buckethandler.h1
-rw-r--r--searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp5
-rw-r--r--searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h1
-rw-r--r--searchcore/src/vespa/searchcore/proton/server/proton.cpp1
-rw-r--r--searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h14
10 files changed, 138 insertions, 18 deletions
diff --git a/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp b/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp
index 29748a2010c..82410d28610 100644
--- a/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp
+++ b/searchcore/src/tests/proton/documentdb/buckethandler/buckethandler_test.cpp
@@ -140,6 +140,11 @@ struct Fixture
setNodeUp(bool value)
{
_calc->setNodeUp(value);
+ _calc->setNodeMaintenance(false);
+ _handler.notifyClusterStateChanged(_calc);
+ }
+ void setNodeMaintenance(bool value) {
+ _calc->setNodeMaintenance(value);
_handler.notifyClusterStateChanged(_calc);
}
};
@@ -223,7 +228,7 @@ TEST_F("require that unready bucket can be reported as active", Fixture)
}
-TEST_F("require that node being down deactivates buckets", Fixture)
+TEST_F("node going down (but not into maintenance state) deactivates all buckets", Fixture)
{
f._handler.handleSetCurrentState(f._ready.bucket(2),
BucketInfo::ACTIVE, f._genResult);
@@ -252,6 +257,29 @@ TEST_F("require that node being down deactivates buckets", Fixture)
EXPECT_EQUAL(true, f._bucketInfo.getInfo().isActive());
}
+TEST_F("node going into maintenance state does _not_ deactivate any buckets", Fixture)
+{
+ f._handler.handleSetCurrentState(f._ready.bucket(2),
+ BucketInfo::ACTIVE, f._genResult);
+ f.sync();
+ f.setNodeMaintenance(true);
+ f.sync();
+ f.handleGetBucketInfo(f._ready.bucket(2));
+ EXPECT_TRUE(f._bucketInfo.getInfo().isActive());
+}
+
+TEST_F("node going from maintenance to up state deactivates all buckets", Fixture)
+{
+ f._handler.handleSetCurrentState(f._ready.bucket(2),
+ BucketInfo::ACTIVE, f._genResult);
+ f.sync();
+ f.setNodeMaintenance(true);
+ f.sync();
+ f.setNodeUp(true);
+ f.sync();
+ f.handleGetBucketInfo(f._ready.bucket(2));
+ EXPECT_FALSE(f._bucketInfo.getInfo().isActive());
+}
TEST_MAIN()
{
diff --git a/searchcore/src/tests/proton/matchengine/matchengine.cpp b/searchcore/src/tests/proton/matchengine/matchengine.cpp
index 481a9f061be..34c36fd9a72 100644
--- a/searchcore/src/tests/proton/matchengine/matchengine.cpp
+++ b/searchcore/src/tests/proton/matchengine/matchengine.cpp
@@ -17,7 +17,7 @@ class MySearchHandler : public ISearchHandler {
std::string _name;
std::string _reply;
public:
- MySearchHandler(size_t numHits = 0) :
+ explicit MySearchHandler(size_t numHits = 0) :
_numHits(numHits), _name("my"), _reply("myreply")
{}
DocsumReply::UP getDocsums(const DocsumRequest &) override {
@@ -91,6 +91,7 @@ assertSearchReply(MatchEngine & engine, const std::string & searchDocType, size_
LocalSearchClient client;
engine.search(SearchRequest::Source(request), client);
SearchReply::UP reply = client.getReply(10000);
+ ASSERT_TRUE(reply);
return EXPECT_EQUAL(expHits, reply->hits.size());
}
@@ -173,11 +174,24 @@ TEST("requireThatEmptySearchReplyIsReturnedWhenEngineIsClosed")
LocalSearchClient client;
SearchRequest::Source request(new SearchRequest());
SearchReply::UP reply = engine.search(std::move(request), client);
- EXPECT_TRUE(reply );
+ ASSERT_TRUE(reply);
EXPECT_EQUAL(0u, reply->hits.size());
EXPECT_EQUAL(7u, reply->getDistributionKey());
}
+namespace {
+
+constexpr const char* search_interface_offline_slime_str() noexcept {
+ return "{\n"
+ " \"status\": {\n"
+ " \"state\": \"OFFLINE\",\n"
+ " \"message\": \"Search interface is offline\"\n"
+ " }\n"
+ "}\n";
+}
+
+}
+
TEST("requireThatStateIsReported")
{
MatchEngine engine(1, 1, 7);
@@ -185,14 +199,44 @@ TEST("requireThatStateIsReported")
Slime slime;
SlimeInserter inserter(slime);
engine.get_state(inserter, false);
- EXPECT_EQUAL(
- "{\n"
- " \"status\": {\n"
- " \"state\": \"OFFLINE\",\n"
- " \"message\": \"Search interface is offline\"\n"
- " }\n"
- "}\n",
- slime.toString());
+ EXPECT_EQUAL(search_interface_offline_slime_str(),
+ slime.toString());
+}
+
+TEST("searches are executed when node is in maintenance mode")
+{
+ MatchEngine engine(1, 1, 7);
+ engine.setNodeMaintenance(true);
+ engine.putSearchHandler(DocTypeName("foo"), std::make_shared<MySearchHandler>(3));
+ EXPECT_TRUE(assertSearchReply(engine, "foo", 3));
+}
+
+TEST("setNodeMaintenance(true) implies setNodeUp(false)")
+{
+ MatchEngine engine(1, 1, 7);
+ engine.setNodeUp(true);
+ engine.setNodeMaintenance(true);
+ EXPECT_FALSE(engine.isOnline());
+}
+
+TEST("setNodeMaintenance(false) does not imply setNodeUp(false)")
+{
+ MatchEngine engine(1, 1, 7);
+ engine.setNodeUp(true);
+ engine.setNodeMaintenance(false);
+ EXPECT_TRUE(engine.isOnline());
+}
+
+TEST("search interface is reported as offline when node is in maintenance mode")
+{
+ MatchEngine engine(1, 1, 7);
+ engine.setNodeMaintenance(true);
+
+ Slime slime;
+ SlimeInserter inserter(slime);
+ engine.get_state(inserter, false);
+ EXPECT_EQUAL(search_interface_offline_slime_str(),
+ slime.toString());
}
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp
index 5ad4a7ed52b..58dc473b85e 100644
--- a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp
+++ b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.cpp
@@ -50,7 +50,8 @@ MatchEngine::MatchEngine(size_t numThreads, size_t threadsPerSearch, uint32_t di
_handlers(),
_executor(std::max(size_t(1), numThreads / threadsPerSearch), 256_Ki, match_engine_executor),
_threadBundlePool(std::max(size_t(1), threadsPerSearch)),
- _nodeUp(false)
+ _nodeUp(false),
+ _nodeMaintenance(false)
{
}
@@ -98,7 +99,8 @@ search::engine::SearchReply::UP
MatchEngine::search(search::engine::SearchRequest::Source request,
search::engine::SearchClient &client)
{
- if (_closed || !_nodeUp) {
+ // We continue to allow searches if the node is in Maintenance mode
+ if (_closed || (!_nodeUp && !_nodeMaintenance)) {
auto ret = std::make_unique<search::engine::SearchReply>();
ret->setDistributionKey(_distributionKey);
@@ -177,6 +179,14 @@ MatchEngine::setNodeUp(bool nodeUp)
_nodeUp = nodeUp;
}
+void
+MatchEngine::setNodeMaintenance(bool nodeMaintenance)
+{
+ _nodeMaintenance = nodeMaintenance;
+ if (nodeMaintenance) {
+ _nodeUp = false;
+ }
+}
StatusReport::UP
MatchEngine::reportStatus() const
diff --git a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h
index b4e32c45003..7cc0c97048b 100644
--- a/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h
+++ b/searchcore/src/vespa/searchcore/proton/matchengine/matchengine.h
@@ -26,6 +26,7 @@ private:
vespalib::ThreadStackExecutor _executor;
vespalib::SimpleThreadBundle::Pool _threadBundlePool;
bool _nodeUp;
+ bool _nodeMaintenance;
public:
/**
@@ -131,6 +132,13 @@ public:
*/
void setNodeUp(bool nodeUp);
+ /**
+ * Set node into maintenance, based on info from cluster controller. Note that
+ * nodeMaintenance == true also implies setNodeUp(false), as the node is technically
+ * not in a Up state.
+ */
+ void setNodeMaintenance(bool nodeMaintenance);
+
StatusReport::UP reportStatus() const;
search::engine::SearchReply::UP search(
diff --git a/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp b/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp
index c15be9336fe..d329d9ce27c 100644
--- a/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp
+++ b/searchcore/src/vespa/searchcore/proton/server/buckethandler.cpp
@@ -68,7 +68,8 @@ BucketHandler::BucketHandler(vespalib::Executor &executor)
_executor(executor),
_ready(nullptr),
_changedHandlers(),
- _nodeUp(false)
+ _nodeUp(false),
+ _nodeMaintenance(false)
{
LOG(spam, "BucketHandler::BucketHandler");
}
@@ -147,9 +148,20 @@ void
BucketHandler::notifyClusterStateChanged(const std::shared_ptr<IBucketStateCalculator> & newCalc)
{
bool oldNodeUp = _nodeUp;
- _nodeUp = newCalc->nodeUp();
+ bool oldNodeMaintenance = _nodeMaintenance;
+ _nodeUp = newCalc->nodeUp(); // Up, Retired or Initializing
+ _nodeMaintenance = newCalc->nodeMaintenance();
LOG(spam, "notifyClusterStateChanged: %s -> %s", oldNodeUp ? "up" : "down", _nodeUp ? "up" : "down");
- if (oldNodeUp && !_nodeUp) {
+ if (_nodeMaintenance) {
+ return; // Don't deactivate buckets in maintenance mode; let query traffic drain away naturally.
+ }
+ // We implicitly deactivate buckets in two edge cases:
+ // - Up -> Down (not maintenance; handled above), since the node can not be expected to offer
+ // any graceful query draining when set Down.
+ // - Maintenance -> Up, since we'd otherwise introduce a bunch of transient duplicate results
+ // into queries. The assumption is that the system has already activated buckets on other
+ // nodes in such a scenario.
+ if ((oldNodeUp && !_nodeUp) || (oldNodeMaintenance && _nodeUp)) {
deactivateAllActiveBuckets();
}
}
diff --git a/searchcore/src/vespa/searchcore/proton/server/buckethandler.h b/searchcore/src/vespa/searchcore/proton/server/buckethandler.h
index 7f44d2ebd71..2344e080450 100644
--- a/searchcore/src/vespa/searchcore/proton/server/buckethandler.h
+++ b/searchcore/src/vespa/searchcore/proton/server/buckethandler.h
@@ -25,6 +25,7 @@ private:
documentmetastore::IBucketHandler *_ready;
std::vector<IBucketStateChangedHandler *> _changedHandlers;
bool _nodeUp;
+ bool _nodeMaintenance;
void performSetCurrentState(document::BucketId bucketId,
storage::spi::BucketInfo::ActiveState newState,
diff --git a/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp b/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp
index 3d709bd19d1..137b2596eeb 100644
--- a/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp
+++ b/searchcore/src/vespa/searchcore/proton/server/clusterstatehandler.cpp
@@ -27,6 +27,7 @@ private:
bool _nodeUp;
bool _nodeInitializing;
bool _nodeRetired;
+ bool _nodeMaintenance;
public:
ClusterStateAdapter(const ClusterState &calc)
@@ -34,7 +35,8 @@ public:
_clusterUp(_calc.clusterUp()),
_nodeUp(_calc.nodeUp()),
_nodeInitializing(_calc.nodeInitializing()),
- _nodeRetired(_calc.nodeRetired())
+ _nodeRetired(_calc.nodeRetired()),
+ _nodeMaintenance(_calc.nodeMaintenance())
{
}
vespalib::Trinary shouldBeReady(const document::Bucket &bucket) const override {
@@ -44,6 +46,7 @@ public:
bool nodeUp() const override { return _nodeUp; }
bool nodeInitializing() const override { return _nodeInitializing; }
bool nodeRetired() const override { return _nodeRetired; }
+ bool nodeMaintenance() const noexcept override { return _nodeMaintenance; }
};
}
diff --git a/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h b/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h
index b60a5ad8175..9534f346d1f 100644
--- a/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h
+++ b/searchcore/src/vespa/searchcore/proton/server/ibucketstatecalculator.h
@@ -15,6 +15,7 @@ struct IBucketStateCalculator
virtual bool nodeUp() const = 0;
virtual bool nodeInitializing() const = 0;
virtual bool nodeRetired() const = 0;
+ virtual bool nodeMaintenance() const noexcept = 0;
virtual ~IBucketStateCalculator() = default;
};
diff --git a/searchcore/src/vespa/searchcore/proton/server/proton.cpp b/searchcore/src/vespa/searchcore/proton/server/proton.cpp
index 275f9029107..621f3acd072 100644
--- a/searchcore/src/vespa/searchcore/proton/server/proton.cpp
+++ b/searchcore/src/vespa/searchcore/proton/server/proton.cpp
@@ -869,6 +869,7 @@ Proton::setClusterState(BucketSpace bucketSpace, const storage::spi::ClusterStat
bool nodeRetired(calc.nodeRetired());
bool nodeUp = updateNodeUp(bucketSpace, nodeUpInBucketSpace);
_matchEngine->setNodeUp(nodeUp);
+ _matchEngine->setNodeMaintenance(calc.nodeMaintenance());
if (_memoryFlushConfigUpdater) {
_memoryFlushConfigUpdater->setNodeRetired(nodeRetired);
}
diff --git a/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h b/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h
index a5a0185d787..e218058f01e 100644
--- a/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h
+++ b/searchcore/src/vespa/searchcore/proton/test/bucketstatecalculator.h
@@ -20,6 +20,7 @@ private:
bool _clusterUp;
bool _nodeUp;
bool _nodeRetired;
+ bool _nodeMaintenance;
public:
typedef std::shared_ptr<BucketStateCalculator> SP;
@@ -28,7 +29,8 @@ public:
_asked(),
_clusterUp(true),
_nodeUp(true),
- _nodeRetired(false)
+ _nodeRetired(false),
+ _nodeMaintenance(false)
{
}
BucketStateCalculator &addReady(const document::BucketId &bucket) {
@@ -54,6 +56,15 @@ public:
return *this;
}
+ BucketStateCalculator& setNodeMaintenance(bool maintenance) noexcept {
+ _nodeMaintenance = maintenance;
+ if (maintenance) {
+ _nodeUp = false;
+ _nodeRetired = false;
+ }
+ return *this;
+ }
+
const BucketIdVector &asked() const noexcept { return _asked; }
void resetAsked() { _asked.clear(); }
@@ -67,6 +78,7 @@ public:
bool nodeUp() const override { return _nodeUp; }
bool nodeRetired() const override { return _nodeRetired; }
bool nodeInitializing() const override { return false; }
+ bool nodeMaintenance() const noexcept override { return _nodeMaintenance; }
};
}