summaryrefslogtreecommitdiffstats
path: root/configd
diff options
context:
space:
mode:
authorArne H Juul <arnej27959@users.noreply.github.com>2021-06-10 14:13:21 +0200
committerGitHub <noreply@github.com>2021-06-10 14:13:21 +0200
commiteb93f635cde554ab681c1029d68412b8dbcd5487 (patch)
treeb4aaf1ed37524622f606d3f245ada22250bb2e68 /configd
parent8b55850e0abc38722c3c172f8287d07b1a5837a6 (diff)
parent97f4c2fd8ed1e1fd80bc4a2ee950f2d6d98384d6 (diff)
Merge pull request #18171 from vespa-engine/arnej/add-corner-probes
Arnej/add corner probes
Diffstat (limited to 'configd')
-rw-r--r--configd/src/apps/sentinel/cc-result.h9
-rw-r--r--configd/src/apps/sentinel/connectivity.cpp132
-rw-r--r--configd/src/apps/sentinel/connectivity.h12
-rw-r--r--configd/src/apps/sentinel/outward-check.cpp21
-rw-r--r--configd/src/apps/sentinel/outward-check.h17
5 files changed, 151 insertions, 40 deletions
diff --git a/configd/src/apps/sentinel/cc-result.h b/configd/src/apps/sentinel/cc-result.h
new file mode 100644
index 00000000000..3468cf4324a
--- /dev/null
+++ b/configd/src/apps/sentinel/cc-result.h
@@ -0,0 +1,9 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+namespace config::sentinel {
+
+enum class CcResult { UNKNOWN, CONN_FAIL, UNREACHABLE_UP, INDIRECT_PING_FAIL, INDIRECT_PING_UNAVAIL, ALL_OK };
+
+}
diff --git a/configd/src/apps/sentinel/connectivity.cpp b/configd/src/apps/sentinel/connectivity.cpp
index 132b57fc884..8d1aa0e9673 100644
--- a/configd/src/apps/sentinel/connectivity.cpp
+++ b/configd/src/apps/sentinel/connectivity.cpp
@@ -27,25 +27,33 @@ namespace {
std::string toString(CcResult value) {
switch (value) {
case CcResult::UNKNOWN: return "BAD: missing result"; // very very bad
- case CcResult::REVERSE_FAIL: return "connect OK, but reverse check FAILED"; // very bad
+ case CcResult::INDIRECT_PING_FAIL: return "connect OK, but reverse check FAILED"; // very bad
+ case CcResult::UNREACHABLE_UP: return "unreachable from me, but up"; // very bad
case CcResult::CONN_FAIL: return "failed to connect"; // bad
- case CcResult::REVERSE_UNAVAIL: return "connect OK (but reverse check unavailable)"; // unfortunate
+ case CcResult::INDIRECT_PING_UNAVAIL: return "connect OK (but reverse check unavailable)"; // unfortunate
case CcResult::ALL_OK: return "OK: both ways connectivity verified"; // good
}
LOG(error, "Unknown CcResult enum value: %d", (int)value);
LOG_ABORT("Unknown CcResult enum value");
}
-std::map<std::string, std::string> specsFrom(const ModelConfig &model) {
- std::map<std::string, std::string> checkSpecs;
+using ConnectivityMap = std::map<std::string, OutwardCheck>;
+using HostAndPort = Connectivity::HostAndPort;
+using SpecMap = Connectivity::SpecMap;
+
+std::string spec(const SpecMap::value_type &host_and_port) {
+ return fmt("tcp/%s:%d", host_and_port.first.c_str(), host_and_port.second);
+}
+
+SpecMap specsFrom(const ModelConfig &model) {
+ SpecMap checkSpecs;
for (const auto & h : model.hosts) {
bool foundSentinelPort = false;
for (const auto & s : h.services) {
if (s.name == "config-sentinel") {
for (const auto & p : s.ports) {
if (p.tags.find("rpc") != p.tags.npos) {
- auto spec = fmt("tcp/%s:%d", h.name.c_str(), p.number);
- checkSpecs[h.name] = spec;
+ checkSpecs[h.name] = p.number;
foundSentinelPort = true;
}
}
@@ -59,8 +67,57 @@ std::map<std::string, std::string> specsFrom(const ModelConfig &model) {
return checkSpecs;
}
+void classifyConnFails(ConnectivityMap &connectivityMap,
+ const SpecMap &specMap,
+ RpcServer &rpcServer)
+{
+ std::vector<HostAndPort> failedConnSpecs;
+ std::vector<HostAndPort> goodNeighborSpecs;
+ std::string myHostname = vespa::Defaults::vespaHostname();
+ for (auto & [hostname, check] : connectivityMap) {
+ if (hostname == myHostname) {
+ if (check.result() == CcResult::CONN_FAIL) {
+ check.classifyResult(CcResult::UNREACHABLE_UP);
+ }
+ } else {
+ auto iter = specMap.find(hostname);
+ LOG_ASSERT(iter != specMap.end());
+ if (check.result() == CcResult::ALL_OK) {
+ goodNeighborSpecs.push_back(*iter);
+ }
+ if (check.result() == CcResult::CONN_FAIL) {
+ failedConnSpecs.push_back(*iter);
+ }
+ }
+ }
+ if ((failedConnSpecs.size() == 0) || (goodNeighborSpecs.size() == 0)) {
+ return;
+ }
+ for (const auto & [ nameToCheck, portToCheck ] : failedConnSpecs) {
+ auto cmIter = connectivityMap.find(nameToCheck);
+ LOG_ASSERT(cmIter != connectivityMap.end());
+ OutwardCheckContext checkContext(goodNeighborSpecs.size(), nameToCheck, portToCheck, rpcServer.orb());
+ ConnectivityMap cornerProbes;
+ for (const auto & hp : goodNeighborSpecs) {
+ cornerProbes.try_emplace(hp.first, spec(hp), checkContext);
+ }
+ checkContext.latch.await();
+ size_t numReportsUp = 0;
+ size_t numReportsDown = 0;
+ for (const auto & [hostname, probe] : cornerProbes) {
+ if (probe.result() == CcResult::INDIRECT_PING_FAIL) ++numReportsDown;
+ if (probe.result() == CcResult::ALL_OK) ++numReportsUp;
+ }
+ if (numReportsUp > 0) {
+ LOG(debug, "Unreachable: %s is up according to %zd hosts (down according to me + %zd others)",
+ nameToCheck.c_str(), numReportsUp, numReportsDown);
+ cmIter->second.classifyResult(CcResult::UNREACHABLE_UP);
+ }
+ }
}
+} // namespace <unnamed>
+
void Connectivity::configure(const SentinelConfig::Connectivity &config) {
_config = config;
LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount);
@@ -77,18 +134,18 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) {
LOG(warning, "could not get model config, skipping connectivity checks");
return true;
}
+ std::string myHostname = vespa::Defaults::vespaHostname();
OutwardCheckContext checkContext(clusterSize,
- vespa::Defaults::vespaHostname(),
+ myHostname,
rpcServer.getPort(),
rpcServer.orb());
- std::map<std::string, OutwardCheck> connectivityMap;
- for (const auto & [ hn, spec ] : _checkSpecs) {
- connectivityMap.try_emplace(hn, spec, checkContext);
+ ConnectivityMap connectivityMap;
+ for (const auto &host_and_port : _checkSpecs) {
+ connectivityMap.try_emplace(host_and_port.first, spec(host_and_port), checkContext);
}
checkContext.latch.await();
- size_t numFailedConns = 0;
- size_t numFailedReverse = 0;
- bool allChecksOk = true;
+ classifyConnFails(connectivityMap, _checkSpecs, rpcServer);
+ Accumulated accumulated;
for (const auto & [hostname, check] : connectivityMap) {
std::string detail = toString(check.result());
std::string prev = _detailsPerHost[hostname];
@@ -97,26 +154,47 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) {
}
_detailsPerHost[hostname] = detail;
LOG_ASSERT(check.result() != CcResult::UNKNOWN);
- if (check.result() == CcResult::CONN_FAIL) ++numFailedConns;
- if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse;
+ accumulate(accumulated, check.result());
+ }
+ return enoughOk(accumulated, clusterSize);
+}
+
+void Connectivity::accumulate(Accumulated &target, CcResult value) {
+ switch (value) {
+ case CcResult::UNKNOWN:
+ case CcResult::UNREACHABLE_UP:
+ case CcResult::INDIRECT_PING_FAIL:
+ ++target.numSeriousIssues;
+ ++target.numIssues;
+ break;
+ case CcResult::CONN_FAIL:
+ ++target.numIssues;
+ break;
+ case CcResult::INDIRECT_PING_UNAVAIL:
+ case CcResult::ALL_OK:
+ break;
}
- if (numFailedReverse > size_t(_config.maxBadReverseCount)) {
- LOG(warning, "%zu of %zu nodes report problems connecting to me (max is %d)",
- numFailedReverse, clusterSize, _config.maxBadReverseCount);
- allChecksOk = false;
+}
+
+bool Connectivity::enoughOk(const Accumulated &results, size_t clusterSize) {
+ bool enough = true;
+ if (results.numSeriousIssues > size_t(_config.maxBadReverseCount)) {
+ LOG(warning, "%zu of %zu nodes up but with network connectivity problems (max is %d)",
+ results.numSeriousIssues, clusterSize, _config.maxBadReverseCount);
+ enough = false;
}
- if (numFailedConns * 100.0 > _config.maxBadOutPercent * clusterSize) {
- double pct = numFailedConns * 100.0 / clusterSize;
- LOG(warning, "Problems connecting to %zu of %zu nodes, %.2f %% (max is %d)",
- numFailedConns, clusterSize, pct, _config.maxBadOutPercent);
- allChecksOk = false;
+ if (results.numIssues * 100.0 > _config.maxBadOutPercent * clusterSize) {
+ double pct = results.numIssues * 100.0 / clusterSize;
+ LOG(warning, "Problems with connection to %zu of %zu nodes, %.1f%% (max is %d%%)",
+ results.numIssues, clusterSize, pct, _config.maxBadOutPercent);
+ enough = false;
}
- if (allChecksOk && (numFailedConns == 0) && (numFailedReverse == 0)) {
+ if (results.numIssues == 0) {
LOG(info, "All connectivity checks OK, proceeding with service startup");
- } else if (allChecksOk) {
+ } else if (enough) {
LOG(info, "Enough connectivity checks OK, proceeding with service startup");
}
- return allChecksOk;
+ return enough;
}
}
diff --git a/configd/src/apps/sentinel/connectivity.h b/configd/src/apps/sentinel/connectivity.h
index 1c7ee8ddc57..a1e454a255a 100644
--- a/configd/src/apps/sentinel/connectivity.h
+++ b/configd/src/apps/sentinel/connectivity.h
@@ -3,6 +3,7 @@
#pragma once
#include "rpcserver.h"
+#include "cc-result.h"
#include <vespa/config-sentinel.h>
#include <vespa/config-model.h>
#include <string>
@@ -18,13 +19,22 @@ namespace config::sentinel {
**/
class Connectivity {
public:
+ using SpecMap = std::map<std::string, int>;
+ using HostAndPort = SpecMap::value_type;
+
Connectivity();
~Connectivity();
void configure(const SentinelConfig::Connectivity &config);
bool checkConnectivity(RpcServer &rpcServer);
private:
+ struct Accumulated {
+ size_t numIssues = 0;
+ size_t numSeriousIssues = 0;
+ };
+ void accumulate(Accumulated &target, CcResult value);
+ bool enoughOk(const Accumulated &results, size_t clusterSize);
SentinelConfig::Connectivity _config;
- std::map<std::string, std::string> _checkSpecs;
+ SpecMap _checkSpecs;
std::map<std::string, std::string> _detailsPerHost;
};
diff --git a/configd/src/apps/sentinel/outward-check.cpp b/configd/src/apps/sentinel/outward-check.cpp
index 5fed69d0b6e..c9c94443274 100644
--- a/configd/src/apps/sentinel/outward-check.cpp
+++ b/configd/src/apps/sentinel/outward-check.cpp
@@ -7,6 +7,8 @@ LOG_SETUP(".outward-check");
namespace config::sentinel {
+OutwardCheckContext::~OutwardCheckContext() = default;
+
OutwardCheck::OutwardCheck(const std::string &spec, OutwardCheckContext &context)
: _spec(spec),
_context(context)
@@ -14,8 +16,8 @@ OutwardCheck::OutwardCheck(const std::string &spec, OutwardCheckContext &context
_target = context.orb.GetTarget(spec.c_str());
_req = context.orb.AllocRPCRequest();
_req->SetMethodName("sentinel.check.connectivity");
- _req->GetParams()->AddString(context.myHostname);
- _req->GetParams()->AddInt32(context.myPortnum);
+ _req->GetParams()->AddString(context.targetHostname.c_str());
+ _req->GetParams()->AddInt32(context.targetPortnum);
_req->GetParams()->AddInt32(500);
_target->InvokeAsync(_req, 1.500, this);
}
@@ -29,17 +31,21 @@ void OutwardCheck::RequestDone(FRT_RPCRequest *req) {
if (answer == "ok") {
LOG(debug, "ping to %s with reverse connectivity OK", _spec.c_str());
_result = CcResult::ALL_OK;
- } else {
+ } else if (answer == "bad") {
LOG(debug, "connected to %s, but reverse connectivity fails: %s",
_spec.c_str(), answer.c_str());
- _result = CcResult::REVERSE_FAIL;
+ _result = CcResult::INDIRECT_PING_FAIL;
+ } else {
+ LOG(warning, "connected to %s, but strange reverse connectivity: %s",
+ _spec.c_str(), answer.c_str());
+ _result = CcResult::INDIRECT_PING_UNAVAIL;
}
} else if (req->GetErrorCode() == FRTE_RPC_NO_SUCH_METHOD ||
req->GetErrorCode() == FRTE_RPC_WRONG_PARAMS ||
req->GetErrorCode() == FRTE_RPC_WRONG_RETURN)
{
LOG(debug, "Connected OK to %s but no reverse connectivity check available", _spec.c_str());
- _result = CcResult::REVERSE_UNAVAIL;
+ _result = CcResult::INDIRECT_PING_UNAVAIL;
} else {
LOG(debug, "error on request to %s : %s (%d)", _spec.c_str(),
req->GetErrorMessage(), req->GetErrorCode());
@@ -52,4 +58,9 @@ void OutwardCheck::RequestDone(FRT_RPCRequest *req) {
_context.latch.countDown();
}
+void OutwardCheck::classifyResult(CcResult value) {
+ LOG_ASSERT(_result == CcResult::CONN_FAIL);
+ _result = value;
+}
+
}
diff --git a/configd/src/apps/sentinel/outward-check.h b/configd/src/apps/sentinel/outward-check.h
index 01a298aee18..0e53b9010dc 100644
--- a/configd/src/apps/sentinel/outward-check.h
+++ b/configd/src/apps/sentinel/outward-check.h
@@ -1,5 +1,8 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "cc-result.h"
#include <string>
#include <vespa/vespalib/util/count_down_latch.h>
#include <vespa/fnet/frt/supervisor.h>
@@ -12,22 +15,21 @@ namespace config::sentinel {
struct OutwardCheckContext {
vespalib::CountDownLatch latch;
- const char * myHostname;
- int myPortnum;
+ std::string targetHostname;
+ int targetPortnum;
FRT_Supervisor &orb;
OutwardCheckContext(size_t count,
- const char * hostname,
+ const std::string &hostname,
int portnumber,
FRT_Supervisor &supervisor)
: latch(count),
- myHostname(hostname),
- myPortnum(portnumber),
+ targetHostname(hostname),
+ targetPortnum(portnumber),
orb(supervisor)
{}
+ ~OutwardCheckContext();
};
-enum class CcResult { UNKNOWN, CONN_FAIL, REVERSE_FAIL, REVERSE_UNAVAIL, ALL_OK };
-
class OutwardCheck : public FRT_IRequestWait {
private:
CcResult _result = CcResult::UNKNOWN;
@@ -41,6 +43,7 @@ public:
void RequestDone(FRT_RPCRequest *req) override;
bool ok() const { return _result == CcResult::ALL_OK; }
CcResult result() const { return _result; }
+ void classifyResult(CcResult value);
};
}