diff options
author | Arne Juul <arnej@verizonmedia.com> | 2021-06-08 13:41:51 +0000 |
---|---|---|
committer | Arne Juul <arnej@verizonmedia.com> | 2021-06-09 07:39:53 +0000 |
commit | 047b641beec9ea62b54789374bae41948bae5d54 (patch) | |
tree | 693806375b498d1702b65636f2bd915f5edfad7c /configd | |
parent | 7a7296a89fecd6238bf845741157b6c6971602ff (diff) |
add probes around the corner, take 1
Diffstat (limited to 'configd')
-rw-r--r-- | configd/src/apps/sentinel/connectivity.cpp | 74 | ||||
-rw-r--r-- | configd/src/apps/sentinel/connectivity.h | 9 | ||||
-rw-r--r-- | configd/src/apps/sentinel/outward-check.cpp | 10 | ||||
-rw-r--r-- | configd/src/apps/sentinel/outward-check.h | 5 |
4 files changed, 85 insertions, 13 deletions
diff --git a/configd/src/apps/sentinel/connectivity.cpp b/configd/src/apps/sentinel/connectivity.cpp index 132b57fc884..84c03d5f3c1 100644 --- a/configd/src/apps/sentinel/connectivity.cpp +++ b/configd/src/apps/sentinel/connectivity.cpp @@ -36,16 +36,19 @@ std::string toString(CcResult value) { LOG_ABORT("Unknown CcResult enum value"); } -std::map<std::string, std::string> specsFrom(const ModelConfig &model) { - std::map<std::string, std::string> checkSpecs; +using ConnectivityMap = std::map<std::string, OutwardCheck>; +using HostAndPort = Connectivity::HostAndPort; +using SpecMap = Connectivity::SpecMap; + +SpecMap specsFrom(const ModelConfig &model) { + SpecMap checkSpecs; for (const auto & h : model.hosts) { bool foundSentinelPort = false; for (const auto & s : h.services) { if (s.name == "config-sentinel") { for (const auto & p : s.ports) { if (p.tags.find("rpc") != p.tags.npos) { - auto spec = fmt("tcp/%s:%d", h.name.c_str(), p.number); - checkSpecs[h.name] = spec; + checkSpecs[h.name] = HostAndPort{h.name, p.number}; foundSentinelPort = true; } } @@ -59,6 +62,48 @@ std::map<std::string, std::string> specsFrom(const ModelConfig &model) { return checkSpecs; } +size_t countUnreachable(const ConnectivityMap &connectivityMap, + const SpecMap &specMap, + RpcServer &rpcServer) +{ + std::vector<HostAndPort> failedConnSpecs; + std::vector<HostAndPort> goodNeighborSpecs; + std::string myHostname = vespa::Defaults::vespaHostname(); + for (const auto & [hostname, check] : connectivityMap) { + auto iter = specMap.find(hostname); + LOG_ASSERT(iter != specMap.end()); + if ((check.result() == CcResult::ALL_OK) && (hostname != myHostname)) { + goodNeighborSpecs.push_back(iter->second); + } + if (check.result() == CcResult::CONN_FAIL) { + failedConnSpecs.push_back(iter->second); + } + } + size_t counter = 0; + for (const auto & toCheck : failedConnSpecs) { + OutwardCheckContext checkContext(goodNeighborSpecs.size(), toCheck.host, toCheck.port, rpcServer.orb()); + ConnectivityMap cornerProbes; + for (const auto & hp : goodNeighborSpecs) { + cornerProbes.try_emplace(hp.host, hp.spec(), checkContext); + } + checkContext.latch.await(); + size_t numReportsUp = 0; + size_t numReportsDown = 0; + for (const auto & [hostname, probe] : cornerProbes) { + if (probe.result() == CcResult::REVERSE_FAIL) ++numReportsDown; + if (probe.result() == CcResult::ALL_OK) ++numReportsUp; + } + if (numReportsUp > numReportsDown) { + ++counter; + } + } + return counter; +} + +} + +std::string Connectivity::HostAndPort::spec() const { + return fmt("tcp/%s:%d", host.c_str(), port); } void Connectivity::configure(const SentinelConfig::Connectivity &config) { @@ -77,15 +122,17 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) { LOG(warning, "could not get model config, skipping connectivity checks"); return true; } + std::string myHostname = vespa::Defaults::vespaHostname(); OutwardCheckContext checkContext(clusterSize, - vespa::Defaults::vespaHostname(), + myHostname, rpcServer.getPort(), rpcServer.orb()); - std::map<std::string, OutwardCheck> connectivityMap; - for (const auto & [ hn, spec ] : _checkSpecs) { - connectivityMap.try_emplace(hn, spec, checkContext); + ConnectivityMap connectivityMap; + for (const auto & [ hn, host_and_port ] : _checkSpecs) { + connectivityMap.try_emplace(hn, host_and_port.spec(), checkContext); } checkContext.latch.await(); + size_t numAllGood = 0; size_t numFailedConns = 0; size_t numFailedReverse = 0; bool allChecksOk = true; @@ -97,6 +144,9 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) { } _detailsPerHost[hostname] = detail; LOG_ASSERT(check.result() != CcResult::UNKNOWN); + if ((check.result() == CcResult::ALL_OK) && (hostname != myHostname)) { + ++numAllGood; + } if (check.result() == CcResult::CONN_FAIL) ++numFailedConns; if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse; } @@ -111,6 +161,14 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) { numFailedConns, clusterSize, pct, _config.maxBadOutPercent); allChecksOk = false; } + size_t numUnreachable = (numFailedConns > 0) + ? countUnreachable(connectivityMap, _checkSpecs, rpcServer) + : 0; + if (numUnreachable > size_t(_config.maxBadReverseCount)) { + LOG(warning, "%zu of %zu nodes are up but unreachable from me (max is %d)", + numUnreachable, clusterSize, _config.maxBadReverseCount); + allChecksOk = false; + } if (allChecksOk && (numFailedConns == 0) && (numFailedReverse == 0)) { LOG(info, "All connectivity checks OK, proceeding with service startup"); } else if (allChecksOk) { diff --git a/configd/src/apps/sentinel/connectivity.h b/configd/src/apps/sentinel/connectivity.h index 1c7ee8ddc57..93bc2c865b5 100644 --- a/configd/src/apps/sentinel/connectivity.h +++ b/configd/src/apps/sentinel/connectivity.h @@ -18,13 +18,20 @@ namespace config::sentinel { **/ class Connectivity { public: + struct HostAndPort { + std::string host; + int port; + std::string spec() const; + }; + using SpecMap = std::map<std::string, HostAndPort>; + Connectivity(); ~Connectivity(); void configure(const SentinelConfig::Connectivity &config); bool checkConnectivity(RpcServer &rpcServer); private: SentinelConfig::Connectivity _config; - std::map<std::string, std::string> _checkSpecs; + SpecMap _checkSpecs; std::map<std::string, std::string> _detailsPerHost; }; diff --git a/configd/src/apps/sentinel/outward-check.cpp b/configd/src/apps/sentinel/outward-check.cpp index 5fed69d0b6e..500ea835fbf 100644 --- a/configd/src/apps/sentinel/outward-check.cpp +++ b/configd/src/apps/sentinel/outward-check.cpp @@ -7,6 +7,8 @@ LOG_SETUP(".outward-check"); namespace config::sentinel { +OutwardCheckContext::~OutwardCheckContext() = default; + OutwardCheck::OutwardCheck(const std::string &spec, OutwardCheckContext &context) : _spec(spec), _context(context) @@ -14,7 +16,7 @@ OutwardCheck::OutwardCheck(const std::string &spec, OutwardCheckContext &context _target = context.orb.GetTarget(spec.c_str()); _req = context.orb.AllocRPCRequest(); _req->SetMethodName("sentinel.check.connectivity"); - _req->GetParams()->AddString(context.myHostname); + _req->GetParams()->AddString(context.myHostname.c_str()); _req->GetParams()->AddInt32(context.myPortnum); _req->GetParams()->AddInt32(500); _target->InvokeAsync(_req, 1.500, this); @@ -29,10 +31,14 @@ void OutwardCheck::RequestDone(FRT_RPCRequest *req) { if (answer == "ok") { LOG(debug, "ping to %s with reverse connectivity OK", _spec.c_str()); _result = CcResult::ALL_OK; - } else { + } else if (answer == "bad") { LOG(debug, "connected to %s, but reverse connectivity fails: %s", _spec.c_str(), answer.c_str()); _result = CcResult::REVERSE_FAIL; + } else { + LOG(warning, "connected to %s, but strange reverse connectivity: %s", + _spec.c_str(), answer.c_str()); + _result = CcResult::REVERSE_UNAVAIL; } } else if (req->GetErrorCode() == FRTE_RPC_NO_SUCH_METHOD || req->GetErrorCode() == FRTE_RPC_WRONG_PARAMS || diff --git a/configd/src/apps/sentinel/outward-check.h b/configd/src/apps/sentinel/outward-check.h index 01a298aee18..11faee1d707 100644 --- a/configd/src/apps/sentinel/outward-check.h +++ b/configd/src/apps/sentinel/outward-check.h @@ -12,11 +12,11 @@ namespace config::sentinel { struct OutwardCheckContext { vespalib::CountDownLatch latch; - const char * myHostname; + std::string myHostname; int myPortnum; FRT_Supervisor &orb; OutwardCheckContext(size_t count, - const char * hostname, + const std::string &hostname, int portnumber, FRT_Supervisor &supervisor) : latch(count), @@ -24,6 +24,7 @@ struct OutwardCheckContext { myPortnum(portnumber), orb(supervisor) {} + ~OutwardCheckContext(); }; enum class CcResult { UNKNOWN, CONN_FAIL, REVERSE_FAIL, REVERSE_UNAVAIL, ALL_OK }; |