diff options
author | Arne H Juul <arnej27959@users.noreply.github.com> | 2021-06-10 14:13:21 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-06-10 14:13:21 +0200 |
commit | eb93f635cde554ab681c1029d68412b8dbcd5487 (patch) | |
tree | b4aaf1ed37524622f606d3f245ada22250bb2e68 /configd | |
parent | 8b55850e0abc38722c3c172f8287d07b1a5837a6 (diff) | |
parent | 97f4c2fd8ed1e1fd80bc4a2ee950f2d6d98384d6 (diff) |
Merge pull request #18171 from vespa-engine/arnej/add-corner-probes
Arnej/add corner probes
Diffstat (limited to 'configd')
-rw-r--r-- | configd/src/apps/sentinel/cc-result.h | 9 | ||||
-rw-r--r-- | configd/src/apps/sentinel/connectivity.cpp | 132 | ||||
-rw-r--r-- | configd/src/apps/sentinel/connectivity.h | 12 | ||||
-rw-r--r-- | configd/src/apps/sentinel/outward-check.cpp | 21 | ||||
-rw-r--r-- | configd/src/apps/sentinel/outward-check.h | 17 |
5 files changed, 151 insertions, 40 deletions
diff --git a/configd/src/apps/sentinel/cc-result.h b/configd/src/apps/sentinel/cc-result.h new file mode 100644 index 00000000000..3468cf4324a --- /dev/null +++ b/configd/src/apps/sentinel/cc-result.h @@ -0,0 +1,9 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +namespace config::sentinel { + +enum class CcResult { UNKNOWN, CONN_FAIL, UNREACHABLE_UP, INDIRECT_PING_FAIL, INDIRECT_PING_UNAVAIL, ALL_OK }; + +} diff --git a/configd/src/apps/sentinel/connectivity.cpp b/configd/src/apps/sentinel/connectivity.cpp index 132b57fc884..8d1aa0e9673 100644 --- a/configd/src/apps/sentinel/connectivity.cpp +++ b/configd/src/apps/sentinel/connectivity.cpp @@ -27,25 +27,33 @@ namespace { std::string toString(CcResult value) { switch (value) { case CcResult::UNKNOWN: return "BAD: missing result"; // very very bad - case CcResult::REVERSE_FAIL: return "connect OK, but reverse check FAILED"; // very bad + case CcResult::INDIRECT_PING_FAIL: return "connect OK, but reverse check FAILED"; // very bad + case CcResult::UNREACHABLE_UP: return "unreachable from me, but up"; // very bad case CcResult::CONN_FAIL: return "failed to connect"; // bad - case CcResult::REVERSE_UNAVAIL: return "connect OK (but reverse check unavailable)"; // unfortunate + case CcResult::INDIRECT_PING_UNAVAIL: return "connect OK (but reverse check unavailable)"; // unfortunate case CcResult::ALL_OK: return "OK: both ways connectivity verified"; // good } LOG(error, "Unknown CcResult enum value: %d", (int)value); LOG_ABORT("Unknown CcResult enum value"); } -std::map<std::string, std::string> specsFrom(const ModelConfig &model) { - std::map<std::string, std::string> checkSpecs; +using ConnectivityMap = std::map<std::string, OutwardCheck>; +using HostAndPort = Connectivity::HostAndPort; +using SpecMap = Connectivity::SpecMap; + +std::string spec(const SpecMap::value_type &host_and_port) { + return fmt("tcp/%s:%d", host_and_port.first.c_str(), host_and_port.second); +} + +SpecMap specsFrom(const ModelConfig &model) { + SpecMap checkSpecs; for (const auto & h : model.hosts) { bool foundSentinelPort = false; for (const auto & s : h.services) { if (s.name == "config-sentinel") { for (const auto & p : s.ports) { if (p.tags.find("rpc") != p.tags.npos) { - auto spec = fmt("tcp/%s:%d", h.name.c_str(), p.number); - checkSpecs[h.name] = spec; + checkSpecs[h.name] = p.number; foundSentinelPort = true; } } @@ -59,8 +67,57 @@ std::map<std::string, std::string> specsFrom(const ModelConfig &model) { return checkSpecs; } +void classifyConnFails(ConnectivityMap &connectivityMap, + const SpecMap &specMap, + RpcServer &rpcServer) +{ + std::vector<HostAndPort> failedConnSpecs; + std::vector<HostAndPort> goodNeighborSpecs; + std::string myHostname = vespa::Defaults::vespaHostname(); + for (auto & [hostname, check] : connectivityMap) { + if (hostname == myHostname) { + if (check.result() == CcResult::CONN_FAIL) { + check.classifyResult(CcResult::UNREACHABLE_UP); + } + } else { + auto iter = specMap.find(hostname); + LOG_ASSERT(iter != specMap.end()); + if (check.result() == CcResult::ALL_OK) { + goodNeighborSpecs.push_back(*iter); + } + if (check.result() == CcResult::CONN_FAIL) { + failedConnSpecs.push_back(*iter); + } + } + } + if ((failedConnSpecs.size() == 0) || (goodNeighborSpecs.size() == 0)) { + return; + } + for (const auto & [ nameToCheck, portToCheck ] : failedConnSpecs) { + auto cmIter = connectivityMap.find(nameToCheck); + LOG_ASSERT(cmIter != connectivityMap.end()); + OutwardCheckContext checkContext(goodNeighborSpecs.size(), nameToCheck, portToCheck, rpcServer.orb()); + ConnectivityMap cornerProbes; + for (const auto & hp : goodNeighborSpecs) { + cornerProbes.try_emplace(hp.first, spec(hp), checkContext); + } + checkContext.latch.await(); + size_t numReportsUp = 0; + size_t numReportsDown = 0; + for (const auto & [hostname, probe] : cornerProbes) { + if (probe.result() == CcResult::INDIRECT_PING_FAIL) ++numReportsDown; + if (probe.result() == CcResult::ALL_OK) ++numReportsUp; + } + if (numReportsUp > 0) { + LOG(debug, "Unreachable: %s is up according to %zd hosts (down according to me + %zd others)", + nameToCheck.c_str(), numReportsUp, numReportsDown); + cmIter->second.classifyResult(CcResult::UNREACHABLE_UP); + } + } } +} // namespace <unnamed> + void Connectivity::configure(const SentinelConfig::Connectivity &config) { _config = config; LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount); @@ -77,18 +134,18 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) { LOG(warning, "could not get model config, skipping connectivity checks"); return true; } + std::string myHostname = vespa::Defaults::vespaHostname(); OutwardCheckContext checkContext(clusterSize, - vespa::Defaults::vespaHostname(), + myHostname, rpcServer.getPort(), rpcServer.orb()); - std::map<std::string, OutwardCheck> connectivityMap; - for (const auto & [ hn, spec ] : _checkSpecs) { - connectivityMap.try_emplace(hn, spec, checkContext); + ConnectivityMap connectivityMap; + for (const auto &host_and_port : _checkSpecs) { + connectivityMap.try_emplace(host_and_port.first, spec(host_and_port), checkContext); } checkContext.latch.await(); - size_t numFailedConns = 0; - size_t numFailedReverse = 0; - bool allChecksOk = true; + classifyConnFails(connectivityMap, _checkSpecs, rpcServer); + Accumulated accumulated; for (const auto & [hostname, check] : connectivityMap) { std::string detail = toString(check.result()); std::string prev = _detailsPerHost[hostname]; @@ -97,26 +154,47 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) { } _detailsPerHost[hostname] = detail; LOG_ASSERT(check.result() != CcResult::UNKNOWN); - if (check.result() == CcResult::CONN_FAIL) ++numFailedConns; - if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse; + accumulate(accumulated, check.result()); + } + return enoughOk(accumulated, clusterSize); +} + +void Connectivity::accumulate(Accumulated &target, CcResult value) { + switch (value) { + case CcResult::UNKNOWN: + case CcResult::UNREACHABLE_UP: + case CcResult::INDIRECT_PING_FAIL: + ++target.numSeriousIssues; + ++target.numIssues; + break; + case CcResult::CONN_FAIL: + ++target.numIssues; + break; + case CcResult::INDIRECT_PING_UNAVAIL: + case CcResult::ALL_OK: + break; } - if (numFailedReverse > size_t(_config.maxBadReverseCount)) { - LOG(warning, "%zu of %zu nodes report problems connecting to me (max is %d)", - numFailedReverse, clusterSize, _config.maxBadReverseCount); - allChecksOk = false; +} + +bool Connectivity::enoughOk(const Accumulated &results, size_t clusterSize) { + bool enough = true; + if (results.numSeriousIssues > size_t(_config.maxBadReverseCount)) { + LOG(warning, "%zu of %zu nodes up but with network connectivity problems (max is %d)", + results.numSeriousIssues, clusterSize, _config.maxBadReverseCount); + enough = false; } - if (numFailedConns * 100.0 > _config.maxBadOutPercent * clusterSize) { - double pct = numFailedConns * 100.0 / clusterSize; - LOG(warning, "Problems connecting to %zu of %zu nodes, %.2f %% (max is %d)", - numFailedConns, clusterSize, pct, _config.maxBadOutPercent); - allChecksOk = false; + if (results.numIssues * 100.0 > _config.maxBadOutPercent * clusterSize) { + double pct = results.numIssues * 100.0 / clusterSize; + LOG(warning, "Problems with connection to %zu of %zu nodes, %.1f%% (max is %d%%)", + results.numIssues, clusterSize, pct, _config.maxBadOutPercent); + enough = false; } - if (allChecksOk && (numFailedConns == 0) && (numFailedReverse == 0)) { + if (results.numIssues == 0) { LOG(info, "All connectivity checks OK, proceeding with service startup"); - } else if (allChecksOk) { + } else if (enough) { LOG(info, "Enough connectivity checks OK, proceeding with service startup"); } - return allChecksOk; + return enough; } } diff --git a/configd/src/apps/sentinel/connectivity.h b/configd/src/apps/sentinel/connectivity.h index 1c7ee8ddc57..a1e454a255a 100644 --- a/configd/src/apps/sentinel/connectivity.h +++ b/configd/src/apps/sentinel/connectivity.h @@ -3,6 +3,7 @@ #pragma once #include "rpcserver.h" +#include "cc-result.h" #include <vespa/config-sentinel.h> #include <vespa/config-model.h> #include <string> @@ -18,13 +19,22 @@ namespace config::sentinel { **/ class Connectivity { public: + using SpecMap = std::map<std::string, int>; + using HostAndPort = SpecMap::value_type; + Connectivity(); ~Connectivity(); void configure(const SentinelConfig::Connectivity &config); bool checkConnectivity(RpcServer &rpcServer); private: + struct Accumulated { + size_t numIssues = 0; + size_t numSeriousIssues = 0; + }; + void accumulate(Accumulated &target, CcResult value); + bool enoughOk(const Accumulated &results, size_t clusterSize); SentinelConfig::Connectivity _config; - std::map<std::string, std::string> _checkSpecs; + SpecMap _checkSpecs; std::map<std::string, std::string> _detailsPerHost; }; diff --git a/configd/src/apps/sentinel/outward-check.cpp b/configd/src/apps/sentinel/outward-check.cpp index 5fed69d0b6e..c9c94443274 100644 --- a/configd/src/apps/sentinel/outward-check.cpp +++ b/configd/src/apps/sentinel/outward-check.cpp @@ -7,6 +7,8 @@ LOG_SETUP(".outward-check"); namespace config::sentinel { +OutwardCheckContext::~OutwardCheckContext() = default; + OutwardCheck::OutwardCheck(const std::string &spec, OutwardCheckContext &context) : _spec(spec), _context(context) @@ -14,8 +16,8 @@ OutwardCheck::OutwardCheck(const std::string &spec, OutwardCheckContext &context _target = context.orb.GetTarget(spec.c_str()); _req = context.orb.AllocRPCRequest(); _req->SetMethodName("sentinel.check.connectivity"); - _req->GetParams()->AddString(context.myHostname); - _req->GetParams()->AddInt32(context.myPortnum); + _req->GetParams()->AddString(context.targetHostname.c_str()); + _req->GetParams()->AddInt32(context.targetPortnum); _req->GetParams()->AddInt32(500); _target->InvokeAsync(_req, 1.500, this); } @@ -29,17 +31,21 @@ void OutwardCheck::RequestDone(FRT_RPCRequest *req) { if (answer == "ok") { LOG(debug, "ping to %s with reverse connectivity OK", _spec.c_str()); _result = CcResult::ALL_OK; - } else { + } else if (answer == "bad") { LOG(debug, "connected to %s, but reverse connectivity fails: %s", _spec.c_str(), answer.c_str()); - _result = CcResult::REVERSE_FAIL; + _result = CcResult::INDIRECT_PING_FAIL; + } else { + LOG(warning, "connected to %s, but strange reverse connectivity: %s", + _spec.c_str(), answer.c_str()); + _result = CcResult::INDIRECT_PING_UNAVAIL; } } else if (req->GetErrorCode() == FRTE_RPC_NO_SUCH_METHOD || req->GetErrorCode() == FRTE_RPC_WRONG_PARAMS || req->GetErrorCode() == FRTE_RPC_WRONG_RETURN) { LOG(debug, "Connected OK to %s but no reverse connectivity check available", _spec.c_str()); - _result = CcResult::REVERSE_UNAVAIL; + _result = CcResult::INDIRECT_PING_UNAVAIL; } else { LOG(debug, "error on request to %s : %s (%d)", _spec.c_str(), req->GetErrorMessage(), req->GetErrorCode()); @@ -52,4 +58,9 @@ void OutwardCheck::RequestDone(FRT_RPCRequest *req) { _context.latch.countDown(); } +void OutwardCheck::classifyResult(CcResult value) { + LOG_ASSERT(_result == CcResult::CONN_FAIL); + _result = value; +} + } diff --git a/configd/src/apps/sentinel/outward-check.h b/configd/src/apps/sentinel/outward-check.h index 01a298aee18..0e53b9010dc 100644 --- a/configd/src/apps/sentinel/outward-check.h +++ b/configd/src/apps/sentinel/outward-check.h @@ -1,5 +1,8 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "cc-result.h" #include <string> #include <vespa/vespalib/util/count_down_latch.h> #include <vespa/fnet/frt/supervisor.h> @@ -12,22 +15,21 @@ namespace config::sentinel { struct OutwardCheckContext { vespalib::CountDownLatch latch; - const char * myHostname; - int myPortnum; + std::string targetHostname; + int targetPortnum; FRT_Supervisor &orb; OutwardCheckContext(size_t count, - const char * hostname, + const std::string &hostname, int portnumber, FRT_Supervisor &supervisor) : latch(count), - myHostname(hostname), - myPortnum(portnumber), + targetHostname(hostname), + targetPortnum(portnumber), orb(supervisor) {} + ~OutwardCheckContext(); }; -enum class CcResult { UNKNOWN, CONN_FAIL, REVERSE_FAIL, REVERSE_UNAVAIL, ALL_OK }; - class OutwardCheck : public FRT_IRequestWait { private: CcResult _result = CcResult::UNKNOWN; @@ -41,6 +43,7 @@ public: void RequestDone(FRT_RPCRequest *req) override; bool ok() const { return _result == CcResult::ALL_OK; } CcResult result() const { return _result; } + void classifyResult(CcResult value); }; } |