aboutsummaryrefslogtreecommitdiffstats
path: root/configd
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2021-06-09 07:22:42 +0000
committerArne Juul <arnej@verizonmedia.com>2021-06-09 07:39:53 +0000
commit8d76a69b1b0f70e10fbfdde4e2b9e1ab442632e6 (patch)
tree6ababbea743f288b8a82e56f877103a8f3cc8838 /configd
parentbaac9f2eb87de12980f1586d35f9b1c71648bf06 (diff)
classify CONN_FAIL results using corner probes
Diffstat (limited to 'configd')
-rw-r--r--configd/src/apps/sentinel/connectivity.cpp99
-rw-r--r--configd/src/apps/sentinel/outward-check.cpp5
-rw-r--r--configd/src/apps/sentinel/outward-check.h3
3 files changed, 64 insertions, 43 deletions
diff --git a/configd/src/apps/sentinel/connectivity.cpp b/configd/src/apps/sentinel/connectivity.cpp
index 05e6a3fb4be..e9897247101 100644
--- a/configd/src/apps/sentinel/connectivity.cpp
+++ b/configd/src/apps/sentinel/connectivity.cpp
@@ -28,7 +28,9 @@ std::string toString(CcResult value) {
switch (value) {
case CcResult::UNKNOWN: return "BAD: missing result"; // very very bad
case CcResult::REVERSE_FAIL: return "connect OK, but reverse check FAILED"; // very bad
+ case CcResult::UNREACHABLE_UP: return "unreachable from me, but up"; // very bad
case CcResult::CONN_FAIL: return "failed to connect"; // bad
+ case CcResult::AFFIRMED_DOWN: return "affirmed down"; // a problem, but probably not on this end
case CcResult::REVERSE_UNAVAIL: return "connect OK (but reverse check unavailable)"; // unfortunate
case CcResult::ALL_OK: return "OK: both ways connectivity verified"; // good
}
@@ -66,26 +68,36 @@ SpecMap specsFrom(const ModelConfig &model) {
return checkSpecs;
}
-size_t countUnreachable(const ConnectivityMap &connectivityMap,
- const SpecMap &specMap,
- RpcServer &rpcServer)
+void classifyConnFails(ConnectivityMap &connectivityMap,
+ const SpecMap &specMap,
+ RpcServer &rpcServer)
{
std::vector<HostAndPort> failedConnSpecs;
std::vector<HostAndPort> goodNeighborSpecs;
std::string myHostname = vespa::Defaults::vespaHostname();
- for (const auto & [hostname, check] : connectivityMap) {
- auto iter = specMap.find(hostname);
- LOG_ASSERT(iter != specMap.end());
- if ((check.result() == CcResult::ALL_OK) && (hostname != myHostname)) {
- goodNeighborSpecs.push_back(*iter);
- }
- if (check.result() == CcResult::CONN_FAIL) {
- failedConnSpecs.push_back(*iter);
+ for (auto & [hostname, check] : connectivityMap) {
+ if (hostname == myHostname) {
+ if (check.result() == CcResult::CONN_FAIL) {
+ check.classifyResult(CcResult::UNREACHABLE_UP);
+ }
+ } else {
+ auto iter = specMap.find(hostname);
+ LOG_ASSERT(iter != specMap.end());
+ if (check.result() == CcResult::ALL_OK) {
+ goodNeighborSpecs.push_back(*iter);
+ }
+ if (check.result() == CcResult::CONN_FAIL) {
+ failedConnSpecs.push_back(*iter);
+ }
}
}
- size_t counter = 0;
- for (const auto & toCheck : failedConnSpecs) {
- OutwardCheckContext checkContext(goodNeighborSpecs.size(), toCheck.first, toCheck.second, rpcServer.orb());
+ if ((failedConnSpecs.size() == 0) || (goodNeighborSpecs.size() == 0)) {
+ return;
+ }
+ for (const auto & [ nameToCheck, portToCheck ] : failedConnSpecs) {
+ auto cmIter = connectivityMap.find(nameToCheck);
+ LOG_ASSERT(cmIter != connectivityMap.end());
+ OutwardCheckContext checkContext(goodNeighborSpecs.size(), nameToCheck, portToCheck, rpcServer.orb());
ConnectivityMap cornerProbes;
for (const auto & hp : goodNeighborSpecs) {
cornerProbes.try_emplace(hp.first, spec(hp), checkContext);
@@ -98,15 +110,16 @@ size_t countUnreachable(const ConnectivityMap &connectivityMap,
if (probe.result() == CcResult::ALL_OK) ++numReportsUp;
}
if (numReportsUp > numReportsDown) {
- LOG(warning, "Unreachable: %s is up according to %zd hosts (down according to me + %zd others)",
- toCheck.first.c_str(), numReportsUp, numReportsDown);
- ++counter;
+ LOG(info, "Unreachable: %s is up according to %zd hosts (down according to me + %zd others)",
+ nameToCheck.c_str(), numReportsUp, numReportsDown);
+ cmIter->second.classifyResult(CcResult::UNREACHABLE_UP);
+ } else if ((numReportsUp == 0) && (numReportsDown > 0)) {
+ cmIter->second.classifyResult(CcResult::AFFIRMED_DOWN);
}
}
- return counter;
}
-}
+} // namespace <unnamed>
void Connectivity::configure(const SentinelConfig::Connectivity &config) {
_config = config;
@@ -134,9 +147,9 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) {
connectivityMap.try_emplace(host_and_port.first, spec(host_and_port), checkContext);
}
checkContext.latch.await();
- size_t numAllGood = 0;
- size_t numFailedConns = 0;
- size_t numFailedReverse = 0;
+ classifyConnFails(connectivityMap, _checkSpecs, rpcServer);
+ size_t numProblematic = 0;
+ size_t numUpButBad = 0;
bool allChecksOk = true;
for (const auto & [hostname, check] : connectivityMap) {
std::string detail = toString(check.result());
@@ -146,32 +159,34 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) {
}
_detailsPerHost[hostname] = detail;
LOG_ASSERT(check.result() != CcResult::UNKNOWN);
- if ((check.result() == CcResult::ALL_OK) && (hostname != myHostname)) {
- ++numAllGood;
+ switch (check.result()) {
+ case CcResult::UNREACHABLE_UP:
+ case CcResult::REVERSE_FAIL:
+ ++numUpButBad;
+ ++numProblematic;
+ break;
+ case CcResult::AFFIRMED_DOWN:
+ case CcResult::CONN_FAIL:
+ ++numProblematic;
+ break;
+ case CcResult::UNKNOWN:
+ case CcResult::REVERSE_UNAVAIL:
+ case CcResult::ALL_OK:
+ break;
}
- if (check.result() == CcResult::CONN_FAIL) ++numFailedConns;
- if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse;
- }
- if (numFailedReverse > size_t(_config.maxBadReverseCount)) {
- LOG(warning, "%zu of %zu nodes report problems connecting to me (max is %d)",
- numFailedReverse, clusterSize, _config.maxBadReverseCount);
- allChecksOk = false;
}
- if (numFailedConns * 100.0 > _config.maxBadOutPercent * clusterSize) {
- double pct = numFailedConns * 100.0 / clusterSize;
- LOG(warning, "Problems connecting to %zu of %zu nodes, %.2f %% (max is %d)",
- numFailedConns, clusterSize, pct, _config.maxBadOutPercent);
+ if (numUpButBad > size_t(_config.maxBadReverseCount)) {
+ LOG(warning, "%zu of %zu nodes up but with network connectivity problems (max is %d)",
+ numUpButBad, clusterSize, _config.maxBadReverseCount);
allChecksOk = false;
}
- size_t numUnreachable = (numFailedConns > 0)
- ? countUnreachable(connectivityMap, _checkSpecs, rpcServer)
- : 0;
- if (numUnreachable > size_t(_config.maxBadReverseCount)) {
- LOG(warning, "%zu of %zu nodes are up but unreachable from me (max is %d)",
- numUnreachable, clusterSize, _config.maxBadReverseCount);
+ if (numProblematic * 100.0 > _config.maxBadOutPercent * clusterSize) {
+ double pct = numProblematic * 100.0 / clusterSize;
+ LOG(warning, "Problems with connection to %zu of %zu nodes, %.1f%% (max is %d%%)",
+ numProblematic, clusterSize, pct, _config.maxBadOutPercent);
allChecksOk = false;
}
- if (allChecksOk && (numFailedConns == 0) && (numFailedReverse == 0)) {
+ if (numProblematic == 0) {
LOG(info, "All connectivity checks OK, proceeding with service startup");
} else if (allChecksOk) {
LOG(info, "Enough connectivity checks OK, proceeding with service startup");
diff --git a/configd/src/apps/sentinel/outward-check.cpp b/configd/src/apps/sentinel/outward-check.cpp
index 500ea835fbf..1f3e5b7524d 100644
--- a/configd/src/apps/sentinel/outward-check.cpp
+++ b/configd/src/apps/sentinel/outward-check.cpp
@@ -58,4 +58,9 @@ void OutwardCheck::RequestDone(FRT_RPCRequest *req) {
_context.latch.countDown();
}
+void OutwardCheck::classifyResult(CcResult value) {
+ LOG_ASSERT(_result == CcResult::CONN_FAIL);
+ _result = value;
+}
+
}
diff --git a/configd/src/apps/sentinel/outward-check.h b/configd/src/apps/sentinel/outward-check.h
index 11faee1d707..d8803e22a02 100644
--- a/configd/src/apps/sentinel/outward-check.h
+++ b/configd/src/apps/sentinel/outward-check.h
@@ -27,7 +27,7 @@ struct OutwardCheckContext {
~OutwardCheckContext();
};
-enum class CcResult { UNKNOWN, CONN_FAIL, REVERSE_FAIL, REVERSE_UNAVAIL, ALL_OK };
+enum class CcResult { UNKNOWN, AFFIRMED_DOWN, CONN_FAIL, UNREACHABLE_UP, REVERSE_FAIL, REVERSE_UNAVAIL, ALL_OK };
class OutwardCheck : public FRT_IRequestWait {
private:
@@ -42,6 +42,7 @@ public:
void RequestDone(FRT_RPCRequest *req) override;
bool ok() const { return _result == CcResult::ALL_OK; }
CcResult result() const { return _result; }
+ void classifyResult(CcResult value);
};
}