aboutsummaryrefslogtreecommitdiffstats
path: root/configd
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2021-06-08 13:41:51 +0000
committerArne Juul <arnej@verizonmedia.com>2021-06-09 07:39:53 +0000
commit047b641beec9ea62b54789374bae41948bae5d54 (patch)
tree693806375b498d1702b65636f2bd915f5edfad7c /configd
parent7a7296a89fecd6238bf845741157b6c6971602ff (diff)
add probes around the corner, take 1
Diffstat (limited to 'configd')
-rw-r--r--configd/src/apps/sentinel/connectivity.cpp74
-rw-r--r--configd/src/apps/sentinel/connectivity.h9
-rw-r--r--configd/src/apps/sentinel/outward-check.cpp10
-rw-r--r--configd/src/apps/sentinel/outward-check.h5
4 files changed, 85 insertions, 13 deletions
diff --git a/configd/src/apps/sentinel/connectivity.cpp b/configd/src/apps/sentinel/connectivity.cpp
index 132b57fc884..84c03d5f3c1 100644
--- a/configd/src/apps/sentinel/connectivity.cpp
+++ b/configd/src/apps/sentinel/connectivity.cpp
@@ -36,16 +36,19 @@ std::string toString(CcResult value) {
LOG_ABORT("Unknown CcResult enum value");
}
-std::map<std::string, std::string> specsFrom(const ModelConfig &model) {
- std::map<std::string, std::string> checkSpecs;
+using ConnectivityMap = std::map<std::string, OutwardCheck>;
+using HostAndPort = Connectivity::HostAndPort;
+using SpecMap = Connectivity::SpecMap;
+
+SpecMap specsFrom(const ModelConfig &model) {
+ SpecMap checkSpecs;
for (const auto & h : model.hosts) {
bool foundSentinelPort = false;
for (const auto & s : h.services) {
if (s.name == "config-sentinel") {
for (const auto & p : s.ports) {
if (p.tags.find("rpc") != p.tags.npos) {
- auto spec = fmt("tcp/%s:%d", h.name.c_str(), p.number);
- checkSpecs[h.name] = spec;
+ checkSpecs[h.name] = HostAndPort{h.name, p.number};
foundSentinelPort = true;
}
}
@@ -59,6 +62,48 @@ std::map<std::string, std::string> specsFrom(const ModelConfig &model) {
return checkSpecs;
}
+size_t countUnreachable(const ConnectivityMap &connectivityMap,
+ const SpecMap &specMap,
+ RpcServer &rpcServer)
+{
+ std::vector<HostAndPort> failedConnSpecs;
+ std::vector<HostAndPort> goodNeighborSpecs;
+ std::string myHostname = vespa::Defaults::vespaHostname();
+ for (const auto & [hostname, check] : connectivityMap) {
+ auto iter = specMap.find(hostname);
+ LOG_ASSERT(iter != specMap.end());
+ if ((check.result() == CcResult::ALL_OK) && (hostname != myHostname)) {
+ goodNeighborSpecs.push_back(iter->second);
+ }
+ if (check.result() == CcResult::CONN_FAIL) {
+ failedConnSpecs.push_back(iter->second);
+ }
+ }
+ size_t counter = 0;
+ for (const auto & toCheck : failedConnSpecs) {
+ OutwardCheckContext checkContext(goodNeighborSpecs.size(), toCheck.host, toCheck.port, rpcServer.orb());
+ ConnectivityMap cornerProbes;
+ for (const auto & hp : goodNeighborSpecs) {
+ cornerProbes.try_emplace(hp.host, hp.spec(), checkContext);
+ }
+ checkContext.latch.await();
+ size_t numReportsUp = 0;
+ size_t numReportsDown = 0;
+ for (const auto & [hostname, probe] : cornerProbes) {
+ if (probe.result() == CcResult::REVERSE_FAIL) ++numReportsDown;
+ if (probe.result() == CcResult::ALL_OK) ++numReportsUp;
+ }
+ if (numReportsUp > numReportsDown) {
+ ++counter;
+ }
+ }
+ return counter;
+}
+
+}
+
+std::string Connectivity::HostAndPort::spec() const {
+ return fmt("tcp/%s:%d", host.c_str(), port);
}
void Connectivity::configure(const SentinelConfig::Connectivity &config) {
@@ -77,15 +122,17 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) {
LOG(warning, "could not get model config, skipping connectivity checks");
return true;
}
+ std::string myHostname = vespa::Defaults::vespaHostname();
OutwardCheckContext checkContext(clusterSize,
- vespa::Defaults::vespaHostname(),
+ myHostname,
rpcServer.getPort(),
rpcServer.orb());
- std::map<std::string, OutwardCheck> connectivityMap;
- for (const auto & [ hn, spec ] : _checkSpecs) {
- connectivityMap.try_emplace(hn, spec, checkContext);
+ ConnectivityMap connectivityMap;
+ for (const auto & [ hn, host_and_port ] : _checkSpecs) {
+ connectivityMap.try_emplace(hn, host_and_port.spec(), checkContext);
}
checkContext.latch.await();
+ size_t numAllGood = 0;
size_t numFailedConns = 0;
size_t numFailedReverse = 0;
bool allChecksOk = true;
@@ -97,6 +144,9 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) {
}
_detailsPerHost[hostname] = detail;
LOG_ASSERT(check.result() != CcResult::UNKNOWN);
+ if ((check.result() == CcResult::ALL_OK) && (hostname != myHostname)) {
+ ++numAllGood;
+ }
if (check.result() == CcResult::CONN_FAIL) ++numFailedConns;
if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse;
}
@@ -111,6 +161,14 @@ Connectivity::checkConnectivity(RpcServer &rpcServer) {
numFailedConns, clusterSize, pct, _config.maxBadOutPercent);
allChecksOk = false;
}
+ size_t numUnreachable = (numFailedConns > 0)
+ ? countUnreachable(connectivityMap, _checkSpecs, rpcServer)
+ : 0;
+ if (numUnreachable > size_t(_config.maxBadReverseCount)) {
+ LOG(warning, "%zu of %zu nodes are up but unreachable from me (max is %d)",
+ numUnreachable, clusterSize, _config.maxBadReverseCount);
+ allChecksOk = false;
+ }
if (allChecksOk && (numFailedConns == 0) && (numFailedReverse == 0)) {
LOG(info, "All connectivity checks OK, proceeding with service startup");
} else if (allChecksOk) {
diff --git a/configd/src/apps/sentinel/connectivity.h b/configd/src/apps/sentinel/connectivity.h
index 1c7ee8ddc57..93bc2c865b5 100644
--- a/configd/src/apps/sentinel/connectivity.h
+++ b/configd/src/apps/sentinel/connectivity.h
@@ -18,13 +18,20 @@ namespace config::sentinel {
**/
class Connectivity {
public:
+ struct HostAndPort {
+ std::string host;
+ int port;
+ std::string spec() const;
+ };
+ using SpecMap = std::map<std::string, HostAndPort>;
+
Connectivity();
~Connectivity();
void configure(const SentinelConfig::Connectivity &config);
bool checkConnectivity(RpcServer &rpcServer);
private:
SentinelConfig::Connectivity _config;
- std::map<std::string, std::string> _checkSpecs;
+ SpecMap _checkSpecs;
std::map<std::string, std::string> _detailsPerHost;
};
diff --git a/configd/src/apps/sentinel/outward-check.cpp b/configd/src/apps/sentinel/outward-check.cpp
index 5fed69d0b6e..500ea835fbf 100644
--- a/configd/src/apps/sentinel/outward-check.cpp
+++ b/configd/src/apps/sentinel/outward-check.cpp
@@ -7,6 +7,8 @@ LOG_SETUP(".outward-check");
namespace config::sentinel {
+OutwardCheckContext::~OutwardCheckContext() = default;
+
OutwardCheck::OutwardCheck(const std::string &spec, OutwardCheckContext &context)
: _spec(spec),
_context(context)
@@ -14,7 +16,7 @@ OutwardCheck::OutwardCheck(const std::string &spec, OutwardCheckContext &context
_target = context.orb.GetTarget(spec.c_str());
_req = context.orb.AllocRPCRequest();
_req->SetMethodName("sentinel.check.connectivity");
- _req->GetParams()->AddString(context.myHostname);
+ _req->GetParams()->AddString(context.myHostname.c_str());
_req->GetParams()->AddInt32(context.myPortnum);
_req->GetParams()->AddInt32(500);
_target->InvokeAsync(_req, 1.500, this);
@@ -29,10 +31,14 @@ void OutwardCheck::RequestDone(FRT_RPCRequest *req) {
if (answer == "ok") {
LOG(debug, "ping to %s with reverse connectivity OK", _spec.c_str());
_result = CcResult::ALL_OK;
- } else {
+ } else if (answer == "bad") {
LOG(debug, "connected to %s, but reverse connectivity fails: %s",
_spec.c_str(), answer.c_str());
_result = CcResult::REVERSE_FAIL;
+ } else {
+ LOG(warning, "connected to %s, but strange reverse connectivity: %s",
+ _spec.c_str(), answer.c_str());
+ _result = CcResult::REVERSE_UNAVAIL;
}
} else if (req->GetErrorCode() == FRTE_RPC_NO_SUCH_METHOD ||
req->GetErrorCode() == FRTE_RPC_WRONG_PARAMS ||
diff --git a/configd/src/apps/sentinel/outward-check.h b/configd/src/apps/sentinel/outward-check.h
index 01a298aee18..11faee1d707 100644
--- a/configd/src/apps/sentinel/outward-check.h
+++ b/configd/src/apps/sentinel/outward-check.h
@@ -12,11 +12,11 @@ namespace config::sentinel {
struct OutwardCheckContext {
vespalib::CountDownLatch latch;
- const char * myHostname;
+ std::string myHostname;
int myPortnum;
FRT_Supervisor &orb;
OutwardCheckContext(size_t count,
- const char * hostname,
+ const std::string &hostname,
int portnumber,
FRT_Supervisor &supervisor)
: latch(count),
@@ -24,6 +24,7 @@ struct OutwardCheckContext {
myPortnum(portnumber),
orb(supervisor)
{}
+ ~OutwardCheckContext();
};
enum class CcResult { UNKNOWN, CONN_FAIL, REVERSE_FAIL, REVERSE_UNAVAIL, ALL_OK };