aboutsummaryrefslogtreecommitdiffstats
path: root/configd/src/apps/sentinel/connectivity.cpp
diff options
context:
space:
mode:
authorArne H Juul <arnej27959@users.noreply.github.com>2021-06-07 23:10:18 +0200
committerGitHub <noreply@github.com>2021-06-07 23:10:18 +0200
commit429ff98c842a124515f66b8bcdaf0c5f64c678e9 (patch)
tree99d73b2f7810d005ff29f5db78685290fc3292f3 /configd/src/apps/sentinel/connectivity.cpp
parent4ffd09f5678559a5f71d3957514f1d52c61e88f0 (diff)
parenta402fb4fab902b9f8c9a8859b1142e436bf439e3 (diff)
Merge pull request #18132 from vespa-engine/arnej/actually-wait-for-connectivity
Arnej/actually wait for connectivity
Diffstat (limited to 'configd/src/apps/sentinel/connectivity.cpp')
-rw-r--r--configd/src/apps/sentinel/connectivity.cpp58
1 files changed, 34 insertions, 24 deletions
diff --git a/configd/src/apps/sentinel/connectivity.cpp b/configd/src/apps/sentinel/connectivity.cpp
index 9cced1d3475..132b57fc884 100644
--- a/configd/src/apps/sentinel/connectivity.cpp
+++ b/configd/src/apps/sentinel/connectivity.cpp
@@ -1,5 +1,6 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "config-owner.h"
#include "connectivity.h"
#include "outward-check.h"
#include <vespa/defaults.h>
@@ -16,19 +17,14 @@ using namespace std::chrono_literals;
namespace config::sentinel {
-Connectivity::Connectivity(const SentinelConfig::Connectivity & config, RpcServer &rpcServer)
- : _config(config),
- _rpcServer(rpcServer)
-{
- LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount);
- LOG(config, "connectivity.maxBadOutPercent = %d", _config.maxBadOutPercent);
-}
+constexpr std::chrono::milliseconds MODEL_TIMEOUT_MS = 60s;
+Connectivity::Connectivity() = default;
Connectivity::~Connectivity() = default;
namespace {
-const char *toString(CcResult value) {
+std::string toString(CcResult value) {
switch (value) {
case CcResult::UNKNOWN: return "BAD: missing result"; // very very bad
case CcResult::REVERSE_FAIL: return "connect OK, but reverse check FAILED"; // very bad
@@ -65,16 +61,28 @@ std::map<std::string, std::string> specsFrom(const ModelConfig &model) {
}
-Connectivity::CheckResult
-Connectivity::checkConnectivity(const ModelConfig &model) {
- const auto checkSpecs = specsFrom(model);
- size_t clusterSize = checkSpecs.size();
+void Connectivity::configure(const SentinelConfig::Connectivity &config) {
+ _config = config;
+ LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount);
+ LOG(config, "connectivity.maxBadOutPercent = %d", _config.maxBadOutPercent);
+ if (auto up = ConfigOwner::fetchModelConfig(MODEL_TIMEOUT_MS)) {
+ _checkSpecs = specsFrom(*up);
+ }
+}
+
+bool
+Connectivity::checkConnectivity(RpcServer &rpcServer) {
+ size_t clusterSize = _checkSpecs.size();
+ if (clusterSize == 0) {
+ LOG(warning, "could not get model config, skipping connectivity checks");
+ return true;
+ }
OutwardCheckContext checkContext(clusterSize,
vespa::Defaults::vespaHostname(),
- _rpcServer.getPort(),
- _rpcServer.orb());
+ rpcServer.getPort(),
+ rpcServer.orb());
std::map<std::string, OutwardCheck> connectivityMap;
- for (const auto & [ hn, spec ] : checkSpecs) {
+ for (const auto & [ hn, spec ] : _checkSpecs) {
connectivityMap.try_emplace(hn, spec, checkContext);
}
checkContext.latch.await();
@@ -82,6 +90,12 @@ Connectivity::checkConnectivity(const ModelConfig &model) {
size_t numFailedReverse = 0;
bool allChecksOk = true;
for (const auto & [hostname, check] : connectivityMap) {
+ std::string detail = toString(check.result());
+ std::string prev = _detailsPerHost[hostname];
+ if (prev != detail) {
+ LOG(info, "Connectivity check details: %s -> %s", hostname.c_str(), detail.c_str());
+ }
+ _detailsPerHost[hostname] = detail;
LOG_ASSERT(check.result() != CcResult::UNKNOWN);
if (check.result() == CcResult::CONN_FAIL) ++numFailedConns;
if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse;
@@ -97,16 +111,12 @@ Connectivity::checkConnectivity(const ModelConfig &model) {
numFailedConns, clusterSize, pct, _config.maxBadOutPercent);
allChecksOk = false;
}
- std::vector<std::string> details;
- for (const auto & [hostname, check] : connectivityMap) {
- std::string detail = fmt("%s -> %s", hostname.c_str(), toString(check.result()));
- details.push_back(detail);
+ if (allChecksOk && (numFailedConns == 0) && (numFailedReverse == 0)) {
+ LOG(info, "All connectivity checks OK, proceeding with service startup");
+ } else if (allChecksOk) {
+ LOG(info, "Enough connectivity checks OK, proceeding with service startup");
}
- CheckResult result{false, false, {}};
- result.enoughOk = allChecksOk;
- result.allOk = (numFailedConns == 0) && (numFailedReverse == 0);
- result.details = std::move(details);
- return result;
+ return allChecksOk;
}
}