diff options
author | Arne H Juul <arnej27959@users.noreply.github.com> | 2021-06-07 23:10:18 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-06-07 23:10:18 +0200 |
commit | 429ff98c842a124515f66b8bcdaf0c5f64c678e9 (patch) | |
tree | 99d73b2f7810d005ff29f5db78685290fc3292f3 /configd/src/apps/sentinel/env.cpp | |
parent | 4ffd09f5678559a5f71d3957514f1d52c61e88f0 (diff) | |
parent | a402fb4fab902b9f8c9a8859b1142e436bf439e3 (diff) |
Merge pull request #18132 from vespa-engine/arnej/actually-wait-for-connectivity
Arnej/actually wait for connectivity
Diffstat (limited to 'configd/src/apps/sentinel/env.cpp')
-rw-r--r-- | configd/src/apps/sentinel/env.cpp | 111 |
1 files changed, 44 insertions, 67 deletions
diff --git a/configd/src/apps/sentinel/env.cpp b/configd/src/apps/sentinel/env.cpp index e4174ee450d..5bbbfd8f0bd 100644 --- a/configd/src/apps/sentinel/env.cpp +++ b/configd/src/apps/sentinel/env.cpp @@ -2,11 +2,12 @@ #include "env.h" #include "check-completion-handler.h" -#include "outward-check.h" +#include "connectivity.h" #include <vespa/defaults.h> #include <vespa/log/log.h> #include <vespa/config/common/exceptions.h> #include <vespa/vespalib/util/exceptions.h> +#include <vespa/vespalib/util/signalhandler.h> #include <vespa/vespalib/util/stringfmt.h> #include <thread> #include <chrono> @@ -18,8 +19,20 @@ using namespace std::chrono_literals; namespace config::sentinel { +namespace { + +void maybeStopNow() { + if (vespalib::SignalHandler::INT.check() || + vespalib::SignalHandler::TERM.check()) + { + throw vespalib::FatalException("got signal during boot()"); + } +} + constexpr std::chrono::milliseconds CONFIG_TIMEOUT_MS = 3min; -constexpr std::chrono::milliseconds MODEL_TIMEOUT_MS = 1500ms; +constexpr int maxConnectivityRetries = 100; + +} // namespace <unnamed> Env::Env() : _cfgOwner(), @@ -31,6 +44,7 @@ Env::Env() _statePort(0) { _startMetrics.startedTime = vespalib::steady_clock::now(); + _stateApi.myHealth.setFailed("initializing..."); } Env::~Env() = default; @@ -38,17 +52,36 @@ Env::~Env() = default; void Env::boot(const std::string &configId) { LOG(debug, "Reading configuration for ID: %s", configId.c_str()); _cfgOwner.subscribe(configId, CONFIG_TIMEOUT_MS); - bool ok = _cfgOwner.checkForConfigUpdate(); // subscribe() should throw if something is not OK - LOG_ASSERT(ok && _cfgOwner.hasConfig()); - const auto & cfg = _cfgOwner.getConfig(); - LOG(config, "Booting sentinel '%s' with [stateserver port %d] and [rpc port %d]", - configId.c_str(), cfg.port.telnet, cfg.port.rpc); - rpcPort(cfg.port.rpc); - statePort(cfg.port.telnet); - if (auto up = ConfigOwner::fetchModelConfig(MODEL_TIMEOUT_MS)) { - waitForConnectivity(*up); + Connectivity checker; + for (int retry = 0; retry < maxConnectivityRetries; ++retry) { + bool changed = _cfgOwner.checkForConfigUpdate(); + LOG_ASSERT(changed || retry > 0); + if (changed) { + LOG_ASSERT(_cfgOwner.hasConfig()); + const auto & cfg = _cfgOwner.getConfig(); + LOG(config, "Booting sentinel '%s' with [stateserver port %d] and [rpc port %d]", + configId.c_str(), cfg.port.telnet, cfg.port.rpc); + rpcPort(cfg.port.rpc); + statePort(cfg.port.telnet); + checker.configure(cfg.connectivity); + } + if (checker.checkConnectivity(*_rpcServer)) { + _stateApi.myHealth.setOk(); + return; + } else { + _stateApi.myHealth.setFailed("FAILED connectivity check"); + if ((retry % 10) == 0) { + LOG(warning, "Bad network connectivity (try %d)", 1+retry); + } + for (int i = 0; i < 5; ++i) { + respondAsEmpty(); + maybeStopNow(); + std::this_thread::sleep_for(600ms); + } + } } + throw vespalib::FatalException("Giving up - too many connectivity check failures"); } void Env::rpcPort(int port) { @@ -93,60 +126,4 @@ void Env::respondAsEmpty() { } } -namespace { - -const char *toString(CcResult value) { - switch (value) { - case CcResult::UNKNOWN: return "unknown"; - case CcResult::CONN_FAIL: return "failed to connect"; - case CcResult::REVERSE_FAIL: return "connect OK, but reverse check FAILED"; - case CcResult::REVERSE_UNAVAIL: return "connect OK, but reverse check unavailable"; - case CcResult::ALL_OK: return "both ways connectivity OK"; - } - LOG(error, "Unknown CcResult enum value: %d", (int)value); - LOG_ABORT("Unknown CcResult enum value"); -} - -std::map<std::string, std::string> specsFrom(const ModelConfig &model) { - std::map<std::string, std::string> checkSpecs; - for (const auto & h : model.hosts) { - bool foundSentinelPort = false; - for (const auto & s : h.services) { - if (s.name == "config-sentinel") { - for (const auto & p : s.ports) { - if (p.tags.find("rpc") != p.tags.npos) { - auto spec = fmt("tcp/%s:%d", h.name.c_str(), p.number); - checkSpecs[h.name] = spec; - foundSentinelPort = true; - } - } - } - } - if (! foundSentinelPort) { - LOG(warning, "Did not find 'config-sentinel' RPC port in model for host %s [%zd services]", - h.name.c_str(), h.services.size()); - } - } - return checkSpecs; -} - -} - -void Env::waitForConnectivity(const ModelConfig &model) { - auto checkSpecs = specsFrom(model); - OutwardCheckContext checkContext(checkSpecs.size(), - vespa::Defaults::vespaHostname(), - _rpcServer->getPort(), - _rpcServer->orb()); - std::map<std::string, OutwardCheck> connectivityMap; - for (const auto & [ hn, spec ] : checkSpecs) { - connectivityMap.try_emplace(hn, spec, checkContext); - } - checkContext.latch.await(); - for (const auto & [hostname, check] : connectivityMap) { - LOG(info, "outward check status for host %s is: %s", - hostname.c_str(), toString(check.result())); - } -} - } |