aboutsummaryrefslogtreecommitdiffstats
path: root/configd
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2021-06-07 10:26:32 +0000
committerArne Juul <arnej@verizonmedia.com>2021-06-07 11:45:04 +0000
commitf16f846f7486febb25069284278034ce597dc164 (patch)
treef1ee2619d91cf2be24e581666d9609543a1f9f2e /configd
parentfb725932a4a9e0b206d79282606e046761caea70 (diff)
simplify after review
Diffstat (limited to 'configd')
-rw-r--r--configd/src/apps/sentinel/connectivity.cpp62
-rw-r--r--configd/src/apps/sentinel/connectivity.h20
-rw-r--r--configd/src/apps/sentinel/env.cpp68
-rw-r--r--configd/src/apps/sentinel/env.h4
4 files changed, 64 insertions, 90 deletions
diff --git a/configd/src/apps/sentinel/connectivity.cpp b/configd/src/apps/sentinel/connectivity.cpp
index 7d9134ca51f..74f11f086d6 100644
--- a/configd/src/apps/sentinel/connectivity.cpp
+++ b/configd/src/apps/sentinel/connectivity.cpp
@@ -1,5 +1,6 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "config-owner.h"
#include "connectivity.h"
#include "outward-check.h"
#include <vespa/defaults.h>
@@ -16,20 +17,9 @@ using namespace std::chrono_literals;
namespace config::sentinel {
-void Connectivity::CheckResult::logDetails() const {
- for (const std::string &detail : details) {
- LOG(info, "Connectivity check details: %s", detail.c_str());
- }
-}
-
-Connectivity::Connectivity(const SentinelConfig::Connectivity & config, RpcServer &rpcServer)
- : _config(config),
- _rpcServer(rpcServer)
-{
- LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount);
- LOG(config, "connectivity.maxBadOutPercent = %d", _config.maxBadOutPercent);
-}
+constexpr std::chrono::milliseconds MODEL_TIMEOUT_MS = 60s;
+Connectivity::Connectivity() = default;
Connectivity::~Connectivity() = default;
namespace {
@@ -71,16 +61,28 @@ std::map<std::string, std::string> specsFrom(const ModelConfig &model) {
}
-Connectivity::CheckResult
-Connectivity::checkConnectivity(const ModelConfig &model) {
- const auto checkSpecs = specsFrom(model);
- size_t clusterSize = checkSpecs.size();
+void Connectivity::configure(const SentinelConfig::Connectivity &config) {
+ _config = config;
+ LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount);
+ LOG(config, "connectivity.maxBadOutPercent = %d", _config.maxBadOutPercent);
+ if (auto up = ConfigOwner::fetchModelConfig(MODEL_TIMEOUT_MS)) {
+ _checkSpecs = specsFrom(*up);
+ }
+}
+
+bool
+Connectivity::checkConnectivity(RpcServer &rpcServer) {
+ size_t clusterSize = _checkSpecs.size();
+ if (clusterSize == 0) {
+ LOG(warning, "could not get model config, skipping connectivity checks");
+ return true;
+ }
OutwardCheckContext checkContext(clusterSize,
vespa::Defaults::vespaHostname(),
- _rpcServer.getPort(),
- _rpcServer.orb());
+ rpcServer.getPort(),
+ rpcServer.orb());
std::map<std::string, OutwardCheck> connectivityMap;
- for (const auto & [ hn, spec ] : checkSpecs) {
+ for (const auto & [ hn, spec ] : _checkSpecs) {
connectivityMap.try_emplace(hn, spec, checkContext);
}
checkContext.latch.await();
@@ -88,6 +90,12 @@ Connectivity::checkConnectivity(const ModelConfig &model) {
size_t numFailedReverse = 0;
bool allChecksOk = true;
for (const auto & [hostname, check] : connectivityMap) {
+ const char *detail = toString(check.result());
+ std::string prev = _detailsPerHost[hostname];
+ if (prev != detail) {
+ LOG(info, "Connectivity check details: %s -> %s", hostname.c_str(), detail);
+ }
+ _detailsPerHost[hostname] = detail;
LOG_ASSERT(check.result() != CcResult::UNKNOWN);
if (check.result() == CcResult::CONN_FAIL) ++numFailedConns;
if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse;
@@ -103,16 +111,12 @@ Connectivity::checkConnectivity(const ModelConfig &model) {
numFailedConns, clusterSize, pct, _config.maxBadOutPercent);
allChecksOk = false;
}
- std::vector<std::string> details;
- for (const auto & [hostname, check] : connectivityMap) {
- std::string detail = fmt("%s -> %s", hostname.c_str(), toString(check.result()));
- details.push_back(detail);
+ if (allChecksOk && (numFailedConns == 0) && (numFailedReverse == 0)) {
+ LOG(info, "All connectivity checks OK, proceeding with service startup");
+ } else if (allChecksOk) {
+ LOG(info, "Enough connectivity checks OK, proceeding with service startup");
}
- CheckResult result{false, false, {}};
- result.enoughOk = allChecksOk;
- result.allOk = (numFailedConns == 0) && (numFailedReverse == 0);
- result.details = std::move(details);
- return result;
+ return allChecksOk;
}
}
diff --git a/configd/src/apps/sentinel/connectivity.h b/configd/src/apps/sentinel/connectivity.h
index 69cea835da6..1c7ee8ddc57 100644
--- a/configd/src/apps/sentinel/connectivity.h
+++ b/configd/src/apps/sentinel/connectivity.h
@@ -6,7 +6,7 @@
#include <vespa/config-sentinel.h>
#include <vespa/config-model.h>
#include <string>
-#include <vector>
+#include <map>
using cloud::config::SentinelConfig;
using cloud::config::ModelConfig;
@@ -18,20 +18,14 @@ namespace config::sentinel {
**/
class Connectivity {
public:
- Connectivity(const SentinelConfig::Connectivity & config, RpcServer &rpcServer);
+ Connectivity();
~Connectivity();
-
- struct CheckResult {
- bool enoughOk;
- bool allOk;
- std::vector<std::string> details;
- void logDetails() const;
- };
-
- CheckResult checkConnectivity(const ModelConfig &model);
+ void configure(const SentinelConfig::Connectivity &config);
+ bool checkConnectivity(RpcServer &rpcServer);
private:
- const SentinelConfig::Connectivity _config;
- RpcServer &_rpcServer;
+ SentinelConfig::Connectivity _config;
+ std::map<std::string, std::string> _checkSpecs;
+ std::map<std::string, std::string> _detailsPerHost;
};
}
diff --git a/configd/src/apps/sentinel/env.cpp b/configd/src/apps/sentinel/env.cpp
index ded615a3c4a..9763956bada 100644
--- a/configd/src/apps/sentinel/env.cpp
+++ b/configd/src/apps/sentinel/env.cpp
@@ -30,7 +30,7 @@ void maybeStopNow() {
}
constexpr std::chrono::milliseconds CONFIG_TIMEOUT_MS = 3min;
-constexpr std::chrono::milliseconds MODEL_TIMEOUT_MS = 1500ms;
+constexpr int maxConnectivityRetries = 100;
} // namespace <unnamed>
@@ -53,22 +53,34 @@ void Env::boot(const std::string &configId) {
LOG(debug, "Reading configuration for ID: %s", configId.c_str());
_cfgOwner.subscribe(configId, CONFIG_TIMEOUT_MS);
// subscribe() should throw if something is not OK
- for (int retry = 0; retry < maxRetryLoops; ++retry) {
- _cfgOwner.checkForConfigUpdate();
- LOG_ASSERT(_cfgOwner.hasConfig());
- const auto & cfg = _cfgOwner.getConfig();
- LOG(config, "Booting sentinel '%s' with [stateserver port %d] and [rpc port %d]",
- configId.c_str(), cfg.port.telnet, cfg.port.rpc);
- rpcPort(cfg.port.rpc);
- statePort(cfg.port.telnet);
- if (waitForConnectivity(retry)) {
+ Connectivity checker;
+ for (int retry = 0; retry < maxConnectivityRetries; ++retry) {
+ bool changed = _cfgOwner.checkForConfigUpdate();
+ if (changed) {
+ LOG_ASSERT(_cfgOwner.hasConfig());
+ const auto & cfg = _cfgOwner.getConfig();
+ LOG(config, "Booting sentinel '%s' with [stateserver port %d] and [rpc port %d]",
+ configId.c_str(), cfg.port.telnet, cfg.port.rpc);
+ rpcPort(cfg.port.rpc);
+ statePort(cfg.port.telnet);
+ checker.configure(cfg.connectivity);
+ }
+ if (checker.checkConnectivity(*_rpcServer)) {
_stateApi.myHealth.setOk();
return;
} else {
- LOG(warning, "Bad network connectivity, retry from start");
+ _stateApi.myHealth.setFailed("FAILED connectivity check");
+ if ((retry % 10) == 0) {
+ LOG(warning, "Bad network connectivity (try %d)", 1+retry);
+ }
+ for (int i = 0; i < 5; ++i) {
+ respondAsEmpty();
+ maybeStopNow();
+ std::this_thread::sleep_for(600ms);
+ }
}
}
- throw InvalidConfigException("Giving up - too many connectivity check failures");
+ throw vespalib::FatalException("Giving up - too many connectivity check failures");
}
void Env::rpcPort(int port) {
@@ -113,36 +125,4 @@ void Env::respondAsEmpty() {
}
}
-bool Env::waitForConnectivity(int outerRetry) {
- auto up = ConfigOwner::fetchModelConfig(MODEL_TIMEOUT_MS);
- if (! up) {
- LOG(warning, "could not get model config, skipping connectivity checks");
- return true;
- }
- Connectivity::CheckResult lastCheckResult;
- Connectivity checker(_cfgOwner.getConfig().connectivity, *_rpcServer);
- for (int retry = 0; retry < maxRetriesInsideLoop; ++retry) {
- auto res = checker.checkConnectivity(*up);
- if (res.enoughOk) {
- LOG(info, "Connectivity check OK, proceeding with service startup");
- if (retry > 0 || ! res.allOk) {
- res.logDetails();
- }
- return true;
- }
- LOG(warning, "Connectivity check FAILED (try %d)", 1 + retry + maxRetriesInsideLoop*outerRetry);
- _stateApi.myHealth.setFailed("FAILED connectivity check");
- if (lastCheckResult.details != res.details) {
- res.logDetails();
- lastCheckResult = std::move(res);
- }
- for (int i = 0; i <= outerRetry; ++i) {
- respondAsEmpty();
- maybeStopNow();
- std::this_thread::sleep_for(1s);
- }
- }
- return false;
-}
-
}
diff --git a/configd/src/apps/sentinel/env.h b/configd/src/apps/sentinel/env.h
index 9a347e8cd85..f71fb537068 100644
--- a/configd/src/apps/sentinel/env.h
+++ b/configd/src/apps/sentinel/env.h
@@ -25,9 +25,6 @@ public:
CommandQueue &commandQueue() { return _rpcCommandQueue; }
StartMetrics &metrics() { return _startMetrics; }
- static constexpr int maxRetryLoops = 5;
- static constexpr int maxRetriesInsideLoop = 10;
-
void boot(const std::string &configId);
void rpcPort(int portnum);
void statePort(int portnum);
@@ -35,7 +32,6 @@ public:
void notifyConfigUpdated();
private:
void respondAsEmpty();
- bool waitForConnectivity(int outerRetry);
ConfigOwner _cfgOwner;
CommandQueue _rpcCommandQueue;
std::unique_ptr<RpcServer> _rpcServer;