summaryrefslogtreecommitdiffstats
path: root/configd
diff options
context:
space:
mode:
authorArne H Juul <arnej27959@users.noreply.github.com>2021-06-04 16:28:30 +0200
committerGitHub <noreply@github.com>2021-06-04 16:28:30 +0200
commit30fde50db70ee02af21c1d2f92096a76a475da1c (patch)
tree6a2aa035672cbdcea33d9e0580963de23876c0dc /configd
parentde2d16aee7fc98317a7282687886683a4a6499a2 (diff)
parentdae3aad759563085301d651c438eb783c271c27e (diff)
Merge pull request #18128 from vespa-engine/arnej/new-connectivity-check
Arnej/new connectivity check
Diffstat (limited to 'configd')
-rw-r--r--configd/src/apps/sentinel/CMakeLists.txt1
-rw-r--r--configd/src/apps/sentinel/config-owner.cpp2
-rw-r--r--configd/src/apps/sentinel/connectivity.cpp112
-rw-r--r--configd/src/apps/sentinel/connectivity.h36
4 files changed, 150 insertions, 1 deletions
diff --git a/configd/src/apps/sentinel/CMakeLists.txt b/configd/src/apps/sentinel/CMakeLists.txt
index e77abc19077..43b4f79a0b2 100644
--- a/configd/src/apps/sentinel/CMakeLists.txt
+++ b/configd/src/apps/sentinel/CMakeLists.txt
@@ -4,6 +4,7 @@ vespa_add_executable(configd_config-sentinel_app
check-completion-handler.cpp
cmdq.cpp
config-owner.cpp
+ connectivity.cpp
env.cpp
line-splitter.cpp
manager.cpp
diff --git a/configd/src/apps/sentinel/config-owner.cpp b/configd/src/apps/sentinel/config-owner.cpp
index d5f06dff76b..26972911b29 100644
--- a/configd/src/apps/sentinel/config-owner.cpp
+++ b/configd/src/apps/sentinel/config-owner.cpp
@@ -52,7 +52,7 @@ ConfigOwner::fetchModelConfig(std::chrono::milliseconds timeout)
tempSubscriber.subscribe<ModelConfig>("admin/model", timeout);
if (tempSubscriber.nextGenerationNow()) {
modelConfig = modelHandle->getConfig();
- LOG(config, "Sentinel got model info [version %s] for %zd hosts [config generation %zd",
+ LOG(config, "Sentinel got model info [version %s] for %zd hosts [config generation %zd]",
modelConfig->vespaVersion.c_str(), modelConfig->hosts.size(),
tempSubscriber.getGeneration());
}
diff --git a/configd/src/apps/sentinel/connectivity.cpp b/configd/src/apps/sentinel/connectivity.cpp
new file mode 100644
index 00000000000..9cced1d3475
--- /dev/null
+++ b/configd/src/apps/sentinel/connectivity.cpp
@@ -0,0 +1,112 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "connectivity.h"
+#include "outward-check.h"
+#include <vespa/defaults.h>
+#include <vespa/log/log.h>
+#include <vespa/vespalib/util/exceptions.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <thread>
+#include <chrono>
+
+LOG_SETUP(".connectivity");
+
+using vespalib::make_string_short::fmt;
+using namespace std::chrono_literals;
+
+namespace config::sentinel {
+
+Connectivity::Connectivity(const SentinelConfig::Connectivity & config, RpcServer &rpcServer)
+ : _config(config),
+ _rpcServer(rpcServer)
+{
+ LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount);
+ LOG(config, "connectivity.maxBadOutPercent = %d", _config.maxBadOutPercent);
+}
+
+Connectivity::~Connectivity() = default;
+
+namespace {
+
+const char *toString(CcResult value) {
+ switch (value) {
+ case CcResult::UNKNOWN: return "BAD: missing result"; // very very bad
+ case CcResult::REVERSE_FAIL: return "connect OK, but reverse check FAILED"; // very bad
+ case CcResult::CONN_FAIL: return "failed to connect"; // bad
+ case CcResult::REVERSE_UNAVAIL: return "connect OK (but reverse check unavailable)"; // unfortunate
+ case CcResult::ALL_OK: return "OK: both ways connectivity verified"; // good
+ }
+ LOG(error, "Unknown CcResult enum value: %d", (int)value);
+ LOG_ABORT("Unknown CcResult enum value");
+}
+
+std::map<std::string, std::string> specsFrom(const ModelConfig &model) {
+ std::map<std::string, std::string> checkSpecs;
+ for (const auto & h : model.hosts) {
+ bool foundSentinelPort = false;
+ for (const auto & s : h.services) {
+ if (s.name == "config-sentinel") {
+ for (const auto & p : s.ports) {
+ if (p.tags.find("rpc") != p.tags.npos) {
+ auto spec = fmt("tcp/%s:%d", h.name.c_str(), p.number);
+ checkSpecs[h.name] = spec;
+ foundSentinelPort = true;
+ }
+ }
+ }
+ }
+ if (! foundSentinelPort) {
+ LOG(warning, "Did not find 'config-sentinel' RPC port in model for host %s [%zd services]",
+ h.name.c_str(), h.services.size());
+ }
+ }
+ return checkSpecs;
+}
+
+}
+
+Connectivity::CheckResult
+Connectivity::checkConnectivity(const ModelConfig &model) {
+ const auto checkSpecs = specsFrom(model);
+ size_t clusterSize = checkSpecs.size();
+ OutwardCheckContext checkContext(clusterSize,
+ vespa::Defaults::vespaHostname(),
+ _rpcServer.getPort(),
+ _rpcServer.orb());
+ std::map<std::string, OutwardCheck> connectivityMap;
+ for (const auto & [ hn, spec ] : checkSpecs) {
+ connectivityMap.try_emplace(hn, spec, checkContext);
+ }
+ checkContext.latch.await();
+ size_t numFailedConns = 0;
+ size_t numFailedReverse = 0;
+ bool allChecksOk = true;
+ for (const auto & [hostname, check] : connectivityMap) {
+ LOG_ASSERT(check.result() != CcResult::UNKNOWN);
+ if (check.result() == CcResult::CONN_FAIL) ++numFailedConns;
+ if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse;
+ }
+ if (numFailedReverse > size_t(_config.maxBadReverseCount)) {
+ LOG(warning, "%zu of %zu nodes report problems connecting to me (max is %d)",
+ numFailedReverse, clusterSize, _config.maxBadReverseCount);
+ allChecksOk = false;
+ }
+ if (numFailedConns * 100.0 > _config.maxBadOutPercent * clusterSize) {
+ double pct = numFailedConns * 100.0 / clusterSize;
+ LOG(warning, "Problems connecting to %zu of %zu nodes, %.2f %% (max is %d)",
+ numFailedConns, clusterSize, pct, _config.maxBadOutPercent);
+ allChecksOk = false;
+ }
+ std::vector<std::string> details;
+ for (const auto & [hostname, check] : connectivityMap) {
+ std::string detail = fmt("%s -> %s", hostname.c_str(), toString(check.result()));
+ details.push_back(detail);
+ }
+ CheckResult result{false, false, {}};
+ result.enoughOk = allChecksOk;
+ result.allOk = (numFailedConns == 0) && (numFailedReverse == 0);
+ result.details = std::move(details);
+ return result;
+}
+
+}
diff --git a/configd/src/apps/sentinel/connectivity.h b/configd/src/apps/sentinel/connectivity.h
new file mode 100644
index 00000000000..0e32b5243e0
--- /dev/null
+++ b/configd/src/apps/sentinel/connectivity.h
@@ -0,0 +1,36 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "rpcserver.h"
+#include <vespa/config-sentinel.h>
+#include <vespa/config-model.h>
+#include <string>
+#include <vector>
+
+using cloud::config::SentinelConfig;
+using cloud::config::ModelConfig;
+
+namespace config::sentinel {
+
+/**
+ * Utility class for running connectivity check.
+ **/
+class Connectivity {
+public:
+ Connectivity(const SentinelConfig::Connectivity & config, RpcServer &rpcServer);
+ ~Connectivity();
+
+ struct CheckResult {
+ bool enoughOk;
+ bool allOk;
+ std::vector<std::string> details;
+ };
+
+ CheckResult checkConnectivity(const ModelConfig &model);
+private:
+ const SentinelConfig::Connectivity _config;
+ RpcServer &_rpcServer;
+};
+
+}