1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "connectivity.h"
#include "outward-check.h"
#include <vespa/defaults.h>
#include <vespa/log/log.h>
#include <vespa/vespalib/util/exceptions.h>
#include <vespa/vespalib/util/stringfmt.h>
#include <thread>
#include <chrono>
LOG_SETUP(".connectivity");
using vespalib::make_string_short::fmt;
using namespace std::chrono_literals;
namespace config::sentinel {
Connectivity::Connectivity(const SentinelConfig::Connectivity & config, RpcServer &rpcServer)
: _config(config),
_rpcServer(rpcServer)
{
LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount);
LOG(config, "connectivity.maxBadOutPercent = %d", _config.maxBadOutPercent);
}
Connectivity::~Connectivity() = default;
namespace {
const char *toString(CcResult value) {
switch (value) {
case CcResult::UNKNOWN: return "BAD: missing result"; // very very bad
case CcResult::REVERSE_FAIL: return "connect OK, but reverse check FAILED"; // very bad
case CcResult::CONN_FAIL: return "failed to connect"; // bad
case CcResult::REVERSE_UNAVAIL: return "connect OK (but reverse check unavailable)"; // unfortunate
case CcResult::ALL_OK: return "OK: both ways connectivity verified"; // good
}
LOG(error, "Unknown CcResult enum value: %d", (int)value);
LOG_ABORT("Unknown CcResult enum value");
}
std::map<std::string, std::string> specsFrom(const ModelConfig &model) {
std::map<std::string, std::string> checkSpecs;
for (const auto & h : model.hosts) {
bool foundSentinelPort = false;
for (const auto & s : h.services) {
if (s.name == "config-sentinel") {
for (const auto & p : s.ports) {
if (p.tags.find("rpc") != p.tags.npos) {
auto spec = fmt("tcp/%s:%d", h.name.c_str(), p.number);
checkSpecs[h.name] = spec;
foundSentinelPort = true;
}
}
}
}
if (! foundSentinelPort) {
LOG(warning, "Did not find 'config-sentinel' RPC port in model for host %s [%zd services]",
h.name.c_str(), h.services.size());
}
}
return checkSpecs;
}
}
Connectivity::CheckResult
Connectivity::checkConnectivity(const ModelConfig &model) {
const auto checkSpecs = specsFrom(model);
size_t clusterSize = checkSpecs.size();
OutwardCheckContext checkContext(clusterSize,
vespa::Defaults::vespaHostname(),
_rpcServer.getPort(),
_rpcServer.orb());
std::map<std::string, OutwardCheck> connectivityMap;
for (const auto & [ hn, spec ] : checkSpecs) {
connectivityMap.try_emplace(hn, spec, checkContext);
}
checkContext.latch.await();
size_t numFailedConns = 0;
size_t numFailedReverse = 0;
bool allChecksOk = true;
for (const auto & [hostname, check] : connectivityMap) {
LOG_ASSERT(check.result() != CcResult::UNKNOWN);
if (check.result() == CcResult::CONN_FAIL) ++numFailedConns;
if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse;
}
if (numFailedReverse > size_t(_config.maxBadReverseCount)) {
LOG(warning, "%zu of %zu nodes report problems connecting to me (max is %d)",
numFailedReverse, clusterSize, _config.maxBadReverseCount);
allChecksOk = false;
}
if (numFailedConns * 100.0 > _config.maxBadOutPercent * clusterSize) {
double pct = numFailedConns * 100.0 / clusterSize;
LOG(warning, "Problems connecting to %zu of %zu nodes, %.2f %% (max is %d)",
numFailedConns, clusterSize, pct, _config.maxBadOutPercent);
allChecksOk = false;
}
std::vector<std::string> details;
for (const auto & [hostname, check] : connectivityMap) {
std::string detail = fmt("%s -> %s", hostname.c_str(), toString(check.result()));
details.push_back(detail);
}
CheckResult result{false, false, {}};
result.enoughOk = allChecksOk;
result.allOk = (numFailedConns == 0) && (numFailedReverse == 0);
result.details = std::move(details);
return result;
}
}
|