aboutsummaryrefslogtreecommitdiffstats
path: root/configd/src/apps/sentinel/connectivity.cpp
blob: 9cced1d3475a23d08af3f2278021560813a371d7 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "connectivity.h"
#include "outward-check.h"
#include <vespa/defaults.h>
#include <vespa/log/log.h>
#include <vespa/vespalib/util/exceptions.h>
#include <vespa/vespalib/util/stringfmt.h>
#include <thread>
#include <chrono>

LOG_SETUP(".connectivity");

using vespalib::make_string_short::fmt;
using namespace std::chrono_literals;

namespace config::sentinel {

Connectivity::Connectivity(const SentinelConfig::Connectivity & config, RpcServer &rpcServer)
  : _config(config),
    _rpcServer(rpcServer)
{
    LOG(config, "connectivity.maxBadReverseCount = %d", _config.maxBadReverseCount);
    LOG(config, "connectivity.maxBadOutPercent = %d", _config.maxBadOutPercent);
}

Connectivity::~Connectivity() = default;

namespace {

const char *toString(CcResult value) {
    switch (value) {
        case CcResult::UNKNOWN: return "BAD: missing result"; // very very bad
        case CcResult::REVERSE_FAIL: return "connect OK, but reverse check FAILED"; // very bad
        case CcResult::CONN_FAIL: return "failed to connect"; // bad
        case CcResult::REVERSE_UNAVAIL: return "connect OK (but reverse check unavailable)"; // unfortunate
        case CcResult::ALL_OK: return "OK: both ways connectivity verified"; // good
    }
    LOG(error, "Unknown CcResult enum value: %d", (int)value);
    LOG_ABORT("Unknown CcResult enum value");
}

std::map<std::string, std::string> specsFrom(const ModelConfig &model) {
    std::map<std::string, std::string> checkSpecs;
    for (const auto & h : model.hosts) {
        bool foundSentinelPort = false;
        for (const auto & s : h.services) {
            if (s.name == "config-sentinel") {
                for (const auto & p : s.ports) {
                    if (p.tags.find("rpc") != p.tags.npos) {
                        auto spec = fmt("tcp/%s:%d", h.name.c_str(), p.number);
                        checkSpecs[h.name] = spec;
                        foundSentinelPort = true;
                    }
                }
            }
        }
        if (! foundSentinelPort) {
            LOG(warning, "Did not find 'config-sentinel' RPC port in model for host %s [%zd services]",
                h.name.c_str(), h.services.size());
        }
    }
    return checkSpecs;
}

}

Connectivity::CheckResult
Connectivity::checkConnectivity(const ModelConfig &model) {
    const auto checkSpecs = specsFrom(model);
    size_t clusterSize = checkSpecs.size();
    OutwardCheckContext checkContext(clusterSize,
                                     vespa::Defaults::vespaHostname(),
                                     _rpcServer.getPort(),
                                     _rpcServer.orb());
    std::map<std::string, OutwardCheck> connectivityMap;
    for (const auto & [ hn, spec ] : checkSpecs) {
        connectivityMap.try_emplace(hn, spec, checkContext);
    }
    checkContext.latch.await();
    size_t numFailedConns = 0;
    size_t numFailedReverse = 0;
    bool allChecksOk = true;
    for (const auto & [hostname, check] : connectivityMap) {
        LOG_ASSERT(check.result() != CcResult::UNKNOWN);
        if (check.result() == CcResult::CONN_FAIL) ++numFailedConns;
        if (check.result() == CcResult::REVERSE_FAIL) ++numFailedReverse;
    }
    if (numFailedReverse > size_t(_config.maxBadReverseCount)) {
        LOG(warning, "%zu of %zu nodes report problems connecting to me (max is %d)",
            numFailedReverse, clusterSize, _config.maxBadReverseCount);
        allChecksOk = false;
    }
    if (numFailedConns * 100.0 > _config.maxBadOutPercent * clusterSize) {
        double pct = numFailedConns * 100.0 / clusterSize;
        LOG(warning, "Problems connecting to %zu of %zu nodes, %.2f %% (max is %d)",
            numFailedConns, clusterSize, pct, _config.maxBadOutPercent);
        allChecksOk = false;
    }
    std::vector<std::string> details;
    for (const auto & [hostname, check] : connectivityMap) {
        std::string detail = fmt("%s -> %s", hostname.c_str(), toString(check.result()));
        details.push_back(detail);
    }
    CheckResult result{false, false, {}};
    result.enoughOk = allChecksOk;
    result.allOk = (numFailedConns == 0) && (numFailedReverse == 0);
    result.details = std::move(details);
    return result;
}

}