diff options
author | Arne H Juul <arnej27959@users.noreply.github.com> | 2021-06-02 16:16:50 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-06-02 16:16:50 +0200 |
commit | f17496e440389c12f1f8028a5937ef0c93a5c7d8 (patch) | |
tree | 1d6147a9bd26290217450ee7d4406153bd336d67 | |
parent | 41cf24e114e7c0f3aa9e8e882725c52b77470c8e (diff) | |
parent | e8d528c3c1357b7c009734db01f63155bfb15613 (diff) |
Merge pull request #18082 from vespa-engine/arnej/outward-conn-check
Arnej/outward conn check
-rw-r--r-- | configd/src/apps/sentinel/CMakeLists.txt | 1 | ||||
-rw-r--r-- | configd/src/apps/sentinel/env.cpp | 87 | ||||
-rw-r--r-- | configd/src/apps/sentinel/env.h | 3 | ||||
-rw-r--r-- | configd/src/apps/sentinel/manager.cpp | 2 | ||||
-rw-r--r-- | configd/src/apps/sentinel/manager.h | 2 | ||||
-rw-r--r-- | configd/src/apps/sentinel/outward-check.cpp | 55 | ||||
-rw-r--r-- | configd/src/apps/sentinel/outward-check.h | 46 | ||||
-rw-r--r-- | configd/src/apps/sentinel/peer-check.cpp | 6 | ||||
-rw-r--r-- | configd/src/apps/sentinel/peer-check.h | 4 | ||||
-rw-r--r-- | configd/src/apps/sentinel/rpchooks.cpp | 10 | ||||
-rw-r--r-- | configd/src/apps/sentinel/rpcserver.h | 1 | ||||
-rw-r--r-- | configd/src/apps/sentinel/sentinel.cpp | 4 |
12 files changed, 187 insertions, 34 deletions
diff --git a/configd/src/apps/sentinel/CMakeLists.txt b/configd/src/apps/sentinel/CMakeLists.txt index d67a41f2a75..e77abc19077 100644 --- a/configd/src/apps/sentinel/CMakeLists.txt +++ b/configd/src/apps/sentinel/CMakeLists.txt @@ -9,6 +9,7 @@ vespa_add_executable(configd_config-sentinel_app manager.cpp metrics.cpp output-connection.cpp + outward-check.cpp peer-check.cpp rpchooks.cpp rpcserver.cpp diff --git a/configd/src/apps/sentinel/env.cpp b/configd/src/apps/sentinel/env.cpp index 45eea3c6417..e4174ee450d 100644 --- a/configd/src/apps/sentinel/env.cpp +++ b/configd/src/apps/sentinel/env.cpp @@ -2,14 +2,18 @@ #include "env.h" #include "check-completion-handler.h" +#include "outward-check.h" +#include <vespa/defaults.h> #include <vespa/log/log.h> #include <vespa/config/common/exceptions.h> #include <vespa/vespalib/util/exceptions.h> +#include <vespa/vespalib/util/stringfmt.h> #include <thread> #include <chrono> LOG_SETUP(".env"); +using vespalib::make_string_short::fmt; using namespace std::chrono_literals; namespace config::sentinel { @@ -43,21 +47,7 @@ void Env::boot(const std::string &configId) { rpcPort(cfg.port.rpc); statePort(cfg.port.telnet); if (auto up = ConfigOwner::fetchModelConfig(MODEL_TIMEOUT_MS)) { - const ModelConfig &model = *up; - for (const auto & h : model.hosts) { - LOG(info, "- Model for host %s with %zd services", h.name.c_str(), h.services.size()); - for (const auto & s : h.services) { - if (s.name == "config-sentinel") { - LOG(info, " - Model for service %s type %s configid %s with %zd ports", - s.name.c_str(), s.type.c_str(), s.configid.c_str(), s.ports.size()); - for (const auto & p : s.ports) { - if (p.tags.find("rpc") != p.tags.npos) { - LOG(info, " - Model for port %d has tags %s", p.number, p.tags.c_str()); - } - } - } - } - } + waitForConnectivity(*up); } } @@ -79,12 +69,12 @@ void Env::statePort(int port) { throw vespalib::FatalException("Bad port " + std::to_string(port) + ", expected range [1, 65535]", VESPA_STRLOC); } if (port == 0) { - port = 19098; + port = 19098; // default in config } if (_stateServer && port == _statePort) { return; // ok already } - LOG(debug, "Config-sentinel accepts connections on port %d", port); + LOG(debug, "Config-sentinel accepts state connections on port %d", port); _stateServer = std::make_unique<vespalib::StateServer>( port, _stateApi.myHealth, _startMetrics.producer, _stateApi.myComponents); _statePort = port; @@ -96,8 +86,67 @@ void Env::notifyConfigUpdated() { } -void Env::handleCmd(Cmd::UP cmd) { - cmd->retError("still booting, not ready for all RPC commands"); +void Env::respondAsEmpty() { + auto commands = _rpcCommandQueue.drain(); + for (Cmd::UP &cmd : commands) { + cmd->retError("still booting, not ready for all RPC commands"); + } +} + +namespace { + +const char *toString(CcResult value) { + switch (value) { + case CcResult::UNKNOWN: return "unknown"; + case CcResult::CONN_FAIL: return "failed to connect"; + case CcResult::REVERSE_FAIL: return "connect OK, but reverse check FAILED"; + case CcResult::REVERSE_UNAVAIL: return "connect OK, but reverse check unavailable"; + case CcResult::ALL_OK: return "both ways connectivity OK"; + } + LOG(error, "Unknown CcResult enum value: %d", (int)value); + LOG_ABORT("Unknown CcResult enum value"); +} + +std::map<std::string, std::string> specsFrom(const ModelConfig &model) { + std::map<std::string, std::string> checkSpecs; + for (const auto & h : model.hosts) { + bool foundSentinelPort = false; + for (const auto & s : h.services) { + if (s.name == "config-sentinel") { + for (const auto & p : s.ports) { + if (p.tags.find("rpc") != p.tags.npos) { + auto spec = fmt("tcp/%s:%d", h.name.c_str(), p.number); + checkSpecs[h.name] = spec; + foundSentinelPort = true; + } + } + } + } + if (! foundSentinelPort) { + LOG(warning, "Did not find 'config-sentinel' RPC port in model for host %s [%zd services]", + h.name.c_str(), h.services.size()); + } + } + return checkSpecs; +} + +} + +void Env::waitForConnectivity(const ModelConfig &model) { + auto checkSpecs = specsFrom(model); + OutwardCheckContext checkContext(checkSpecs.size(), + vespa::Defaults::vespaHostname(), + _rpcServer->getPort(), + _rpcServer->orb()); + std::map<std::string, OutwardCheck> connectivityMap; + for (const auto & [ hn, spec ] : checkSpecs) { + connectivityMap.try_emplace(hn, spec, checkContext); + } + checkContext.latch.await(); + for (const auto & [hostname, check] : connectivityMap) { + LOG(info, "outward check status for host %s is: %s", + hostname.c_str(), toString(check.result())); + } } } diff --git a/configd/src/apps/sentinel/env.h b/configd/src/apps/sentinel/env.h index 0213fd09460..f117854f006 100644 --- a/configd/src/apps/sentinel/env.h +++ b/configd/src/apps/sentinel/env.h @@ -30,8 +30,9 @@ public: void statePort(int portnum); void notifyConfigUpdated(); - void handleCmd(Cmd::UP cmd); private: + void respondAsEmpty(); + void waitForConnectivity(const ModelConfig &model); ConfigOwner _cfgOwner; CommandQueue _rpcCommandQueue; std::unique_ptr<RpcServer> _rpcServer; diff --git a/configd/src/apps/sentinel/manager.cpp b/configd/src/apps/sentinel/manager.cpp index 80dd2a3fda8..6e0ed78211c 100644 --- a/configd/src/apps/sentinel/manager.cpp +++ b/configd/src/apps/sentinel/manager.cpp @@ -109,7 +109,7 @@ Manager::doConfigure() } -int +bool Manager::doWork() { // Return true if there are any running services, false if not. diff --git a/configd/src/apps/sentinel/manager.h b/configd/src/apps/sentinel/manager.h index 9d3d14c6aeb..24bd67cbc49 100644 --- a/configd/src/apps/sentinel/manager.h +++ b/configd/src/apps/sentinel/manager.h @@ -55,7 +55,7 @@ public: Manager(Env &env); virtual ~Manager(); bool terminate(); - int doWork(); + bool doWork(); void updateActiveFdset(fd_set *fds, int *maxNum); }; diff --git a/configd/src/apps/sentinel/outward-check.cpp b/configd/src/apps/sentinel/outward-check.cpp new file mode 100644 index 00000000000..5fed69d0b6e --- /dev/null +++ b/configd/src/apps/sentinel/outward-check.cpp @@ -0,0 +1,55 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "outward-check.h" +#include <vespa/log/log.h> + +LOG_SETUP(".outward-check"); + +namespace config::sentinel { + +OutwardCheck::OutwardCheck(const std::string &spec, OutwardCheckContext &context) + : _spec(spec), + _context(context) +{ + _target = context.orb.GetTarget(spec.c_str()); + _req = context.orb.AllocRPCRequest(); + _req->SetMethodName("sentinel.check.connectivity"); + _req->GetParams()->AddString(context.myHostname); + _req->GetParams()->AddInt32(context.myPortnum); + _req->GetParams()->AddInt32(500); + _target->InvokeAsync(_req, 1.500, this); +} + +OutwardCheck::~OutwardCheck() = default; + +void OutwardCheck::RequestDone(FRT_RPCRequest *req) { + LOG_ASSERT(req == _req); + if (req->CheckReturnTypes("s")) { + std::string answer = _req->GetReturn()->GetValue(0)._string._str; + if (answer == "ok") { + LOG(debug, "ping to %s with reverse connectivity OK", _spec.c_str()); + _result = CcResult::ALL_OK; + } else { + LOG(debug, "connected to %s, but reverse connectivity fails: %s", + _spec.c_str(), answer.c_str()); + _result = CcResult::REVERSE_FAIL; + } + } else if (req->GetErrorCode() == FRTE_RPC_NO_SUCH_METHOD || + req->GetErrorCode() == FRTE_RPC_WRONG_PARAMS || + req->GetErrorCode() == FRTE_RPC_WRONG_RETURN) + { + LOG(debug, "Connected OK to %s but no reverse connectivity check available", _spec.c_str()); + _result = CcResult::REVERSE_UNAVAIL; + } else { + LOG(debug, "error on request to %s : %s (%d)", _spec.c_str(), + req->GetErrorMessage(), req->GetErrorCode()); + _result = CcResult::CONN_FAIL; + } + _req->SubRef(); + _req = nullptr; + _target->SubRef(); + _target = nullptr; + _context.latch.countDown(); +} + +} diff --git a/configd/src/apps/sentinel/outward-check.h b/configd/src/apps/sentinel/outward-check.h new file mode 100644 index 00000000000..01a298aee18 --- /dev/null +++ b/configd/src/apps/sentinel/outward-check.h @@ -0,0 +1,46 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <string> +#include <vespa/vespalib/util/count_down_latch.h> +#include <vespa/fnet/frt/supervisor.h> +#include <vespa/fnet/frt/invoker.h> +#include <vespa/fnet/frt/rpcrequest.h> +#include <vespa/fnet/frt/supervisor.h> +#include <vespa/fnet/frt/target.h> + +namespace config::sentinel { + +struct OutwardCheckContext { + vespalib::CountDownLatch latch; + const char * myHostname; + int myPortnum; + FRT_Supervisor &orb; + OutwardCheckContext(size_t count, + const char * hostname, + int portnumber, + FRT_Supervisor &supervisor) + : latch(count), + myHostname(hostname), + myPortnum(portnumber), + orb(supervisor) + {} +}; + +enum class CcResult { UNKNOWN, CONN_FAIL, REVERSE_FAIL, REVERSE_UNAVAIL, ALL_OK }; + +class OutwardCheck : public FRT_IRequestWait { +private: + CcResult _result = CcResult::UNKNOWN; + FRT_Target *_target = nullptr; + FRT_RPCRequest *_req = nullptr; + std::string _spec; + OutwardCheckContext &_context; +public: + OutwardCheck(const std::string &spec, OutwardCheckContext &context); + virtual ~OutwardCheck(); + void RequestDone(FRT_RPCRequest *req) override; + bool ok() const { return _result == CcResult::ALL_OK; } + CcResult result() const { return _result; } +}; + +} diff --git a/configd/src/apps/sentinel/peer-check.cpp b/configd/src/apps/sentinel/peer-check.cpp index 024f928c994..60c3d9c96c9 100644 --- a/configd/src/apps/sentinel/peer-check.cpp +++ b/configd/src/apps/sentinel/peer-check.cpp @@ -10,7 +10,7 @@ using vespalib::make_string_short::fmt; namespace config::sentinel { -PeerCheck::PeerCheck(StatusCallback &callback, const std::string &host, int port, FRT_Supervisor &orb) +PeerCheck::PeerCheck(StatusCallback &callback, const std::string &host, int port, FRT_Supervisor &orb, int timeout_ms) : _callback(callback), _hostname(host), _portnum(port), @@ -21,7 +21,7 @@ PeerCheck::PeerCheck(StatusCallback &callback, const std::string &host, int port _target = orb.GetTarget(spec.c_str()); _req = orb.AllocRPCRequest(); _req->SetMethodName("frt.rpc.ping"); - _target->InvokeAsync(_req, 0.500, this); + _target->InvokeAsync(_req, timeout_ms * 0.001, this); } PeerCheck::~PeerCheck() { @@ -36,7 +36,7 @@ void PeerCheck::RequestDone(FRT_RPCRequest *req) { LOG(warning, "error on ping to %s [port %d]: %s (%d)", _hostname.c_str(), _portnum, req->GetErrorMessage(), req->GetErrorCode()); } else { - LOG(info, "OK ping to %s [port %d]", _hostname.c_str(), _portnum); + LOG(debug, "OK ping to %s [port %d]", _hostname.c_str(), _portnum); statusOk = true; } _req->SubRef(); diff --git a/configd/src/apps/sentinel/peer-check.h b/configd/src/apps/sentinel/peer-check.h index 658375a8d7b..096f304467b 100644 --- a/configd/src/apps/sentinel/peer-check.h +++ b/configd/src/apps/sentinel/peer-check.h @@ -4,7 +4,6 @@ #include "status-callback.h" #include <string> -#include <vespa/fnet/task.h> #include <vespa/fnet/frt/invoker.h> #include <vespa/fnet/frt/rpcrequest.h> #include <vespa/fnet/frt/supervisor.h> @@ -15,7 +14,7 @@ namespace config::sentinel { class PeerCheck : public FRT_IRequestWait { public: - PeerCheck(StatusCallback &callback, const std::string &host, int portnum, FRT_Supervisor &orb); + PeerCheck(StatusCallback &callback, const std::string &host, int portnum, FRT_Supervisor &orb, int timeout_ms); ~PeerCheck(); PeerCheck(const PeerCheck &) = delete; @@ -25,7 +24,6 @@ public: /** from FRT_IRequestWait **/ void RequestDone(FRT_RPCRequest *req) override; - private: StatusCallback &_callback; std::string _hostname; diff --git a/configd/src/apps/sentinel/rpchooks.cpp b/configd/src/apps/sentinel/rpchooks.cpp index d364e74154c..24e3cd53509 100644 --- a/configd/src/apps/sentinel/rpchooks.cpp +++ b/configd/src/apps/sentinel/rpchooks.cpp @@ -45,11 +45,12 @@ RPCHooks::initRPC(FRT_Supervisor *supervisor) FRT_METHOD(RPCHooks::rpc_startService), this); rb.MethodDesc("start a service"); //------------------------------------------------------------------------- - rb.DefineMethod("sentinel.check.connectivity", "si", "s", + rb.DefineMethod("sentinel.check.connectivity", "sii", "s", FRT_METHOD(RPCHooks::rpc_checkConnectivity), this); rb.MethodDesc("check connectivity for peer sentinel"); rb.ParamDesc("name", "Hostname of peer sentinel"); rb.ParamDesc("port", "Port number of peer sentinel"); + rb.ParamDesc("timeout", "Timeout for check in milliseconds"); rb.ReturnDesc("status", "Status (ok, bad, or unknown) for peer"); //------------------------------------------------------------------------- } @@ -97,11 +98,12 @@ RPCHooks::rpc_checkConnectivity(FRT_RPCRequest *req) { FRT_Values &args = *req->GetParams(); const char *hostname = args[0]._string._str; - uint32_t portnum = args[1]._intval32; - LOG(debug, "got checkConnectivity %s [port %d]", hostname, portnum); + int portnum = args[1]._intval32; + int timeout = args[2]._intval32; + LOG(debug, "got checkConnectivity %s [port %d] timeout %d", hostname, portnum, timeout); req->Detach(); auto & completionHandler = req->getStash().create<CheckCompletionHandler>(req); - req->getStash().create<PeerCheck>(completionHandler, hostname, portnum, _orb); + req->getStash().create<PeerCheck>(completionHandler, hostname, portnum, _orb, timeout); } } // namespace slobrok diff --git a/configd/src/apps/sentinel/rpcserver.h b/configd/src/apps/sentinel/rpcserver.h index ef4b394fdca..4c6dea00ddf 100644 --- a/configd/src/apps/sentinel/rpcserver.h +++ b/configd/src/apps/sentinel/rpcserver.h @@ -22,6 +22,7 @@ public: ~RpcServer(); int getPort() const { return _port; } + FRT_Supervisor &orb() { return _server.supervisor(); } }; } // namespace config::sentinel diff --git a/configd/src/apps/sentinel/sentinel.cpp b/configd/src/apps/sentinel/sentinel.cpp index ed0e6c1cfe5..18d4dc28f8a 100644 --- a/configd/src/apps/sentinel/sentinel.cpp +++ b/configd/src/apps/sentinel/sentinel.cpp @@ -118,6 +118,6 @@ main(int argc, char **argv) } EV_STOPPING("config-sentinel", "normal exit"); - int rv = manager.terminate(); - return rv; + bool rv = manager.terminate(); + return rv ? EXIT_SUCCESS : EXIT_FAILURE; } |