diff options
author | Tor Egge <Tor.Egge@yahooinc.com> | 2023-07-19 18:16:09 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-19 18:16:09 +0200 |
commit | a26f304d7b4b7ed046d583fd6422df244b00fb7d (patch) | |
tree | cdafeca0c9b447aff7194372dec32e8129035ca2 | |
parent | 48212ae35bea422c44f09511984fdcdabae23106 (diff) | |
parent | 7c9ae83e1a2cffa9e13d53115eda8cdfc20a288d (diff) |
Merge pull request #27835 from vespa-engine/toregge/warn-on-missing-health-ping
Warn on missing health ping.
-rw-r--r-- | storage/src/vespa/storage/storageserver/statemanager.cpp | 30 | ||||
-rw-r--r-- | storage/src/vespa/storage/storageserver/statemanager.h | 5 |
2 files changed, 35 insertions, 0 deletions
diff --git a/storage/src/vespa/storage/storageserver/statemanager.cpp b/storage/src/vespa/storage/storageserver/statemanager.cpp index 654fe0e1f5d..c228229e4ef 100644 --- a/storage/src/vespa/storage/storageserver/statemanager.cpp +++ b/storage/src/vespa/storage/storageserver/statemanager.cpp @@ -17,6 +17,7 @@ #include <vespa/vespalib/util/exceptions.h> #include <vespa/vespalib/util/string_escape.h> #include <vespa/vespalib/util/stringfmt.h> +#include <vespa/vespalib/util/time.h> #include <fstream> #include <vespa/log/log.h> @@ -68,6 +69,10 @@ StateManager::StateManager(StorageComponentRegister& compReg, _threadLock(), _systemStateHistory(), _systemStateHistorySize(50), + _start_time(vespalib::steady_clock::now()), + _health_ping_time(), + _health_ping_warn_interval(5min), + _health_ping_warn_time(_start_time + _health_ping_warn_interval), _hostInfo(std::move(hostInfo)), _controllers_observed_explicit_node_state(), _noThreadTestMode(testMode), @@ -391,6 +396,8 @@ StateManager::onGetNodeState(const api::GetNodeStateCommand::SP& cmd) std::shared_ptr<api::GetNodeStateReply> reply; { std::unique_lock guard(_stateLock); + _health_ping_time = vespalib::steady_clock::now(); + _health_ping_warn_time = _health_ping_time.value() + _health_ping_warn_interval; const bool is_up_to_date = (_controllers_observed_explicit_node_state.find(cmd->getSourceIndex()) != _controllers_observed_explicit_node_state.end()); if ((cmd->getExpectedState() != nullptr) @@ -479,6 +486,28 @@ StateManager::run(framework::ThreadHandle& thread) } void +StateManager::warn_on_missing_health_ping() +{ + vespalib::steady_time now(vespalib::steady_clock::now()); + std::optional<vespalib::steady_time> health_ping_time; + { + std::lock_guard lock(_stateLock); + if (now <= _health_ping_warn_time) { + return; + } + health_ping_time = _health_ping_time; + _health_ping_warn_time = now + _health_ping_warn_interval; + } + if (health_ping_time.has_value()) { + vespalib::duration duration = now - health_ping_time.value(); + LOG(warning, "Last health ping from cluster controller was %1.1f seconds ago", vespalib::to_s(duration)); + } else { + vespalib::duration duration = now - _start_time; + LOG(warning, "No health pings from cluster controller since startup %1.1f seconds ago", vespalib::to_s(duration)); + } +} + +void StateManager::tick() { bool almost_immediate_replies = _requested_almost_immediate_node_state_replies.load(std::memory_order_relaxed); if (almost_immediate_replies) { @@ -487,6 +516,7 @@ StateManager::tick() { } else { sendGetNodeStateReplies(_component.getClock().getMonotonicTime()); } + warn_on_missing_health_ping(); } bool diff --git a/storage/src/vespa/storage/storageserver/statemanager.h b/storage/src/vespa/storage/storageserver/statemanager.h index 0b9a47c2515..3b1291b1c3f 100644 --- a/storage/src/vespa/storage/storageserver/statemanager.h +++ b/storage/src/vespa/storage/storageserver/statemanager.h @@ -65,6 +65,10 @@ class StateManager : public NodeStateUpdater, std::condition_variable _threadCond; std::deque<TimeSysStatePair> _systemStateHistory; uint32_t _systemStateHistorySize; + const vespalib::steady_time _start_time; + std::optional<vespalib::steady_time> _health_ping_time; + vespalib::duration _health_ping_warn_interval; + vespalib::steady_time _health_ping_warn_time; std::unique_ptr<HostInfo> _hostInfo; std::unique_ptr<framework::Thread> _thread; // Controllers that have observed a GetNodeState response sent _after_ @@ -84,6 +88,7 @@ public: void onClose() override; void tick(); + void warn_on_missing_health_ping(); void print(std::ostream& out, bool verbose, const std::string& indent) const override; void reportHtmlStatus(std::ostream&, const framework::HttpUrlPath&) const override; |