From 0f6bdb99acea472c7f109ef3426a5388ef49c6e6 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Wed, 19 Jul 2023 16:33:37 +0200 Subject: Warn on missing health ping. --- .../vespa/storage/storageserver/statemanager.cpp | 30 ++++++++++++++++++++++ .../src/vespa/storage/storageserver/statemanager.h | 5 ++++ 2 files changed, 35 insertions(+) diff --git a/storage/src/vespa/storage/storageserver/statemanager.cpp b/storage/src/vespa/storage/storageserver/statemanager.cpp index 654fe0e1f5d..cb3bfcf6400 100644 --- a/storage/src/vespa/storage/storageserver/statemanager.cpp +++ b/storage/src/vespa/storage/storageserver/statemanager.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -68,6 +69,10 @@ StateManager::StateManager(StorageComponentRegister& compReg, _threadLock(), _systemStateHistory(), _systemStateHistorySize(50), + _start_time(vespalib::steady_clock::now()), + _health_ping_time(), + _health_ping_warn_interval(5min), + _health_ping_warn_time(_start_time + _health_ping_warn_interval), _hostInfo(std::move(hostInfo)), _controllers_observed_explicit_node_state(), _noThreadTestMode(testMode), @@ -391,6 +396,8 @@ StateManager::onGetNodeState(const api::GetNodeStateCommand::SP& cmd) std::shared_ptr reply; { std::unique_lock guard(_stateLock); + _health_ping_time = vespalib::steady_clock::now(); + _health_ping_warn_time = _health_ping_time.value() + _health_ping_warn_interval; const bool is_up_to_date = (_controllers_observed_explicit_node_state.find(cmd->getSourceIndex()) != _controllers_observed_explicit_node_state.end()); if ((cmd->getExpectedState() != nullptr) @@ -478,6 +485,28 @@ StateManager::run(framework::ThreadHandle& thread) } +void +StateManager::warn_on_missing_health_ping() +{ + vespalib::steady_time now(vespalib::steady_clock::now()); + std::optional health_ping_time; + { + std::lock_guard lock(_stateLock); + if (now <= _health_ping_warn_time) { + return; + } + health_ping_time = _health_ping_time; + _health_ping_warn_time = now + _health_ping_warn_interval; + } + if (health_ping_time.has_value()) { + vespalib::duration duration = now - health_ping_time.value(); + LOG(warning, "Last health ping was %1.1f seconds ago", vespalib::to_s(duration)); + } else { + vespalib::duration duration = now - _start_time; + LOG(warning, "No health pings since startup %1.1f seconds ago", vespalib::to_s(duration)); + } +} + void StateManager::tick() { bool almost_immediate_replies = _requested_almost_immediate_node_state_replies.load(std::memory_order_relaxed); @@ -487,6 +516,7 @@ StateManager::tick() { } else { sendGetNodeStateReplies(_component.getClock().getMonotonicTime()); } + warn_on_missing_health_ping(); } bool diff --git a/storage/src/vespa/storage/storageserver/statemanager.h b/storage/src/vespa/storage/storageserver/statemanager.h index 0b9a47c2515..3b1291b1c3f 100644 --- a/storage/src/vespa/storage/storageserver/statemanager.h +++ b/storage/src/vespa/storage/storageserver/statemanager.h @@ -65,6 +65,10 @@ class StateManager : public NodeStateUpdater, std::condition_variable _threadCond; std::deque _systemStateHistory; uint32_t _systemStateHistorySize; + const vespalib::steady_time _start_time; + std::optional _health_ping_time; + vespalib::duration _health_ping_warn_interval; + vespalib::steady_time _health_ping_warn_time; std::unique_ptr _hostInfo; std::unique_ptr _thread; // Controllers that have observed a GetNodeState response sent _after_ @@ -84,6 +88,7 @@ public: void onClose() override; void tick(); + void warn_on_missing_health_ping(); void print(std::ostream& out, bool verbose, const std::string& indent) const override; void reportHtmlStatus(std::ostream&, const framework::HttpUrlPath&) const override; -- cgit v1.2.3