diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2020-07-13 15:12:13 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2020-07-13 15:12:13 +0000 |
commit | 0e9a9eb04dce4268420cd15a743aaf3c2eecd448 (patch) | |
tree | 53f93dbb3cd304c3a15eecfd7a0782d324fd0648 /storage | |
parent | 9baa67e082830e31e47b2881737351cb1828d64f (diff) |
Emit log warning on repeated content node bucket info fetch failures
Currently requires a certain number of repeated failures for a given
cluster state transition. Rationale is that problematic nodes usually
fail for a prolonged amount of time, so it's wise to reduce log noise
from more transient failures. Threshold to be adjusted later as needed.
Diffstat (limited to 'storage')
3 files changed, 31 insertions, 7 deletions
diff --git a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.h b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.h index 695f80750aa..232f1186879 100644 --- a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.h +++ b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.h @@ -50,6 +50,7 @@ private: uint16_t _distributorIndex; bool _bucketOwnershipTransfer; std::unordered_map<uint16_t, size_t> _rejectedRequests; + std::unordered_map<uint16_t, size_t> _failed_requests; // Also includes rejections BucketDatabase::MergingProcessor::Result merge(BucketDatabase::Merger&) override; void insert_remaining_at_end(BucketDatabase::TrailingInserter&) override; @@ -122,6 +123,13 @@ public: auto iter = _rejectedRequests.find(node); return ((iter != _rejectedRequests.end()) ? iter->second : 0); } + void increment_request_failures(uint16_t node) { + _failed_requests[node]++; + } + [[nodiscard]] size_t request_failures(uint16_t node) const noexcept { + auto iter = _failed_requests.find(node); + return ((iter != _failed_requests.end()) ? iter->second : 0); + } }; } diff --git a/storage/src/vespa/storage/distributor/pendingclusterstate.cpp b/storage/src/vespa/storage/distributor/pendingclusterstate.cpp index 62a520abc87..3dda989ff74 100644 --- a/storage/src/vespa/storage/distributor/pendingclusterstate.cpp +++ b/storage/src/vespa/storage/distributor/pendingclusterstate.cpp @@ -12,7 +12,7 @@ #include <vespa/vespalib/util/xmlstream.hpp> #include <climits> -#include <vespa/log/log.h> +#include <vespa/log/bufferedlogger.h> LOG_SETUP(".pendingclusterstate"); using document::BucketSpace; @@ -248,7 +248,22 @@ PendingClusterState::Summary::Summary(const std::string& prevClusterState, PendingClusterState::Summary::Summary(const Summary &) = default; PendingClusterState::Summary & PendingClusterState::Summary::operator = (const Summary &) = default; -PendingClusterState::Summary::~Summary() { } +PendingClusterState::Summary::~Summary() = default; + +void PendingClusterState::update_reply_failure_statistics(const api::ReturnCode& result, const BucketSpaceAndNode& source) { + auto transition_iter = _pendingTransitions.find(source.bucketSpace); + assert(transition_iter != _pendingTransitions.end()); + auto& transition = *transition_iter->second; + transition.increment_request_failures(source.node); + // Edge triggered (rate limited) warning for content node bucket fetching failures + if (transition.request_failures(source.node) == RequestFailureWarningEdgeTriggerThreshold) { + LOGBP(warning, "Have failed multiple bucket info fetch requests towards node %u. Last received error is: %s", + source.node, result.toString().c_str()); + } + if (result.getResult() == api::ReturnCode::REJECTED) { + transition.incrementRequestRejections(source.node); + } +} bool PendingClusterState::onRequestBucketInfoReply(const std::shared_ptr<api::RequestBucketInfoReply>& reply) @@ -266,11 +281,7 @@ PendingClusterState::onRequestBucketInfoReply(const std::shared_ptr<api::Request resendTime += framework::MilliSecTime(100); _delayedRequests.emplace_back(resendTime, bucketSpaceAndNode); _sentMessages.erase(iter); - if (result.getResult() == api::ReturnCode::REJECTED) { - auto transitionIter = _pendingTransitions.find(bucketSpaceAndNode.bucketSpace); - assert(transitionIter != _pendingTransitions.end()); - transitionIter->second->incrementRequestRejections(bucketSpaceAndNode.node); - } + update_reply_failure_statistics(result, bucketSpaceAndNode); return true; } diff --git a/storage/src/vespa/storage/distributor/pendingclusterstate.h b/storage/src/vespa/storage/distributor/pendingclusterstate.h index 7aa35b32b8e..f79a3185c67 100644 --- a/storage/src/vespa/storage/distributor/pendingclusterstate.h +++ b/storage/src/vespa/storage/distributor/pendingclusterstate.h @@ -155,6 +155,10 @@ public: std::string requestNodesToString() const; private: + // With 100ms resend timeout, this requires a particular node to have failed + // for _at least_ threshold/10 seconds before a log warning is emitted. + constexpr static size_t RequestFailureWarningEdgeTriggerThreshold = 20; + /** * Creates a pending cluster state that represents * a set system state command from the fleet controller. @@ -211,6 +215,7 @@ private: std::string getPrevClusterStateBundleString() const { return _prevClusterStateBundle.getBaselineClusterState()->toString(); } + void update_reply_failure_statistics(const api::ReturnCode& result, const BucketSpaceAndNode& source); std::shared_ptr<api::SetSystemStateCommand> _cmd; |