aboutsummaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2020-07-13 15:12:13 +0000
committerTor Brede Vekterli <vekterli@verizonmedia.com>2020-07-13 15:12:13 +0000
commit0e9a9eb04dce4268420cd15a743aaf3c2eecd448 (patch)
tree53f93dbb3cd304c3a15eecfd7a0782d324fd0648 /storage
parent9baa67e082830e31e47b2881737351cb1828d64f (diff)
Emit log warning on repeated content node bucket info fetch failures
Currently requires a certain number of repeated failures for a given cluster state transition. Rationale is that problematic nodes usually fail for a prolonged amount of time, so it's wise to reduce log noise from more transient failures. Threshold to be adjusted later as needed.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.h8
-rw-r--r--storage/src/vespa/storage/distributor/pendingclusterstate.cpp25
-rw-r--r--storage/src/vespa/storage/distributor/pendingclusterstate.h5
3 files changed, 31 insertions, 7 deletions
diff --git a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.h b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.h
index 695f80750aa..232f1186879 100644
--- a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.h
+++ b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.h
@@ -50,6 +50,7 @@ private:
uint16_t _distributorIndex;
bool _bucketOwnershipTransfer;
std::unordered_map<uint16_t, size_t> _rejectedRequests;
+ std::unordered_map<uint16_t, size_t> _failed_requests; // Also includes rejections
BucketDatabase::MergingProcessor::Result merge(BucketDatabase::Merger&) override;
void insert_remaining_at_end(BucketDatabase::TrailingInserter&) override;
@@ -122,6 +123,13 @@ public:
auto iter = _rejectedRequests.find(node);
return ((iter != _rejectedRequests.end()) ? iter->second : 0);
}
+ void increment_request_failures(uint16_t node) {
+ _failed_requests[node]++;
+ }
+ [[nodiscard]] size_t request_failures(uint16_t node) const noexcept {
+ auto iter = _failed_requests.find(node);
+ return ((iter != _failed_requests.end()) ? iter->second : 0);
+ }
};
}
diff --git a/storage/src/vespa/storage/distributor/pendingclusterstate.cpp b/storage/src/vespa/storage/distributor/pendingclusterstate.cpp
index 62a520abc87..3dda989ff74 100644
--- a/storage/src/vespa/storage/distributor/pendingclusterstate.cpp
+++ b/storage/src/vespa/storage/distributor/pendingclusterstate.cpp
@@ -12,7 +12,7 @@
#include <vespa/vespalib/util/xmlstream.hpp>
#include <climits>
-#include <vespa/log/log.h>
+#include <vespa/log/bufferedlogger.h>
LOG_SETUP(".pendingclusterstate");
using document::BucketSpace;
@@ -248,7 +248,22 @@ PendingClusterState::Summary::Summary(const std::string& prevClusterState,
PendingClusterState::Summary::Summary(const Summary &) = default;
PendingClusterState::Summary & PendingClusterState::Summary::operator = (const Summary &) = default;
-PendingClusterState::Summary::~Summary() { }
+PendingClusterState::Summary::~Summary() = default;
+
+void PendingClusterState::update_reply_failure_statistics(const api::ReturnCode& result, const BucketSpaceAndNode& source) {
+ auto transition_iter = _pendingTransitions.find(source.bucketSpace);
+ assert(transition_iter != _pendingTransitions.end());
+ auto& transition = *transition_iter->second;
+ transition.increment_request_failures(source.node);
+ // Edge triggered (rate limited) warning for content node bucket fetching failures
+ if (transition.request_failures(source.node) == RequestFailureWarningEdgeTriggerThreshold) {
+ LOGBP(warning, "Have failed multiple bucket info fetch requests towards node %u. Last received error is: %s",
+ source.node, result.toString().c_str());
+ }
+ if (result.getResult() == api::ReturnCode::REJECTED) {
+ transition.incrementRequestRejections(source.node);
+ }
+}
bool
PendingClusterState::onRequestBucketInfoReply(const std::shared_ptr<api::RequestBucketInfoReply>& reply)
@@ -266,11 +281,7 @@ PendingClusterState::onRequestBucketInfoReply(const std::shared_ptr<api::Request
resendTime += framework::MilliSecTime(100);
_delayedRequests.emplace_back(resendTime, bucketSpaceAndNode);
_sentMessages.erase(iter);
- if (result.getResult() == api::ReturnCode::REJECTED) {
- auto transitionIter = _pendingTransitions.find(bucketSpaceAndNode.bucketSpace);
- assert(transitionIter != _pendingTransitions.end());
- transitionIter->second->incrementRequestRejections(bucketSpaceAndNode.node);
- }
+ update_reply_failure_statistics(result, bucketSpaceAndNode);
return true;
}
diff --git a/storage/src/vespa/storage/distributor/pendingclusterstate.h b/storage/src/vespa/storage/distributor/pendingclusterstate.h
index 7aa35b32b8e..f79a3185c67 100644
--- a/storage/src/vespa/storage/distributor/pendingclusterstate.h
+++ b/storage/src/vespa/storage/distributor/pendingclusterstate.h
@@ -155,6 +155,10 @@ public:
std::string requestNodesToString() const;
private:
+ // With 100ms resend timeout, this requires a particular node to have failed
+ // for _at least_ threshold/10 seconds before a log warning is emitted.
+ constexpr static size_t RequestFailureWarningEdgeTriggerThreshold = 20;
+
/**
* Creates a pending cluster state that represents
* a set system state command from the fleet controller.
@@ -211,6 +215,7 @@ private:
std::string getPrevClusterStateBundleString() const {
return _prevClusterStateBundle.getBaselineClusterState()->toString();
}
+ void update_reply_failure_statistics(const api::ReturnCode& result, const BucketSpaceAndNode& source);
std::shared_ptr<api::SetSystemStateCommand> _cmd;