diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-10-07 11:11:27 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-10-07 11:11:27 +0000 |
commit | f7f4b31a947443ffe1fb088ef6dcf1a0a65f76af (patch) | |
tree | 06b260a075c93b86eb1ae9181aa7268bf1bf363d | |
parent | 385ff3f0d79e76eba8c6cf688bc730fb14b0dd38 (diff) |
Add test-and-set failures as own distributor metric
Would otherwise be counted under mysterious "storagefailure" catch-all
category. Currently not tracked under aggregate failure sum metric,
as these are not really "failures" since TaS-failures are expected
to happen and do not indicate problems in the backend.
3 files changed, 31 insertions, 4 deletions
diff --git a/storage/src/tests/distributor/updateoperationtest.cpp b/storage/src/tests/distributor/updateoperationtest.cpp index 6bc000f6780..1d6fb5fe2ea 100644 --- a/storage/src/tests/distributor/updateoperationtest.cpp +++ b/storage/src/tests/distributor/updateoperationtest.cpp @@ -42,7 +42,8 @@ struct UpdateOperationTest : Test, DistributorTestUtil { } void replyToMessage(UpdateOperation& callback, DistributorMessageSenderStub& sender, uint32_t index, - uint64_t oldTimestamp, const api::BucketInfo& info = api::BucketInfo(2,4,6)); + uint64_t oldTimestamp, const api::BucketInfo& info = api::BucketInfo(2,4,6), + const api::ReturnCode& result = api::ReturnCode()); std::shared_ptr<UpdateOperation> sendUpdate(const std::string& bucketState); @@ -72,7 +73,7 @@ UpdateOperationTest::sendUpdate(const std::string& bucketState) void UpdateOperationTest::replyToMessage(UpdateOperation& callback, DistributorMessageSenderStub& sender, uint32_t index, - uint64_t oldTimestamp, const api::BucketInfo& info) + uint64_t oldTimestamp, const api::BucketInfo& info, const api::ReturnCode& result) { std::shared_ptr<api::StorageMessage> msg2 = sender.command(index); auto* updatec = dynamic_cast<UpdateCommand*>(msg2.get()); @@ -80,6 +81,7 @@ UpdateOperationTest::replyToMessage(UpdateOperation& callback, DistributorMessag auto* updateR = static_cast<api::UpdateReply*>(reply.get()); updateR->setOldTimestamp(oldTimestamp); updateR->setBucketInfo(info); + updateR->setResult(result); callback.onReceive(sender, std::shared_ptr<StorageReply>(reply.release())); } @@ -163,3 +165,21 @@ TEST_F(UpdateOperationTest, multi_node_inconsistent_timestamp) { EXPECT_EQ(1, metrics.diverging_timestamp_updates.getValue()); } +TEST_F(UpdateOperationTest, test_and_set_failures_increment_tas_metric) { + setupDistributor(2, 2, "distributor:1 storage:1"); + std::shared_ptr<UpdateOperation> cb(sendUpdate("0=1/2/3")); + DistributorMessageSenderStub sender; + cb->start(sender, framework::MilliSecTime(0)); + ASSERT_EQ("Update => 0", sender.getCommands(true)); + api::ReturnCode result(api::ReturnCode::TEST_AND_SET_CONDITION_FAILED, "bork bork"); + replyToMessage(*cb, sender, 0, 1234, api::BucketInfo(), result); + + ASSERT_EQ("UpdateReply(id:ns:text/html::1, BucketId(0x0000000000000000), " + "timestamp 100, timestamp of updated doc: 0) " + "ReturnCode(TEST_AND_SET_CONDITION_FAILED, bork bork)", + sender.getLastReply(true)); + + auto& metrics = getDistributor().getMetrics().updates[documentapi::LoadType::DEFAULT]; + EXPECT_EQ(1, metrics.failures.test_and_set_failed.getValue()); +} + diff --git a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp index d3ae5d547ed..457d1e051b9 100644 --- a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp +++ b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp @@ -28,7 +28,9 @@ PersistenceFailuresMetricSet::PersistenceFailuresMetricSet(MetricSet* owner) "being in an inconsistent state or not found", this), notfound("notfound", {}, "The number of operations that failed because the document did not exist", this), concurrent_mutations("concurrent_mutations", {}, "The number of operations that were transiently failed due " - "to a mutating operation already being in progress for its document ID", this) + "to a mutating operation already being in progress for its document ID", this), + test_and_set_failed("test_and_set_failed", {}, "The number of mutating operations that failed because " + "they specified a test-and-set condition that did not match the existing document", this) { sum.addMetricToSum(notready); sum.addMetricToSum(notconnected); @@ -39,6 +41,8 @@ PersistenceFailuresMetricSet::PersistenceFailuresMetricSet(MetricSet* owner) sum.addMetricToSum(busy); sum.addMetricToSum(inconsistent_bucket); sum.addMetricToSum(notfound); + // TaS/concurrent mutation failures not added to the main failure metric, as they're not "failures" as per se. + // TODO introduce separate aggregate for such metrics } PersistenceFailuresMetricSet::~PersistenceFailuresMetricSet() = default; @@ -61,7 +65,7 @@ PersistenceOperationMetricSet::PersistenceOperationMetricSet(const std::string& failures(this) { } -PersistenceOperationMetricSet::~PersistenceOperationMetricSet() { } +PersistenceOperationMetricSet::~PersistenceOperationMetricSet() = default; MetricSet * PersistenceOperationMetricSet::clone(std::vector<Metric::UP>& ownerList, CopyType copyType, @@ -84,6 +88,8 @@ PersistenceOperationMetricSet::updateFromResult(const api::ReturnCode& result) failures.wrongdistributor.inc(); } else if (result.getResult() == api::ReturnCode::TIMEOUT) { failures.timeout.inc(); + } else if (result.getResult() == api::ReturnCode::TEST_AND_SET_CONDITION_FAILED) { + failures.test_and_set_failed.inc(); } else if (result.isBusy()) { failures.busy.inc(); } else if (result.isBucketDisappearance()) { diff --git a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h index 4f51c664daf..52249529e4f 100644 --- a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h +++ b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h @@ -28,6 +28,7 @@ public: metrics::LongCountMetric inconsistent_bucket; metrics::LongCountMetric notfound; metrics::LongCountMetric concurrent_mutations; + metrics::LongCountMetric test_and_set_failed; MetricSet * clone(std::vector<Metric::UP>& ownerList, CopyType copyType, metrics::MetricSet* owner, bool includeUnused) const override; |