aboutsummaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2021-12-17 13:33:00 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2021-12-17 13:33:00 +0000
commit18f0ad03c867241f72fcb463b1b482a34c25622e (patch)
tree8eb649f5a59cbda5866a1f14f1c19ffe7478618d /storage
parentda221b81f17026e4c7489d9c7dd84f659cb807a4 (diff)
Complete wiring of OK/failure metric reporting during update write-repair
This also changes the aggregation semantics of "not found" results. An update reply indicating "not found" is protocol-wise reported back to the client as successful but with a not-found field set that has to be explicitly checked. But on the distributor the `notfound` metric is under the `updates.failures` metric set, i.e. not counted as a success. To avoid double bookkeeping caused by treating it as both OK and not, we choose to not count "not found" updates under the `ok` metric. But to avoid artificially inflating the `failures.sum` metric, we remove the `notfound` metric from the implicit sum aggregation. This means that visibility into "not found" updates requires explicitly tracking the `notfound` metric as opposed to just the aggregate, but this should be an acceptable tradeoff to avoid false failure positives.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/twophaseupdateoperationtest.cpp28
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp15
-rw-r--r--storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp11
3 files changed, 43 insertions, 11 deletions
diff --git a/storage/src/tests/distributor/twophaseupdateoperationtest.cpp b/storage/src/tests/distributor/twophaseupdateoperationtest.cpp
index 876aa4a258c..509f417baf1 100644
--- a/storage/src/tests/distributor/twophaseupdateoperationtest.cpp
+++ b/storage/src/tests/distributor/twophaseupdateoperationtest.cpp
@@ -632,6 +632,8 @@ TEST_F(TwoPhaseUpdateOperationTest, safe_path_updates_newest_received_document)
"timestamp 0, timestamp of updated doc: 70) "
"ReturnCode(NONE)",
_sender.getLastReply(true));
+
+ EXPECT_EQ(metrics().updates.ok.getValue(), 1);
}
TEST_F(TwoPhaseUpdateOperationTest, create_if_non_existent_creates_document_if_all_empty_gets) {
@@ -661,6 +663,8 @@ TEST_F(TwoPhaseUpdateOperationTest, create_if_non_existent_creates_document_if_a
"timestamp 0, timestamp of updated doc: 200000000) "
"ReturnCode(NONE)",
_sender.getLastReply(true));
+
+ EXPECT_EQ(metrics().updates.ok.getValue(), 1);
}
TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_safe_path_has_failed_put) {
@@ -685,6 +689,9 @@ TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_safe_path_has_failed_put) {
"timestamp 0, timestamp of updated doc: 200000000) "
"ReturnCode(IO_FAILURE)",
_sender.getLastReply(true));
+
+ EXPECT_EQ(metrics().updates.ok.getValue(), 0);
+ EXPECT_EQ(metrics().updates.failures.storagefailure.getValue(), 1);
}
TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_safe_path_gets_fail) {
@@ -701,6 +708,9 @@ TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_safe_path_gets_fail) {
"timestamp 0, timestamp of updated doc: 0) "
"ReturnCode(IO_FAILURE)",
_sender.getLastReply(true));
+
+ EXPECT_EQ(metrics().updates.ok.getValue(), 0);
+ EXPECT_EQ(metrics().updates.failures.storagefailure.getValue(), 1);
}
TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_apply_throws_exception) {
@@ -747,6 +757,7 @@ TEST_F(TwoPhaseUpdateOperationTest, non_existing_with_auto_create) {
"ReturnCode(NONE)",
_sender.getLastReply(true));
+ EXPECT_EQ(metrics().updates.ok.getValue(), 1);
// "Not found" failure not counted when create: true is set, since the update itself isn't failed.
EXPECT_EQ(metrics().updates.failures.notfound.getValue(), 0);
}
@@ -767,6 +778,9 @@ TEST_F(TwoPhaseUpdateOperationTest, safe_path_fails_update_when_mismatching_time
"ReturnCode(NONE, No document with requested "
"timestamp found)",
_sender.getLastReply(true));
+
+ EXPECT_EQ(metrics().updates.ok.getValue(), 0);
+ EXPECT_EQ(metrics().updates.failures.notfound.getValue(), 1);
}
TEST_F(TwoPhaseUpdateOperationTest, safe_path_update_propagates_message_settings_to_gets_and_puts) {
@@ -951,12 +965,22 @@ TEST_F(TwoPhaseUpdateOperationTest, safe_path_condition_with_missing_doc_and_aut
.createIfNonExistent(true));
cb->start(_sender, framework::MilliSecTime(0));
- replyToGet(*cb, _sender, 0, 100, false);
- replyToGet(*cb, _sender, 1, 110, false);
+ replyToGet(*cb, _sender, 0, 0, false);
+ replyToGet(*cb, _sender, 1, 0, false);
ASSERT_EQ("Put => 1,Put => 0", _sender.getCommands(true, false, 2));
+ replyToPut(*cb, _sender, 2);
+ replyToPut(*cb, _sender, 3);
+
+ EXPECT_EQ("UpdateReply(id:ns:testdoctype1::1, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 200000000) "
+ "ReturnCode(NONE)",
+ _sender.getLastReply(true));
+
EXPECT_EQ(metrics().updates.failures.notfound.getValue(), 0); // Not counted as "not found" failure when we auto create
EXPECT_EQ(metrics().updates.failures.test_and_set_failed.getValue(), 0);
+ EXPECT_EQ(metrics().updates.ok.getValue(), 1);
}
void
diff --git a/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp
index 00c783d1eae..af3b727e07b 100644
--- a/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp
+++ b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp
@@ -155,18 +155,20 @@ TwoPhaseUpdateOperation::sendReply(
_replySent = true;
}
+// This particular method is called when we synthesize our own UpdateReply,
+// not when we take over an already produced one from an UpdateOperation.
+// The latter will already increment _updateMetric fields implicitly.
void
TwoPhaseUpdateOperation::sendReplyWithResult(
DistributorStripeMessageSender& sender,
const api::ReturnCode& result)
{
ensureUpdateReplyCreated();
- // This particular method is called when we synthesize our own UpdateReply,
- // not when we take over an already produced one from an UpdateOperation.
- // The latter will already increment the TaS metric implicitly.
- if (result.getResult() == api::ReturnCode::Result::TEST_AND_SET_CONDITION_FAILED) {
- _updateMetric.failures.test_and_set_failed.inc();
- }
+ // Don't bump metrics if document not found but otherwise OK.
+ // Already counted in metrics prior to calling this method.
+ if (!(result.success() && (_updateReply->getOldTimestamp() == 0))) {
+ _updateMetric.updateFromResult(result);
+ } // else: `notfound` metric already incremented.
_updateReply->setResult(result);
sendReply(sender, _updateReply);
}
@@ -553,6 +555,7 @@ TwoPhaseUpdateOperation::handleSafePathReceivedGet(DistributorStripeMessageSende
if (reply.getDocument()) {
api::Timestamp receivedTimestamp = reply.getLastModifiedTimestamp();
if (!satisfiesUpdateTimestampConstraint(receivedTimestamp)) {
+ _updateMetric.failures.notfound.inc();
sendReplyWithResult(sender, api::ReturnCode(api::ReturnCode::OK,
"No document with requested timestamp found"));
return;
diff --git a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp
index 9a7e2aba5e4..0e42a505567 100644
--- a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp
+++ b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp
@@ -39,9 +39,14 @@ PersistenceFailuresMetricSet::PersistenceFailuresMetricSet(MetricSet* owner)
sum.addMetricToSum(timeout);
sum.addMetricToSum(busy);
sum.addMetricToSum(inconsistent_bucket);
- sum.addMetricToSum(notfound);
- // TaS/concurrent mutation failures not added to the main failure metric, as they're not "failures" as per se.
- // TODO introduce separate aggregate for such metrics
+ // We don't consider the following as explicit failures (even though they're in the failure set)
+ // and therefore don't count them as part of the aggregate sum:
+ //
+ // - Test-and-set mismatches
+ // - Concurrent mutation failures
+ // - Document to be updated not found
+ //
+ // TODO introduce separate aggregate for such metrics. Presumably when we deprecate legacy metric paths.
}
PersistenceFailuresMetricSet::~PersistenceFailuresMetricSet() = default;