Complete wiring of OK/failure metric reporting during update write-repair

This also changes the aggregation semantics of "not found" results. An update reply indicating "not found" is protocol-wise reported back to the client as successful but with a not-found field set that has to be explicitly checked. But on the distributor the `notfound` metric is under the `updates.failures` metric set, i.e. not counted as a success. To avoid double bookkeeping caused by treating it as both OK and not, we choose to not count "not found" updates under the `ok` metric. But to avoid artificially inflating the `failures.sum` metric, we remove the `notfound` metric from the implicit sum aggregation. This means that visibility into "not found" updates requires explicitly tracking the `notfound` metric as opposed to just the aggregate, but this should be an acceptable tradeoff to avoid false failure positives.
author: Tor Brede Vekterli <vekterli@yahooinc.com> 2021-12-17 13:33:00 +0000
committer: Tor Brede Vekterli <vekterli@yahooinc.com> 2021-12-17 13:33:00 +0000
commit: 18f0ad03c867241f72fcb463b1b482a34c25622e (patch)
tree: 8eb649f5a59cbda5866a1f14f1c19ffe7478618d /storage
parent: da221b81f17026e4c7489d9c7dd84f659cb807a4 (diff)
3 files changed, 43 insertions, 11 deletions
diff --git a/storage/src/tests/distributor/twophaseupdateoperationtest.cpp b/storage/src/tests/distributor/twophaseupdateoperationtest.cpp
index 876aa4a258c..509f417baf1 100644
--- a/storage/src/tests/distributor/twophaseupdateoperationtest.cpp
+++ b/storage/src/tests/distributor/twophaseupdateoperationtest.cpp
@@ -632,6 +632,8 @@ TEST_F(TwoPhaseUpdateOperationTest, safe_path_updates_newest_received_document)
               "timestamp 0, timestamp of updated doc: 70) "
               "ReturnCode(NONE)",
               _sender.getLastReply(true));
+
+    EXPECT_EQ(metrics().updates.ok.getValue(), 1);
 }
 
 TEST_F(TwoPhaseUpdateOperationTest, create_if_non_existent_creates_document_if_all_empty_gets) {
@@ -661,6 +663,8 @@ TEST_F(TwoPhaseUpdateOperationTest, create_if_non_existent_creates_document_if_a
               "timestamp 0, timestamp of updated doc: 200000000) "
               "ReturnCode(NONE)",
               _sender.getLastReply(true));
+
+    EXPECT_EQ(metrics().updates.ok.getValue(), 1);
 }
 
 TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_safe_path_has_failed_put) {
@@ -685,6 +689,9 @@ TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_safe_path_has_failed_put) {
               "timestamp 0, timestamp of updated doc: 200000000) "
               "ReturnCode(IO_FAILURE)",
               _sender.getLastReply(true));
+
+    EXPECT_EQ(metrics().updates.ok.getValue(), 0);
+    EXPECT_EQ(metrics().updates.failures.storagefailure.getValue(), 1);
 }
 
 TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_safe_path_gets_fail) {
@@ -701,6 +708,9 @@ TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_safe_path_gets_fail) {
               "timestamp 0, timestamp of updated doc: 0) "
               "ReturnCode(IO_FAILURE)",
               _sender.getLastReply(true));
+
+    EXPECT_EQ(metrics().updates.ok.getValue(), 0);
+    EXPECT_EQ(metrics().updates.failures.storagefailure.getValue(), 1);
 }
 
 TEST_F(TwoPhaseUpdateOperationTest, update_fails_if_apply_throws_exception) {
@@ -747,6 +757,7 @@ TEST_F(TwoPhaseUpdateOperationTest, non_existing_with_auto_create) {
               "ReturnCode(NONE)",
               _sender.getLastReply(true));
 
+    EXPECT_EQ(metrics().updates.ok.getValue(), 1);
     // "Not found" failure not counted when create: true is set, since the update itself isn't failed.
     EXPECT_EQ(metrics().updates.failures.notfound.getValue(), 0);
 }
@@ -767,6 +778,9 @@ TEST_F(TwoPhaseUpdateOperationTest, safe_path_fails_update_when_mismatching_time
               "ReturnCode(NONE, No document with requested "
                          "timestamp found)",
               _sender.getLastReply(true));
+
+    EXPECT_EQ(metrics().updates.ok.getValue(), 0);
+    EXPECT_EQ(metrics().updates.failures.notfound.getValue(), 1);
 }
 
 TEST_F(TwoPhaseUpdateOperationTest, safe_path_update_propagates_message_settings_to_gets_and_puts) {
@@ -951,12 +965,22 @@ TEST_F(TwoPhaseUpdateOperationTest, safe_path_condition_with_missing_doc_and_aut
                     .createIfNonExistent(true));
 
     cb->start(_sender, framework::MilliSecTime(0));
-    replyToGet(*cb, _sender, 0, 100, false);
-    replyToGet(*cb, _sender, 1, 110, false);
+    replyToGet(*cb, _sender, 0, 0, false);
+    replyToGet(*cb, _sender, 1, 0, false);
     ASSERT_EQ("Put => 1,Put => 0", _sender.getCommands(true, false, 2));
 
+    replyToPut(*cb, _sender, 2);
+    replyToPut(*cb, _sender, 3);
+
+    EXPECT_EQ("UpdateReply(id:ns:testdoctype1::1, "
+                          "BucketId(0x0000000000000000), "
+                          "timestamp 0, timestamp of updated doc: 200000000) "
+                          "ReturnCode(NONE)",
+              _sender.getLastReply(true));
+
     EXPECT_EQ(metrics().updates.failures.notfound.getValue(), 0); // Not counted as "not found" failure when we auto create
     EXPECT_EQ(metrics().updates.failures.test_and_set_failed.getValue(), 0);
+    EXPECT_EQ(metrics().updates.ok.getValue(), 1);
 }
 
 void
diff --git a/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp
index 00c783d1eae..af3b727e07b 100644
--- a/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp
+++ b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp
@@ -155,18 +155,20 @@ TwoPhaseUpdateOperation::sendReply(
     _replySent = true;
 }
 
+// This particular method is called when we synthesize our own UpdateReply,
+// not when we take over an already produced one from an UpdateOperation.
+// The latter will already increment _updateMetric fields implicitly.
 void
 TwoPhaseUpdateOperation::sendReplyWithResult(
         DistributorStripeMessageSender& sender,
         const api::ReturnCode& result)
 {
     ensureUpdateReplyCreated();
-    // This particular method is called when we synthesize our own UpdateReply,
-    // not when we take over an already produced one from an UpdateOperation.
-    // The latter will already increment the TaS metric implicitly.
-    if (result.getResult() == api::ReturnCode::Result::TEST_AND_SET_CONDITION_FAILED) {
-        _updateMetric.failures.test_and_set_failed.inc();
-    }
+    // Don't bump metrics if document not found but otherwise OK.
+    // Already counted in metrics prior to calling this method.
+    if (!(result.success() && (_updateReply->getOldTimestamp() == 0))) {
+        _updateMetric.updateFromResult(result);
+    } // else: `notfound` metric already incremented.
     _updateReply->setResult(result);
     sendReply(sender, _updateReply);
 }
@@ -553,6 +555,7 @@ TwoPhaseUpdateOperation::handleSafePathReceivedGet(DistributorStripeMessageSende
     if (reply.getDocument()) {
         api::Timestamp receivedTimestamp = reply.getLastModifiedTimestamp();
         if (!satisfiesUpdateTimestampConstraint(receivedTimestamp)) {
+            _updateMetric.failures.notfound.inc();
             sendReplyWithResult(sender, api::ReturnCode(api::ReturnCode::OK,
                                                         "No document with requested timestamp found"));
             return;
diff --git a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp
index 9a7e2aba5e4..0e42a505567 100644
--- a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp
+++ b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp
@@ -39,9 +39,14 @@ PersistenceFailuresMetricSet::PersistenceFailuresMetricSet(MetricSet* owner)
     sum.addMetricToSum(timeout);
     sum.addMetricToSum(busy);
     sum.addMetricToSum(inconsistent_bucket);
-    sum.addMetricToSum(notfound);
-    // TaS/concurrent mutation failures not added to the main failure metric, as they're not "failures" as per se.
-    // TODO introduce separate aggregate for such metrics
+    // We don't consider the following as explicit failures (even though they're in the failure set)
+    // and therefore don't count them as part of the aggregate sum:
+    //
+    //  - Test-and-set mismatches
+    //  - Concurrent mutation failures
+    //  - Document to be updated not found
+    //
+    // TODO introduce separate aggregate for such metrics. Presumably when we deprecate legacy metric paths.
 }
 
 PersistenceFailuresMetricSet::~PersistenceFailuresMetricSet() = default;
author	Tor Brede Vekterli <vekterli@yahooinc.com>	2021-12-17 13:33:00 +0000
committer	Tor Brede Vekterli <vekterli@yahooinc.com>	2021-12-17 13:33:00 +0000
commit	18f0ad03c867241f72fcb463b1b482a34c25622e (patch)
tree	8eb649f5a59cbda5866a1f14f1c19ffe7478618d /storage
parent	da221b81f17026e4c7489d9c7dd84f659cb807a4 (diff)