diff options
author | Geir Storli <geirst@yahooinc.com> | 2021-10-14 14:46:26 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-10-14 14:46:26 +0200 |
commit | 73384b28424890ac344ea04c3efdf936b279fbc4 (patch) | |
tree | fe6bb577e43b80cb572cafe5988b4d59459ab202 /storage | |
parent | 299e8fea68d4a0f28c365bd3ff7866f18ca290dd (diff) | |
parent | f78d629d643c24a20a1f4622c12a236bf008f1e6 (diff) |
Merge pull request #19547 from vespa-engine/toregge/add-detailed-metrics-for-failed-merge-operations
Add detailed metrics for failed merge operations.
Diffstat (limited to 'storage')
5 files changed, 111 insertions, 33 deletions
diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp index d174a6335b7..65ee5254193 100644 --- a/storage/src/tests/distributor/mergeoperationtest.cpp +++ b/storage/src/tests/distributor/mergeoperationtest.cpp @@ -37,6 +37,10 @@ struct MergeOperationTest : Test, DistributorStripeTestUtil { } std::shared_ptr<MergeOperation> setup_minimal_merge_op(); + std::shared_ptr<MergeOperation> setup_simple_merge_op(); + void assert_simple_merge_bucket_command(); + void assert_simple_delete_bucket_command(); + MergeBucketMetricSet& get_merge_metrics(); }; std::shared_ptr<MergeOperation> @@ -48,7 +52,9 @@ MergeOperationTest::setup_minimal_merge_op() return op; } -TEST_F(MergeOperationTest, simple) { +std::shared_ptr<MergeOperation> +MergeOperationTest::setup_simple_merge_op() +{ getClock().setAbsoluteTimeInSeconds(10); addNodesToBucketDB(document::BucketId(16, 1), @@ -58,44 +64,48 @@ TEST_F(MergeOperationTest, simple) { enable_cluster_state("distributor:1 storage:3"); - MergeOperation op(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)), - toVector<uint16_t>(0, 1, 2))); - op.setIdealStateManager(&getIdealStateManager()); - op.start(_sender, framework::MilliSecTime(0)); + auto op = std::make_shared<MergeOperation>(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)), toVector<uint16_t>(0, 1, 2))); + op->setIdealStateManager(&getIdealStateManager()); + op->start(_sender, framework::MilliSecTime(0)); + return op; +} +void +MergeOperationTest::assert_simple_merge_bucket_command() +{ ASSERT_EQ("MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, " "cluster state version: 0, nodes: [0, 2, 1 (source only)], chain: [], " "reasons to start: ) => 0", _sender.getLastCommand(true)); +} - sendReply(op); - +void +MergeOperationTest::assert_simple_delete_bucket_command() +{ ASSERT_EQ("DeleteBucketCommand(BucketId(0x4000000000000001)) " "Reasons to start: => 1", _sender.getLastCommand(true)); - } -TEST_F(MergeOperationTest, fail_if_source_only_copies_changed) { - getClock().setAbsoluteTimeInSeconds(10); - - addNodesToBucketDB(document::BucketId(16, 1), - "0=10/1/1/t," - "1=20/1/1," - "2=10/1/1/t"); - - enable_cluster_state("distributor:1 storage:3"); - - MergeOperation op(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)), - toVector<uint16_t>(0, 1, 2))); - op.setIdealStateManager(&getIdealStateManager()); - op.start(_sender, framework::MilliSecTime(0)); +MergeBucketMetricSet& +MergeOperationTest::get_merge_metrics() +{ + return dynamic_cast<MergeBucketMetricSet&>(*getIdealStateManager().getMetrics().operations[IdealStateOperation::MERGE_BUCKET]); +} - std::string merge("MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, " - "cluster state version: 0, nodes: [0, 2, 1 (source only)], chain: [], " - "reasons to start: ) => 0"); +TEST_F(MergeOperationTest, simple) { + auto op = setup_simple_merge_op(); + ASSERT_NO_FATAL_FAILURE(assert_simple_merge_bucket_command()); + sendReply(*op); + ASSERT_NO_FATAL_FAILURE(assert_simple_delete_bucket_command()); + EXPECT_EQ(0, get_merge_metrics().ok.getValue()); + sendReply(*op); + EXPECT_EQ(1, get_merge_metrics().ok.getValue()); +} - ASSERT_EQ(merge, _sender.getLastCommand(true)); +TEST_F(MergeOperationTest, fail_if_source_only_copies_changed) { + auto op = setup_simple_merge_op(); + ASSERT_NO_FATAL_FAILURE(assert_simple_merge_bucket_command()); { auto& cmd = dynamic_cast<api::MergeBucketCommand&>(*_sender.command(0)); EXPECT_EQ(0, cmd.getSourceIndex()); @@ -106,10 +116,22 @@ TEST_F(MergeOperationTest, fail_if_source_only_copies_changed) { "0=10/1/1/t," "1=40/1/1," "2=10/1/1/t"); - sendReply(op); + sendReply(*op); // Should not be a remove here! - ASSERT_EQ(merge, _sender.getLastCommand(true)); - EXPECT_FALSE(op.ok()); + ASSERT_NO_FATAL_FAILURE(assert_simple_merge_bucket_command()); + EXPECT_FALSE(op->ok()); + EXPECT_EQ(1, get_merge_metrics().failed.getValue()); + EXPECT_EQ(1, get_merge_metrics().source_only_copy_changed.getValue()); +} + +TEST_F(MergeOperationTest, fail_if_delete_bucket_fails) { + auto op = setup_simple_merge_op(); + ASSERT_NO_FATAL_FAILURE(assert_simple_merge_bucket_command()); + sendReply(*op); + ASSERT_NO_FATAL_FAILURE(assert_simple_delete_bucket_command()); + sendReply(*op, -1, api::ReturnCode::ABORTED); + EXPECT_EQ(1, get_merge_metrics().failed.getValue()); + EXPECT_EQ(1, get_merge_metrics().source_only_copy_delete_failed.getValue()); } namespace { @@ -276,6 +298,8 @@ TEST_F(MergeOperationTest, do_not_remove_copies_with_pending_messages) { // Should not be a remove here! ASSERT_EQ(merge, _sender.getLastCommand(true)); EXPECT_FALSE(op.ok()); + EXPECT_EQ(1, get_merge_metrics().failed.getValue()); + EXPECT_EQ(1, get_merge_metrics().source_only_copy_delete_blocked.getValue()); } /* diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp index 89dc1ada39d..1a05a2e3baa 100644 --- a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp +++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp @@ -35,6 +35,22 @@ GcMetricSet::GcMetricSet(const std::string& name, metrics::Metric::Tags tags, co GcMetricSet::~GcMetricSet() = default; +MergeBucketMetricSet::MergeBucketMetricSet(const std::string& name, metrics::Metric::Tags tags, const std::string& description, MetricSet* owner) + : OperationMetricSet(name, std::move(tags), description, owner), + source_only_copy_changed("source_only_copy_changed", + {{"logdefault"},{"yamasdefault"}}, + "The number of merge operations where source-only copy changed"), + source_only_copy_delete_blocked("source_only_copy_delete_blocked", + {{"logdefault"},{"yamasdefault"}}, + "The number of merge operations where delete of unchanged source-only copies was blocked"), + source_only_copy_delete_failed("source_only_copy_delete_failed", + {{"logdefault"},{"yamasdefault"}}, + "The number of merge operations where delete of unchanged source-only copies failed") +{ +} + +MergeBucketMetricSet::~MergeBucketMetricSet() = default; + void IdealStateMetricSet::createOperationMetrics() { typedef IdealStateOperation ISO; @@ -45,10 +61,10 @@ IdealStateMetricSet::createOperationMetrics() { new OperationMetricSet("delete_bucket", {{"logdefault"},{"yamasdefault"}}, "Operations to delete excess buckets on storage nodes", this)); - operations[ISO::MERGE_BUCKET] = std::shared_ptr<OperationMetricSet>( - new OperationMetricSet("merge_bucket", - {{"logdefault"},{"yamasdefault"}}, - "Operations to merge buckets that are out of sync", this)); + operations[ISO::MERGE_BUCKET] = std::make_shared<MergeBucketMetricSet> + ("merge_bucket", + metrics::Metric::Tags{{"logdefault"},{"yamasdefault"}}, + "Operations to merge buckets that are out of sync", this); operations[ISO::SPLIT_BUCKET] = std::shared_ptr<OperationMetricSet>( new OperationMetricSet("split_bucket", {{"logdefault"},{"yamasdefault"}}, diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.h b/storage/src/vespa/storage/distributor/idealstatemetricsset.h index ada1bd50e5c..a0cbe86dac4 100644 --- a/storage/src/vespa/storage/distributor/idealstatemetricsset.h +++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.h @@ -31,6 +31,16 @@ struct GcMetricSet : OperationMetricSet { ~GcMetricSet() override; }; +class MergeBucketMetricSet : public OperationMetricSet +{ +public: + metrics::LongCountMetric source_only_copy_changed; + metrics::LongCountMetric source_only_copy_delete_blocked; + metrics::LongCountMetric source_only_copy_delete_failed; + MergeBucketMetricSet(const std::string& name, metrics::Metric::Tags tags, const std::string& description, MetricSet* owner); + ~MergeBucketMetricSet() override; +}; + class IdealStateMetricSet : public metrics::MetricSet { public: diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp index 69bbb67b1f3..7cfe4172b2c 100644 --- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp +++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp @@ -2,6 +2,7 @@ #include "mergeoperation.h" #include <vespa/document/bucket/fixed_bucket_spaces.h> #include <vespa/storage/distributor/idealstatemanager.h> +#include <vespa/storage/distributor/idealstatemetricsset.h> #include <vespa/storage/distributor/distributor_bucket_space.h> #include <vespa/storage/distributor/pendingmessagetracker.h> #include <vespa/vdslib/distribution/distribution.h> @@ -239,6 +240,10 @@ MergeOperation::deleteSourceOnlyNodes( LOG(debug, "Source only removal for %s was blocked by a pending operation", getBucketId().toString().c_str()); _ok = false; + auto merge_metrics = get_merge_metrics(); + if (merge_metrics) { + merge_metrics->source_only_copy_delete_blocked.inc(1); + } done(); return; } @@ -260,6 +265,12 @@ MergeOperation::onReceive(DistributorStripeMessageSender& sender, if (_removeOperation.get()) { if (_removeOperation->onReceiveInternal(msg)) { _ok = _removeOperation->ok(); + if (!_ok) { + auto merge_metrics = get_merge_metrics(); + if (merge_metrics) { + merge_metrics->source_only_copy_delete_failed.inc(1); + } + } done(); } @@ -284,6 +295,10 @@ MergeOperation::onReceive(DistributorStripeMessageSender& sender, } if (sourceOnlyCopyChangedDuringMerge(entry)) { _ok = false; + auto merge_metrics = get_merge_metrics(); + if (merge_metrics) { + merge_metrics->source_only_copy_changed.inc(1); + } done(); return; } @@ -352,4 +367,14 @@ bool MergeOperation::is_global_bucket_merge() const noexcept { return getBucket().getBucketSpace() == document::FixedBucketSpaces::global_space(); } +MergeBucketMetricSet* +MergeOperation::get_merge_metrics() +{ + if (_manager) { + return dynamic_cast<MergeBucketMetricSet *>(_manager->getMetrics().operations[getType()].get()); + } else { + return nullptr; + } +} + } diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h index 2be98785fdb..1bca1f7389f 100644 --- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h +++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h @@ -12,6 +12,8 @@ namespace storage::lib { class Distribution; } namespace storage::distributor { +class MergeBucketMetricSet; + class MergeOperation : public IdealStateOperation { protected: @@ -62,6 +64,7 @@ private: void deleteSourceOnlyNodes(const BucketDatabase::Entry& currentState, DistributorStripeMessageSender& sender); bool is_global_bucket_merge() const noexcept; + MergeBucketMetricSet* get_merge_metrics(); }; } |