summaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2021-10-14 14:46:26 +0200
committerGitHub <noreply@github.com>2021-10-14 14:46:26 +0200
commit73384b28424890ac344ea04c3efdf936b279fbc4 (patch)
treefe6bb577e43b80cb572cafe5988b4d59459ab202 /storage
parent299e8fea68d4a0f28c365bd3ff7866f18ca290dd (diff)
parentf78d629d643c24a20a1f4622c12a236bf008f1e6 (diff)
Merge pull request #19547 from vespa-engine/toregge/add-detailed-metrics-for-failed-merge-operations
Add detailed metrics for failed merge operations.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/mergeoperationtest.cpp82
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemetricsset.cpp24
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemetricsset.h10
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp25
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h3
5 files changed, 111 insertions, 33 deletions
diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp
index d174a6335b7..65ee5254193 100644
--- a/storage/src/tests/distributor/mergeoperationtest.cpp
+++ b/storage/src/tests/distributor/mergeoperationtest.cpp
@@ -37,6 +37,10 @@ struct MergeOperationTest : Test, DistributorStripeTestUtil {
}
std::shared_ptr<MergeOperation> setup_minimal_merge_op();
+ std::shared_ptr<MergeOperation> setup_simple_merge_op();
+ void assert_simple_merge_bucket_command();
+ void assert_simple_delete_bucket_command();
+ MergeBucketMetricSet& get_merge_metrics();
};
std::shared_ptr<MergeOperation>
@@ -48,7 +52,9 @@ MergeOperationTest::setup_minimal_merge_op()
return op;
}
-TEST_F(MergeOperationTest, simple) {
+std::shared_ptr<MergeOperation>
+MergeOperationTest::setup_simple_merge_op()
+{
getClock().setAbsoluteTimeInSeconds(10);
addNodesToBucketDB(document::BucketId(16, 1),
@@ -58,44 +64,48 @@ TEST_F(MergeOperationTest, simple) {
enable_cluster_state("distributor:1 storage:3");
- MergeOperation op(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)),
- toVector<uint16_t>(0, 1, 2)));
- op.setIdealStateManager(&getIdealStateManager());
- op.start(_sender, framework::MilliSecTime(0));
+ auto op = std::make_shared<MergeOperation>(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)), toVector<uint16_t>(0, 1, 2)));
+ op->setIdealStateManager(&getIdealStateManager());
+ op->start(_sender, framework::MilliSecTime(0));
+ return op;
+}
+void
+MergeOperationTest::assert_simple_merge_bucket_command()
+{
ASSERT_EQ("MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, "
"cluster state version: 0, nodes: [0, 2, 1 (source only)], chain: [], "
"reasons to start: ) => 0",
_sender.getLastCommand(true));
+}
- sendReply(op);
-
+void
+MergeOperationTest::assert_simple_delete_bucket_command()
+{
ASSERT_EQ("DeleteBucketCommand(BucketId(0x4000000000000001)) "
"Reasons to start: => 1",
_sender.getLastCommand(true));
-
}
-TEST_F(MergeOperationTest, fail_if_source_only_copies_changed) {
- getClock().setAbsoluteTimeInSeconds(10);
-
- addNodesToBucketDB(document::BucketId(16, 1),
- "0=10/1/1/t,"
- "1=20/1/1,"
- "2=10/1/1/t");
-
- enable_cluster_state("distributor:1 storage:3");
-
- MergeOperation op(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)),
- toVector<uint16_t>(0, 1, 2)));
- op.setIdealStateManager(&getIdealStateManager());
- op.start(_sender, framework::MilliSecTime(0));
+MergeBucketMetricSet&
+MergeOperationTest::get_merge_metrics()
+{
+ return dynamic_cast<MergeBucketMetricSet&>(*getIdealStateManager().getMetrics().operations[IdealStateOperation::MERGE_BUCKET]);
+}
- std::string merge("MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, "
- "cluster state version: 0, nodes: [0, 2, 1 (source only)], chain: [], "
- "reasons to start: ) => 0");
+TEST_F(MergeOperationTest, simple) {
+ auto op = setup_simple_merge_op();
+ ASSERT_NO_FATAL_FAILURE(assert_simple_merge_bucket_command());
+ sendReply(*op);
+ ASSERT_NO_FATAL_FAILURE(assert_simple_delete_bucket_command());
+ EXPECT_EQ(0, get_merge_metrics().ok.getValue());
+ sendReply(*op);
+ EXPECT_EQ(1, get_merge_metrics().ok.getValue());
+}
- ASSERT_EQ(merge, _sender.getLastCommand(true));
+TEST_F(MergeOperationTest, fail_if_source_only_copies_changed) {
+ auto op = setup_simple_merge_op();
+ ASSERT_NO_FATAL_FAILURE(assert_simple_merge_bucket_command());
{
auto& cmd = dynamic_cast<api::MergeBucketCommand&>(*_sender.command(0));
EXPECT_EQ(0, cmd.getSourceIndex());
@@ -106,10 +116,22 @@ TEST_F(MergeOperationTest, fail_if_source_only_copies_changed) {
"0=10/1/1/t,"
"1=40/1/1,"
"2=10/1/1/t");
- sendReply(op);
+ sendReply(*op);
// Should not be a remove here!
- ASSERT_EQ(merge, _sender.getLastCommand(true));
- EXPECT_FALSE(op.ok());
+ ASSERT_NO_FATAL_FAILURE(assert_simple_merge_bucket_command());
+ EXPECT_FALSE(op->ok());
+ EXPECT_EQ(1, get_merge_metrics().failed.getValue());
+ EXPECT_EQ(1, get_merge_metrics().source_only_copy_changed.getValue());
+}
+
+TEST_F(MergeOperationTest, fail_if_delete_bucket_fails) {
+ auto op = setup_simple_merge_op();
+ ASSERT_NO_FATAL_FAILURE(assert_simple_merge_bucket_command());
+ sendReply(*op);
+ ASSERT_NO_FATAL_FAILURE(assert_simple_delete_bucket_command());
+ sendReply(*op, -1, api::ReturnCode::ABORTED);
+ EXPECT_EQ(1, get_merge_metrics().failed.getValue());
+ EXPECT_EQ(1, get_merge_metrics().source_only_copy_delete_failed.getValue());
}
namespace {
@@ -276,6 +298,8 @@ TEST_F(MergeOperationTest, do_not_remove_copies_with_pending_messages) {
// Should not be a remove here!
ASSERT_EQ(merge, _sender.getLastCommand(true));
EXPECT_FALSE(op.ok());
+ EXPECT_EQ(1, get_merge_metrics().failed.getValue());
+ EXPECT_EQ(1, get_merge_metrics().source_only_copy_delete_blocked.getValue());
}
/*
diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp
index 89dc1ada39d..1a05a2e3baa 100644
--- a/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp
+++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.cpp
@@ -35,6 +35,22 @@ GcMetricSet::GcMetricSet(const std::string& name, metrics::Metric::Tags tags, co
GcMetricSet::~GcMetricSet() = default;
+MergeBucketMetricSet::MergeBucketMetricSet(const std::string& name, metrics::Metric::Tags tags, const std::string& description, MetricSet* owner)
+ : OperationMetricSet(name, std::move(tags), description, owner),
+ source_only_copy_changed("source_only_copy_changed",
+ {{"logdefault"},{"yamasdefault"}},
+ "The number of merge operations where source-only copy changed"),
+ source_only_copy_delete_blocked("source_only_copy_delete_blocked",
+ {{"logdefault"},{"yamasdefault"}},
+ "The number of merge operations where delete of unchanged source-only copies was blocked"),
+ source_only_copy_delete_failed("source_only_copy_delete_failed",
+ {{"logdefault"},{"yamasdefault"}},
+ "The number of merge operations where delete of unchanged source-only copies failed")
+{
+}
+
+MergeBucketMetricSet::~MergeBucketMetricSet() = default;
+
void
IdealStateMetricSet::createOperationMetrics() {
typedef IdealStateOperation ISO;
@@ -45,10 +61,10 @@ IdealStateMetricSet::createOperationMetrics() {
new OperationMetricSet("delete_bucket",
{{"logdefault"},{"yamasdefault"}},
"Operations to delete excess buckets on storage nodes", this));
- operations[ISO::MERGE_BUCKET] = std::shared_ptr<OperationMetricSet>(
- new OperationMetricSet("merge_bucket",
- {{"logdefault"},{"yamasdefault"}},
- "Operations to merge buckets that are out of sync", this));
+ operations[ISO::MERGE_BUCKET] = std::make_shared<MergeBucketMetricSet>
+ ("merge_bucket",
+ metrics::Metric::Tags{{"logdefault"},{"yamasdefault"}},
+ "Operations to merge buckets that are out of sync", this);
operations[ISO::SPLIT_BUCKET] = std::shared_ptr<OperationMetricSet>(
new OperationMetricSet("split_bucket",
{{"logdefault"},{"yamasdefault"}},
diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.h b/storage/src/vespa/storage/distributor/idealstatemetricsset.h
index ada1bd50e5c..a0cbe86dac4 100644
--- a/storage/src/vespa/storage/distributor/idealstatemetricsset.h
+++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.h
@@ -31,6 +31,16 @@ struct GcMetricSet : OperationMetricSet {
~GcMetricSet() override;
};
+class MergeBucketMetricSet : public OperationMetricSet
+{
+public:
+ metrics::LongCountMetric source_only_copy_changed;
+ metrics::LongCountMetric source_only_copy_delete_blocked;
+ metrics::LongCountMetric source_only_copy_delete_failed;
+ MergeBucketMetricSet(const std::string& name, metrics::Metric::Tags tags, const std::string& description, MetricSet* owner);
+ ~MergeBucketMetricSet() override;
+};
+
class IdealStateMetricSet : public metrics::MetricSet
{
public:
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
index 69bbb67b1f3..7cfe4172b2c 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
@@ -2,6 +2,7 @@
#include "mergeoperation.h"
#include <vespa/document/bucket/fixed_bucket_spaces.h>
#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/idealstatemetricsset.h>
#include <vespa/storage/distributor/distributor_bucket_space.h>
#include <vespa/storage/distributor/pendingmessagetracker.h>
#include <vespa/vdslib/distribution/distribution.h>
@@ -239,6 +240,10 @@ MergeOperation::deleteSourceOnlyNodes(
LOG(debug, "Source only removal for %s was blocked by a pending operation",
getBucketId().toString().c_str());
_ok = false;
+ auto merge_metrics = get_merge_metrics();
+ if (merge_metrics) {
+ merge_metrics->source_only_copy_delete_blocked.inc(1);
+ }
done();
return;
}
@@ -260,6 +265,12 @@ MergeOperation::onReceive(DistributorStripeMessageSender& sender,
if (_removeOperation.get()) {
if (_removeOperation->onReceiveInternal(msg)) {
_ok = _removeOperation->ok();
+ if (!_ok) {
+ auto merge_metrics = get_merge_metrics();
+ if (merge_metrics) {
+ merge_metrics->source_only_copy_delete_failed.inc(1);
+ }
+ }
done();
}
@@ -284,6 +295,10 @@ MergeOperation::onReceive(DistributorStripeMessageSender& sender,
}
if (sourceOnlyCopyChangedDuringMerge(entry)) {
_ok = false;
+ auto merge_metrics = get_merge_metrics();
+ if (merge_metrics) {
+ merge_metrics->source_only_copy_changed.inc(1);
+ }
done();
return;
}
@@ -352,4 +367,14 @@ bool MergeOperation::is_global_bucket_merge() const noexcept {
return getBucket().getBucketSpace() == document::FixedBucketSpaces::global_space();
}
+MergeBucketMetricSet*
+MergeOperation::get_merge_metrics()
+{
+ if (_manager) {
+ return dynamic_cast<MergeBucketMetricSet *>(_manager->getMetrics().operations[getType()].get());
+ } else {
+ return nullptr;
+ }
+}
+
}
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
index 2be98785fdb..1bca1f7389f 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
@@ -12,6 +12,8 @@
namespace storage::lib { class Distribution; }
namespace storage::distributor {
+class MergeBucketMetricSet;
+
class MergeOperation : public IdealStateOperation
{
protected:
@@ -62,6 +64,7 @@ private:
void deleteSourceOnlyNodes(const BucketDatabase::Entry& currentState,
DistributorStripeMessageSender& sender);
bool is_global_bucket_merge() const noexcept;
+ MergeBucketMetricSet* get_merge_metrics();
};
}