aboutsummaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahoo-inc.com>2016-12-07 10:57:03 +0100
committerGitHub <noreply@github.com>2016-12-07 10:57:03 +0100
commit88a5a58938c773bc702d8f2d918f88a9c4482fc6 (patch)
tree084ead8dc0cac3c1fb17a368c8d322186bbe7ddf /storage
parent0ac893aa3398834ac6d0556be8cf8f57dd0f951c (diff)
parentba1bd01788166317d75805b976475297ce9794c7 (diff)
Merge pull request #1254 from yahoo/vekterli/add-more-distributor-visiting-metrics
Add more distributor visiting metrics
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/CMakeLists.txt1
-rw-r--r--storage/src/tests/distributor/persistence_metrics_set_test.cpp92
-rw-r--r--storage/src/tests/distributor/visitoroperationtest.cpp78
-rw-r--r--storage/src/vespa/storage/distributor/CMakeLists.txt1
-rw-r--r--storage/src/vespa/storage/distributor/distributormetricsset.cpp61
-rw-r--r--storage/src/vespa/storage/distributor/distributormetricsset.h35
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp18
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/visitoroperation.h1
-rw-r--r--storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp98
-rw-r--r--storage/src/vespa/storage/distributor/persistence_operation_metric_set.h57
-rw-r--r--storage/src/vespa/storage/distributor/persistencemessagetracker.cpp20
-rw-r--r--storage/src/vespa/storage/distributor/persistencemessagetracker.h4
-rw-r--r--storage/src/vespa/storage/distributor/visitormetricsset.cpp16
-rw-r--r--storage/src/vespa/storage/distributor/visitormetricsset.h8
14 files changed, 371 insertions, 119 deletions
diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt
index 600a8231a8e..2676a31e832 100644
--- a/storage/src/tests/distributor/CMakeLists.txt
+++ b/storage/src/tests/distributor/CMakeLists.txt
@@ -39,6 +39,7 @@ vespa_add_library(storage_testdistributor TEST
nodemaintenancestatstrackertest.cpp
distributor_host_info_reporter_test.cpp
ownership_transfer_safe_time_point_calculator_test.cpp
+ persistence_metrics_set_test.cpp
DEPENDS
storage_distributor
storage_testcommon
diff --git a/storage/src/tests/distributor/persistence_metrics_set_test.cpp b/storage/src/tests/distributor/persistence_metrics_set_test.cpp
new file mode 100644
index 00000000000..131a588413b
--- /dev/null
+++ b/storage/src/tests/distributor/persistence_metrics_set_test.cpp
@@ -0,0 +1,92 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+namespace storage {
+namespace distributor {
+
+struct PersistenceMetricsSetTest : CppUnit::TestFixture {
+ void successful_return_codes_are_counted_as_ok();
+ void wrong_distribution_failure_is_counted();
+ void timeout_failure_is_counted();
+ // Note for these tests: busy, connection failures et al are sets of
+ // failure codes and not just a single code. We only test certain members
+ // of these sets here. See api::ReturnCode implementation for an exhaustive
+ // list.
+ void busy_failure_is_counted();
+ void connection_failure_is_counted();
+ void inconsistent_bucket_is_counted();
+ void non_special_cased_failure_codes_are_catchall_counted();
+
+ CPPUNIT_TEST_SUITE(PersistenceMetricsSetTest);
+ CPPUNIT_TEST(successful_return_codes_are_counted_as_ok);
+ CPPUNIT_TEST(wrong_distribution_failure_is_counted);
+ CPPUNIT_TEST(timeout_failure_is_counted);
+ CPPUNIT_TEST(busy_failure_is_counted);
+ CPPUNIT_TEST(connection_failure_is_counted);
+ CPPUNIT_TEST(inconsistent_bucket_is_counted);
+ CPPUNIT_TEST(non_special_cased_failure_codes_are_catchall_counted);
+ CPPUNIT_TEST_SUITE_END();
+
+ void assert_failure_is_counted(PersistenceOperationMetricSet& metrics,
+ api::ReturnCode::Result failure_code,
+ const metrics::LongCountMetric& checked)
+ {
+ metrics.updateFromResult(api::ReturnCode(failure_code));
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), checked.getLongValue("count"));
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), metrics.ok.getLongValue("count"));
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(PersistenceMetricsSetTest);
+
+void PersistenceMetricsSetTest::successful_return_codes_are_counted_as_ok() {
+ PersistenceOperationMetricSet metrics("foo");
+ metrics.updateFromResult(api::ReturnCode());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), metrics.ok.getLongValue("count"));
+}
+
+void PersistenceMetricsSetTest::wrong_distribution_failure_is_counted() {
+ PersistenceOperationMetricSet metrics("foo");
+ assert_failure_is_counted(metrics, api::ReturnCode::WRONG_DISTRIBUTION,
+ metrics.failures.wrongdistributor);
+}
+
+void PersistenceMetricsSetTest::timeout_failure_is_counted() {
+ PersistenceOperationMetricSet metrics("foo");
+ assert_failure_is_counted(metrics, api::ReturnCode::TIMEOUT,
+ metrics.failures.timeout);
+}
+
+void PersistenceMetricsSetTest::busy_failure_is_counted() {
+ PersistenceOperationMetricSet metrics("foo");
+ assert_failure_is_counted(metrics, api::ReturnCode::BUSY,
+ metrics.failures.busy);
+}
+
+void PersistenceMetricsSetTest::connection_failure_is_counted() {
+ PersistenceOperationMetricSet metrics("foo");
+ // This is dirty enum value coercion, but this is how "parent protocol"
+ // error codes are handled already.
+ api::ReturnCode::Result error_code(static_cast<api::ReturnCode::Result>(
+ mbus::ErrorCode::CONNECTION_ERROR));
+ assert_failure_is_counted(metrics, error_code,
+ metrics.failures.notconnected);
+}
+
+void PersistenceMetricsSetTest::inconsistent_bucket_is_counted() {
+ PersistenceOperationMetricSet metrics("foo");
+ assert_failure_is_counted(metrics, api::ReturnCode::BUCKET_NOT_FOUND,
+ metrics.failures.inconsistent_bucket);
+}
+
+void PersistenceMetricsSetTest::non_special_cased_failure_codes_are_catchall_counted() {
+ PersistenceOperationMetricSet metrics("foo");
+ assert_failure_is_counted(metrics, api::ReturnCode::REJECTED,
+ metrics.failures.storagefailure);
+}
+
+}
+}
diff --git a/storage/src/tests/distributor/visitoroperationtest.cpp b/storage/src/tests/distributor/visitoroperationtest.cpp
index 9b674029959..7bd9eea29b0 100644
--- a/storage/src/tests/distributor/visitoroperationtest.cpp
+++ b/storage/src/tests/distributor/visitoroperationtest.cpp
@@ -64,6 +64,8 @@ class VisitorOperationTest : public CppUnit::TestFixture,
CPPUNIT_TEST(testNoClientReplyBeforeAllStorageRepliesReceived);
CPPUNIT_TEST(testSkipFailedSubBucketsWhenVisitingInconsistent);
CPPUNIT_TEST(testQueueTimeoutIsFactorOfTotalTimeout);
+ CPPUNIT_TEST(metrics_are_updated_with_visitor_statistics_upon_replying);
+ CPPUNIT_TEST(statistical_metrics_not_updated_on_wrong_distribution);
CPPUNIT_TEST_SUITE_END();
protected:
@@ -109,6 +111,8 @@ protected:
void testNoClientReplyBeforeAllStorageRepliesReceived();
void testSkipFailedSubBucketsWhenVisitingInconsistent();
void testQueueTimeoutIsFactorOfTotalTimeout();
+ void metrics_are_updated_with_visitor_statistics_upon_replying();
+ void statistical_metrics_not_updated_on_wrong_distribution();
public:
VisitorOperationTest()
: defaultConfig(framework::MilliSecTime(0),
@@ -188,6 +192,10 @@ private:
return ost.str();
}
+ VisitorMetricSet& defaultVisitorMetrics() {
+ return getDistributor().getMetrics().visits[documentapi::LoadType::DEFAULT];
+ }
+
std::unique_ptr<VisitorOperation> createOpWithConfig(
api::CreateVisitorCommand::SP msg,
const VisitorOperation::Config& config)
@@ -233,6 +241,8 @@ private:
std::unique_ptr<VisitorOperation> startOperationWith2StorageNodeVisitors(
bool inconsistent);
+
+ void do_visitor_roundtrip_with_statistics(const api::ReturnCode& result);
};
CPPUNIT_TEST_SUITE_REGISTRATION(VisitorOperationTest);
@@ -307,6 +317,8 @@ VisitorOperationTest::doStandardVisitTest(const std::string& clusterState)
"last=BucketId(0x000000007fffffff)) "
"ReturnCode(NONE)"),
_sender.getLastReply());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), defaultVisitorMetrics().
+ ok.getLongValue("count"));
}
void
@@ -603,6 +615,8 @@ VisitorOperationTest::testBucketRemovedWhileVisitorPending()
std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
"ReturnCode(BUCKET_NOT_FOUND)"),
_sender.getLastReply());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), defaultVisitorMetrics().failures.
+ inconsistent_bucket.getLongValue("count"));
}
void
@@ -725,6 +739,8 @@ VisitorOperationTest::testTimeoutDoesNotOverrideCriticalError()
"CreateVisitorReply(last=BucketId(0x0000000000000000)) "
"ReturnCode(INTERNAL_FAILURE, [from content node 0] )"s,
_sender.getLastReply());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), defaultVisitorMetrics().failures.
+ storagefailure.getLongValue("count"));
}
void
@@ -737,6 +753,8 @@ VisitorOperationTest::testWrongDistribution()
std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
"ReturnCode(WRONG_DISTRIBUTION, distributor:100 storage:2)"),
runEmptyVisitor(createVisitorCommand("wrongdist", id, nullId)));
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), defaultVisitorMetrics().failures.
+ wrongdistributor.getLongValue("count"));
}
void
@@ -1181,6 +1199,9 @@ VisitorOperationTest::testFailureOnAllNodes()
std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
"ReturnCode(BUCKET_NOT_FOUND)"),
_sender.getLastReply());
+ // TODO it'd be much more accurate to increase the "notconnected" metric
+ // here, but our metrics are currently based on the reply sent back to the
+ // client, not the ones sent from the content nodes to the distributor.
}
@@ -1606,5 +1627,62 @@ VisitorOperationTest::testQueueTimeoutIsFactorOfTotalTimeout()
CPPUNIT_ASSERT_EQUAL(uint32_t(5000), cmd.getQueueTimeout());
}
+void
+VisitorOperationTest::do_visitor_roundtrip_with_statistics(
+ const api::ReturnCode& result)
+{
+ document::BucketId id(0x400000000000007bULL);
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+ addNodesToBucketDB(id, "0=1/1/1/t");
+
+ auto op = createOpWithDefaultConfig(
+ createVisitorCommand("metricstats", id, nullId));
+
+ op->start(_sender, framework::MilliSecTime(0));
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+ auto& cmd(dynamic_cast<CreateVisitorCommand&>(*_sender.commands[0]));
+ auto reply = cmd.makeReply();
+ vdslib::VisitorStatistics stats;
+ stats.setBucketsVisited(50);
+ stats.setDocumentsVisited(100);
+ stats.setBytesVisited(2000);
+ static_cast<CreateVisitorReply&>(*reply).setVisitorStatistics(stats);
+ reply->setResult(result);
+
+ op->receive(_sender, api::StorageReply::SP(std::move(reply)));
+}
+
+void
+VisitorOperationTest::metrics_are_updated_with_visitor_statistics_upon_replying()
+{
+ do_visitor_roundtrip_with_statistics(api::ReturnCode(api::ReturnCode::OK));
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(50), defaultVisitorMetrics().buckets_per_visitor.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(100), defaultVisitorMetrics().docs_per_visitor.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(2000), defaultVisitorMetrics().bytes_per_visitor.getLast());
+}
+
+void
+VisitorOperationTest::statistical_metrics_not_updated_on_wrong_distribution()
+{
+ setupDistributor(1, 100, "distributor:100 storage:2");
+
+ document::BucketId id(uint64_t(0x400000000000127b));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(WRONG_DISTRIBUTION, distributor:100 storage:2)"),
+ runEmptyVisitor(createVisitorCommand("wrongdist", id, nullId)));
+
+ // Note that we're testing the number of _times_ the metric has been
+ // updated, not the value with which it's been updated (which would be zero
+ // even in the case we actually did update the statistical metrics).
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), defaultVisitorMetrics().buckets_per_visitor.getCount());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), defaultVisitorMetrics().docs_per_visitor.getCount());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), defaultVisitorMetrics().bytes_per_visitor.getCount());
+ // Fascinating that count is also a double...
+ CPPUNIT_ASSERT_EQUAL(0.0, defaultVisitorMetrics().latency.getCount());
+}
+
} // distributor
} // storage
diff --git a/storage/src/vespa/storage/distributor/CMakeLists.txt b/storage/src/vespa/storage/distributor/CMakeLists.txt
index 4f6dd7da153..3b8843d39ea 100644
--- a/storage/src/vespa/storage/distributor/CMakeLists.txt
+++ b/storage/src/vespa/storage/distributor/CMakeLists.txt
@@ -4,6 +4,7 @@ vespa_add_library(storage_distributor
distributor.cpp
distributormetricsset.cpp
visitormetricsset.cpp
+ persistence_operation_metric_set.cpp
idealstatemetricsset.cpp
operationowner.cpp
distributorcomponent.cpp
diff --git a/storage/src/vespa/storage/distributor/distributormetricsset.cpp b/storage/src/vespa/storage/distributor/distributormetricsset.cpp
index 0760ab6223d..f842ed771c9 100644
--- a/storage/src/vespa/storage/distributor/distributormetricsset.cpp
+++ b/storage/src/vespa/storage/distributor/distributormetricsset.cpp
@@ -7,65 +7,6 @@ namespace storage {
using metrics::MetricSet;
-PersistenceFailuresMetricSet::PersistenceFailuresMetricSet(MetricSet* owner)
- : MetricSet("failures", "", "Detailed failure statistics", owner),
- sum("total", "logdefault yamasdefault", "Sum of all failures", this),
- notready("notready", "", "The number of operations discarded because distributor was not ready", this),
- notconnected("notconnected", "", "The number of operations discarded because there were no available storage nodes to send to", this),
- wrongdistributor("wrongdistributor", "", "The number of operations discarded because they were sent to the wrong distributor", this),
- safe_time_not_reached("safe_time_not_reached", "",
- "The number of operations that were transiently"
- " failed due to them arriving before the safe "
- "time point for bucket ownership handovers has "
- "passed", this),
- storagefailure("storagefailure", "", "The number of operations that failed in storage", this),
- timeout("timeout", "", "The number of operations that failed because the operation timed out towards storage", this),
- busy("busy", "", "The number of messages from storage that failed because the storage node was busy", this),
- notfound("notfound", "", "The number of operations that failed because the document did not exist", this)
-{
- sum.addMetricToSum(notready);
- sum.addMetricToSum(notconnected);
- sum.addMetricToSum(wrongdistributor);
- sum.addMetricToSum(safe_time_not_reached);
- sum.addMetricToSum(storagefailure);
- sum.addMetricToSum(timeout);
- sum.addMetricToSum(busy);
- sum.addMetricToSum(notfound);
-}
-PersistenceFailuresMetricSet::~PersistenceFailuresMetricSet() { }
-
-MetricSet *
-PersistenceFailuresMetricSet::clone(std::vector<Metric::LP>& ownerList, CopyType copyType,
- MetricSet* owner, bool includeUnused) const
-{
- if (copyType == INACTIVE) {
- return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused);
- }
- return (PersistenceFailuresMetricSet*)
- (new PersistenceFailuresMetricSet(owner))->assignValues(*this);
-}
-
-PersistenceOperationMetricSet::PersistenceOperationMetricSet(const std::string& name, MetricSet* owner)
- : MetricSet(name, "", vespalib::make_string("Statistics for the %s command", name.c_str()), owner, "operationtype"),
- latency("latency", "yamasdefault", vespalib::make_string("The average latency of %s operations", name.c_str()), this),
- ok("ok", "logdefault yamasdefault", vespalib::make_string("The number of successful %s operations performed", name.c_str()), this),
- failures(this)
-{ }
-
-PersistenceOperationMetricSet::~PersistenceOperationMetricSet() { }
-
-MetricSet *
-PersistenceOperationMetricSet::clone(std::vector<Metric::LP>& ownerList, CopyType copyType,
- MetricSet* owner, bool includeUnused) const
-{
- if (copyType == INACTIVE) {
- return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused);
- }
- return (PersistenceOperationMetricSet*)
- (new PersistenceOperationMetricSet(getName(), owner))
- ->assignValues(*this);
-}
-
DistributorMetricSet::DistributorMetricSet(const metrics::LoadTypeSet& lt)
: MetricSet("distributor", "distributor", ""),
puts(lt, PersistenceOperationMetricSet("puts"), this),
@@ -94,7 +35,7 @@ DistributorMetricSet::DistributorMetricSet(const metrics::LoadTypeSet& lt)
DistributorMetricSet::~DistributorMetricSet() { }
-}
+} // storage
template class metrics::LoadMetric<storage::PersistenceOperationMetricSet>;
template class metrics::SumMetric<storage::PersistenceOperationMetricSet>;
diff --git a/storage/src/vespa/storage/distributor/distributormetricsset.h b/storage/src/vespa/storage/distributor/distributormetricsset.h
index b8458d119e7..705e7305e0d 100644
--- a/storage/src/vespa/storage/distributor/distributormetricsset.h
+++ b/storage/src/vespa/storage/distributor/distributormetricsset.h
@@ -1,46 +1,13 @@
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
+#include "persistence_operation_metric_set.h"
#include "visitormetricsset.h"
#include <vespa/metrics/metrics.h>
#include <vespa/documentapi/loadtypes/loadtypeset.h>
namespace storage {
-class PersistenceFailuresMetricSet : public metrics::MetricSet
-{
-public:
- PersistenceFailuresMetricSet(metrics::MetricSet* owner);
- ~PersistenceFailuresMetricSet();
-
- metrics::SumMetric<metrics::LongCountMetric> sum;
- metrics::LongCountMetric notready;
- metrics::LongCountMetric notconnected;
- metrics::LongCountMetric wrongdistributor;
- metrics::LongCountMetric safe_time_not_reached;
- metrics::LongCountMetric storagefailure;
- metrics::LongCountMetric timeout;
- metrics::LongCountMetric busy;
- metrics::LongCountMetric notfound;
-
- MetricSet * clone(std::vector<Metric::LP>& ownerList, CopyType copyType,
- metrics::MetricSet* owner, bool includeUnused) const;
-};
-
-class PersistenceOperationMetricSet : public metrics::MetricSet
-{
-public:
- metrics::DoubleAverageMetric latency;
- metrics::LongCountMetric ok;
- PersistenceFailuresMetricSet failures;
-
- PersistenceOperationMetricSet(const std::string& name, metrics::MetricSet* owner = nullptr);
- ~PersistenceOperationMetricSet();
-
- MetricSet * clone(std::vector<Metric::LP>& ownerList, CopyType copyType,
- metrics::MetricSet* owner, bool includeUnused) const override;
-};
-
class DistributorMetricSet : public metrics::MetricSet
{
public:
diff --git a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp
index 7bb99ea4b43..1bcc6abafb6 100644
--- a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp
+++ b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp
@@ -983,14 +983,30 @@ VisitorOperation::sendReply(const api::ReturnCode& code, DistributorMessageSende
code.toString().c_str(),
_msg->getInstanceId().c_str(), _msg->getMsgId());
+ updateReplyMetrics(code);
sender.sendReply(reply);
- _metrics.latency.addValue(_operationTimer.getElapsedTimeAsDouble());
_sentReply = true;
}
}
void
+VisitorOperation::updateReplyMetrics(const api::ReturnCode& result)
+{
+ _metrics.updateFromResult(result);
+ // WrongDistributionReply happens as a normal and expected part of a visitor
+ // session's lifetime. If we pollute the metrics with measurements taken
+ // from such replies, the averages will not be representative.
+ if (result.getResult() == api::ReturnCode::WRONG_DISTRIBUTION) {
+ return;
+ }
+ _metrics.latency.addValue(_operationTimer.getElapsedTimeAsDouble());
+ _metrics.buckets_per_visitor.inc(_visitorStatistics.getBucketsVisited());
+ _metrics.docs_per_visitor.inc(_visitorStatistics.getDocumentsVisited());
+ _metrics.bytes_per_visitor.inc(_visitorStatistics.getBytesVisited());
+}
+
+void
VisitorOperation::onClose(DistributorMessageSender& sender)
{
sendReply(api::ReturnCode(api::ReturnCode::ABORTED, "Process is shutting down"),
diff --git a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h
index 1824610355d..6fd18d8d01e 100644
--- a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h
+++ b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h
@@ -90,6 +90,7 @@ private:
typedef std::map<uint64_t, api::CreateVisitorCommand::SP> SentMessagesMap;
void sendReply(const api::ReturnCode& code, DistributorMessageSender& sender);
+ void updateReplyMetrics(const api::ReturnCode& result);
void verifyDistributorsAreAvailable();
void verifyVisitorDistributionBitCount(const document::BucketId&);
void verifyDistributorIsNotDown(const lib::ClusterState&);
diff --git a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp
new file mode 100644
index 00000000000..b7766713b01
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp
@@ -0,0 +1,98 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "distributormetricsset.h"
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/metrics/loadmetric.hpp>
+#include <vespa/metrics/summetric.hpp>
+
+namespace storage {
+
+using metrics::MetricSet;
+
+PersistenceFailuresMetricSet::PersistenceFailuresMetricSet(MetricSet* owner)
+ : MetricSet("failures", "", "Detailed failure statistics", owner),
+ sum("total", "logdefault yamasdefault", "Sum of all failures", this),
+ notready("notready", "", "The number of operations discarded because distributor was not ready", this),
+ notconnected("notconnected", "", "The number of operations discarded because there were no available storage nodes to send to", this),
+ wrongdistributor("wrongdistributor", "", "The number of operations discarded because they were sent to the wrong distributor", this),
+ safe_time_not_reached("safe_time_not_reached", "",
+ "The number of operations that were transiently"
+ " failed due to them arriving before the safe "
+ "time point for bucket ownership handovers has "
+ "passed", this),
+ storagefailure("storagefailure", "", "The number of operations that failed in storage", this),
+ timeout("timeout", "", "The number of operations that failed because the operation timed out towards storage", this),
+ busy("busy", "", "The number of messages from storage that failed because the storage node was busy", this),
+ inconsistent_bucket("inconsistent_bucket", "",
+ "The number of operations failed due to buckets "
+ "being in an inconsistent state or not found", this),
+ notfound("notfound", "", "The number of operations that failed because the document did not exist", this)
+{
+ sum.addMetricToSum(notready);
+ sum.addMetricToSum(notconnected);
+ sum.addMetricToSum(wrongdistributor);
+ sum.addMetricToSum(safe_time_not_reached);
+ sum.addMetricToSum(storagefailure);
+ sum.addMetricToSum(timeout);
+ sum.addMetricToSum(busy);
+ sum.addMetricToSum(inconsistent_bucket);
+ sum.addMetricToSum(notfound);
+}
+PersistenceFailuresMetricSet::~PersistenceFailuresMetricSet() { }
+
+MetricSet *
+PersistenceFailuresMetricSet::clone(std::vector<Metric::LP>& ownerList, CopyType copyType,
+ MetricSet* owner, bool includeUnused) const
+{
+ if (copyType == INACTIVE) {
+ return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused);
+ }
+ return (PersistenceFailuresMetricSet*)
+ (new PersistenceFailuresMetricSet(owner))->assignValues(*this);
+}
+
+PersistenceOperationMetricSet::PersistenceOperationMetricSet(const std::string& name, MetricSet* owner)
+ : MetricSet(name, "", vespalib::make_string("Statistics for the %s command", name.c_str()), owner, "operationtype"),
+ latency("latency", "yamasdefault", vespalib::make_string("The average latency of %s operations", name.c_str()), this),
+ ok("ok", "logdefault yamasdefault", vespalib::make_string("The number of successful %s operations performed", name.c_str()), this),
+ failures(this)
+{ }
+
+PersistenceOperationMetricSet::~PersistenceOperationMetricSet() { }
+
+MetricSet *
+PersistenceOperationMetricSet::clone(std::vector<Metric::LP>& ownerList, CopyType copyType,
+ MetricSet* owner, bool includeUnused) const
+{
+ if (copyType == INACTIVE) {
+ return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused);
+ }
+ return (PersistenceOperationMetricSet*)
+ (new PersistenceOperationMetricSet(getName(), owner))
+ ->assignValues(*this);
+}
+
+void
+PersistenceOperationMetricSet::updateFromResult(const api::ReturnCode& result)
+{
+ if (result.success()) {
+ ++ok;
+ } else if (result.getResult() == api::ReturnCode::WRONG_DISTRIBUTION) {
+ ++failures.wrongdistributor;
+ } else if (result.getResult() == api::ReturnCode::TIMEOUT) {
+ ++failures.timeout;
+ } else if (result.isBusy()) {
+ ++failures.busy;
+ } else if (result.isBucketDisappearance()) {
+ // Bucket not found/deleted codes imply that replicas are transiently
+ // inconsistent in our DB or across replica nodes.
+ ++failures.inconsistent_bucket;
+ } else if (result.isNodeDownOrNetwork()) {
+ ++failures.notconnected;
+ } else {
+ ++failures.storagefailure;
+ }
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h
new file mode 100644
index 00000000000..221662a2ba6
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h
@@ -0,0 +1,57 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+#include <vespa/documentapi/loadtypes/loadtypeset.h>
+
+namespace storage {
+
+namespace api {
+class ReturnCode;
+}
+
+class PersistenceFailuresMetricSet : public metrics::MetricSet
+{
+public:
+ PersistenceFailuresMetricSet(metrics::MetricSet* owner);
+ ~PersistenceFailuresMetricSet();
+
+ metrics::SumMetric<metrics::LongCountMetric> sum;
+ metrics::LongCountMetric notready;
+ metrics::LongCountMetric notconnected;
+ metrics::LongCountMetric wrongdistributor;
+ metrics::LongCountMetric safe_time_not_reached;
+ metrics::LongCountMetric storagefailure;
+ metrics::LongCountMetric timeout;
+ metrics::LongCountMetric busy;
+ metrics::LongCountMetric inconsistent_bucket;
+ metrics::LongCountMetric notfound;
+
+ MetricSet * clone(std::vector<Metric::LP>& ownerList, CopyType copyType,
+ metrics::MetricSet* owner, bool includeUnused) const;
+};
+
+class PersistenceOperationMetricSet : public metrics::MetricSet
+{
+public:
+ metrics::DoubleAverageMetric latency;
+ metrics::LongCountMetric ok;
+ PersistenceFailuresMetricSet failures;
+
+ PersistenceOperationMetricSet(const std::string& name, metrics::MetricSet* owner = nullptr);
+ ~PersistenceOperationMetricSet();
+
+ MetricSet * clone(std::vector<Metric::LP>& ownerList, CopyType copyType,
+ metrics::MetricSet* owner, bool includeUnused) const override;
+
+ /**
+ * Increments appropriate success/failure count metrics based on the return
+ * code provided in `result`.
+ *
+ * Does _not_ update latency metric.
+ */
+ void updateFromResult(const api::ReturnCode& result);
+};
+
+}
diff --git a/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp
index 34193bbb3c1..4ea765a2d4a 100644
--- a/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp
+++ b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp
@@ -21,11 +21,11 @@ PersistenceMessageTrackerImpl::PersistenceMessageTrackerImpl(
_metric(metric),
_reply(reply),
_manager(link),
- _success(true),
_revertTimestamp(revertTimestamp),
- _priority(reply->getPriority())
+ _requestTimer(link.getClock()),
+ _priority(reply->getPriority()),
+ _success(true)
{
- _creationTime.SetNow();
}
void
@@ -51,18 +51,8 @@ void
PersistenceMessageTrackerImpl::updateMetrics()
{
const api::ReturnCode& result(_reply->getResult());
- if (result.success()) {
- ++_metric.ok;
- } else if (result.getResult() == api::ReturnCode::TIMEOUT) {
- ++_metric.failures.timeout;
- } else if (result.isBusy()) {
- ++_metric.failures.busy;
- } else if (result.isNodeDownOrNetwork()) {
- ++_metric.failures.notconnected;
- } else {
- ++_metric.failures.storagefailure;
- }
- _metric.latency.addValue(_creationTime.MilliSecsToNow());
+ _metric.updateFromResult(result);
+ _metric.latency.addValue(_requestTimer.getElapsedTimeAsDouble());
}
void
diff --git a/storage/src/vespa/storage/distributor/persistencemessagetracker.h b/storage/src/vespa/storage/distributor/persistencemessagetracker.h
index b5c3e57e05e..06c8bd1135c 100644
--- a/storage/src/vespa/storage/distributor/persistencemessagetracker.h
+++ b/storage/src/vespa/storage/distributor/persistencemessagetracker.h
@@ -78,12 +78,12 @@ private:
PersistenceOperationMetricSet& _metric;
std::shared_ptr<api::BucketInfoReply> _reply;
DistributorComponent& _manager;
- FastOS_Time _creationTime;
- bool _success;
api::Timestamp _revertTimestamp;
std::vector<std::pair<document::BucketId, uint16_t> > _revertNodes;
mbus::TraceNode _trace;
+ framework::MilliSecTimer _requestTimer;
uint8_t _priority;
+ bool _success;
bool canSendReplyEarly() const;
void addBucketInfoFromReply(uint16_t node,
diff --git a/storage/src/vespa/storage/distributor/visitormetricsset.cpp b/storage/src/vespa/storage/distributor/visitormetricsset.cpp
index ae8aeeb8463..fb2bab51d1a 100644
--- a/storage/src/vespa/storage/distributor/visitormetricsset.cpp
+++ b/storage/src/vespa/storage/distributor/visitormetricsset.cpp
@@ -9,10 +9,18 @@ namespace storage {
using metrics::MetricSet;
VisitorMetricSet::VisitorMetricSet(MetricSet* owner)
- : MetricSet("visitor", "visitor", "", owner),
- latency("latency", "", "Latency of visitor (in ms)", this),
- failed("failed", "", "Number of visitors that failed or were aborted by the user", this)
-{ }
+ : PersistenceOperationMetricSet("visitor", owner),
+ buckets_per_visitor("buckets_per_visitor", "",
+ "The number of sub buckets visited as part of a "
+ "single client visitor command", this),
+ docs_per_visitor("docs_per_visitor", "",
+ "The number of documents visited on content nodes as "
+ "part of a single client visitor command", this),
+ bytes_per_visitor("bytes_per_visitor", "",
+ "The number of bytes visited on content nodes as part "
+ "of a single client visitor command", this)
+{
+}
VisitorMetricSet::~VisitorMetricSet() { }
diff --git a/storage/src/vespa/storage/distributor/visitormetricsset.h b/storage/src/vespa/storage/distributor/visitormetricsset.h
index 24a3bbcce2f..34d65b9ec07 100644
--- a/storage/src/vespa/storage/distributor/visitormetricsset.h
+++ b/storage/src/vespa/storage/distributor/visitormetricsset.h
@@ -1,14 +1,16 @@
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
+#include "persistence_operation_metric_set.h"
#include <vespa/metrics/metrics.h>
#include <vespa/documentapi/loadtypes/loadtypeset.h>
namespace storage {
-struct VisitorMetricSet : public metrics::MetricSet {
- metrics::DoubleAverageMetric latency;
- metrics::LongCountMetric failed;
+struct VisitorMetricSet : public PersistenceOperationMetricSet {
+ metrics::LongAverageMetric buckets_per_visitor;
+ metrics::LongAverageMetric docs_per_visitor;
+ metrics::LongAverageMetric bytes_per_visitor;
VisitorMetricSet(MetricSet* owner = nullptr);
~VisitorMetricSet();