diff options
author | Tor Brede Vekterli <vekterli@yahoo-inc.com> | 2016-12-07 10:57:03 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-12-07 10:57:03 +0100 |
commit | 88a5a58938c773bc702d8f2d918f88a9c4482fc6 (patch) | |
tree | 084ead8dc0cac3c1fb17a368c8d322186bbe7ddf /storage | |
parent | 0ac893aa3398834ac6d0556be8cf8f57dd0f951c (diff) | |
parent | ba1bd01788166317d75805b976475297ce9794c7 (diff) |
Merge pull request #1254 from yahoo/vekterli/add-more-distributor-visiting-metrics
Add more distributor visiting metrics
Diffstat (limited to 'storage')
14 files changed, 371 insertions, 119 deletions
diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt index 600a8231a8e..2676a31e832 100644 --- a/storage/src/tests/distributor/CMakeLists.txt +++ b/storage/src/tests/distributor/CMakeLists.txt @@ -39,6 +39,7 @@ vespa_add_library(storage_testdistributor TEST nodemaintenancestatstrackertest.cpp distributor_host_info_reporter_test.cpp ownership_transfer_safe_time_point_calculator_test.cpp + persistence_metrics_set_test.cpp DEPENDS storage_distributor storage_testcommon diff --git a/storage/src/tests/distributor/persistence_metrics_set_test.cpp b/storage/src/tests/distributor/persistence_metrics_set_test.cpp new file mode 100644 index 00000000000..131a588413b --- /dev/null +++ b/storage/src/tests/distributor/persistence_metrics_set_test.cpp @@ -0,0 +1,92 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/storage/distributor/distributormetricsset.h> +#include <vespa/storageapi/messageapi/returncode.h> +#include <vespa/vdstestlib/cppunit/macros.h> + +namespace storage { +namespace distributor { + +struct PersistenceMetricsSetTest : CppUnit::TestFixture { + void successful_return_codes_are_counted_as_ok(); + void wrong_distribution_failure_is_counted(); + void timeout_failure_is_counted(); + // Note for these tests: busy, connection failures et al are sets of + // failure codes and not just a single code. We only test certain members + // of these sets here. See api::ReturnCode implementation for an exhaustive + // list. + void busy_failure_is_counted(); + void connection_failure_is_counted(); + void inconsistent_bucket_is_counted(); + void non_special_cased_failure_codes_are_catchall_counted(); + + CPPUNIT_TEST_SUITE(PersistenceMetricsSetTest); + CPPUNIT_TEST(successful_return_codes_are_counted_as_ok); + CPPUNIT_TEST(wrong_distribution_failure_is_counted); + CPPUNIT_TEST(timeout_failure_is_counted); + CPPUNIT_TEST(busy_failure_is_counted); + CPPUNIT_TEST(connection_failure_is_counted); + CPPUNIT_TEST(inconsistent_bucket_is_counted); + CPPUNIT_TEST(non_special_cased_failure_codes_are_catchall_counted); + CPPUNIT_TEST_SUITE_END(); + + void assert_failure_is_counted(PersistenceOperationMetricSet& metrics, + api::ReturnCode::Result failure_code, + const metrics::LongCountMetric& checked) + { + metrics.updateFromResult(api::ReturnCode(failure_code)); + CPPUNIT_ASSERT_EQUAL(int64_t(1), checked.getLongValue("count")); + CPPUNIT_ASSERT_EQUAL(int64_t(0), metrics.ok.getLongValue("count")); + } +}; + +CPPUNIT_TEST_SUITE_REGISTRATION(PersistenceMetricsSetTest); + +void PersistenceMetricsSetTest::successful_return_codes_are_counted_as_ok() { + PersistenceOperationMetricSet metrics("foo"); + metrics.updateFromResult(api::ReturnCode()); + CPPUNIT_ASSERT_EQUAL(int64_t(1), metrics.ok.getLongValue("count")); +} + +void PersistenceMetricsSetTest::wrong_distribution_failure_is_counted() { + PersistenceOperationMetricSet metrics("foo"); + assert_failure_is_counted(metrics, api::ReturnCode::WRONG_DISTRIBUTION, + metrics.failures.wrongdistributor); +} + +void PersistenceMetricsSetTest::timeout_failure_is_counted() { + PersistenceOperationMetricSet metrics("foo"); + assert_failure_is_counted(metrics, api::ReturnCode::TIMEOUT, + metrics.failures.timeout); +} + +void PersistenceMetricsSetTest::busy_failure_is_counted() { + PersistenceOperationMetricSet metrics("foo"); + assert_failure_is_counted(metrics, api::ReturnCode::BUSY, + metrics.failures.busy); +} + +void PersistenceMetricsSetTest::connection_failure_is_counted() { + PersistenceOperationMetricSet metrics("foo"); + // This is dirty enum value coercion, but this is how "parent protocol" + // error codes are handled already. + api::ReturnCode::Result error_code(static_cast<api::ReturnCode::Result>( + mbus::ErrorCode::CONNECTION_ERROR)); + assert_failure_is_counted(metrics, error_code, + metrics.failures.notconnected); +} + +void PersistenceMetricsSetTest::inconsistent_bucket_is_counted() { + PersistenceOperationMetricSet metrics("foo"); + assert_failure_is_counted(metrics, api::ReturnCode::BUCKET_NOT_FOUND, + metrics.failures.inconsistent_bucket); +} + +void PersistenceMetricsSetTest::non_special_cased_failure_codes_are_catchall_counted() { + PersistenceOperationMetricSet metrics("foo"); + assert_failure_is_counted(metrics, api::ReturnCode::REJECTED, + metrics.failures.storagefailure); +} + +} +} diff --git a/storage/src/tests/distributor/visitoroperationtest.cpp b/storage/src/tests/distributor/visitoroperationtest.cpp index 9b674029959..7bd9eea29b0 100644 --- a/storage/src/tests/distributor/visitoroperationtest.cpp +++ b/storage/src/tests/distributor/visitoroperationtest.cpp @@ -64,6 +64,8 @@ class VisitorOperationTest : public CppUnit::TestFixture, CPPUNIT_TEST(testNoClientReplyBeforeAllStorageRepliesReceived); CPPUNIT_TEST(testSkipFailedSubBucketsWhenVisitingInconsistent); CPPUNIT_TEST(testQueueTimeoutIsFactorOfTotalTimeout); + CPPUNIT_TEST(metrics_are_updated_with_visitor_statistics_upon_replying); + CPPUNIT_TEST(statistical_metrics_not_updated_on_wrong_distribution); CPPUNIT_TEST_SUITE_END(); protected: @@ -109,6 +111,8 @@ protected: void testNoClientReplyBeforeAllStorageRepliesReceived(); void testSkipFailedSubBucketsWhenVisitingInconsistent(); void testQueueTimeoutIsFactorOfTotalTimeout(); + void metrics_are_updated_with_visitor_statistics_upon_replying(); + void statistical_metrics_not_updated_on_wrong_distribution(); public: VisitorOperationTest() : defaultConfig(framework::MilliSecTime(0), @@ -188,6 +192,10 @@ private: return ost.str(); } + VisitorMetricSet& defaultVisitorMetrics() { + return getDistributor().getMetrics().visits[documentapi::LoadType::DEFAULT]; + } + std::unique_ptr<VisitorOperation> createOpWithConfig( api::CreateVisitorCommand::SP msg, const VisitorOperation::Config& config) @@ -233,6 +241,8 @@ private: std::unique_ptr<VisitorOperation> startOperationWith2StorageNodeVisitors( bool inconsistent); + + void do_visitor_roundtrip_with_statistics(const api::ReturnCode& result); }; CPPUNIT_TEST_SUITE_REGISTRATION(VisitorOperationTest); @@ -307,6 +317,8 @@ VisitorOperationTest::doStandardVisitTest(const std::string& clusterState) "last=BucketId(0x000000007fffffff)) " "ReturnCode(NONE)"), _sender.getLastReply()); + CPPUNIT_ASSERT_EQUAL(int64_t(1), defaultVisitorMetrics(). + ok.getLongValue("count")); } void @@ -603,6 +615,8 @@ VisitorOperationTest::testBucketRemovedWhileVisitorPending() std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) " "ReturnCode(BUCKET_NOT_FOUND)"), _sender.getLastReply()); + CPPUNIT_ASSERT_EQUAL(int64_t(1), defaultVisitorMetrics().failures. + inconsistent_bucket.getLongValue("count")); } void @@ -725,6 +739,8 @@ VisitorOperationTest::testTimeoutDoesNotOverrideCriticalError() "CreateVisitorReply(last=BucketId(0x0000000000000000)) " "ReturnCode(INTERNAL_FAILURE, [from content node 0] )"s, _sender.getLastReply()); + CPPUNIT_ASSERT_EQUAL(int64_t(1), defaultVisitorMetrics().failures. + storagefailure.getLongValue("count")); } void @@ -737,6 +753,8 @@ VisitorOperationTest::testWrongDistribution() std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) " "ReturnCode(WRONG_DISTRIBUTION, distributor:100 storage:2)"), runEmptyVisitor(createVisitorCommand("wrongdist", id, nullId))); + CPPUNIT_ASSERT_EQUAL(int64_t(1), defaultVisitorMetrics().failures. + wrongdistributor.getLongValue("count")); } void @@ -1181,6 +1199,9 @@ VisitorOperationTest::testFailureOnAllNodes() std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) " "ReturnCode(BUCKET_NOT_FOUND)"), _sender.getLastReply()); + // TODO it'd be much more accurate to increase the "notconnected" metric + // here, but our metrics are currently based on the reply sent back to the + // client, not the ones sent from the content nodes to the distributor. } @@ -1606,5 +1627,62 @@ VisitorOperationTest::testQueueTimeoutIsFactorOfTotalTimeout() CPPUNIT_ASSERT_EQUAL(uint32_t(5000), cmd.getQueueTimeout()); } +void +VisitorOperationTest::do_visitor_roundtrip_with_statistics( + const api::ReturnCode& result) +{ + document::BucketId id(0x400000000000007bULL); + _distributor->enableClusterState(ClusterState("distributor:1 storage:1")); + addNodesToBucketDB(id, "0=1/1/1/t"); + + auto op = createOpWithDefaultConfig( + createVisitorCommand("metricstats", id, nullId)); + + op->start(_sender, framework::MilliSecTime(0)); + CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"), + _sender.getCommands(true)); + auto& cmd(dynamic_cast<CreateVisitorCommand&>(*_sender.commands[0])); + auto reply = cmd.makeReply(); + vdslib::VisitorStatistics stats; + stats.setBucketsVisited(50); + stats.setDocumentsVisited(100); + stats.setBytesVisited(2000); + static_cast<CreateVisitorReply&>(*reply).setVisitorStatistics(stats); + reply->setResult(result); + + op->receive(_sender, api::StorageReply::SP(std::move(reply))); +} + +void +VisitorOperationTest::metrics_are_updated_with_visitor_statistics_upon_replying() +{ + do_visitor_roundtrip_with_statistics(api::ReturnCode(api::ReturnCode::OK)); + + CPPUNIT_ASSERT_EQUAL(int64_t(50), defaultVisitorMetrics().buckets_per_visitor.getLast()); + CPPUNIT_ASSERT_EQUAL(int64_t(100), defaultVisitorMetrics().docs_per_visitor.getLast()); + CPPUNIT_ASSERT_EQUAL(int64_t(2000), defaultVisitorMetrics().bytes_per_visitor.getLast()); +} + +void +VisitorOperationTest::statistical_metrics_not_updated_on_wrong_distribution() +{ + setupDistributor(1, 100, "distributor:100 storage:2"); + + document::BucketId id(uint64_t(0x400000000000127b)); + CPPUNIT_ASSERT_EQUAL( + std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) " + "ReturnCode(WRONG_DISTRIBUTION, distributor:100 storage:2)"), + runEmptyVisitor(createVisitorCommand("wrongdist", id, nullId))); + + // Note that we're testing the number of _times_ the metric has been + // updated, not the value with which it's been updated (which would be zero + // even in the case we actually did update the statistical metrics). + CPPUNIT_ASSERT_EQUAL(int64_t(0), defaultVisitorMetrics().buckets_per_visitor.getCount()); + CPPUNIT_ASSERT_EQUAL(int64_t(0), defaultVisitorMetrics().docs_per_visitor.getCount()); + CPPUNIT_ASSERT_EQUAL(int64_t(0), defaultVisitorMetrics().bytes_per_visitor.getCount()); + // Fascinating that count is also a double... + CPPUNIT_ASSERT_EQUAL(0.0, defaultVisitorMetrics().latency.getCount()); +} + } // distributor } // storage diff --git a/storage/src/vespa/storage/distributor/CMakeLists.txt b/storage/src/vespa/storage/distributor/CMakeLists.txt index 4f6dd7da153..3b8843d39ea 100644 --- a/storage/src/vespa/storage/distributor/CMakeLists.txt +++ b/storage/src/vespa/storage/distributor/CMakeLists.txt @@ -4,6 +4,7 @@ vespa_add_library(storage_distributor distributor.cpp distributormetricsset.cpp visitormetricsset.cpp + persistence_operation_metric_set.cpp idealstatemetricsset.cpp operationowner.cpp distributorcomponent.cpp diff --git a/storage/src/vespa/storage/distributor/distributormetricsset.cpp b/storage/src/vespa/storage/distributor/distributormetricsset.cpp index 0760ab6223d..f842ed771c9 100644 --- a/storage/src/vespa/storage/distributor/distributormetricsset.cpp +++ b/storage/src/vespa/storage/distributor/distributormetricsset.cpp @@ -7,65 +7,6 @@ namespace storage { using metrics::MetricSet; -PersistenceFailuresMetricSet::PersistenceFailuresMetricSet(MetricSet* owner) - : MetricSet("failures", "", "Detailed failure statistics", owner), - sum("total", "logdefault yamasdefault", "Sum of all failures", this), - notready("notready", "", "The number of operations discarded because distributor was not ready", this), - notconnected("notconnected", "", "The number of operations discarded because there were no available storage nodes to send to", this), - wrongdistributor("wrongdistributor", "", "The number of operations discarded because they were sent to the wrong distributor", this), - safe_time_not_reached("safe_time_not_reached", "", - "The number of operations that were transiently" - " failed due to them arriving before the safe " - "time point for bucket ownership handovers has " - "passed", this), - storagefailure("storagefailure", "", "The number of operations that failed in storage", this), - timeout("timeout", "", "The number of operations that failed because the operation timed out towards storage", this), - busy("busy", "", "The number of messages from storage that failed because the storage node was busy", this), - notfound("notfound", "", "The number of operations that failed because the document did not exist", this) -{ - sum.addMetricToSum(notready); - sum.addMetricToSum(notconnected); - sum.addMetricToSum(wrongdistributor); - sum.addMetricToSum(safe_time_not_reached); - sum.addMetricToSum(storagefailure); - sum.addMetricToSum(timeout); - sum.addMetricToSum(busy); - sum.addMetricToSum(notfound); -} -PersistenceFailuresMetricSet::~PersistenceFailuresMetricSet() { } - -MetricSet * -PersistenceFailuresMetricSet::clone(std::vector<Metric::LP>& ownerList, CopyType copyType, - MetricSet* owner, bool includeUnused) const -{ - if (copyType == INACTIVE) { - return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused); - } - return (PersistenceFailuresMetricSet*) - (new PersistenceFailuresMetricSet(owner))->assignValues(*this); -} - -PersistenceOperationMetricSet::PersistenceOperationMetricSet(const std::string& name, MetricSet* owner) - : MetricSet(name, "", vespalib::make_string("Statistics for the %s command", name.c_str()), owner, "operationtype"), - latency("latency", "yamasdefault", vespalib::make_string("The average latency of %s operations", name.c_str()), this), - ok("ok", "logdefault yamasdefault", vespalib::make_string("The number of successful %s operations performed", name.c_str()), this), - failures(this) -{ } - -PersistenceOperationMetricSet::~PersistenceOperationMetricSet() { } - -MetricSet * -PersistenceOperationMetricSet::clone(std::vector<Metric::LP>& ownerList, CopyType copyType, - MetricSet* owner, bool includeUnused) const -{ - if (copyType == INACTIVE) { - return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused); - } - return (PersistenceOperationMetricSet*) - (new PersistenceOperationMetricSet(getName(), owner)) - ->assignValues(*this); -} - DistributorMetricSet::DistributorMetricSet(const metrics::LoadTypeSet& lt) : MetricSet("distributor", "distributor", ""), puts(lt, PersistenceOperationMetricSet("puts"), this), @@ -94,7 +35,7 @@ DistributorMetricSet::DistributorMetricSet(const metrics::LoadTypeSet& lt) DistributorMetricSet::~DistributorMetricSet() { } -} +} // storage template class metrics::LoadMetric<storage::PersistenceOperationMetricSet>; template class metrics::SumMetric<storage::PersistenceOperationMetricSet>; diff --git a/storage/src/vespa/storage/distributor/distributormetricsset.h b/storage/src/vespa/storage/distributor/distributormetricsset.h index b8458d119e7..705e7305e0d 100644 --- a/storage/src/vespa/storage/distributor/distributormetricsset.h +++ b/storage/src/vespa/storage/distributor/distributormetricsset.h @@ -1,46 +1,13 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once +#include "persistence_operation_metric_set.h" #include "visitormetricsset.h" #include <vespa/metrics/metrics.h> #include <vespa/documentapi/loadtypes/loadtypeset.h> namespace storage { -class PersistenceFailuresMetricSet : public metrics::MetricSet -{ -public: - PersistenceFailuresMetricSet(metrics::MetricSet* owner); - ~PersistenceFailuresMetricSet(); - - metrics::SumMetric<metrics::LongCountMetric> sum; - metrics::LongCountMetric notready; - metrics::LongCountMetric notconnected; - metrics::LongCountMetric wrongdistributor; - metrics::LongCountMetric safe_time_not_reached; - metrics::LongCountMetric storagefailure; - metrics::LongCountMetric timeout; - metrics::LongCountMetric busy; - metrics::LongCountMetric notfound; - - MetricSet * clone(std::vector<Metric::LP>& ownerList, CopyType copyType, - metrics::MetricSet* owner, bool includeUnused) const; -}; - -class PersistenceOperationMetricSet : public metrics::MetricSet -{ -public: - metrics::DoubleAverageMetric latency; - metrics::LongCountMetric ok; - PersistenceFailuresMetricSet failures; - - PersistenceOperationMetricSet(const std::string& name, metrics::MetricSet* owner = nullptr); - ~PersistenceOperationMetricSet(); - - MetricSet * clone(std::vector<Metric::LP>& ownerList, CopyType copyType, - metrics::MetricSet* owner, bool includeUnused) const override; -}; - class DistributorMetricSet : public metrics::MetricSet { public: diff --git a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp index 7bb99ea4b43..1bcc6abafb6 100644 --- a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp +++ b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp @@ -983,14 +983,30 @@ VisitorOperation::sendReply(const api::ReturnCode& code, DistributorMessageSende code.toString().c_str(), _msg->getInstanceId().c_str(), _msg->getMsgId()); + updateReplyMetrics(code); sender.sendReply(reply); - _metrics.latency.addValue(_operationTimer.getElapsedTimeAsDouble()); _sentReply = true; } } void +VisitorOperation::updateReplyMetrics(const api::ReturnCode& result) +{ + _metrics.updateFromResult(result); + // WrongDistributionReply happens as a normal and expected part of a visitor + // session's lifetime. If we pollute the metrics with measurements taken + // from such replies, the averages will not be representative. + if (result.getResult() == api::ReturnCode::WRONG_DISTRIBUTION) { + return; + } + _metrics.latency.addValue(_operationTimer.getElapsedTimeAsDouble()); + _metrics.buckets_per_visitor.inc(_visitorStatistics.getBucketsVisited()); + _metrics.docs_per_visitor.inc(_visitorStatistics.getDocumentsVisited()); + _metrics.bytes_per_visitor.inc(_visitorStatistics.getBytesVisited()); +} + +void VisitorOperation::onClose(DistributorMessageSender& sender) { sendReply(api::ReturnCode(api::ReturnCode::ABORTED, "Process is shutting down"), diff --git a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h index 1824610355d..6fd18d8d01e 100644 --- a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h +++ b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h @@ -90,6 +90,7 @@ private: typedef std::map<uint64_t, api::CreateVisitorCommand::SP> SentMessagesMap; void sendReply(const api::ReturnCode& code, DistributorMessageSender& sender); + void updateReplyMetrics(const api::ReturnCode& result); void verifyDistributorsAreAvailable(); void verifyVisitorDistributionBitCount(const document::BucketId&); void verifyDistributorIsNotDown(const lib::ClusterState&); diff --git a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp new file mode 100644 index 00000000000..b7766713b01 --- /dev/null +++ b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.cpp @@ -0,0 +1,98 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "distributormetricsset.h" +#include <vespa/storageapi/messageapi/returncode.h> +#include <vespa/metrics/loadmetric.hpp> +#include <vespa/metrics/summetric.hpp> + +namespace storage { + +using metrics::MetricSet; + +PersistenceFailuresMetricSet::PersistenceFailuresMetricSet(MetricSet* owner) + : MetricSet("failures", "", "Detailed failure statistics", owner), + sum("total", "logdefault yamasdefault", "Sum of all failures", this), + notready("notready", "", "The number of operations discarded because distributor was not ready", this), + notconnected("notconnected", "", "The number of operations discarded because there were no available storage nodes to send to", this), + wrongdistributor("wrongdistributor", "", "The number of operations discarded because they were sent to the wrong distributor", this), + safe_time_not_reached("safe_time_not_reached", "", + "The number of operations that were transiently" + " failed due to them arriving before the safe " + "time point for bucket ownership handovers has " + "passed", this), + storagefailure("storagefailure", "", "The number of operations that failed in storage", this), + timeout("timeout", "", "The number of operations that failed because the operation timed out towards storage", this), + busy("busy", "", "The number of messages from storage that failed because the storage node was busy", this), + inconsistent_bucket("inconsistent_bucket", "", + "The number of operations failed due to buckets " + "being in an inconsistent state or not found", this), + notfound("notfound", "", "The number of operations that failed because the document did not exist", this) +{ + sum.addMetricToSum(notready); + sum.addMetricToSum(notconnected); + sum.addMetricToSum(wrongdistributor); + sum.addMetricToSum(safe_time_not_reached); + sum.addMetricToSum(storagefailure); + sum.addMetricToSum(timeout); + sum.addMetricToSum(busy); + sum.addMetricToSum(inconsistent_bucket); + sum.addMetricToSum(notfound); +} +PersistenceFailuresMetricSet::~PersistenceFailuresMetricSet() { } + +MetricSet * +PersistenceFailuresMetricSet::clone(std::vector<Metric::LP>& ownerList, CopyType copyType, + MetricSet* owner, bool includeUnused) const +{ + if (copyType == INACTIVE) { + return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused); + } + return (PersistenceFailuresMetricSet*) + (new PersistenceFailuresMetricSet(owner))->assignValues(*this); +} + +PersistenceOperationMetricSet::PersistenceOperationMetricSet(const std::string& name, MetricSet* owner) + : MetricSet(name, "", vespalib::make_string("Statistics for the %s command", name.c_str()), owner, "operationtype"), + latency("latency", "yamasdefault", vespalib::make_string("The average latency of %s operations", name.c_str()), this), + ok("ok", "logdefault yamasdefault", vespalib::make_string("The number of successful %s operations performed", name.c_str()), this), + failures(this) +{ } + +PersistenceOperationMetricSet::~PersistenceOperationMetricSet() { } + +MetricSet * +PersistenceOperationMetricSet::clone(std::vector<Metric::LP>& ownerList, CopyType copyType, + MetricSet* owner, bool includeUnused) const +{ + if (copyType == INACTIVE) { + return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused); + } + return (PersistenceOperationMetricSet*) + (new PersistenceOperationMetricSet(getName(), owner)) + ->assignValues(*this); +} + +void +PersistenceOperationMetricSet::updateFromResult(const api::ReturnCode& result) +{ + if (result.success()) { + ++ok; + } else if (result.getResult() == api::ReturnCode::WRONG_DISTRIBUTION) { + ++failures.wrongdistributor; + } else if (result.getResult() == api::ReturnCode::TIMEOUT) { + ++failures.timeout; + } else if (result.isBusy()) { + ++failures.busy; + } else if (result.isBucketDisappearance()) { + // Bucket not found/deleted codes imply that replicas are transiently + // inconsistent in our DB or across replica nodes. + ++failures.inconsistent_bucket; + } else if (result.isNodeDownOrNetwork()) { + ++failures.notconnected; + } else { + ++failures.storagefailure; + } +} + +} // storage + diff --git a/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h new file mode 100644 index 00000000000..221662a2ba6 --- /dev/null +++ b/storage/src/vespa/storage/distributor/persistence_operation_metric_set.h @@ -0,0 +1,57 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/metrics/metrics.h> +#include <vespa/documentapi/loadtypes/loadtypeset.h> + +namespace storage { + +namespace api { +class ReturnCode; +} + +class PersistenceFailuresMetricSet : public metrics::MetricSet +{ +public: + PersistenceFailuresMetricSet(metrics::MetricSet* owner); + ~PersistenceFailuresMetricSet(); + + metrics::SumMetric<metrics::LongCountMetric> sum; + metrics::LongCountMetric notready; + metrics::LongCountMetric notconnected; + metrics::LongCountMetric wrongdistributor; + metrics::LongCountMetric safe_time_not_reached; + metrics::LongCountMetric storagefailure; + metrics::LongCountMetric timeout; + metrics::LongCountMetric busy; + metrics::LongCountMetric inconsistent_bucket; + metrics::LongCountMetric notfound; + + MetricSet * clone(std::vector<Metric::LP>& ownerList, CopyType copyType, + metrics::MetricSet* owner, bool includeUnused) const; +}; + +class PersistenceOperationMetricSet : public metrics::MetricSet +{ +public: + metrics::DoubleAverageMetric latency; + metrics::LongCountMetric ok; + PersistenceFailuresMetricSet failures; + + PersistenceOperationMetricSet(const std::string& name, metrics::MetricSet* owner = nullptr); + ~PersistenceOperationMetricSet(); + + MetricSet * clone(std::vector<Metric::LP>& ownerList, CopyType copyType, + metrics::MetricSet* owner, bool includeUnused) const override; + + /** + * Increments appropriate success/failure count metrics based on the return + * code provided in `result`. + * + * Does _not_ update latency metric. + */ + void updateFromResult(const api::ReturnCode& result); +}; + +} diff --git a/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp index 34193bbb3c1..4ea765a2d4a 100644 --- a/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp +++ b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp @@ -21,11 +21,11 @@ PersistenceMessageTrackerImpl::PersistenceMessageTrackerImpl( _metric(metric), _reply(reply), _manager(link), - _success(true), _revertTimestamp(revertTimestamp), - _priority(reply->getPriority()) + _requestTimer(link.getClock()), + _priority(reply->getPriority()), + _success(true) { - _creationTime.SetNow(); } void @@ -51,18 +51,8 @@ void PersistenceMessageTrackerImpl::updateMetrics() { const api::ReturnCode& result(_reply->getResult()); - if (result.success()) { - ++_metric.ok; - } else if (result.getResult() == api::ReturnCode::TIMEOUT) { - ++_metric.failures.timeout; - } else if (result.isBusy()) { - ++_metric.failures.busy; - } else if (result.isNodeDownOrNetwork()) { - ++_metric.failures.notconnected; - } else { - ++_metric.failures.storagefailure; - } - _metric.latency.addValue(_creationTime.MilliSecsToNow()); + _metric.updateFromResult(result); + _metric.latency.addValue(_requestTimer.getElapsedTimeAsDouble()); } void diff --git a/storage/src/vespa/storage/distributor/persistencemessagetracker.h b/storage/src/vespa/storage/distributor/persistencemessagetracker.h index b5c3e57e05e..06c8bd1135c 100644 --- a/storage/src/vespa/storage/distributor/persistencemessagetracker.h +++ b/storage/src/vespa/storage/distributor/persistencemessagetracker.h @@ -78,12 +78,12 @@ private: PersistenceOperationMetricSet& _metric; std::shared_ptr<api::BucketInfoReply> _reply; DistributorComponent& _manager; - FastOS_Time _creationTime; - bool _success; api::Timestamp _revertTimestamp; std::vector<std::pair<document::BucketId, uint16_t> > _revertNodes; mbus::TraceNode _trace; + framework::MilliSecTimer _requestTimer; uint8_t _priority; + bool _success; bool canSendReplyEarly() const; void addBucketInfoFromReply(uint16_t node, diff --git a/storage/src/vespa/storage/distributor/visitormetricsset.cpp b/storage/src/vespa/storage/distributor/visitormetricsset.cpp index ae8aeeb8463..fb2bab51d1a 100644 --- a/storage/src/vespa/storage/distributor/visitormetricsset.cpp +++ b/storage/src/vespa/storage/distributor/visitormetricsset.cpp @@ -9,10 +9,18 @@ namespace storage { using metrics::MetricSet; VisitorMetricSet::VisitorMetricSet(MetricSet* owner) - : MetricSet("visitor", "visitor", "", owner), - latency("latency", "", "Latency of visitor (in ms)", this), - failed("failed", "", "Number of visitors that failed or were aborted by the user", this) -{ } + : PersistenceOperationMetricSet("visitor", owner), + buckets_per_visitor("buckets_per_visitor", "", + "The number of sub buckets visited as part of a " + "single client visitor command", this), + docs_per_visitor("docs_per_visitor", "", + "The number of documents visited on content nodes as " + "part of a single client visitor command", this), + bytes_per_visitor("bytes_per_visitor", "", + "The number of bytes visited on content nodes as part " + "of a single client visitor command", this) +{ +} VisitorMetricSet::~VisitorMetricSet() { } diff --git a/storage/src/vespa/storage/distributor/visitormetricsset.h b/storage/src/vespa/storage/distributor/visitormetricsset.h index 24a3bbcce2f..34d65b9ec07 100644 --- a/storage/src/vespa/storage/distributor/visitormetricsset.h +++ b/storage/src/vespa/storage/distributor/visitormetricsset.h @@ -1,14 +1,16 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once +#include "persistence_operation_metric_set.h" #include <vespa/metrics/metrics.h> #include <vespa/documentapi/loadtypes/loadtypeset.h> namespace storage { -struct VisitorMetricSet : public metrics::MetricSet { - metrics::DoubleAverageMetric latency; - metrics::LongCountMetric failed; +struct VisitorMetricSet : public PersistenceOperationMetricSet { + metrics::LongAverageMetric buckets_per_visitor; + metrics::LongAverageMetric docs_per_visitor; + metrics::LongAverageMetric bytes_per_visitor; VisitorMetricSet(MetricSet* owner = nullptr); ~VisitorMetricSet(); |