diff options
author | Tor Brede Vekterli <vekterli@yahoo-inc.com> | 2017-01-12 17:22:11 +0100 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahoo-inc.com> | 2017-01-12 17:22:11 +0100 |
commit | a2dff014ac0bf54860ef885711e3ae3a6ed8fdee (patch) | |
tree | 9e7f740064f1a5ba7f432c08040f88c100a28c83 /storage | |
parent | d89de4d0f4ef794a36c5e0a0c0b1824f94f4ca5f (diff) |
Track time spent transitioning between cluster states
Allows observing whether state transitions take an unexpected amount of
time.
Diffstat (limited to 'storage')
5 files changed, 120 insertions, 12 deletions
diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp index d76132e24ab..e0c0a76f3e7 100644 --- a/storage/src/tests/distributor/bucketdbupdatertest.cpp +++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp @@ -69,6 +69,10 @@ class BucketDBUpdaterTest : public CppUnit::TestFixture, CPPUNIT_TEST(changed_distributor_set_implies_ownership_transfer); CPPUNIT_TEST(unchanged_distributor_set_implies_no_ownership_transfer); CPPUNIT_TEST(changed_distribution_config_implies_ownership_transfer); + CPPUNIT_TEST(transition_time_tracked_for_single_state_change); + CPPUNIT_TEST(transition_time_reset_across_non_preempting_state_changes); + CPPUNIT_TEST(transition_time_tracked_for_distribution_config_change); + CPPUNIT_TEST(transition_time_tracked_across_preempted_transitions); CPPUNIT_TEST_SUITE_END(); protected: @@ -119,6 +123,10 @@ protected: void changed_distributor_set_implies_ownership_transfer(); void unchanged_distributor_set_implies_no_ownership_transfer(); void changed_distribution_config_implies_ownership_transfer(); + void transition_time_tracked_for_single_state_change(); + void transition_time_reset_across_non_preempting_state_changes(); + void transition_time_tracked_for_distribution_config_change(); + void transition_time_tracked_across_preempted_transitions(); bool bucketExistsThatHasNode(int bucketCount, uint16_t node) const; @@ -302,11 +310,10 @@ public: sortSentMessagesByIndex(_sender, sizeBeforeState); } - void setAndEnableClusterState(const lib::ClusterState& state, - uint32_t expectedMsgs, - uint32_t nBuckets) { - _sender.clear(); - setSystemState(state); + void completeBucketInfoGathering(const lib::ClusterState& state, + uint32_t expectedMsgs, + uint32_t nBuckets = 1) + { CPPUNIT_ASSERT_EQUAL(size_t(expectedMsgs), _sender.commands.size()); for (uint32_t i = 0; i < _sender.commands.size(); i++) { @@ -323,6 +330,29 @@ public: } } + void setAndEnableClusterState(const lib::ClusterState& state, + uint32_t expectedMsgs, + uint32_t nBuckets) + { + _sender.clear(); + setSystemState(state); + completeBucketInfoGathering(state, expectedMsgs, nBuckets); + } + + void completeStateTransitionInSeconds(const std::string& stateStr, + uint32_t seconds, + uint32_t expectedMsgs) + { + _sender.clear(); + lib::ClusterState state(stateStr); + setSystemState(state); + getClock().addSecondsToTime(seconds); + completeBucketInfoGathering(state, expectedMsgs); + } + + uint64_t lastTransitionTimeInMillis() { + return uint64_t(getDistributor().getMetrics().stateTransitionTime.getLast()); + } void setStorageNodes(uint32_t numStorageNodes) { _sender.clear(); @@ -2338,5 +2368,51 @@ BucketDBUpdaterTest::changed_distribution_config_implies_ownership_transfer() CPPUNIT_ASSERT(fixture->state->hasBucketOwnershipTransfer()); } +void +BucketDBUpdaterTest::transition_time_tracked_for_single_state_change() +{ + completeStateTransitionInSeconds("distributor:2 storage:2", 5, 2); + + CPPUNIT_ASSERT_EQUAL(uint64_t(5000), lastTransitionTimeInMillis()); +} + +void +BucketDBUpdaterTest::transition_time_reset_across_non_preempting_state_changes() +{ + completeStateTransitionInSeconds("distributor:2 storage:2", 5, 2); + completeStateTransitionInSeconds("distributor:2 storage:3", 3, 1); + + CPPUNIT_ASSERT_EQUAL(uint64_t(3000), lastTransitionTimeInMillis()); +} + +void +BucketDBUpdaterTest::transition_time_tracked_for_distribution_config_change() +{ + lib::ClusterState state("distributor:2 storage:2"); + setAndEnableClusterState(state, 2, 1); + + _sender.clear(); + std::string distConfig(getDistConfig3Nodes1Group()); + setDistribution(distConfig); + getClock().addSecondsToTime(4); + completeBucketInfoGathering(state, 2); + CPPUNIT_ASSERT_EQUAL(uint64_t(4000), lastTransitionTimeInMillis()); +} + +void +BucketDBUpdaterTest::transition_time_tracked_across_preempted_transitions() +{ + _sender.clear(); + lib::ClusterState state("distributor:2 storage:2"); + setSystemState(state); + getClock().addSecondsToTime(5); + // Pre-empted with new state here, which will push out the old pending + // state and replace it with a new one. We should still count the time + // used processing the old state. + completeStateTransitionInSeconds("distributor:2 storage:3", 3, 3); + + CPPUNIT_ASSERT_EQUAL(uint64_t(8000), lastTransitionTimeInMillis()); +} + } // distributor } // storage diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp index b88a22c8042..b3e2cf36955 100644 --- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp +++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp @@ -28,7 +28,8 @@ BucketDBUpdater::BucketDBUpdater(Distributor& owner, DistributorComponentRegister& compReg) : framework::StatusReporter("bucketdb", "Bucket DB Updater"), _bucketSpaceComponent(owner, bucketSpace, compReg, "Bucket DB Updater"), - _sender(sender) + _sender(sender), + _transitionTimer(_bucketSpaceComponent.getClock()) { } @@ -160,9 +161,29 @@ BucketDBUpdater::removeSuperfluousBuckets( } void +BucketDBUpdater::ensureTransitionTimerStarted() +{ + if (!_pendingClusterState) { + // Don't overwrite start time if we're already processing a state, as + // that will make transition times appear artificially low. + _transitionTimer = framework::MilliSecTimer( + _bucketSpaceComponent.getClock()); + } +} + +void +BucketDBUpdater::completeTransitionTimer() +{ + _bucketSpaceComponent.getDistributor().getMetrics() + .stateTransitionTime.addValue(_transitionTimer.getElapsedTimeAsDouble()); +} + +void BucketDBUpdater::storageDistributionChanged( const lib::Distribution& distribution) { + ensureTransitionTimerStarted(); + removeSuperfluousBuckets(distribution, _bucketSpaceComponent.getClusterState()); @@ -204,6 +225,7 @@ BucketDBUpdater::onSetSystemState( if (state == oldState) { return false; } + ensureTransitionTimerStarted(); removeSuperfluousBuckets( _bucketSpaceComponent.getDistribution(), @@ -524,6 +546,7 @@ BucketDBUpdater::processCompletedPendingClusterState() _pendingClusterState.reset(); _outdatedNodes.clear(); sendAllQueuedBucketRechecks(); + completeTransitionTimer(); } void diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.h b/storage/src/vespa/storage/distributor/bucketdbupdater.h index c87907fe0b2..8f050d87d95 100644 --- a/storage/src/vespa/storage/distributor/bucketdbupdater.h +++ b/storage/src/vespa/storage/distributor/bucketdbupdater.h @@ -170,6 +170,8 @@ private: const BucketDatabase::Entry& e, uint16_t node, BucketListMerger::BucketList& existing) const; + void ensureTransitionTimerStarted(); + void completeTransitionTimer(); /** * Adds all buckets contained in the bucket database * that are either contained @@ -272,6 +274,7 @@ private: DistributorMessageSender& _sender; std::set<EnqueuedBucketRecheck> _enqueuedRechecks; std::unordered_set<uint16_t> _outdatedNodes; + framework::MilliSecTimer _transitionTimer; }; } diff --git a/storage/src/vespa/storage/distributor/distributormetricsset.cpp b/storage/src/vespa/storage/distributor/distributormetricsset.cpp index f842ed771c9..af22ec52c24 100644 --- a/storage/src/vespa/storage/distributor/distributormetricsset.cpp +++ b/storage/src/vespa/storage/distributor/distributormetricsset.cpp @@ -19,15 +19,20 @@ DistributorMetricSet::DistributorMetricSet(const metrics::LoadTypeSet& lt) stats(lt, PersistenceOperationMetricSet("stats"), this), multioperations(lt, PersistenceOperationMetricSet("multioperations"), this), visits(lt, VisitorMetricSet(), this), + stateTransitionTime("state_transition_time", "", + "Time it takes to complete a cluster state transition. If a " + "state transition is preempted before completing, its elapsed " + "time is counted as part of the total time spent for the final, " + "completed state transition", this), recoveryModeTime("recoverymodeschedulingtime", "", - "Time spent scheduling operations in recovery mode " - "after receiving new cluster state", this), + "Time spent scheduling operations in recovery mode " + "after receiving new cluster state", this), docsStored("docsstored", "logdefault yamasdefault", - "Number of documents stored in all buckets controlled by " - "this distributor", this), + "Number of documents stored in all buckets controlled by " + "this distributor", this), bytesStored("bytesstored", "logdefault yamasdefault", - "Number of bytes stored in all buckets controlled by " - "this distributor", this) + "Number of bytes stored in all buckets controlled by " + "this distributor", this) { docsStored.logOnlyIfSet(); bytesStored.logOnlyIfSet(); diff --git a/storage/src/vespa/storage/distributor/distributormetricsset.h b/storage/src/vespa/storage/distributor/distributormetricsset.h index 705e7305e0d..753d9ae0056 100644 --- a/storage/src/vespa/storage/distributor/distributormetricsset.h +++ b/storage/src/vespa/storage/distributor/distributormetricsset.h @@ -21,6 +21,7 @@ public: metrics::LoadMetric<PersistenceOperationMetricSet> stats; metrics::LoadMetric<PersistenceOperationMetricSet> multioperations; metrics::LoadMetric<VisitorMetricSet> visits; + metrics::DoubleAverageMetric stateTransitionTime; metrics::DoubleAverageMetric recoveryModeTime; metrics::LongValueMetric docsStored; metrics::LongValueMetric bytesStored; |