From 4a2642f2f11526ccc183e331f0644afa89bbf028 Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Mon, 29 Apr 2019 14:57:46 +0000 Subject: Elide bucket DB pruning scans when possible Only do a (potentially expensive) pruning pass over the bucket DB when the cluster state transition indicates that one or more nodes are not in the same availability-state as the currently active state. For instance, if a cluster state change is for a content node going from Maintenance to Down, no pruning action is required since that shall already have taken place during the processing of the initial Down edge (and no buckets shall have been created for it in the meantime). We do not currently attempt to elide distribution config changes, as these happen much more rarely than cluster state changes. --- storage/src/tests/distributor/CMakeLists.txt | 1 + .../distributor/bucket_db_prune_elision_test.cpp | 119 +++++++++++++++++++++ .../src/tests/distributor/bucketdbupdatertest.cpp | 43 ++++++++ .../src/vespa/storage/distributor/CMakeLists.txt | 1 + .../distributor/bucket_db_prune_elision.cpp | 58 ++++++++++ .../storage/distributor/bucket_db_prune_elision.h | 27 +++++ .../vespa/storage/distributor/bucketdbupdater.cpp | 33 ++++-- .../vespa/storage/distributor/bucketdbupdater.h | 3 +- 8 files changed, 275 insertions(+), 10 deletions(-) create mode 100644 storage/src/tests/distributor/bucket_db_prune_elision_test.cpp create mode 100644 storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp create mode 100644 storage/src/vespa/storage/distributor/bucket_db_prune_elision.h (limited to 'storage') diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt index 1e9dca3f7f1..f7a65000f6c 100644 --- a/storage/src/tests/distributor/CMakeLists.txt +++ b/storage/src/tests/distributor/CMakeLists.txt @@ -51,6 +51,7 @@ vespa_add_library(storage_gtestdistributor TEST bucketdbupdatertest.cpp # Fixture etc. dupes with non-gtest runner : distributortestutil.cpp + bucket_db_prune_elision_test.cpp messagesenderstub.cpp DEPENDS storage_distributor diff --git a/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp b/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp new file mode 100644 index 00000000000..4821d05a90c --- /dev/null +++ b/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp @@ -0,0 +1,119 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include +#include +#include + +using namespace ::testing; + +namespace storage::distributor { + +namespace { + +lib::ClusterState state_of(const char* str) { + return lib::ClusterState(str); +} + +} + +TEST(IdempotentStateTransitionTest, state_differing_only_in_version_allows_elision) { + EXPECT_TRUE(db_pruning_may_be_elided(state_of("version:1 bits:8 distributor:3 storage:3"), + state_of("version:2 bits:8 distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, differing_cluster_state_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("cluster:d distributor:3 storage:3"), + state_of("distributor:3 storage:3"))); + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("cluster:d distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, differing_distribution_bit_count_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("bits:8 distributor:3 storage:3"), + state_of("bits:9 distributor:3 storage:3"))); + // No explicit "bits:" implies 16 bits + EXPECT_FALSE(db_pruning_may_be_elided(state_of("bits:8 distributor:3 storage:3"), + state_of("distributor:3 storage:3"))); + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("bits:8 distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, same_implicit_distribution_bit_count_allows_elision) { + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("bits:16 distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, changed_distributor_node_count_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:4 storage:3"))); + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:4 storage:3"), + state_of("distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, changed_distributor_node_state_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"), + state_of("distributor:3 storage:3"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 .0.s:d storage:3"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"), + state_of("distributor:3 .0.s:u storage:3"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"), + state_of("distributor:3 .1.s:d storage:3"))); +} + +TEST(IdempotentStateTransitionTest, changed_storage_node_count_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 storage:4"))); + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:4"), + state_of("distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, changed_storage_node_state_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"), + state_of("distributor:3 storage:3"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 storage:3 .0.s:d"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"), + state_of("distributor:3 storage:3 .0.s:u"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"), + state_of("distributor:3 storage:3 .1.s:d"))); +} + +TEST(IdempotentStateTransitionTest, may_elide_for_transition_between_different_effective_storage_down_states) { + // Maintenance -> Down edge shall already have pruned DB on Maintenance edge + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:m"), + state_of("distributor:3 storage:3 .0.s:d"))); + + // Down -> Maintenance edge shall already have pruned DB on Down edge + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"), + state_of("distributor:3 storage:3 .0.s:m"))); +} + +TEST(IdempotentStateTransitionTest, may_elide_for_transition_between_different_effective_storage_up_states) { + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:i"), + state_of("distributor:3 storage:3"))); + + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .1.s:r"), + state_of("distributor:3 storage:3"))); + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 storage:3 .2.s:r"))); +} + +// Changed startup timestamps imply that bucket info should be fetched from a node, but +// does not imply that pruning is required. +TEST(IdempotentStateTransitionTest, may_elide_changed_startup_timestamps) { + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 storage:3 .0.t:123456"))); + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.t:123456"), + state_of("distributor:3 storage:3"))); + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.t:123456"), + state_of("distributor:3 storage:3 .0.t:654321"))); +} + +} diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp index 2beae2ac43b..51797e90b3e 100644 --- a/storage/src/tests/distributor/bucketdbupdatertest.cpp +++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp @@ -1864,6 +1864,7 @@ TEST_F(BucketDBUpdaterTest, testClusterStateAlwaysSendsFullFetchWhenDistribution _sender.clear(); std::string distConfig(getDistConfig6Nodes2Groups()); setDistribution(distConfig); + sortSentMessagesByIndex(_sender); ASSERT_EQ(messageCount(6), _sender.commands.size()); // Suddenly, a wild cluster state change appears! Even though this state @@ -1921,6 +1922,48 @@ TEST_F(BucketDBUpdaterTest, testChangedDistributionConfigTriggersRecoveryMode) { EXPECT_TRUE(_distributor->isInRecoveryMode()); } +namespace { + +template +struct FunctorProcessor : BucketDatabase::EntryProcessor { + Func _f; + + template + explicit FunctorProcessor(F&& f) : _f(std::forward(f)) {} + + bool process(const BucketDatabase::Entry& e) override { + _f(e); + return true; + } +}; + +template +std::unique_ptr func_processor(Func&& f) { + return std::make_unique>(std::forward(f)); +} + +} + +TEST_F(BucketDBUpdaterTest, changed_distribution_config_does_not_elide_bucket_db_pruning) { + setDistribution(getDistConfig3Nodes1Group()); + + constexpr uint32_t n_buckets = 100; + ASSERT_NO_FATAL_FAILURE( + setAndEnableClusterState(lib::ClusterState("distributor:6 storage:6"), messageCount(6), n_buckets)); + _sender.clear(); + + // Config implies a different node set than the current cluster state, so it's crucial that + // DB pruning is _not_ elided. Yes, this is inherently racing with cluster state changes and + // should be changed to be atomic and controlled by the cluster controller instead of config. + // But this is where we currently are. + setDistribution(getDistConfig6Nodes2Groups()); + + getBucketDatabase().forEach(*func_processor([&](const auto& e) { + EXPECT_TRUE(getBucketDBUpdater() + .checkOwnershipInPendingState(makeDocumentBucket(e.getBucketId())).isOwned()); + })); +} + TEST_F(BucketDBUpdaterTest, testNewlyAddedBucketsHaveCurrentTimeAsGcTimestamp) { getClock().setAbsoluteTimeInSeconds(101234); lib::ClusterState stateBefore("distributor:1 storage:1"); diff --git a/storage/src/vespa/storage/distributor/CMakeLists.txt b/storage/src/vespa/storage/distributor/CMakeLists.txt index bd82c0e4d6e..8c701033e67 100644 --- a/storage/src/vespa/storage/distributor/CMakeLists.txt +++ b/storage/src/vespa/storage/distributor/CMakeLists.txt @@ -4,6 +4,7 @@ vespa_add_library(storage_distributor activecopy.cpp blockingoperationstarter.cpp bucketdbupdater.cpp + bucket_db_prune_elision.cpp bucketgctimecalculator.cpp bucketlistmerger.cpp clusterinformation.cpp diff --git a/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp new file mode 100644 index 00000000000..78b5b576c16 --- /dev/null +++ b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp @@ -0,0 +1,58 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "bucket_db_prune_elision.h" +#include + +namespace storage::distributor { + +// Returns whether the set of nodes of type `node_type` across two cluster states +// are idempotent from the perspective of bucket pruning. This is the case iff +// the effective down/up state of each node is unchanged. I.e. if the only difference +// between two states is that a storage node goes from state Down to Maintenance (or +// vice versa), the buckets shall already have been pruned during processing of the +// first edge, so the subsequent pruning may be elided. +// +// If a node's state is not one of `up_states`, it is considered to be in an effective +// Down state, and vice versa. +// +// Precondition: a.getNodeCount(node_type) == b.getNodeCount(node_type) +bool node_states_are_idempotent_for_pruning(const lib::NodeType& node_type, + const lib::ClusterState& a, + const lib::ClusterState& b, + const char* up_states) +{ + const uint16_t node_count = a.getNodeCount(node_type); + for (uint16_t i = 0; i < node_count; ++i) { + lib::Node node(node_type, i); + const auto& a_s = a.getNodeState(node); + const auto& b_s = b.getNodeState(node); + // Transitioning from one effective Down state to another can elide DB pruning, + // as the DB shall already have been pruned of all relevant buckets on the _first_ + // effective Down edge. + // No pruning shall take place on a transition from one effective Up state to another + // effective Up state. + if (a_s.getState().oneOf(up_states) != b_s.getState().oneOf(up_states)) { + return false; + } + } + return true; +} + +bool db_pruning_may_be_elided(const lib::ClusterState& a, const lib::ClusterState& b, const char* up_states) { + if (a.getClusterState() != b.getClusterState()) { + return false; + } + if (a.getDistributionBitCount() != b.getDistributionBitCount()) { + return false; + } + if (a.getNodeCount(lib::NodeType::DISTRIBUTOR) != b.getNodeCount(lib::NodeType::DISTRIBUTOR)) { + return false; + } + if (a.getNodeCount(lib::NodeType::STORAGE) != b.getNodeCount(lib::NodeType::STORAGE)) { + return false; + } + return (node_states_are_idempotent_for_pruning(lib::NodeType::DISTRIBUTOR, a, b, up_states) && + node_states_are_idempotent_for_pruning(lib::NodeType::STORAGE, a, b, up_states)); +} + +} diff --git a/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h new file mode 100644 index 00000000000..7b0f554231a --- /dev/null +++ b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h @@ -0,0 +1,27 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +namespace storage::lib { +class ClusterState; +} + +namespace storage::distributor { + +/* + * Returns whether the state transition from a -> b is idempotent in terms + * of buckets needing to be pruned from the distributor's bucket database. + * + * Examples of when this is the case: + * - `a` and `b` differ only in state version number + * - Storage node 1 is .s:d in `a`, and .s:m in `b`. Buckets have already + * been pruned when `a` was processed. + * - Node startup timestamps have been changed. This will trigger bucket + * info re-fetches if the distributor observes a higher startup timestamp + * than it currently was aware of, but does not need any pruning. + */ +bool db_pruning_may_be_elided(const lib::ClusterState& a, + const lib::ClusterState& b, + const char* up_states = "uri"); + +} diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp index 08c8cc59933..db2842b70f2 100644 --- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp +++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "bucketdbupdater.h" +#include "bucket_db_prune_elision.h" #include "distributor.h" #include "distributor_bucket_space.h" #include "simpleclusterinformation.h" @@ -119,23 +120,37 @@ BucketDBUpdater::recheckBucketInfo(uint32_t nodeIdx, void BucketDBUpdater::removeSuperfluousBuckets( - const lib::ClusterStateBundle& newState) + const lib::ClusterStateBundle& newState, + bool is_distribution_config_change) { const bool move_to_read_only_db = shouldDeferStateEnabling(); - for (auto &elem : _distributorComponent.getBucketSpaceRepo()) { - const auto &newDistribution(elem.second->getDistribution()); - const auto &oldClusterState(elem.second->getClusterState()); - auto &bucketDb(elem.second->getBucketDatabase()); + const char* up_states = _distributorComponent.getDistributor().getStorageNodeUpStates(); + for (auto& elem : _distributorComponent.getBucketSpaceRepo()) { + const auto& newDistribution(elem.second->getDistribution()); + const auto& oldClusterState(elem.second->getClusterState()); + const auto& new_cluster_state = newState.getDerivedClusterState(elem.first); + + // Running a full DB sweep is expensive, so if the cluster state transition does + // not actually indicate that buckets should possibly be removed, we elide it entirely. + if (!is_distribution_config_change + && db_pruning_may_be_elided(oldClusterState, *new_cluster_state, up_states)) + { + LOG(debug, "Eliding DB pruning for state transition '%s' -> '%s'", + oldClusterState.toString().c_str(), new_cluster_state->toString().c_str()); + continue; + } + + auto& bucketDb(elem.second->getBucketDatabase()); auto& readOnlyDb(_distributorComponent.getReadOnlyBucketSpaceRepo().get(elem.first).getBucketDatabase()); // Remove all buckets not belonging to this distributor, or // being on storage nodes that are no longer up. NodeRemover proc( oldClusterState, - *newState.getDerivedClusterState(elem.first), + *new_cluster_state, _distributorComponent.getIndex(), newDistribution, - _distributorComponent.getDistributor().getStorageNodeUpStates()); + up_states); bucketDb.forEach(proc); for (const auto & bucket : proc.getBucketsToRemove()) { @@ -184,7 +199,7 @@ BucketDBUpdater::storageDistributionChanged() { ensureTransitionTimerStarted(); - removeSuperfluousBuckets(_distributorComponent.getClusterStateBundle()); + removeSuperfluousBuckets(_distributorComponent.getClusterStateBundle(), true); ClusterInformation::CSP clusterInfo(new SimpleClusterInformation( _distributorComponent.getIndex(), @@ -237,7 +252,7 @@ BucketDBUpdater::onSetSystemState( // Separate timer since _transitionTimer might span multiple pending states. framework::MilliSecTimer process_timer(_distributorComponent.getClock()); - removeSuperfluousBuckets(cmd->getClusterStateBundle()); + removeSuperfluousBuckets(cmd->getClusterStateBundle(), false); replyToPreviousPendingClusterStateIfAny(); ClusterInformation::CSP clusterInfo( diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.h b/storage/src/vespa/storage/distributor/bucketdbupdater.h index 9ff732a09f5..e9877d37e67 100644 --- a/storage/src/vespa/storage/distributor/bucketdbupdater.h +++ b/storage/src/vespa/storage/distributor/bucketdbupdater.h @@ -163,7 +163,8 @@ private: void updateState(const lib::ClusterState& oldState, const lib::ClusterState& newState); - void removeSuperfluousBuckets(const lib::ClusterStateBundle& newState); + void removeSuperfluousBuckets(const lib::ClusterStateBundle& newState, + bool is_distribution_config_change); void replyToPreviousPendingClusterStateIfAny(); void replyToActivationWithActualVersion( -- cgit v1.2.3