diff options
Diffstat (limited to 'storage')
8 files changed, 275 insertions, 10 deletions
diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt index 1e9dca3f7f1..f7a65000f6c 100644 --- a/storage/src/tests/distributor/CMakeLists.txt +++ b/storage/src/tests/distributor/CMakeLists.txt @@ -51,6 +51,7 @@ vespa_add_library(storage_gtestdistributor TEST bucketdbupdatertest.cpp # Fixture etc. dupes with non-gtest runner : distributortestutil.cpp + bucket_db_prune_elision_test.cpp messagesenderstub.cpp DEPENDS storage_distributor diff --git a/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp b/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp new file mode 100644 index 00000000000..4821d05a90c --- /dev/null +++ b/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp @@ -0,0 +1,119 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/storage/distributor/bucket_db_prune_elision.h> +#include <vespa/vdslib/state/clusterstate.h> +#include <gtest/gtest.h> + +using namespace ::testing; + +namespace storage::distributor { + +namespace { + +lib::ClusterState state_of(const char* str) { + return lib::ClusterState(str); +} + +} + +TEST(IdempotentStateTransitionTest, state_differing_only_in_version_allows_elision) { + EXPECT_TRUE(db_pruning_may_be_elided(state_of("version:1 bits:8 distributor:3 storage:3"), + state_of("version:2 bits:8 distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, differing_cluster_state_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("cluster:d distributor:3 storage:3"), + state_of("distributor:3 storage:3"))); + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("cluster:d distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, differing_distribution_bit_count_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("bits:8 distributor:3 storage:3"), + state_of("bits:9 distributor:3 storage:3"))); + // No explicit "bits:" implies 16 bits + EXPECT_FALSE(db_pruning_may_be_elided(state_of("bits:8 distributor:3 storage:3"), + state_of("distributor:3 storage:3"))); + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("bits:8 distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, same_implicit_distribution_bit_count_allows_elision) { + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("bits:16 distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, changed_distributor_node_count_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:4 storage:3"))); + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:4 storage:3"), + state_of("distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, changed_distributor_node_state_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"), + state_of("distributor:3 storage:3"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 .0.s:d storage:3"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"), + state_of("distributor:3 .0.s:u storage:3"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"), + state_of("distributor:3 .1.s:d storage:3"))); +} + +TEST(IdempotentStateTransitionTest, changed_storage_node_count_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 storage:4"))); + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:4"), + state_of("distributor:3 storage:3"))); +} + +TEST(IdempotentStateTransitionTest, changed_storage_node_state_disallows_elision) { + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"), + state_of("distributor:3 storage:3"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 storage:3 .0.s:d"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"), + state_of("distributor:3 storage:3 .0.s:u"))); + + EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"), + state_of("distributor:3 storage:3 .1.s:d"))); +} + +TEST(IdempotentStateTransitionTest, may_elide_for_transition_between_different_effective_storage_down_states) { + // Maintenance -> Down edge shall already have pruned DB on Maintenance edge + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:m"), + state_of("distributor:3 storage:3 .0.s:d"))); + + // Down -> Maintenance edge shall already have pruned DB on Down edge + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"), + state_of("distributor:3 storage:3 .0.s:m"))); +} + +TEST(IdempotentStateTransitionTest, may_elide_for_transition_between_different_effective_storage_up_states) { + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:i"), + state_of("distributor:3 storage:3"))); + + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .1.s:r"), + state_of("distributor:3 storage:3"))); + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 storage:3 .2.s:r"))); +} + +// Changed startup timestamps imply that bucket info should be fetched from a node, but +// does not imply that pruning is required. +TEST(IdempotentStateTransitionTest, may_elide_changed_startup_timestamps) { + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"), + state_of("distributor:3 storage:3 .0.t:123456"))); + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.t:123456"), + state_of("distributor:3 storage:3"))); + EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.t:123456"), + state_of("distributor:3 storage:3 .0.t:654321"))); +} + +} diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp index 2beae2ac43b..51797e90b3e 100644 --- a/storage/src/tests/distributor/bucketdbupdatertest.cpp +++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp @@ -1864,6 +1864,7 @@ TEST_F(BucketDBUpdaterTest, testClusterStateAlwaysSendsFullFetchWhenDistribution _sender.clear(); std::string distConfig(getDistConfig6Nodes2Groups()); setDistribution(distConfig); + sortSentMessagesByIndex(_sender); ASSERT_EQ(messageCount(6), _sender.commands.size()); // Suddenly, a wild cluster state change appears! Even though this state @@ -1921,6 +1922,48 @@ TEST_F(BucketDBUpdaterTest, testChangedDistributionConfigTriggersRecoveryMode) { EXPECT_TRUE(_distributor->isInRecoveryMode()); } +namespace { + +template <typename Func> +struct FunctorProcessor : BucketDatabase::EntryProcessor { + Func _f; + + template <typename F> + explicit FunctorProcessor(F&& f) : _f(std::forward<F>(f)) {} + + bool process(const BucketDatabase::Entry& e) override { + _f(e); + return true; + } +}; + +template <typename Func> +std::unique_ptr<BucketDatabase::EntryProcessor> func_processor(Func&& f) { + return std::make_unique<FunctorProcessor<Func>>(std::forward<Func>(f)); +} + +} + +TEST_F(BucketDBUpdaterTest, changed_distribution_config_does_not_elide_bucket_db_pruning) { + setDistribution(getDistConfig3Nodes1Group()); + + constexpr uint32_t n_buckets = 100; + ASSERT_NO_FATAL_FAILURE( + setAndEnableClusterState(lib::ClusterState("distributor:6 storage:6"), messageCount(6), n_buckets)); + _sender.clear(); + + // Config implies a different node set than the current cluster state, so it's crucial that + // DB pruning is _not_ elided. Yes, this is inherently racing with cluster state changes and + // should be changed to be atomic and controlled by the cluster controller instead of config. + // But this is where we currently are. + setDistribution(getDistConfig6Nodes2Groups()); + + getBucketDatabase().forEach(*func_processor([&](const auto& e) { + EXPECT_TRUE(getBucketDBUpdater() + .checkOwnershipInPendingState(makeDocumentBucket(e.getBucketId())).isOwned()); + })); +} + TEST_F(BucketDBUpdaterTest, testNewlyAddedBucketsHaveCurrentTimeAsGcTimestamp) { getClock().setAbsoluteTimeInSeconds(101234); lib::ClusterState stateBefore("distributor:1 storage:1"); diff --git a/storage/src/vespa/storage/distributor/CMakeLists.txt b/storage/src/vespa/storage/distributor/CMakeLists.txt index bd82c0e4d6e..8c701033e67 100644 --- a/storage/src/vespa/storage/distributor/CMakeLists.txt +++ b/storage/src/vespa/storage/distributor/CMakeLists.txt @@ -4,6 +4,7 @@ vespa_add_library(storage_distributor activecopy.cpp blockingoperationstarter.cpp bucketdbupdater.cpp + bucket_db_prune_elision.cpp bucketgctimecalculator.cpp bucketlistmerger.cpp clusterinformation.cpp diff --git a/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp new file mode 100644 index 00000000000..78b5b576c16 --- /dev/null +++ b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp @@ -0,0 +1,58 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "bucket_db_prune_elision.h" +#include <vespa/vdslib/state/clusterstate.h> + +namespace storage::distributor { + +// Returns whether the set of nodes of type `node_type` across two cluster states +// are idempotent from the perspective of bucket pruning. This is the case iff +// the effective down/up state of each node is unchanged. I.e. if the only difference +// between two states is that a storage node goes from state Down to Maintenance (or +// vice versa), the buckets shall already have been pruned during processing of the +// first edge, so the subsequent pruning may be elided. +// +// If a node's state is not one of `up_states`, it is considered to be in an effective +// Down state, and vice versa. +// +// Precondition: a.getNodeCount(node_type) == b.getNodeCount(node_type) +bool node_states_are_idempotent_for_pruning(const lib::NodeType& node_type, + const lib::ClusterState& a, + const lib::ClusterState& b, + const char* up_states) +{ + const uint16_t node_count = a.getNodeCount(node_type); + for (uint16_t i = 0; i < node_count; ++i) { + lib::Node node(node_type, i); + const auto& a_s = a.getNodeState(node); + const auto& b_s = b.getNodeState(node); + // Transitioning from one effective Down state to another can elide DB pruning, + // as the DB shall already have been pruned of all relevant buckets on the _first_ + // effective Down edge. + // No pruning shall take place on a transition from one effective Up state to another + // effective Up state. + if (a_s.getState().oneOf(up_states) != b_s.getState().oneOf(up_states)) { + return false; + } + } + return true; +} + +bool db_pruning_may_be_elided(const lib::ClusterState& a, const lib::ClusterState& b, const char* up_states) { + if (a.getClusterState() != b.getClusterState()) { + return false; + } + if (a.getDistributionBitCount() != b.getDistributionBitCount()) { + return false; + } + if (a.getNodeCount(lib::NodeType::DISTRIBUTOR) != b.getNodeCount(lib::NodeType::DISTRIBUTOR)) { + return false; + } + if (a.getNodeCount(lib::NodeType::STORAGE) != b.getNodeCount(lib::NodeType::STORAGE)) { + return false; + } + return (node_states_are_idempotent_for_pruning(lib::NodeType::DISTRIBUTOR, a, b, up_states) && + node_states_are_idempotent_for_pruning(lib::NodeType::STORAGE, a, b, up_states)); +} + +} diff --git a/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h new file mode 100644 index 00000000000..7b0f554231a --- /dev/null +++ b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h @@ -0,0 +1,27 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +namespace storage::lib { +class ClusterState; +} + +namespace storage::distributor { + +/* + * Returns whether the state transition from a -> b is idempotent in terms + * of buckets needing to be pruned from the distributor's bucket database. + * + * Examples of when this is the case: + * - `a` and `b` differ only in state version number + * - Storage node 1 is .s:d in `a`, and .s:m in `b`. Buckets have already + * been pruned when `a` was processed. + * - Node startup timestamps have been changed. This will trigger bucket + * info re-fetches if the distributor observes a higher startup timestamp + * than it currently was aware of, but does not need any pruning. + */ +bool db_pruning_may_be_elided(const lib::ClusterState& a, + const lib::ClusterState& b, + const char* up_states = "uri"); + +} diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp index 08c8cc59933..db2842b70f2 100644 --- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp +++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "bucketdbupdater.h" +#include "bucket_db_prune_elision.h" #include "distributor.h" #include "distributor_bucket_space.h" #include "simpleclusterinformation.h" @@ -119,23 +120,37 @@ BucketDBUpdater::recheckBucketInfo(uint32_t nodeIdx, void BucketDBUpdater::removeSuperfluousBuckets( - const lib::ClusterStateBundle& newState) + const lib::ClusterStateBundle& newState, + bool is_distribution_config_change) { const bool move_to_read_only_db = shouldDeferStateEnabling(); - for (auto &elem : _distributorComponent.getBucketSpaceRepo()) { - const auto &newDistribution(elem.second->getDistribution()); - const auto &oldClusterState(elem.second->getClusterState()); - auto &bucketDb(elem.second->getBucketDatabase()); + const char* up_states = _distributorComponent.getDistributor().getStorageNodeUpStates(); + for (auto& elem : _distributorComponent.getBucketSpaceRepo()) { + const auto& newDistribution(elem.second->getDistribution()); + const auto& oldClusterState(elem.second->getClusterState()); + const auto& new_cluster_state = newState.getDerivedClusterState(elem.first); + + // Running a full DB sweep is expensive, so if the cluster state transition does + // not actually indicate that buckets should possibly be removed, we elide it entirely. + if (!is_distribution_config_change + && db_pruning_may_be_elided(oldClusterState, *new_cluster_state, up_states)) + { + LOG(debug, "Eliding DB pruning for state transition '%s' -> '%s'", + oldClusterState.toString().c_str(), new_cluster_state->toString().c_str()); + continue; + } + + auto& bucketDb(elem.second->getBucketDatabase()); auto& readOnlyDb(_distributorComponent.getReadOnlyBucketSpaceRepo().get(elem.first).getBucketDatabase()); // Remove all buckets not belonging to this distributor, or // being on storage nodes that are no longer up. NodeRemover proc( oldClusterState, - *newState.getDerivedClusterState(elem.first), + *new_cluster_state, _distributorComponent.getIndex(), newDistribution, - _distributorComponent.getDistributor().getStorageNodeUpStates()); + up_states); bucketDb.forEach(proc); for (const auto & bucket : proc.getBucketsToRemove()) { @@ -184,7 +199,7 @@ BucketDBUpdater::storageDistributionChanged() { ensureTransitionTimerStarted(); - removeSuperfluousBuckets(_distributorComponent.getClusterStateBundle()); + removeSuperfluousBuckets(_distributorComponent.getClusterStateBundle(), true); ClusterInformation::CSP clusterInfo(new SimpleClusterInformation( _distributorComponent.getIndex(), @@ -237,7 +252,7 @@ BucketDBUpdater::onSetSystemState( // Separate timer since _transitionTimer might span multiple pending states. framework::MilliSecTimer process_timer(_distributorComponent.getClock()); - removeSuperfluousBuckets(cmd->getClusterStateBundle()); + removeSuperfluousBuckets(cmd->getClusterStateBundle(), false); replyToPreviousPendingClusterStateIfAny(); ClusterInformation::CSP clusterInfo( diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.h b/storage/src/vespa/storage/distributor/bucketdbupdater.h index 9ff732a09f5..e9877d37e67 100644 --- a/storage/src/vespa/storage/distributor/bucketdbupdater.h +++ b/storage/src/vespa/storage/distributor/bucketdbupdater.h @@ -163,7 +163,8 @@ private: void updateState(const lib::ClusterState& oldState, const lib::ClusterState& newState); - void removeSuperfluousBuckets(const lib::ClusterStateBundle& newState); + void removeSuperfluousBuckets(const lib::ClusterStateBundle& newState, + bool is_distribution_config_change); void replyToPreviousPendingClusterStateIfAny(); void replyToActivationWithActualVersion( |