summaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2019-04-29 14:57:46 +0000
committerTor Brede Vekterli <vekterli@verizonmedia.com>2019-04-29 15:14:50 +0000
commit4a2642f2f11526ccc183e331f0644afa89bbf028 (patch)
treeba98d1b9650c075d83973601d9a470a59faf9f2f /storage
parentc9f9f467df69a65fe1a403f58411259e3c0fa823 (diff)
Elide bucket DB pruning scans when possible
Only do a (potentially expensive) pruning pass over the bucket DB when the cluster state transition indicates that one or more nodes are not in the same availability-state as the currently active state. For instance, if a cluster state change is for a content node going from Maintenance to Down, no pruning action is required since that shall already have taken place during the processing of the initial Down edge (and no buckets shall have been created for it in the meantime). We do not currently attempt to elide distribution config changes, as these happen much more rarely than cluster state changes.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/CMakeLists.txt1
-rw-r--r--storage/src/tests/distributor/bucket_db_prune_elision_test.cpp119
-rw-r--r--storage/src/tests/distributor/bucketdbupdatertest.cpp43
-rw-r--r--storage/src/vespa/storage/distributor/CMakeLists.txt1
-rw-r--r--storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp58
-rw-r--r--storage/src/vespa/storage/distributor/bucket_db_prune_elision.h27
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.cpp33
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.h3
8 files changed, 275 insertions, 10 deletions
diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt
index 1e9dca3f7f1..f7a65000f6c 100644
--- a/storage/src/tests/distributor/CMakeLists.txt
+++ b/storage/src/tests/distributor/CMakeLists.txt
@@ -51,6 +51,7 @@ vespa_add_library(storage_gtestdistributor TEST
bucketdbupdatertest.cpp
# Fixture etc. dupes with non-gtest runner :
distributortestutil.cpp
+ bucket_db_prune_elision_test.cpp
messagesenderstub.cpp
DEPENDS
storage_distributor
diff --git a/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp b/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp
new file mode 100644
index 00000000000..4821d05a90c
--- /dev/null
+++ b/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp
@@ -0,0 +1,119 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/distributor/bucket_db_prune_elision.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+
+namespace storage::distributor {
+
+namespace {
+
+lib::ClusterState state_of(const char* str) {
+ return lib::ClusterState(str);
+}
+
+}
+
+TEST(IdempotentStateTransitionTest, state_differing_only_in_version_allows_elision) {
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("version:1 bits:8 distributor:3 storage:3"),
+ state_of("version:2 bits:8 distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, differing_cluster_state_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("cluster:d distributor:3 storage:3"),
+ state_of("distributor:3 storage:3")));
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("cluster:d distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, differing_distribution_bit_count_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("bits:8 distributor:3 storage:3"),
+ state_of("bits:9 distributor:3 storage:3")));
+ // No explicit "bits:" implies 16 bits
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("bits:8 distributor:3 storage:3"),
+ state_of("distributor:3 storage:3")));
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("bits:8 distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, same_implicit_distribution_bit_count_allows_elision) {
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("bits:16 distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, changed_distributor_node_count_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:4 storage:3")));
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:4 storage:3"),
+ state_of("distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, changed_distributor_node_state_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"),
+ state_of("distributor:3 storage:3")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 .0.s:d storage:3")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"),
+ state_of("distributor:3 .0.s:u storage:3")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"),
+ state_of("distributor:3 .1.s:d storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, changed_storage_node_count_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 storage:4")));
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:4"),
+ state_of("distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, changed_storage_node_state_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"),
+ state_of("distributor:3 storage:3")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 storage:3 .0.s:d")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"),
+ state_of("distributor:3 storage:3 .0.s:u")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"),
+ state_of("distributor:3 storage:3 .1.s:d")));
+}
+
+TEST(IdempotentStateTransitionTest, may_elide_for_transition_between_different_effective_storage_down_states) {
+ // Maintenance -> Down edge shall already have pruned DB on Maintenance edge
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:m"),
+ state_of("distributor:3 storage:3 .0.s:d")));
+
+ // Down -> Maintenance edge shall already have pruned DB on Down edge
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"),
+ state_of("distributor:3 storage:3 .0.s:m")));
+}
+
+TEST(IdempotentStateTransitionTest, may_elide_for_transition_between_different_effective_storage_up_states) {
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:i"),
+ state_of("distributor:3 storage:3")));
+
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .1.s:r"),
+ state_of("distributor:3 storage:3")));
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 storage:3 .2.s:r")));
+}
+
+// Changed startup timestamps imply that bucket info should be fetched from a node, but
+// does not imply that pruning is required.
+TEST(IdempotentStateTransitionTest, may_elide_changed_startup_timestamps) {
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 storage:3 .0.t:123456")));
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.t:123456"),
+ state_of("distributor:3 storage:3")));
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.t:123456"),
+ state_of("distributor:3 storage:3 .0.t:654321")));
+}
+
+}
diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp
index 2beae2ac43b..51797e90b3e 100644
--- a/storage/src/tests/distributor/bucketdbupdatertest.cpp
+++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp
@@ -1864,6 +1864,7 @@ TEST_F(BucketDBUpdaterTest, testClusterStateAlwaysSendsFullFetchWhenDistribution
_sender.clear();
std::string distConfig(getDistConfig6Nodes2Groups());
setDistribution(distConfig);
+
sortSentMessagesByIndex(_sender);
ASSERT_EQ(messageCount(6), _sender.commands.size());
// Suddenly, a wild cluster state change appears! Even though this state
@@ -1921,6 +1922,48 @@ TEST_F(BucketDBUpdaterTest, testChangedDistributionConfigTriggersRecoveryMode) {
EXPECT_TRUE(_distributor->isInRecoveryMode());
}
+namespace {
+
+template <typename Func>
+struct FunctorProcessor : BucketDatabase::EntryProcessor {
+ Func _f;
+
+ template <typename F>
+ explicit FunctorProcessor(F&& f) : _f(std::forward<F>(f)) {}
+
+ bool process(const BucketDatabase::Entry& e) override {
+ _f(e);
+ return true;
+ }
+};
+
+template <typename Func>
+std::unique_ptr<BucketDatabase::EntryProcessor> func_processor(Func&& f) {
+ return std::make_unique<FunctorProcessor<Func>>(std::forward<Func>(f));
+}
+
+}
+
+TEST_F(BucketDBUpdaterTest, changed_distribution_config_does_not_elide_bucket_db_pruning) {
+ setDistribution(getDistConfig3Nodes1Group());
+
+ constexpr uint32_t n_buckets = 100;
+ ASSERT_NO_FATAL_FAILURE(
+ setAndEnableClusterState(lib::ClusterState("distributor:6 storage:6"), messageCount(6), n_buckets));
+ _sender.clear();
+
+ // Config implies a different node set than the current cluster state, so it's crucial that
+ // DB pruning is _not_ elided. Yes, this is inherently racing with cluster state changes and
+ // should be changed to be atomic and controlled by the cluster controller instead of config.
+ // But this is where we currently are.
+ setDistribution(getDistConfig6Nodes2Groups());
+
+ getBucketDatabase().forEach(*func_processor([&](const auto& e) {
+ EXPECT_TRUE(getBucketDBUpdater()
+ .checkOwnershipInPendingState(makeDocumentBucket(e.getBucketId())).isOwned());
+ }));
+}
+
TEST_F(BucketDBUpdaterTest, testNewlyAddedBucketsHaveCurrentTimeAsGcTimestamp) {
getClock().setAbsoluteTimeInSeconds(101234);
lib::ClusterState stateBefore("distributor:1 storage:1");
diff --git a/storage/src/vespa/storage/distributor/CMakeLists.txt b/storage/src/vespa/storage/distributor/CMakeLists.txt
index bd82c0e4d6e..8c701033e67 100644
--- a/storage/src/vespa/storage/distributor/CMakeLists.txt
+++ b/storage/src/vespa/storage/distributor/CMakeLists.txt
@@ -4,6 +4,7 @@ vespa_add_library(storage_distributor
activecopy.cpp
blockingoperationstarter.cpp
bucketdbupdater.cpp
+ bucket_db_prune_elision.cpp
bucketgctimecalculator.cpp
bucketlistmerger.cpp
clusterinformation.cpp
diff --git a/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp
new file mode 100644
index 00000000000..78b5b576c16
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp
@@ -0,0 +1,58 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "bucket_db_prune_elision.h"
+#include <vespa/vdslib/state/clusterstate.h>
+
+namespace storage::distributor {
+
+// Returns whether the set of nodes of type `node_type` across two cluster states
+// are idempotent from the perspective of bucket pruning. This is the case iff
+// the effective down/up state of each node is unchanged. I.e. if the only difference
+// between two states is that a storage node goes from state Down to Maintenance (or
+// vice versa), the buckets shall already have been pruned during processing of the
+// first edge, so the subsequent pruning may be elided.
+//
+// If a node's state is not one of `up_states`, it is considered to be in an effective
+// Down state, and vice versa.
+//
+// Precondition: a.getNodeCount(node_type) == b.getNodeCount(node_type)
+bool node_states_are_idempotent_for_pruning(const lib::NodeType& node_type,
+ const lib::ClusterState& a,
+ const lib::ClusterState& b,
+ const char* up_states)
+{
+ const uint16_t node_count = a.getNodeCount(node_type);
+ for (uint16_t i = 0; i < node_count; ++i) {
+ lib::Node node(node_type, i);
+ const auto& a_s = a.getNodeState(node);
+ const auto& b_s = b.getNodeState(node);
+ // Transitioning from one effective Down state to another can elide DB pruning,
+ // as the DB shall already have been pruned of all relevant buckets on the _first_
+ // effective Down edge.
+ // No pruning shall take place on a transition from one effective Up state to another
+ // effective Up state.
+ if (a_s.getState().oneOf(up_states) != b_s.getState().oneOf(up_states)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool db_pruning_may_be_elided(const lib::ClusterState& a, const lib::ClusterState& b, const char* up_states) {
+ if (a.getClusterState() != b.getClusterState()) {
+ return false;
+ }
+ if (a.getDistributionBitCount() != b.getDistributionBitCount()) {
+ return false;
+ }
+ if (a.getNodeCount(lib::NodeType::DISTRIBUTOR) != b.getNodeCount(lib::NodeType::DISTRIBUTOR)) {
+ return false;
+ }
+ if (a.getNodeCount(lib::NodeType::STORAGE) != b.getNodeCount(lib::NodeType::STORAGE)) {
+ return false;
+ }
+ return (node_states_are_idempotent_for_pruning(lib::NodeType::DISTRIBUTOR, a, b, up_states) &&
+ node_states_are_idempotent_for_pruning(lib::NodeType::STORAGE, a, b, up_states));
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h
new file mode 100644
index 00000000000..7b0f554231a
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h
@@ -0,0 +1,27 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+namespace storage::lib {
+class ClusterState;
+}
+
+namespace storage::distributor {
+
+/*
+ * Returns whether the state transition from a -> b is idempotent in terms
+ * of buckets needing to be pruned from the distributor's bucket database.
+ *
+ * Examples of when this is the case:
+ * - `a` and `b` differ only in state version number
+ * - Storage node 1 is .s:d in `a`, and .s:m in `b`. Buckets have already
+ * been pruned when `a` was processed.
+ * - Node startup timestamps have been changed. This will trigger bucket
+ * info re-fetches if the distributor observes a higher startup timestamp
+ * than it currently was aware of, but does not need any pruning.
+ */
+bool db_pruning_may_be_elided(const lib::ClusterState& a,
+ const lib::ClusterState& b,
+ const char* up_states = "uri");
+
+}
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
index 08c8cc59933..db2842b70f2 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "bucketdbupdater.h"
+#include "bucket_db_prune_elision.h"
#include "distributor.h"
#include "distributor_bucket_space.h"
#include "simpleclusterinformation.h"
@@ -119,23 +120,37 @@ BucketDBUpdater::recheckBucketInfo(uint32_t nodeIdx,
void
BucketDBUpdater::removeSuperfluousBuckets(
- const lib::ClusterStateBundle& newState)
+ const lib::ClusterStateBundle& newState,
+ bool is_distribution_config_change)
{
const bool move_to_read_only_db = shouldDeferStateEnabling();
- for (auto &elem : _distributorComponent.getBucketSpaceRepo()) {
- const auto &newDistribution(elem.second->getDistribution());
- const auto &oldClusterState(elem.second->getClusterState());
- auto &bucketDb(elem.second->getBucketDatabase());
+ const char* up_states = _distributorComponent.getDistributor().getStorageNodeUpStates();
+ for (auto& elem : _distributorComponent.getBucketSpaceRepo()) {
+ const auto& newDistribution(elem.second->getDistribution());
+ const auto& oldClusterState(elem.second->getClusterState());
+ const auto& new_cluster_state = newState.getDerivedClusterState(elem.first);
+
+ // Running a full DB sweep is expensive, so if the cluster state transition does
+ // not actually indicate that buckets should possibly be removed, we elide it entirely.
+ if (!is_distribution_config_change
+ && db_pruning_may_be_elided(oldClusterState, *new_cluster_state, up_states))
+ {
+ LOG(debug, "Eliding DB pruning for state transition '%s' -> '%s'",
+ oldClusterState.toString().c_str(), new_cluster_state->toString().c_str());
+ continue;
+ }
+
+ auto& bucketDb(elem.second->getBucketDatabase());
auto& readOnlyDb(_distributorComponent.getReadOnlyBucketSpaceRepo().get(elem.first).getBucketDatabase());
// Remove all buckets not belonging to this distributor, or
// being on storage nodes that are no longer up.
NodeRemover proc(
oldClusterState,
- *newState.getDerivedClusterState(elem.first),
+ *new_cluster_state,
_distributorComponent.getIndex(),
newDistribution,
- _distributorComponent.getDistributor().getStorageNodeUpStates());
+ up_states);
bucketDb.forEach(proc);
for (const auto & bucket : proc.getBucketsToRemove()) {
@@ -184,7 +199,7 @@ BucketDBUpdater::storageDistributionChanged()
{
ensureTransitionTimerStarted();
- removeSuperfluousBuckets(_distributorComponent.getClusterStateBundle());
+ removeSuperfluousBuckets(_distributorComponent.getClusterStateBundle(), true);
ClusterInformation::CSP clusterInfo(new SimpleClusterInformation(
_distributorComponent.getIndex(),
@@ -237,7 +252,7 @@ BucketDBUpdater::onSetSystemState(
// Separate timer since _transitionTimer might span multiple pending states.
framework::MilliSecTimer process_timer(_distributorComponent.getClock());
- removeSuperfluousBuckets(cmd->getClusterStateBundle());
+ removeSuperfluousBuckets(cmd->getClusterStateBundle(), false);
replyToPreviousPendingClusterStateIfAny();
ClusterInformation::CSP clusterInfo(
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.h b/storage/src/vespa/storage/distributor/bucketdbupdater.h
index 9ff732a09f5..e9877d37e67 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.h
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.h
@@ -163,7 +163,8 @@ private:
void updateState(const lib::ClusterState& oldState, const lib::ClusterState& newState);
- void removeSuperfluousBuckets(const lib::ClusterStateBundle& newState);
+ void removeSuperfluousBuckets(const lib::ClusterStateBundle& newState,
+ bool is_distribution_config_change);
void replyToPreviousPendingClusterStateIfAny();
void replyToActivationWithActualVersion(