summaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/CMakeLists.txt1
-rw-r--r--storage/src/tests/distributor/bucket_db_prune_elision_test.cpp119
-rw-r--r--storage/src/tests/distributor/bucketdbupdatertest.cpp43
-rw-r--r--storage/src/vespa/storage/distributor/CMakeLists.txt1
-rw-r--r--storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp58
-rw-r--r--storage/src/vespa/storage/distributor/bucket_db_prune_elision.h27
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.cpp33
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.h3
8 files changed, 275 insertions, 10 deletions
diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt
index 1e9dca3f7f1..f7a65000f6c 100644
--- a/storage/src/tests/distributor/CMakeLists.txt
+++ b/storage/src/tests/distributor/CMakeLists.txt
@@ -51,6 +51,7 @@ vespa_add_library(storage_gtestdistributor TEST
bucketdbupdatertest.cpp
# Fixture etc. dupes with non-gtest runner :
distributortestutil.cpp
+ bucket_db_prune_elision_test.cpp
messagesenderstub.cpp
DEPENDS
storage_distributor
diff --git a/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp b/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp
new file mode 100644
index 00000000000..4821d05a90c
--- /dev/null
+++ b/storage/src/tests/distributor/bucket_db_prune_elision_test.cpp
@@ -0,0 +1,119 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/distributor/bucket_db_prune_elision.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+
+namespace storage::distributor {
+
+namespace {
+
+lib::ClusterState state_of(const char* str) {
+ return lib::ClusterState(str);
+}
+
+}
+
+TEST(IdempotentStateTransitionTest, state_differing_only_in_version_allows_elision) {
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("version:1 bits:8 distributor:3 storage:3"),
+ state_of("version:2 bits:8 distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, differing_cluster_state_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("cluster:d distributor:3 storage:3"),
+ state_of("distributor:3 storage:3")));
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("cluster:d distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, differing_distribution_bit_count_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("bits:8 distributor:3 storage:3"),
+ state_of("bits:9 distributor:3 storage:3")));
+ // No explicit "bits:" implies 16 bits
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("bits:8 distributor:3 storage:3"),
+ state_of("distributor:3 storage:3")));
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("bits:8 distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, same_implicit_distribution_bit_count_allows_elision) {
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("bits:16 distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, changed_distributor_node_count_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:4 storage:3")));
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:4 storage:3"),
+ state_of("distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, changed_distributor_node_state_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"),
+ state_of("distributor:3 storage:3")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 .0.s:d storage:3")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"),
+ state_of("distributor:3 .0.s:u storage:3")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 .0.s:d storage:3"),
+ state_of("distributor:3 .1.s:d storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, changed_storage_node_count_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 storage:4")));
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:4"),
+ state_of("distributor:3 storage:3")));
+}
+
+TEST(IdempotentStateTransitionTest, changed_storage_node_state_disallows_elision) {
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"),
+ state_of("distributor:3 storage:3")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 storage:3 .0.s:d")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"),
+ state_of("distributor:3 storage:3 .0.s:u")));
+
+ EXPECT_FALSE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"),
+ state_of("distributor:3 storage:3 .1.s:d")));
+}
+
+TEST(IdempotentStateTransitionTest, may_elide_for_transition_between_different_effective_storage_down_states) {
+ // Maintenance -> Down edge shall already have pruned DB on Maintenance edge
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:m"),
+ state_of("distributor:3 storage:3 .0.s:d")));
+
+ // Down -> Maintenance edge shall already have pruned DB on Down edge
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:d"),
+ state_of("distributor:3 storage:3 .0.s:m")));
+}
+
+TEST(IdempotentStateTransitionTest, may_elide_for_transition_between_different_effective_storage_up_states) {
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.s:i"),
+ state_of("distributor:3 storage:3")));
+
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .1.s:r"),
+ state_of("distributor:3 storage:3")));
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 storage:3 .2.s:r")));
+}
+
+// Changed startup timestamps imply that bucket info should be fetched from a node, but
+// does not imply that pruning is required.
+TEST(IdempotentStateTransitionTest, may_elide_changed_startup_timestamps) {
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3"),
+ state_of("distributor:3 storage:3 .0.t:123456")));
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.t:123456"),
+ state_of("distributor:3 storage:3")));
+ EXPECT_TRUE(db_pruning_may_be_elided(state_of("distributor:3 storage:3 .0.t:123456"),
+ state_of("distributor:3 storage:3 .0.t:654321")));
+}
+
+}
diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp
index 2beae2ac43b..51797e90b3e 100644
--- a/storage/src/tests/distributor/bucketdbupdatertest.cpp
+++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp
@@ -1864,6 +1864,7 @@ TEST_F(BucketDBUpdaterTest, testClusterStateAlwaysSendsFullFetchWhenDistribution
_sender.clear();
std::string distConfig(getDistConfig6Nodes2Groups());
setDistribution(distConfig);
+
sortSentMessagesByIndex(_sender);
ASSERT_EQ(messageCount(6), _sender.commands.size());
// Suddenly, a wild cluster state change appears! Even though this state
@@ -1921,6 +1922,48 @@ TEST_F(BucketDBUpdaterTest, testChangedDistributionConfigTriggersRecoveryMode) {
EXPECT_TRUE(_distributor->isInRecoveryMode());
}
+namespace {
+
+template <typename Func>
+struct FunctorProcessor : BucketDatabase::EntryProcessor {
+ Func _f;
+
+ template <typename F>
+ explicit FunctorProcessor(F&& f) : _f(std::forward<F>(f)) {}
+
+ bool process(const BucketDatabase::Entry& e) override {
+ _f(e);
+ return true;
+ }
+};
+
+template <typename Func>
+std::unique_ptr<BucketDatabase::EntryProcessor> func_processor(Func&& f) {
+ return std::make_unique<FunctorProcessor<Func>>(std::forward<Func>(f));
+}
+
+}
+
+TEST_F(BucketDBUpdaterTest, changed_distribution_config_does_not_elide_bucket_db_pruning) {
+ setDistribution(getDistConfig3Nodes1Group());
+
+ constexpr uint32_t n_buckets = 100;
+ ASSERT_NO_FATAL_FAILURE(
+ setAndEnableClusterState(lib::ClusterState("distributor:6 storage:6"), messageCount(6), n_buckets));
+ _sender.clear();
+
+ // Config implies a different node set than the current cluster state, so it's crucial that
+ // DB pruning is _not_ elided. Yes, this is inherently racing with cluster state changes and
+ // should be changed to be atomic and controlled by the cluster controller instead of config.
+ // But this is where we currently are.
+ setDistribution(getDistConfig6Nodes2Groups());
+
+ getBucketDatabase().forEach(*func_processor([&](const auto& e) {
+ EXPECT_TRUE(getBucketDBUpdater()
+ .checkOwnershipInPendingState(makeDocumentBucket(e.getBucketId())).isOwned());
+ }));
+}
+
TEST_F(BucketDBUpdaterTest, testNewlyAddedBucketsHaveCurrentTimeAsGcTimestamp) {
getClock().setAbsoluteTimeInSeconds(101234);
lib::ClusterState stateBefore("distributor:1 storage:1");
diff --git a/storage/src/vespa/storage/distributor/CMakeLists.txt b/storage/src/vespa/storage/distributor/CMakeLists.txt
index bd82c0e4d6e..8c701033e67 100644
--- a/storage/src/vespa/storage/distributor/CMakeLists.txt
+++ b/storage/src/vespa/storage/distributor/CMakeLists.txt
@@ -4,6 +4,7 @@ vespa_add_library(storage_distributor
activecopy.cpp
blockingoperationstarter.cpp
bucketdbupdater.cpp
+ bucket_db_prune_elision.cpp
bucketgctimecalculator.cpp
bucketlistmerger.cpp
clusterinformation.cpp
diff --git a/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp
new file mode 100644
index 00000000000..78b5b576c16
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.cpp
@@ -0,0 +1,58 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "bucket_db_prune_elision.h"
+#include <vespa/vdslib/state/clusterstate.h>
+
+namespace storage::distributor {
+
+// Returns whether the set of nodes of type `node_type` across two cluster states
+// are idempotent from the perspective of bucket pruning. This is the case iff
+// the effective down/up state of each node is unchanged. I.e. if the only difference
+// between two states is that a storage node goes from state Down to Maintenance (or
+// vice versa), the buckets shall already have been pruned during processing of the
+// first edge, so the subsequent pruning may be elided.
+//
+// If a node's state is not one of `up_states`, it is considered to be in an effective
+// Down state, and vice versa.
+//
+// Precondition: a.getNodeCount(node_type) == b.getNodeCount(node_type)
+bool node_states_are_idempotent_for_pruning(const lib::NodeType& node_type,
+ const lib::ClusterState& a,
+ const lib::ClusterState& b,
+ const char* up_states)
+{
+ const uint16_t node_count = a.getNodeCount(node_type);
+ for (uint16_t i = 0; i < node_count; ++i) {
+ lib::Node node(node_type, i);
+ const auto& a_s = a.getNodeState(node);
+ const auto& b_s = b.getNodeState(node);
+ // Transitioning from one effective Down state to another can elide DB pruning,
+ // as the DB shall already have been pruned of all relevant buckets on the _first_
+ // effective Down edge.
+ // No pruning shall take place on a transition from one effective Up state to another
+ // effective Up state.
+ if (a_s.getState().oneOf(up_states) != b_s.getState().oneOf(up_states)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool db_pruning_may_be_elided(const lib::ClusterState& a, const lib::ClusterState& b, const char* up_states) {
+ if (a.getClusterState() != b.getClusterState()) {
+ return false;
+ }
+ if (a.getDistributionBitCount() != b.getDistributionBitCount()) {
+ return false;
+ }
+ if (a.getNodeCount(lib::NodeType::DISTRIBUTOR) != b.getNodeCount(lib::NodeType::DISTRIBUTOR)) {
+ return false;
+ }
+ if (a.getNodeCount(lib::NodeType::STORAGE) != b.getNodeCount(lib::NodeType::STORAGE)) {
+ return false;
+ }
+ return (node_states_are_idempotent_for_pruning(lib::NodeType::DISTRIBUTOR, a, b, up_states) &&
+ node_states_are_idempotent_for_pruning(lib::NodeType::STORAGE, a, b, up_states));
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h
new file mode 100644
index 00000000000..7b0f554231a
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucket_db_prune_elision.h
@@ -0,0 +1,27 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+namespace storage::lib {
+class ClusterState;
+}
+
+namespace storage::distributor {
+
+/*
+ * Returns whether the state transition from a -> b is idempotent in terms
+ * of buckets needing to be pruned from the distributor's bucket database.
+ *
+ * Examples of when this is the case:
+ * - `a` and `b` differ only in state version number
+ * - Storage node 1 is .s:d in `a`, and .s:m in `b`. Buckets have already
+ * been pruned when `a` was processed.
+ * - Node startup timestamps have been changed. This will trigger bucket
+ * info re-fetches if the distributor observes a higher startup timestamp
+ * than it currently was aware of, but does not need any pruning.
+ */
+bool db_pruning_may_be_elided(const lib::ClusterState& a,
+ const lib::ClusterState& b,
+ const char* up_states = "uri");
+
+}
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
index 08c8cc59933..db2842b70f2 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
@@ -1,6 +1,7 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "bucketdbupdater.h"
+#include "bucket_db_prune_elision.h"
#include "distributor.h"
#include "distributor_bucket_space.h"
#include "simpleclusterinformation.h"
@@ -119,23 +120,37 @@ BucketDBUpdater::recheckBucketInfo(uint32_t nodeIdx,
void
BucketDBUpdater::removeSuperfluousBuckets(
- const lib::ClusterStateBundle& newState)
+ const lib::ClusterStateBundle& newState,
+ bool is_distribution_config_change)
{
const bool move_to_read_only_db = shouldDeferStateEnabling();
- for (auto &elem : _distributorComponent.getBucketSpaceRepo()) {
- const auto &newDistribution(elem.second->getDistribution());
- const auto &oldClusterState(elem.second->getClusterState());
- auto &bucketDb(elem.second->getBucketDatabase());
+ const char* up_states = _distributorComponent.getDistributor().getStorageNodeUpStates();
+ for (auto& elem : _distributorComponent.getBucketSpaceRepo()) {
+ const auto& newDistribution(elem.second->getDistribution());
+ const auto& oldClusterState(elem.second->getClusterState());
+ const auto& new_cluster_state = newState.getDerivedClusterState(elem.first);
+
+ // Running a full DB sweep is expensive, so if the cluster state transition does
+ // not actually indicate that buckets should possibly be removed, we elide it entirely.
+ if (!is_distribution_config_change
+ && db_pruning_may_be_elided(oldClusterState, *new_cluster_state, up_states))
+ {
+ LOG(debug, "Eliding DB pruning for state transition '%s' -> '%s'",
+ oldClusterState.toString().c_str(), new_cluster_state->toString().c_str());
+ continue;
+ }
+
+ auto& bucketDb(elem.second->getBucketDatabase());
auto& readOnlyDb(_distributorComponent.getReadOnlyBucketSpaceRepo().get(elem.first).getBucketDatabase());
// Remove all buckets not belonging to this distributor, or
// being on storage nodes that are no longer up.
NodeRemover proc(
oldClusterState,
- *newState.getDerivedClusterState(elem.first),
+ *new_cluster_state,
_distributorComponent.getIndex(),
newDistribution,
- _distributorComponent.getDistributor().getStorageNodeUpStates());
+ up_states);
bucketDb.forEach(proc);
for (const auto & bucket : proc.getBucketsToRemove()) {
@@ -184,7 +199,7 @@ BucketDBUpdater::storageDistributionChanged()
{
ensureTransitionTimerStarted();
- removeSuperfluousBuckets(_distributorComponent.getClusterStateBundle());
+ removeSuperfluousBuckets(_distributorComponent.getClusterStateBundle(), true);
ClusterInformation::CSP clusterInfo(new SimpleClusterInformation(
_distributorComponent.getIndex(),
@@ -237,7 +252,7 @@ BucketDBUpdater::onSetSystemState(
// Separate timer since _transitionTimer might span multiple pending states.
framework::MilliSecTimer process_timer(_distributorComponent.getClock());
- removeSuperfluousBuckets(cmd->getClusterStateBundle());
+ removeSuperfluousBuckets(cmd->getClusterStateBundle(), false);
replyToPreviousPendingClusterStateIfAny();
ClusterInformation::CSP clusterInfo(
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.h b/storage/src/vespa/storage/distributor/bucketdbupdater.h
index 9ff732a09f5..e9877d37e67 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.h
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.h
@@ -163,7 +163,8 @@ private:
void updateState(const lib::ClusterState& oldState, const lib::ClusterState& newState);
- void removeSuperfluousBuckets(const lib::ClusterStateBundle& newState);
+ void removeSuperfluousBuckets(const lib::ClusterStateBundle& newState,
+ bool is_distribution_config_change);
void replyToPreviousPendingClusterStateIfAny();
void replyToActivationWithActualVersion(