diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-02-17 15:29:08 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-02-17 15:29:08 +0000 |
commit | 847b722bb05b9e823cd147cbc0b4003447bef62a (patch) | |
tree | c1a6d2212819c08e6d577e74210c3db7920822c3 /storage | |
parent | 2e55f8118174a1e6fe5faa5ca9daf88f4be82461 (diff) |
Inhibit activation of replicas out of sync with a replica majority
Adds a configurable max number of groups (default 0) whose replica
activation is inhibited if the replica's bucket info is out of sync
with a majority of other replicas.
Intended to be used for the case where a group comes back up after
transient unavailability and where the nodes are out of sync and
should preferably not be activated until post-merging.
Diffstat (limited to 'storage')
12 files changed, 249 insertions, 21 deletions
diff --git a/storage/src/tests/bucketdb/bucketinfotest.cpp b/storage/src/tests/bucketdb/bucketinfotest.cpp index fe922b5d6bd..c1689014160 100644 --- a/storage/src/tests/bucketdb/bucketinfotest.cpp +++ b/storage/src/tests/bucketdb/bucketinfotest.cpp @@ -182,4 +182,58 @@ TEST(BucketInfoTest, remove_node_can_defer_update_of_trusted_flag) { EXPECT_FALSE(info.getNode(0)->trusted()); } +TEST(BucketInfoTest, no_majority_consistent_bucket_for_too_few_replicas) { + std::vector<uint16_t> order; + BucketInfo info; + // No majority with 0 nodes, for all the obvious reasons. + EXPECT_FALSE(info.majority_consistent_bucket_info().valid()); + // 1 is technically a majority of 1, but it doesn't make sense from the perspective + // of preventing activation of minority replicas. + info.addNode(BucketCopy(0, 0, api::BucketInfo(0x1, 2, 144)), order); + EXPECT_FALSE(info.majority_consistent_bucket_info().valid()); + // Similarly, for 2 out of 2 nodes in sync we have no minority (so no point in reporting), + // and with 1 out of 2 nodes we have no idea which of the nodes to treat as "authoritative". + info.addNode(BucketCopy(0, 1, api::BucketInfo(0x1, 2, 144)), order); + EXPECT_FALSE(info.majority_consistent_bucket_info().valid()); +} + +TEST(BucketInfoTest, majority_consistent_bucket_info_can_be_inferred) { + std::vector<uint16_t> order; + BucketInfo info; + info.addNode(BucketCopy(0, 0, api::BucketInfo(0x1, 2, 144)), order); + info.addNode(BucketCopy(0, 1, api::BucketInfo(0x1, 2, 144)), order); + info.addNode(BucketCopy(0, 2, api::BucketInfo(0x1, 2, 144)), order); + + auto maj_info = info.majority_consistent_bucket_info(); + ASSERT_TRUE(maj_info.valid()); + EXPECT_EQ(maj_info, api::BucketInfo(0x1, 2, 144)); + + // 3 of 4 in sync, still majority. + info.addNode(BucketCopy(0, 3, api::BucketInfo(0x1, 3, 255)), order); + + maj_info = info.majority_consistent_bucket_info(); + ASSERT_TRUE(maj_info.valid()); + EXPECT_EQ(maj_info, api::BucketInfo(0x1, 2, 144)); + + // 3 of 5 in sync, still majority. + info.addNode(BucketCopy(0, 4, api::BucketInfo(0x1, 3, 255)), order); + + maj_info = info.majority_consistent_bucket_info(); + ASSERT_TRUE(maj_info.valid()); + EXPECT_EQ(maj_info, api::BucketInfo(0x1, 2, 144)); + + // 3 of 6 mutually in sync, no majority. + info.addNode(BucketCopy(0, 5, api::BucketInfo(0x1, 3, 255)), order); + + maj_info = info.majority_consistent_bucket_info(); + EXPECT_FALSE(maj_info.valid()); + + // 4 out of 7 in sync; majority. + info.addNode(BucketCopy(0, 6, api::BucketInfo(0x1, 3, 255)), order); + + maj_info = info.majority_consistent_bucket_info(); + ASSERT_TRUE(maj_info.valid()); + EXPECT_EQ(maj_info, api::BucketInfo(0x1, 3, 255)); +} + } // storage::distributor diff --git a/storage/src/tests/distributor/distributortest.cpp b/storage/src/tests/distributor/distributortest.cpp index 928373d516b..c06919a7489 100644 --- a/storage/src/tests/distributor/distributortest.cpp +++ b/storage/src/tests/distributor/distributortest.cpp @@ -208,6 +208,12 @@ struct DistributorTest : Test, DistributorTestUtil { configureDistributor(builder); } + void configure_max_activation_inhibited_out_of_sync_groups(uint32_t n_groups) { + ConfigBuilder builder; + builder.maxActivationInhibitedOutOfSyncGroups = n_groups; + configureDistributor(builder); + } + void configureMaxClusterClockSkew(int seconds); void sendDownClusterStateCommand(); void replyToSingleRequestBucketInfoCommandWith1Bucket(); @@ -1188,6 +1194,17 @@ TEST_F(DistributorTest, prioritize_global_bucket_merges_config_is_propagated_to_ EXPECT_FALSE(getConfig().prioritize_global_bucket_merges()); } +TEST_F(DistributorTest, max_activation_inhibited_out_of_sync_groups_config_is_propagated_to_internal_config) { + createLinks(); + setupDistributor(Redundancy(1), NodeCount(1), "distributor:1 storage:1"); + + configure_max_activation_inhibited_out_of_sync_groups(3); + EXPECT_EQ(getConfig().max_activation_inhibited_out_of_sync_groups(), 3); + + configure_max_activation_inhibited_out_of_sync_groups(0); + EXPECT_EQ(getConfig().max_activation_inhibited_out_of_sync_groups(), 0); +} + TEST_F(DistributorTest, wanted_split_bit_count_is_lower_bounded) { createLinks(); setupDistributor(Redundancy(1), NodeCount(1), "distributor:1 storage:1"); diff --git a/storage/src/tests/distributor/statecheckerstest.cpp b/storage/src/tests/distributor/statecheckerstest.cpp index 8f2a77572a4..8970ba09868 100644 --- a/storage/src/tests/distributor/statecheckerstest.cpp +++ b/storage/src/tests/distributor/statecheckerstest.cpp @@ -1109,8 +1109,7 @@ std::string StateCheckersTest::testBucketStatePerGroup( includePriority); } -TEST_F(StateCheckersTest, bucket_state_per_group) { - setupDistributor(6, 20, "distributor:1 storage:12 .2.s:d .4.s:d .7.s:d"); +std::shared_ptr<lib::Distribution> make_3x3_group_config() { vespa::config::content::StorDistributionConfigBuilder config; config.activePerLeafGroup = true; config.redundancy = 6; @@ -1136,8 +1135,12 @@ TEST_F(StateCheckersTest, bucket_state_per_group) { config.group[3].nodes[0].index = 9; config.group[3].nodes[1].index = 10; config.group[3].nodes[2].index = 11; - auto distr = std::make_shared<lib::Distribution>(config); - triggerDistributionChange(std::move(distr)); + return std::make_shared<lib::Distribution>(config); +} + +TEST_F(StateCheckersTest, bucket_state_per_group) { + setupDistributor(6, 20, "distributor:1 storage:12 .2.s:d .4.s:d .7.s:d"); + triggerDistributionChange(make_3x3_group_config()); { DistributorConfiguration::MaintenancePriorities mp; @@ -1184,6 +1187,70 @@ TEST_F(StateCheckersTest, bucket_state_per_group) { true)); } +TEST_F(StateCheckersTest, do_not_activate_replicas_that_are_out_of_sync_with_majority) { + // TODO why this strange distribution... + // groups: [0, 1, 3] [5, 6, 8] [9, 10, 11] + setupDistributor(6, 12, "distributor:1 storage:12 .2.s:d .4.s:d .7.s:d"); + triggerDistributionChange(make_3x3_group_config()); + getConfig().set_max_activation_inhibited_out_of_sync_groups(3); + + // 5 is out of sync with 0 and 9 and will NOT be activated. + EXPECT_EQ("[Setting node 0 as active: first available copy]" + "[Setting node 9 as active: copy is ideal state priority 2]", + testBucketStatePerGroup("0=2/3/4, 5=3/4/5, 9=2/3/4")); + + // We also try the other indices:... + // 0 out of sync, 5 and 9 in sync (one hopes..!) + EXPECT_EQ("[Setting node 5 as active: first available copy]" + "[Setting node 9 as active: copy is ideal state priority 2]", + testBucketStatePerGroup("0=4/5/6, 5=2/3/4, 9=2/3/4")); + + // 9 out of sync, 0 and 5 in sync + EXPECT_EQ("[Setting node 0 as active: first available copy]" + "[Setting node 5 as active: first available copy]", + testBucketStatePerGroup("0=2/3/4, 5=2/3/4, 9=5/3/4")); + + // If there's no majority, we activate everything because there's really nothing + // better we can do. + EXPECT_EQ("[Setting node 0 as active: first available copy]" + "[Setting node 5 as active: first available copy]" + "[Setting node 9 as active: copy is ideal state priority 2]", + testBucketStatePerGroup("0=2/3/4, 5=5/6/7, 9=8/9/10")); + + // However, if a replica is _already_ active, we will not deactivate it. + EXPECT_EQ("[Setting node 0 as active: first available copy]" + "[Setting node 9 as active: copy is ideal state priority 2]", + testBucketStatePerGroup("0=2/3/4, 5=3/4/5/u/a, 9=2/3/4")); +} + +TEST_F(StateCheckersTest, replica_activation_inhibition_can_be_limited_to_max_n_groups) { + // groups: [0, 1, 3] [5, 6, 8] [9, 10, 11] + setupDistributor(6, 12, "distributor:1 storage:12 .2.s:d .4.s:d .7.s:d"); + triggerDistributionChange(make_3x3_group_config()); + getConfig().set_max_activation_inhibited_out_of_sync_groups(1); + + // We count metadata majorities independent of groups. Let there be 3 in-sync replicas in + // group 0, 1 out of sync in group 1 and 1 out of sync in group 2. Unless we have + // mechanisms in place to limit the number of affected groups, both groups 1 and 2 would + // be inhibited for activation. Since we limit to 1, only group 1 should be affected. + EXPECT_EQ("[Setting node 1 as active: copy is ideal state priority 4]" + "[Setting node 9 as active: copy is ideal state priority 2]", + testBucketStatePerGroup("0=2/3/4, 1=2/3/4, 3=2/3/4, 5=3/4/5, 9=5/6/7")); +} + +TEST_F(StateCheckersTest, activate_replicas_that_are_out_of_sync_with_majority_if_inhibition_config_disabled) { + // groups: [0, 1, 3] [5, 6, 8] [9, 10, 11] + setupDistributor(6, 12, "distributor:1 storage:12 .2.s:d .4.s:d .7.s:d"); + triggerDistributionChange(make_3x3_group_config()); + getConfig().set_max_activation_inhibited_out_of_sync_groups(0); + + // 5 is out of sync with 0 and 9 but will still be activated since the config is false. + EXPECT_EQ("[Setting node 0 as active: first available copy]" + "[Setting node 5 as active: first available copy]" + "[Setting node 9 as active: copy is ideal state priority 2]", + testBucketStatePerGroup("0=2/3/4, 5=3/4/5, 9=2/3/4")); +} + TEST_F(StateCheckersTest, allow_activation_of_retired_nodes) { // All nodes in retired state implies that the ideal state is empty. But // we still want to be able to shuffle bucket activations around in order diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.h b/storage/src/vespa/storage/bucketdb/bucketinfo.h index 9616a1fae6e..1fd6c824904 100644 --- a/storage/src/vespa/storage/bucketdb/bucketinfo.h +++ b/storage/src/vespa/storage/bucketdb/bucketinfo.h @@ -106,6 +106,11 @@ public: return _nodes; } + // If there is a valid majority of replicas that have the same metadata + // (checksum and document count), return that bucket info. + // Otherwise, return default-constructed info with valid() == false. + api::BucketInfo majority_consistent_bucket_info() const noexcept; + std::string toString() const; uint32_t getHighestDocumentCount() const; diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.hpp b/storage/src/vespa/storage/bucketdb/bucketinfo.hpp index 562e8d562fb..07316ff0d71 100644 --- a/storage/src/vespa/storage/bucketdb/bucketinfo.hpp +++ b/storage/src/vespa/storage/bucketdb/bucketinfo.hpp @@ -2,6 +2,7 @@ #include "bucketcopy.h" #include <vespa/vespalib/util/arrayref.h> +#include <vespa/vespalib/stllike/hash_map.hpp> #include <iostream> #include <sstream> @@ -65,6 +66,55 @@ bool BucketInfoBase<NodeSeq>::consistentNodes(bool countInvalidAsConsistent) con return true; } +namespace { + +// BucketInfo wrapper which only concerns itself with fields that indicate +// whether replicas are in sync. +struct ReplicaMetadata { + api::BucketInfo info; + + ReplicaMetadata() noexcept = default; + explicit ReplicaMetadata(const api::BucketInfo& info_) noexcept + : info(info_) + {} + bool operator==(const ReplicaMetadata& rhs) const noexcept { + // TODO merge state checker itself only considers checksum, should we do the same...? + return ((info.getChecksum() == rhs.info.getChecksum()) && + (info.getDocumentCount() == rhs.info.getDocumentCount())); + } + struct hash { + size_t operator()(const ReplicaMetadata& rm) const noexcept { + // We assume that just using the checksum is extremely likely to be unique in the table. + return rm.info.getChecksum(); + } + }; +}; + +constexpr bool is_majority(size_t n, size_t m) { + return (n >= (m / 2) + 1); +} + +} + +template <typename NodeSeq> +api::BucketInfo BucketInfoBase<NodeSeq>::majority_consistent_bucket_info() const noexcept { + if (_nodes.size() < 3) { + return {}; + } + vespalib::hash_map<ReplicaMetadata, uint16_t, ReplicaMetadata::hash> meta_tracker; + for (const auto& n : _nodes) { + if (n.valid()) { + meta_tracker[ReplicaMetadata(n.getBucketInfo())]++; + } + } + for (const auto& meta : meta_tracker) { + if (is_majority(meta.second, _nodes.size())) { + return meta.first.info; + } + } + return {}; +} + template <typename NodeSeq> void BucketInfoBase<NodeSeq>::print(std::ostream& out, bool verbose, const std::string& indent) const { if (_nodes.size() == 0) { diff --git a/storage/src/vespa/storage/config/distributorconfiguration.cpp b/storage/src/vespa/storage/config/distributorconfiguration.cpp index a87977bec11..ea963b227e2 100644 --- a/storage/src/vespa/storage/config/distributorconfiguration.cpp +++ b/storage/src/vespa/storage/config/distributorconfiguration.cpp @@ -24,6 +24,7 @@ DistributorConfiguration::DistributorConfiguration(StorageComponent& component) _idealStateChunkSize(1000), _maxNodesPerMerge(16), _max_consecutively_inhibited_maintenance_ticks(20), + _max_activation_inhibited_out_of_sync_groups(0), _lastGarbageCollectionChange(vespalib::duration::zero()), _garbageCollectionInterval(0), _minPendingMaintenanceOps(100), @@ -165,6 +166,7 @@ DistributorConfiguration::configure(const vespa::config::content::core::StorDist _use_weak_internal_read_consistency_for_client_gets = config.useWeakInternalReadConsistencyForClientGets; _enable_metadata_only_fetch_phase_for_inconsistent_updates = config.enableMetadataOnlyFetchPhaseForInconsistentUpdates; _prioritize_global_bucket_merges = config.prioritizeGlobalBucketMerges; + _max_activation_inhibited_out_of_sync_groups = config.maxActivationInhibitedOutOfSyncGroups; _enable_revert = config.enableRevert; _minimumReplicaCountingMode = config.minimumReplicaCountingMode; diff --git a/storage/src/vespa/storage/config/distributorconfiguration.h b/storage/src/vespa/storage/config/distributorconfiguration.h index 5ff1a8b3503..9c1456fa9fd 100644 --- a/storage/src/vespa/storage/config/distributorconfiguration.h +++ b/storage/src/vespa/storage/config/distributorconfiguration.h @@ -253,6 +253,14 @@ public: bool prioritize_global_bucket_merges() const noexcept { return _prioritize_global_bucket_merges; } + + void set_max_activation_inhibited_out_of_sync_groups(uint32_t max_groups) noexcept { + _max_activation_inhibited_out_of_sync_groups = max_groups; + } + uint32_t max_activation_inhibited_out_of_sync_groups() const noexcept { + return _max_activation_inhibited_out_of_sync_groups; + } + bool enable_revert() const noexcept { return _enable_revert; } @@ -274,6 +282,7 @@ private: uint32_t _idealStateChunkSize; uint32_t _maxNodesPerMerge; uint32_t _max_consecutively_inhibited_maintenance_ticks; + uint32_t _max_activation_inhibited_out_of_sync_groups; std::string _garbageCollectionSelection; diff --git a/storage/src/vespa/storage/config/stor-distributormanager.def b/storage/src/vespa/storage/config/stor-distributormanager.def index 54f6006895e..6efaf8f8558 100644 --- a/storage/src/vespa/storage/config/stor-distributormanager.def +++ b/storage/src/vespa/storage/config/stor-distributormanager.def @@ -259,3 +259,17 @@ max_consecutively_inhibited_maintenance_ticks int default=20 ## This is a live config for that reason, i.e. it can be disabled in an emergency ## situation if needed. prioritize_global_bucket_merges bool default=true + +## If set, activation of bucket replicas is limited to only those replicas that have +## bucket info consistent with a majority of the other replicas for that bucket. +## Multiple active replicas is only a feature that is enabled for grouped clusters, +## and this feature is intended to prevent nodes in stale groups (whose buckets are +## likely to be out of sync) from serving query traffic until they have gotten back +## into sync. +## Up to to the given number of groups can have their replica activation inhibited +## by this feature. If zero, the feature is functionally disabled. +## If more groups are out of sync than the configured number N, the inhibited groups +## will be the N first groups present in the distribution config. +## Note: this feature only kicks in if the number of groups in the cluster is greater +## than 1. +max_activation_inhibited_out_of_sync_groups int default=0 diff --git a/storage/src/vespa/storage/distributor/activecopy.cpp b/storage/src/vespa/storage/distributor/activecopy.cpp index 5654e986882..e5bfe489d26 100644 --- a/storage/src/vespa/storage/distributor/activecopy.cpp +++ b/storage/src/vespa/storage/distributor/activecopy.cpp @@ -112,24 +112,17 @@ namespace { } } -#undef DEBUG -#if 0 -#define DEBUG(a) a -#else -#define DEBUG(a) -#endif - ActiveList ActiveCopy::calculate(const std::vector<uint16_t>& idealState, const lib::Distribution& distribution, - BucketDatabase::Entry& e) + BucketDatabase::Entry& e, + uint32_t max_activation_inhibited_out_of_sync_groups) { - DEBUG(std::cerr << "Ideal state is " << idealState << "\n"); std::vector<uint16_t> validNodesWithCopy = buildValidNodeIndexList(e); if (validNodesWithCopy.empty()) { return ActiveList(); } - typedef std::vector<uint16_t> IndexList; + using IndexList = std::vector<uint16_t>; std::vector<IndexList> groups; if (distribution.activePerGroup()) { groups = distribution.splitNodesIntoLeafGroups(std::move(validNodesWithCopy)); @@ -138,11 +131,24 @@ ActiveCopy::calculate(const std::vector<uint16_t>& idealState, } std::vector<ActiveCopy> result; result.reserve(groups.size()); - for (uint32_t i=0; i<groups.size(); ++i) { - std::vector<ActiveCopy> entries = buildNodeList(e, groups[i], idealState); - DEBUG(std::cerr << "Finding active for group " << entries << "\n"); + + auto maybe_majority_info = ((max_activation_inhibited_out_of_sync_groups > 0) + ? e->majority_consistent_bucket_info() + : api::BucketInfo()); // Invalid by default + uint32_t inhibited_groups = 0; + for (const auto& group_nodes : groups) { + std::vector<ActiveCopy> entries = buildNodeList(e, group_nodes, idealState); auto best = std::min_element(entries.begin(), entries.end(), ActiveStateOrder()); - DEBUG(std::cerr << "Best copy " << *best << "\n"); + if ((groups.size() > 1) && + (inhibited_groups < max_activation_inhibited_out_of_sync_groups) && + maybe_majority_info.valid()) + { + const auto* candidate = e->getNode(best->_nodeIndex); + if (!candidate->getBucketInfo().equalDocumentInfo(maybe_majority_info) && !candidate->active()) { + ++inhibited_groups; + continue; // Do _not_ add candidate as activation target since it's out of sync with the majority + } + } result.emplace_back(*best); } return ActiveList(std::move(result)); diff --git a/storage/src/vespa/storage/distributor/activecopy.h b/storage/src/vespa/storage/distributor/activecopy.h index d9f83be3748..d22e7072cc7 100644 --- a/storage/src/vespa/storage/distributor/activecopy.h +++ b/storage/src/vespa/storage/distributor/activecopy.h @@ -17,7 +17,9 @@ struct ActiveCopy { friend std::ostream& operator<<(std::ostream& out, const ActiveCopy& e); static ActiveList calculate(const std::vector<uint16_t>& idealState, - const lib::Distribution&, BucketDatabase::Entry&); + const lib::Distribution&, + BucketDatabase::Entry&, + uint32_t max_activation_inhibited_out_of_sync_groups); uint16_t _nodeIndex; uint16_t _ideal; diff --git a/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp index 7a247f6c524..380c4fdf332 100644 --- a/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp +++ b/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp @@ -109,7 +109,8 @@ PutOperation::insertDatabaseEntryAndScheduleCreateBucket(const OperationTargetLi BucketDatabase::Entry entry(_bucketSpace.getBucketDatabase().get(lastBucket)); std::vector<uint16_t> idealState( _bucketSpace.get_ideal_service_layer_nodes_bundle(lastBucket).get_available_nodes()); - active = ActiveCopy::calculate(idealState, _bucketSpace.getDistribution(), entry); + active = ActiveCopy::calculate(idealState, _bucketSpace.getDistribution(), entry, + _op_ctx.distributor_config().max_activation_inhibited_out_of_sync_groups()); LOG(debug, "Active copies for bucket %s: %s", entry.getBucketId().toString().c_str(), active.toString().c_str()); for (uint32_t i=0; i<active.size(); ++i) { BucketCopy copy(*entry->getNode(active[i]._nodeIndex)); diff --git a/storage/src/vespa/storage/distributor/statecheckers.cpp b/storage/src/vespa/storage/distributor/statecheckers.cpp index 796a0434ed4..bdd850e1e23 100644 --- a/storage/src/vespa/storage/distributor/statecheckers.cpp +++ b/storage/src/vespa/storage/distributor/statecheckers.cpp @@ -1060,7 +1060,8 @@ BucketStateStateChecker::check(StateChecker::Context& c) } ActiveList activeNodes( - ActiveCopy::calculate(c.idealState, c.distribution, c.entry)); + ActiveCopy::calculate(c.idealState, c.distribution, c.entry, + c.distributorConfig.max_activation_inhibited_out_of_sync_groups())); if (activeNodes.empty()) { return Result::noMaintenanceNeeded(); } |