diff options
author | Tor Brede Vekterli <vekterli@vespa.ai> | 2024-05-22 12:58:45 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@vespa.ai> | 2024-05-22 13:42:45 +0000 |
commit | 6112b790f867fa464bb03045142a041a18974124 (patch) | |
tree | e8632409b38519bc81b48d259a3f44507370b0b5 /storage | |
parent | 0c63bb70d57d05444f3e899fbaa93d51057f0699 (diff) |
Make replica selection for Puts and bucket activation symmetric
The legacy Put replica selection behavior may route new versions
of a document to replicas that are not considered optimal for
activation. This is not normally an issue, but can manifest itself
as missing coverage when the system is in flux with replicas
moving away from Retired nodes containing ready replicas, as the
existing replicas on the Retired node would be preferred for
activation (and thus be used for searches) but incoming Puts would
instead be sent to non-retired nodes due to being in the ideal state.
The new replica ordering (and transitively, selection) behavior is
identical between Puts and activation. This should help ensure
that new versions of the document is routed to replica(s) that
are most likely to be visible as part of searches.
New selection behavior for Puts is config-gated and defaults to
the legacy behavior.
This also subtly changes the fallback ordering criteria for replica
activation to consider the replica's existing DB _entry_ order
instead of its node index. Since DB entries are always ordered by
their ideal state order (with Retired nodes included), this will
evenly distribute fallback activations rather than skewing them
towards lower indexes. It is not expected that this has any negative
effects in practice, and is therefore _not_ a config-gated change.
Diffstat (limited to 'storage')
8 files changed, 150 insertions, 31 deletions
diff --git a/storage/src/tests/distributor/operationtargetresolvertest.cpp b/storage/src/tests/distributor/operationtargetresolvertest.cpp index f0f8a4359fd..171dc5a42c0 100644 --- a/storage/src/tests/distributor/operationtargetresolvertest.cpp +++ b/storage/src/tests/distributor/operationtargetresolvertest.cpp @@ -27,8 +27,7 @@ struct OperationTargetResolverTest : Test, DistributorStripeTestUtil { const document::DocumentType* _html_type; std::unique_ptr<Operation> op; - BucketInstanceList getInstances(const BucketId& bid, - bool stripToRedundancy); + BucketInstanceList getInstances(const BucketId& bid, bool stripToRedundancy, bool symmetry_mode); void SetUp() override { _repo.reset(new document::DocumentTypeRepo( @@ -62,7 +61,7 @@ namespace { TestTargets::createTest(id, *this, *_asserters.back()) struct Asserter { - virtual ~Asserter() {} + virtual ~Asserter() = default; virtual void assertEqualMsg(std::string t1, OperationTargetList t2, OperationTargetList t3) = 0; @@ -73,21 +72,29 @@ struct TestTargets { OperationTargetList _expected; OperationTargetResolverTest& _test; Asserter& _asserter; + bool _symmetry_mode; TestTargets(const BucketId& id, OperationTargetResolverTest& test, Asserter& asserter) - : _id(id), _test(test), _asserter(asserter) {} + : _id(id), _test(test), _asserter(asserter), _symmetry_mode(true) + { + } ~TestTargets() { - BucketInstanceList result(_test.getInstances(_id, true)); - BucketInstanceList all(_test.getInstances(_id, false)); + BucketInstanceList result(_test.getInstances(_id, true, _symmetry_mode)); + BucketInstanceList all(_test.getInstances(_id, false, _symmetry_mode)); _asserter.assertEqualMsg( all.toString(), _expected, result.createTargets(makeBucketSpace())); delete _asserters.back(); _asserters.pop_back(); } + TestTargets& with_symmetric_replica_selection(bool symmetry) noexcept { + _symmetry_mode = symmetry; + return *this; + } + TestTargets& sendsTo(const BucketId& id, uint16_t node) { _expected.push_back(OperationTarget( makeDocumentBucket(id), lib::Node(lib::NodeType::STORAGE, node), false)); @@ -110,7 +117,7 @@ struct TestTargets { } // anonymous BucketInstanceList -OperationTargetResolverTest::getInstances(const BucketId& id, bool stripToRedundancy) +OperationTargetResolverTest::getInstances(const BucketId& id, bool stripToRedundancy, bool symmetry_mode) { auto &bucketSpaceRepo(operation_context().bucket_space_repo()); auto &distributorBucketSpace(bucketSpaceRepo.get(makeBucketSpace())); @@ -118,6 +125,7 @@ OperationTargetResolverTest::getInstances(const BucketId& id, bool stripToRedund distributorBucketSpace, distributorBucketSpace.getBucketDatabase(), 16, distributorBucketSpace.getDistribution().getRedundancy(), makeBucketSpace()); + resolver.use_symmetric_replica_selection(symmetry_mode); if (stripToRedundancy) { return resolver.getInstances(OperationTargetResolver::PUT, id); } else { @@ -143,14 +151,48 @@ TEST_F(OperationTargetResolverTest, choose_ideal_state_when_many_copies) { .sendsTo(BucketId(16, 0), 3); } -TEST_F(OperationTargetResolverTest, trusted_over_ideal_state) { +TEST_F(OperationTargetResolverTest, legacy_prefers_trusted_over_ideal_state) { setup_stripe(2, 4, "storage:4 distributor:1"); addNodesToBucketDB(BucketId(16, 0), "0=0/0/0/t,1=0,2=0/0/0/t,3=0"); // ideal nodes: 1, 3 + MY_ASSERT_THAT(BucketId(32, 0)).with_symmetric_replica_selection(false) + .sendsTo(BucketId(16, 0), 0) + .sendsTo(BucketId(16, 0), 2); +} + +TEST_F(OperationTargetResolverTest, prefer_ready_over_ideal_state_order) { + setup_stripe(2, 4, "storage:4 distributor:1"); + addNodesToBucketDB(BucketId(16, 0), "0=1/2/3/u/i/r,1=1/2/3,2=1/2/3/u/i/r,3=1/2/3"); + // ideal nodes: 1, 3. 0 and 2 are ready. MY_ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 0) .sendsTo(BucketId(16, 0), 2); } +TEST_F(OperationTargetResolverTest, prefer_ready_over_ideal_state_order_also_when_retired) { + setup_stripe(2, 4, "storage:4 .0.s:r distributor:1"); + addNodesToBucketDB(BucketId(16, 0), "0=1/2/3/u/i/r,1=1/2/3,2=1/2/3/u/i/r,3=1/2/3"); + // ideal nodes: 1, 3. 0 and 2 are ready. + MY_ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 0) + .sendsTo(BucketId(16, 0), 2); +} + +TEST_F(OperationTargetResolverTest, prefer_replicas_with_more_docs_over_replicas_with_fewer_docs) { + setup_stripe(2, 4, "storage:4 distributor:1"); + addNodesToBucketDB(BucketId(16, 0), "0=2/3/4,1=1/2/3,2=3/4/5,3=1/2/3"); + // ideal nodes: 1, 3. 0 and 2 have more docs. + MY_ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 2) + .sendsTo(BucketId(16, 0), 0); +} + +TEST_F(OperationTargetResolverTest, fall_back_to_active_state_and_db_index_if_all_other_fields_equal) { + // All replica nodes tagged as retired, which means none are part of the ideal state order + setup_stripe(2, 4, "storage:4 .0.s:r .2.s:r .3.s:r distributor:1"); + addNodesToBucketDB(BucketId(16, 0), "0=2/3/4/u/a,3=2/3/4,2=2/3/4"); + // ideal nodes: 1, 3. 0 is active and 3 is the remaining replica with the lowest DB order. + MY_ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 0) + .sendsTo(BucketId(16, 0), 3); +} + TEST_F(OperationTargetResolverTest, choose_highest_split_bucket) { setup_stripe(2, 2, "storage:2 distributor:1"); // 0, 1 are both in ideal state for both buckets. diff --git a/storage/src/vespa/storage/config/distributorconfiguration.cpp b/storage/src/vespa/storage/config/distributorconfiguration.cpp index 6957e541d6b..cfbc4caf82d 100644 --- a/storage/src/vespa/storage/config/distributorconfiguration.cpp +++ b/storage/src/vespa/storage/config/distributorconfiguration.cpp @@ -46,6 +46,7 @@ DistributorConfiguration::DistributorConfiguration(StorageComponent& component) _use_weak_internal_read_consistency_for_client_gets(false), _enable_metadata_only_fetch_phase_for_inconsistent_updates(true), _enable_operation_cancellation(false), + _symmetric_put_and_activate_replica_selection(false), _minimumReplicaCountingMode(ReplicaCountingMode::TRUSTED) { } @@ -150,6 +151,7 @@ DistributorConfiguration::configure(const DistributorManagerConfig & config) _max_activation_inhibited_out_of_sync_groups = config.maxActivationInhibitedOutOfSyncGroups; _enable_operation_cancellation = config.enableOperationCancellation; _minimumReplicaCountingMode = deriveReplicaCountingMode(config.minimumReplicaCountingMode); + _symmetric_put_and_activate_replica_selection = config.symmetricPutAndActivateReplicaSelection; if (config.maxClusterClockSkewSec >= 0) { _maxClusterClockSkew = std::chrono::seconds(config.maxClusterClockSkewSec); diff --git a/storage/src/vespa/storage/config/distributorconfiguration.h b/storage/src/vespa/storage/config/distributorconfiguration.h index 38fac13150c..2b73fdc0fa1 100644 --- a/storage/src/vespa/storage/config/distributorconfiguration.h +++ b/storage/src/vespa/storage/config/distributorconfiguration.h @@ -234,8 +234,11 @@ public: [[nodiscard]] bool enable_operation_cancellation() const noexcept { return _enable_operation_cancellation; } + [[nodiscard]] bool symmetric_put_and_activate_replica_selection() const noexcept { + return _symmetric_put_and_activate_replica_selection; + } - bool containsTimeStatement(const std::string& documentSelection) const; + [[nodiscard]] bool containsTimeStatement(const std::string& documentSelection) const; private: StorageComponent& _component; @@ -276,6 +279,7 @@ private: bool _use_weak_internal_read_consistency_for_client_gets; bool _enable_metadata_only_fetch_phase_for_inconsistent_updates; //TODO Rewrite tests and GC bool _enable_operation_cancellation; + bool _symmetric_put_and_activate_replica_selection; ReplicaCountingMode _minimumReplicaCountingMode; }; diff --git a/storage/src/vespa/storage/config/stor-distributormanager.def b/storage/src/vespa/storage/config/stor-distributormanager.def index 3f6028d7fa1..a4d4461ba68 100644 --- a/storage/src/vespa/storage/config/stor-distributormanager.def +++ b/storage/src/vespa/storage/config/stor-distributormanager.def @@ -159,6 +159,13 @@ num_distributor_stripes int default=0 restart ## requests partially or fully "invalidated" by such a change. enable_operation_cancellation bool default=false +## Iff true there will be an 1-1 symmetry between the replicas chosen as feed targets +## for Put operations and the replica selection logic for bucket activation. In particular, +## the most preferred replica for feed will be the most preferred bucket for activation. +## This helps ensure that new versions of documents are routed to replicas that are most +## likely to reflect these changes as part of visible search results. +symmetric_put_and_activate_replica_selection bool default=false + ## TODO GC very soon, it has no effect. priority_merge_out_of_sync_copies int default=120 diff --git a/storage/src/vespa/storage/distributor/activecopy.cpp b/storage/src/vespa/storage/distributor/activecopy.cpp index 35070bcee3b..b823978a0cc 100644 --- a/storage/src/vespa/storage/distributor/activecopy.cpp +++ b/storage/src/vespa/storage/distributor/activecopy.cpp @@ -108,6 +108,7 @@ buildNodeList(const BucketDatabase::Entry& e,vespalib::ConstArrayRef<uint16_t> n struct ActiveStateOrder { bool operator()(const ActiveCopy & e1, const ActiveCopy & e2) noexcept { + // Replica selection order should be kept in sync with OperationTargetResolverImpl's InstanceOrder. if (e1._ready != e2._ready) { return e1._ready; } @@ -120,7 +121,9 @@ struct ActiveStateOrder { if (e1._active != e2._active) { return e1._active; } - return e1.nodeIndex() < e2.nodeIndex(); + // Use _entry_ order instead of node index, as it is in ideal state order (even for retired + // nodes), which avoids unintentional affinities towards lower node indexes. + return e1.entryIndex() < e2.entryIndex(); } }; diff --git a/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp index 54087850e1b..a92896279b0 100644 --- a/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp +++ b/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp @@ -180,6 +180,8 @@ void PutOperation::start_direct_put_dispatch(DistributorStripeMessageSender& sen _op_ctx.distributor_config().getMinimalBucketSplit(), _bucket_space.getDistribution().getRedundancy(), _msg->getBucket().getBucketSpace()); + targetResolver.use_symmetric_replica_selection( + _op_ctx.distributor_config().symmetric_put_and_activate_replica_selection()); OperationTargetList targets(targetResolver.getTargets(OperationTargetResolver::PUT, _doc_id_bucket_id)); for (const auto& target : targets) { diff --git a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp index 394c13c2bad..618cfb56359 100644 --- a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp +++ b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp @@ -10,9 +10,12 @@ namespace storage::distributor { BucketInstance::BucketInstance(const document::BucketId& id, const api::BucketInfo& info, lib::Node node, - uint16_t idealLocationPriority, bool trusted, bool exist) noexcept + uint16_t ideal_location_priority, uint16_t db_entry_order, + bool trusted, bool exist) noexcept : _bucket(id), _info(info), _node(node), - _idealLocationPriority(idealLocationPriority), _trusted(trusted), _exist(exist) + _ideal_location_priority(ideal_location_priority), + _db_entry_order(db_entry_order), + _trusted(trusted), _exists(exist) { } @@ -24,8 +27,8 @@ BucketInstance::print(vespalib::asciistream& out, const PrintProperties&) const std::ostringstream ost; ost << std::hex << _bucket.getId(); - out << "(" << ost.str() << ", " << infoString << ", node " << _node.getIndex() << ", ideal " << _idealLocationPriority - << (_trusted ? ", trusted" : "") << (_exist ? "" : ", new copy") << ")"; + out << "(" << ost.str() << ", " << infoString << ", node " << _node.getIndex() << ", ideal " << _ideal_location_priority + << (_trusted ? ", trusted" : "") << (_exists ? "" : ", new copy") << ")"; } bool @@ -42,7 +45,7 @@ BucketInstanceList::add(const BucketDatabase::Entry& e, const IdealServiceLayerN for (uint32_t i = 0; i < e.getBucketInfo().getNodeCount(); ++i) { const BucketCopy& copy(e.getBucketInfo().getNodeRef(i)); lib::Node node(lib::NodeType::STORAGE, copy.getNode()); - _instances.emplace_back(e.getBucketId(), copy.getBucketInfo(), node, idealState.lookup(copy.getNode()), copy.trusted(), true); + _instances.emplace_back(e.getBucketId(), copy.getBucketInfo(), node, idealState.lookup(copy.getNode()), i, copy.trusted(), true); } } @@ -106,7 +109,8 @@ BucketInstanceList::extendToEnoughCopies(const DistributorBucketSpace& distribut for (uint32_t i=0; i<idealNodes.size(); ++i) { lib::Node node(lib::NodeType::STORAGE, idealNodes[i]); if (!contains(node)) { - _instances.emplace_back(newTarget, api::BucketInfo(), node, i, false, false); + // We don't sort `_instances` after extending, so just reuse `i` as dummy DB entry order. + _instances.emplace_back(newTarget, api::BucketInfo(), node, i, i, false, false); } } } @@ -116,7 +120,7 @@ BucketInstanceList::createTargets(document::BucketSpace bucketSpace) { OperationTargetList result; for (const auto& bi : _instances) { - result.emplace_back(document::Bucket(bucketSpace, bi._bucket), bi._node, !bi._exist); + result.emplace_back(document::Bucket(bucketSpace, bi._bucket), bi._node, !bi._exists); } return result; } @@ -129,6 +133,49 @@ BucketInstanceList::print(vespalib::asciistream& out, const PrintProperties& p) namespace { /** + * To maintain a symmetry between which replicas receive Puts and which versions are + * preferred for activation, use an identical ordering predicate for both (for the case + * where replicas are for the same concrete bucket). + * + * Must only be used with BucketInstances that have a distinct _db_entry_order set per instance. + */ +struct ActiveReplicaSymmetricInstanceOrder { + bool operator()(const BucketInstance& a, const BucketInstance& b) noexcept { + if (a._bucket == b._bucket) { + if (a._info.isReady() != b._info.isReady()) { + return a._info.isReady(); + } + if (a._info.getDocumentCount() != b._info.getDocumentCount()) { + return a._info.getDocumentCount() > b._info.getDocumentCount(); + } + if (a._ideal_location_priority != b._ideal_location_priority) { + return a._ideal_location_priority < b._ideal_location_priority; + } + if (a._info.isActive() != b._info.isActive()) { + return a._info.isActive(); + } + // If all else is equal, this implies both A and B are on retired nodes, which is unlikely + // but possible. Fall back to the existing DB _entry order_, which is equal to an ideal + // state order where retired nodes are considered part of the ideal state (which is not the + // case for most ideal state operations). Since the DB entry order is in ideal state order, + // using this instead of node _index_ avoids affinities to lower indexes in such edge cases. + return a._db_entry_order < b._db_entry_order; + } else { + // TODO this inconsistent split case is equal to the legacy logic (aside from the tie-breaking), + // but is considered to be extremely unlikely in practice, so not worth optimizing for. + if ((a._info.getMetaCount() == 0) ^ (b._info.getMetaCount() == 0)) { + return (a._info.getMetaCount() == 0); + } + if (a._bucket.getUsedBits() != b._bucket.getUsedBits()) { + return (a._bucket.getUsedBits() > b._bucket.getUsedBits()); + } + return a._db_entry_order < b._db_entry_order; + } + return false; + } +}; + +/** * - Trusted copies should be preferred over non-trusted copies for the same bucket. * - Buckets in ideal locations should be preferred over non-ideal locations for the * same bucket across several nodes. @@ -137,14 +184,14 @@ namespace { * - Right after split/join, bucket is often not in ideal location, but should be * preferred instead of source anyhow. */ -struct InstanceOrder { - bool operator()(const BucketInstance& a, const BucketInstance& b) { +struct LegacyInstanceOrder { + bool operator()(const BucketInstance& a, const BucketInstance& b) noexcept { if (a._bucket == b._bucket) { - // Trusted only makes sense within same bucket - // Prefer trusted buckets over non-trusted ones. + // Trusted only makes sense within same bucket + // Prefer trusted buckets over non-trusted ones. if (a._trusted != b._trusted) return a._trusted; - if (a._idealLocationPriority != b._idealLocationPriority) { - return a._idealLocationPriority < b._idealLocationPriority; + if (a._ideal_location_priority != b._ideal_location_priority) { + return a._ideal_location_priority < b._ideal_location_priority; } } else { if ((a._info.getMetaCount() == 0) ^ (b._info.getMetaCount() == 0)) { @@ -164,7 +211,11 @@ OperationTargetResolverImpl::getAllInstances(OperationType type, const document: BucketInstanceList instances; if (type == PUT) { instances.populate(id, _distributor_bucket_space, _bucketDatabase); - instances.sort(InstanceOrder()); + if (_symmetric_replica_selection) { + instances.sort(ActiveReplicaSymmetricInstanceOrder()); + } else { + instances.sort(LegacyInstanceOrder()); + } instances.removeNodeDuplicates(); instances.extendToEnoughCopies(_distributor_bucket_space, _bucketDatabase, _bucketDatabase.getAppropriateBucket(_minUsedBucketBits, id), id); diff --git a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h index 9f367a89cba..6ab38928200 100644 --- a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h +++ b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h @@ -15,15 +15,17 @@ struct BucketInstance : public vespalib::AsciiPrintable { document::BucketId _bucket; api::BucketInfo _info; lib::Node _node; - uint16_t _idealLocationPriority; - bool _trusted; - bool _exist; + uint16_t _ideal_location_priority; + uint16_t _db_entry_order; + bool _trusted; // TODO remove + bool _exists; BucketInstance() noexcept - : _idealLocationPriority(0xffff), _trusted(false), _exist(false) {} + : _ideal_location_priority(0xffff), _db_entry_order(0xffff), _trusted(false), _exists(false) + {} BucketInstance(const document::BucketId& id, const api::BucketInfo& info, - lib::Node node, uint16_t idealLocationPriority, bool trusted, - bool exist) noexcept; + lib::Node node, uint16_t ideal_location_priority, + uint16_t db_entry_order, bool trusted, bool exist) noexcept; void print(vespalib::asciistream& out, const PrintProperties&) const override; }; @@ -83,6 +85,7 @@ class OperationTargetResolverImpl : public OperationTargetResolver { uint32_t _minUsedBucketBits; uint16_t _redundancy; document::BucketSpace _bucketSpace; + bool _symmetric_replica_selection; public: OperationTargetResolverImpl(const DistributorBucketSpace& distributor_bucket_space, @@ -94,9 +97,14 @@ public: _bucketDatabase(bucketDatabase), _minUsedBucketBits(minUsedBucketBits), _redundancy(redundancy), - _bucketSpace(bucketSpace) + _bucketSpace(bucketSpace), + _symmetric_replica_selection(true) {} + void use_symmetric_replica_selection(bool symmetry) noexcept { + _symmetric_replica_selection = symmetry; + } + BucketInstanceList getAllInstances(OperationType type, const document::BucketId& id); BucketInstanceList getInstances(OperationType type, const document::BucketId& id) { BucketInstanceList result(getAllInstances(type, id)); |