aboutsummaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-05-22 12:58:45 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-05-22 13:42:45 +0000
commit6112b790f867fa464bb03045142a041a18974124 (patch)
treee8632409b38519bc81b48d259a3f44507370b0b5 /storage
parent0c63bb70d57d05444f3e899fbaa93d51057f0699 (diff)
Make replica selection for Puts and bucket activation symmetric
The legacy Put replica selection behavior may route new versions of a document to replicas that are not considered optimal for activation. This is not normally an issue, but can manifest itself as missing coverage when the system is in flux with replicas moving away from Retired nodes containing ready replicas, as the existing replicas on the Retired node would be preferred for activation (and thus be used for searches) but incoming Puts would instead be sent to non-retired nodes due to being in the ideal state. The new replica ordering (and transitively, selection) behavior is identical between Puts and activation. This should help ensure that new versions of the document is routed to replica(s) that are most likely to be visible as part of searches. New selection behavior for Puts is config-gated and defaults to the legacy behavior. This also subtly changes the fallback ordering criteria for replica activation to consider the replica's existing DB _entry_ order instead of its node index. Since DB entries are always ordered by their ideal state order (with Retired nodes included), this will evenly distribute fallback activations rather than skewing them towards lower indexes. It is not expected that this has any negative effects in practice, and is therefore _not_ a config-gated change.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/operationtargetresolvertest.cpp58
-rw-r--r--storage/src/vespa/storage/config/distributorconfiguration.cpp2
-rw-r--r--storage/src/vespa/storage/config/distributorconfiguration.h6
-rw-r--r--storage/src/vespa/storage/config/stor-distributormanager.def7
-rw-r--r--storage/src/vespa/storage/distributor/activecopy.cpp5
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/putoperation.cpp2
-rw-r--r--storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp79
-rw-r--r--storage/src/vespa/storage/distributor/operationtargetresolverimpl.h22
8 files changed, 150 insertions, 31 deletions
diff --git a/storage/src/tests/distributor/operationtargetresolvertest.cpp b/storage/src/tests/distributor/operationtargetresolvertest.cpp
index f0f8a4359fd..171dc5a42c0 100644
--- a/storage/src/tests/distributor/operationtargetresolvertest.cpp
+++ b/storage/src/tests/distributor/operationtargetresolvertest.cpp
@@ -27,8 +27,7 @@ struct OperationTargetResolverTest : Test, DistributorStripeTestUtil {
const document::DocumentType* _html_type;
std::unique_ptr<Operation> op;
- BucketInstanceList getInstances(const BucketId& bid,
- bool stripToRedundancy);
+ BucketInstanceList getInstances(const BucketId& bid, bool stripToRedundancy, bool symmetry_mode);
void SetUp() override {
_repo.reset(new document::DocumentTypeRepo(
@@ -62,7 +61,7 @@ namespace {
TestTargets::createTest(id, *this, *_asserters.back())
struct Asserter {
- virtual ~Asserter() {}
+ virtual ~Asserter() = default;
virtual void assertEqualMsg(std::string t1,
OperationTargetList t2,
OperationTargetList t3) = 0;
@@ -73,21 +72,29 @@ struct TestTargets {
OperationTargetList _expected;
OperationTargetResolverTest& _test;
Asserter& _asserter;
+ bool _symmetry_mode;
TestTargets(const BucketId& id,
OperationTargetResolverTest& test,
Asserter& asserter)
- : _id(id), _test(test), _asserter(asserter) {}
+ : _id(id), _test(test), _asserter(asserter), _symmetry_mode(true)
+ {
+ }
~TestTargets() {
- BucketInstanceList result(_test.getInstances(_id, true));
- BucketInstanceList all(_test.getInstances(_id, false));
+ BucketInstanceList result(_test.getInstances(_id, true, _symmetry_mode));
+ BucketInstanceList all(_test.getInstances(_id, false, _symmetry_mode));
_asserter.assertEqualMsg(
all.toString(), _expected, result.createTargets(makeBucketSpace()));
delete _asserters.back();
_asserters.pop_back();
}
+ TestTargets& with_symmetric_replica_selection(bool symmetry) noexcept {
+ _symmetry_mode = symmetry;
+ return *this;
+ }
+
TestTargets& sendsTo(const BucketId& id, uint16_t node) {
_expected.push_back(OperationTarget(
makeDocumentBucket(id), lib::Node(lib::NodeType::STORAGE, node), false));
@@ -110,7 +117,7 @@ struct TestTargets {
} // anonymous
BucketInstanceList
-OperationTargetResolverTest::getInstances(const BucketId& id, bool stripToRedundancy)
+OperationTargetResolverTest::getInstances(const BucketId& id, bool stripToRedundancy, bool symmetry_mode)
{
auto &bucketSpaceRepo(operation_context().bucket_space_repo());
auto &distributorBucketSpace(bucketSpaceRepo.get(makeBucketSpace()));
@@ -118,6 +125,7 @@ OperationTargetResolverTest::getInstances(const BucketId& id, bool stripToRedund
distributorBucketSpace, distributorBucketSpace.getBucketDatabase(), 16,
distributorBucketSpace.getDistribution().getRedundancy(),
makeBucketSpace());
+ resolver.use_symmetric_replica_selection(symmetry_mode);
if (stripToRedundancy) {
return resolver.getInstances(OperationTargetResolver::PUT, id);
} else {
@@ -143,14 +151,48 @@ TEST_F(OperationTargetResolverTest, choose_ideal_state_when_many_copies) {
.sendsTo(BucketId(16, 0), 3);
}
-TEST_F(OperationTargetResolverTest, trusted_over_ideal_state) {
+TEST_F(OperationTargetResolverTest, legacy_prefers_trusted_over_ideal_state) {
setup_stripe(2, 4, "storage:4 distributor:1");
addNodesToBucketDB(BucketId(16, 0), "0=0/0/0/t,1=0,2=0/0/0/t,3=0");
// ideal nodes: 1, 3
+ MY_ASSERT_THAT(BucketId(32, 0)).with_symmetric_replica_selection(false)
+ .sendsTo(BucketId(16, 0), 0)
+ .sendsTo(BucketId(16, 0), 2);
+}
+
+TEST_F(OperationTargetResolverTest, prefer_ready_over_ideal_state_order) {
+ setup_stripe(2, 4, "storage:4 distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "0=1/2/3/u/i/r,1=1/2/3,2=1/2/3/u/i/r,3=1/2/3");
+ // ideal nodes: 1, 3. 0 and 2 are ready.
MY_ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 0)
.sendsTo(BucketId(16, 0), 2);
}
+TEST_F(OperationTargetResolverTest, prefer_ready_over_ideal_state_order_also_when_retired) {
+ setup_stripe(2, 4, "storage:4 .0.s:r distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "0=1/2/3/u/i/r,1=1/2/3,2=1/2/3/u/i/r,3=1/2/3");
+ // ideal nodes: 1, 3. 0 and 2 are ready.
+ MY_ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 0)
+ .sendsTo(BucketId(16, 0), 2);
+}
+
+TEST_F(OperationTargetResolverTest, prefer_replicas_with_more_docs_over_replicas_with_fewer_docs) {
+ setup_stripe(2, 4, "storage:4 distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "0=2/3/4,1=1/2/3,2=3/4/5,3=1/2/3");
+ // ideal nodes: 1, 3. 0 and 2 have more docs.
+ MY_ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 2)
+ .sendsTo(BucketId(16, 0), 0);
+}
+
+TEST_F(OperationTargetResolverTest, fall_back_to_active_state_and_db_index_if_all_other_fields_equal) {
+ // All replica nodes tagged as retired, which means none are part of the ideal state order
+ setup_stripe(2, 4, "storage:4 .0.s:r .2.s:r .3.s:r distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "0=2/3/4/u/a,3=2/3/4,2=2/3/4");
+ // ideal nodes: 1, 3. 0 is active and 3 is the remaining replica with the lowest DB order.
+ MY_ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 0)
+ .sendsTo(BucketId(16, 0), 3);
+}
+
TEST_F(OperationTargetResolverTest, choose_highest_split_bucket) {
setup_stripe(2, 2, "storage:2 distributor:1");
// 0, 1 are both in ideal state for both buckets.
diff --git a/storage/src/vespa/storage/config/distributorconfiguration.cpp b/storage/src/vespa/storage/config/distributorconfiguration.cpp
index 6957e541d6b..cfbc4caf82d 100644
--- a/storage/src/vespa/storage/config/distributorconfiguration.cpp
+++ b/storage/src/vespa/storage/config/distributorconfiguration.cpp
@@ -46,6 +46,7 @@ DistributorConfiguration::DistributorConfiguration(StorageComponent& component)
_use_weak_internal_read_consistency_for_client_gets(false),
_enable_metadata_only_fetch_phase_for_inconsistent_updates(true),
_enable_operation_cancellation(false),
+ _symmetric_put_and_activate_replica_selection(false),
_minimumReplicaCountingMode(ReplicaCountingMode::TRUSTED)
{
}
@@ -150,6 +151,7 @@ DistributorConfiguration::configure(const DistributorManagerConfig & config)
_max_activation_inhibited_out_of_sync_groups = config.maxActivationInhibitedOutOfSyncGroups;
_enable_operation_cancellation = config.enableOperationCancellation;
_minimumReplicaCountingMode = deriveReplicaCountingMode(config.minimumReplicaCountingMode);
+ _symmetric_put_and_activate_replica_selection = config.symmetricPutAndActivateReplicaSelection;
if (config.maxClusterClockSkewSec >= 0) {
_maxClusterClockSkew = std::chrono::seconds(config.maxClusterClockSkewSec);
diff --git a/storage/src/vespa/storage/config/distributorconfiguration.h b/storage/src/vespa/storage/config/distributorconfiguration.h
index 38fac13150c..2b73fdc0fa1 100644
--- a/storage/src/vespa/storage/config/distributorconfiguration.h
+++ b/storage/src/vespa/storage/config/distributorconfiguration.h
@@ -234,8 +234,11 @@ public:
[[nodiscard]] bool enable_operation_cancellation() const noexcept {
return _enable_operation_cancellation;
}
+ [[nodiscard]] bool symmetric_put_and_activate_replica_selection() const noexcept {
+ return _symmetric_put_and_activate_replica_selection;
+ }
- bool containsTimeStatement(const std::string& documentSelection) const;
+ [[nodiscard]] bool containsTimeStatement(const std::string& documentSelection) const;
private:
StorageComponent& _component;
@@ -276,6 +279,7 @@ private:
bool _use_weak_internal_read_consistency_for_client_gets;
bool _enable_metadata_only_fetch_phase_for_inconsistent_updates; //TODO Rewrite tests and GC
bool _enable_operation_cancellation;
+ bool _symmetric_put_and_activate_replica_selection;
ReplicaCountingMode _minimumReplicaCountingMode;
};
diff --git a/storage/src/vespa/storage/config/stor-distributormanager.def b/storage/src/vespa/storage/config/stor-distributormanager.def
index 3f6028d7fa1..a4d4461ba68 100644
--- a/storage/src/vespa/storage/config/stor-distributormanager.def
+++ b/storage/src/vespa/storage/config/stor-distributormanager.def
@@ -159,6 +159,13 @@ num_distributor_stripes int default=0 restart
## requests partially or fully "invalidated" by such a change.
enable_operation_cancellation bool default=false
+## Iff true there will be an 1-1 symmetry between the replicas chosen as feed targets
+## for Put operations and the replica selection logic for bucket activation. In particular,
+## the most preferred replica for feed will be the most preferred bucket for activation.
+## This helps ensure that new versions of documents are routed to replicas that are most
+## likely to reflect these changes as part of visible search results.
+symmetric_put_and_activate_replica_selection bool default=false
+
## TODO GC very soon, it has no effect.
priority_merge_out_of_sync_copies int default=120
diff --git a/storage/src/vespa/storage/distributor/activecopy.cpp b/storage/src/vespa/storage/distributor/activecopy.cpp
index 35070bcee3b..b823978a0cc 100644
--- a/storage/src/vespa/storage/distributor/activecopy.cpp
+++ b/storage/src/vespa/storage/distributor/activecopy.cpp
@@ -108,6 +108,7 @@ buildNodeList(const BucketDatabase::Entry& e,vespalib::ConstArrayRef<uint16_t> n
struct ActiveStateOrder {
bool operator()(const ActiveCopy & e1, const ActiveCopy & e2) noexcept {
+ // Replica selection order should be kept in sync with OperationTargetResolverImpl's InstanceOrder.
if (e1._ready != e2._ready) {
return e1._ready;
}
@@ -120,7 +121,9 @@ struct ActiveStateOrder {
if (e1._active != e2._active) {
return e1._active;
}
- return e1.nodeIndex() < e2.nodeIndex();
+ // Use _entry_ order instead of node index, as it is in ideal state order (even for retired
+ // nodes), which avoids unintentional affinities towards lower node indexes.
+ return e1.entryIndex() < e2.entryIndex();
}
};
diff --git a/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp
index 54087850e1b..a92896279b0 100644
--- a/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp
+++ b/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp
@@ -180,6 +180,8 @@ void PutOperation::start_direct_put_dispatch(DistributorStripeMessageSender& sen
_op_ctx.distributor_config().getMinimalBucketSplit(),
_bucket_space.getDistribution().getRedundancy(),
_msg->getBucket().getBucketSpace());
+ targetResolver.use_symmetric_replica_selection(
+ _op_ctx.distributor_config().symmetric_put_and_activate_replica_selection());
OperationTargetList targets(targetResolver.getTargets(OperationTargetResolver::PUT, _doc_id_bucket_id));
for (const auto& target : targets) {
diff --git a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp
index 394c13c2bad..618cfb56359 100644
--- a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp
+++ b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp
@@ -10,9 +10,12 @@
namespace storage::distributor {
BucketInstance::BucketInstance(const document::BucketId& id, const api::BucketInfo& info, lib::Node node,
- uint16_t idealLocationPriority, bool trusted, bool exist) noexcept
+ uint16_t ideal_location_priority, uint16_t db_entry_order,
+ bool trusted, bool exist) noexcept
: _bucket(id), _info(info), _node(node),
- _idealLocationPriority(idealLocationPriority), _trusted(trusted), _exist(exist)
+ _ideal_location_priority(ideal_location_priority),
+ _db_entry_order(db_entry_order),
+ _trusted(trusted), _exists(exist)
{
}
@@ -24,8 +27,8 @@ BucketInstance::print(vespalib::asciistream& out, const PrintProperties&) const
std::ostringstream ost;
ost << std::hex << _bucket.getId();
- out << "(" << ost.str() << ", " << infoString << ", node " << _node.getIndex() << ", ideal " << _idealLocationPriority
- << (_trusted ? ", trusted" : "") << (_exist ? "" : ", new copy") << ")";
+ out << "(" << ost.str() << ", " << infoString << ", node " << _node.getIndex() << ", ideal " << _ideal_location_priority
+ << (_trusted ? ", trusted" : "") << (_exists ? "" : ", new copy") << ")";
}
bool
@@ -42,7 +45,7 @@ BucketInstanceList::add(const BucketDatabase::Entry& e, const IdealServiceLayerN
for (uint32_t i = 0; i < e.getBucketInfo().getNodeCount(); ++i) {
const BucketCopy& copy(e.getBucketInfo().getNodeRef(i));
lib::Node node(lib::NodeType::STORAGE, copy.getNode());
- _instances.emplace_back(e.getBucketId(), copy.getBucketInfo(), node, idealState.lookup(copy.getNode()), copy.trusted(), true);
+ _instances.emplace_back(e.getBucketId(), copy.getBucketInfo(), node, idealState.lookup(copy.getNode()), i, copy.trusted(), true);
}
}
@@ -106,7 +109,8 @@ BucketInstanceList::extendToEnoughCopies(const DistributorBucketSpace& distribut
for (uint32_t i=0; i<idealNodes.size(); ++i) {
lib::Node node(lib::NodeType::STORAGE, idealNodes[i]);
if (!contains(node)) {
- _instances.emplace_back(newTarget, api::BucketInfo(), node, i, false, false);
+ // We don't sort `_instances` after extending, so just reuse `i` as dummy DB entry order.
+ _instances.emplace_back(newTarget, api::BucketInfo(), node, i, i, false, false);
}
}
}
@@ -116,7 +120,7 @@ BucketInstanceList::createTargets(document::BucketSpace bucketSpace)
{
OperationTargetList result;
for (const auto& bi : _instances) {
- result.emplace_back(document::Bucket(bucketSpace, bi._bucket), bi._node, !bi._exist);
+ result.emplace_back(document::Bucket(bucketSpace, bi._bucket), bi._node, !bi._exists);
}
return result;
}
@@ -129,6 +133,49 @@ BucketInstanceList::print(vespalib::asciistream& out, const PrintProperties& p)
namespace {
/**
+ * To maintain a symmetry between which replicas receive Puts and which versions are
+ * preferred for activation, use an identical ordering predicate for both (for the case
+ * where replicas are for the same concrete bucket).
+ *
+ * Must only be used with BucketInstances that have a distinct _db_entry_order set per instance.
+ */
+struct ActiveReplicaSymmetricInstanceOrder {
+ bool operator()(const BucketInstance& a, const BucketInstance& b) noexcept {
+ if (a._bucket == b._bucket) {
+ if (a._info.isReady() != b._info.isReady()) {
+ return a._info.isReady();
+ }
+ if (a._info.getDocumentCount() != b._info.getDocumentCount()) {
+ return a._info.getDocumentCount() > b._info.getDocumentCount();
+ }
+ if (a._ideal_location_priority != b._ideal_location_priority) {
+ return a._ideal_location_priority < b._ideal_location_priority;
+ }
+ if (a._info.isActive() != b._info.isActive()) {
+ return a._info.isActive();
+ }
+ // If all else is equal, this implies both A and B are on retired nodes, which is unlikely
+ // but possible. Fall back to the existing DB _entry order_, which is equal to an ideal
+ // state order where retired nodes are considered part of the ideal state (which is not the
+ // case for most ideal state operations). Since the DB entry order is in ideal state order,
+ // using this instead of node _index_ avoids affinities to lower indexes in such edge cases.
+ return a._db_entry_order < b._db_entry_order;
+ } else {
+ // TODO this inconsistent split case is equal to the legacy logic (aside from the tie-breaking),
+ // but is considered to be extremely unlikely in practice, so not worth optimizing for.
+ if ((a._info.getMetaCount() == 0) ^ (b._info.getMetaCount() == 0)) {
+ return (a._info.getMetaCount() == 0);
+ }
+ if (a._bucket.getUsedBits() != b._bucket.getUsedBits()) {
+ return (a._bucket.getUsedBits() > b._bucket.getUsedBits());
+ }
+ return a._db_entry_order < b._db_entry_order;
+ }
+ return false;
+ }
+};
+
+/**
* - Trusted copies should be preferred over non-trusted copies for the same bucket.
* - Buckets in ideal locations should be preferred over non-ideal locations for the
* same bucket across several nodes.
@@ -137,14 +184,14 @@ namespace {
* - Right after split/join, bucket is often not in ideal location, but should be
* preferred instead of source anyhow.
*/
-struct InstanceOrder {
- bool operator()(const BucketInstance& a, const BucketInstance& b) {
+struct LegacyInstanceOrder {
+ bool operator()(const BucketInstance& a, const BucketInstance& b) noexcept {
if (a._bucket == b._bucket) {
- // Trusted only makes sense within same bucket
- // Prefer trusted buckets over non-trusted ones.
+ // Trusted only makes sense within same bucket
+ // Prefer trusted buckets over non-trusted ones.
if (a._trusted != b._trusted) return a._trusted;
- if (a._idealLocationPriority != b._idealLocationPriority) {
- return a._idealLocationPriority < b._idealLocationPriority;
+ if (a._ideal_location_priority != b._ideal_location_priority) {
+ return a._ideal_location_priority < b._ideal_location_priority;
}
} else {
if ((a._info.getMetaCount() == 0) ^ (b._info.getMetaCount() == 0)) {
@@ -164,7 +211,11 @@ OperationTargetResolverImpl::getAllInstances(OperationType type, const document:
BucketInstanceList instances;
if (type == PUT) {
instances.populate(id, _distributor_bucket_space, _bucketDatabase);
- instances.sort(InstanceOrder());
+ if (_symmetric_replica_selection) {
+ instances.sort(ActiveReplicaSymmetricInstanceOrder());
+ } else {
+ instances.sort(LegacyInstanceOrder());
+ }
instances.removeNodeDuplicates();
instances.extendToEnoughCopies(_distributor_bucket_space, _bucketDatabase,
_bucketDatabase.getAppropriateBucket(_minUsedBucketBits, id), id);
diff --git a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h
index 9f367a89cba..6ab38928200 100644
--- a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h
+++ b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h
@@ -15,15 +15,17 @@ struct BucketInstance : public vespalib::AsciiPrintable {
document::BucketId _bucket;
api::BucketInfo _info;
lib::Node _node;
- uint16_t _idealLocationPriority;
- bool _trusted;
- bool _exist;
+ uint16_t _ideal_location_priority;
+ uint16_t _db_entry_order;
+ bool _trusted; // TODO remove
+ bool _exists;
BucketInstance() noexcept
- : _idealLocationPriority(0xffff), _trusted(false), _exist(false) {}
+ : _ideal_location_priority(0xffff), _db_entry_order(0xffff), _trusted(false), _exists(false)
+ {}
BucketInstance(const document::BucketId& id, const api::BucketInfo& info,
- lib::Node node, uint16_t idealLocationPriority, bool trusted,
- bool exist) noexcept;
+ lib::Node node, uint16_t ideal_location_priority,
+ uint16_t db_entry_order, bool trusted, bool exist) noexcept;
void print(vespalib::asciistream& out, const PrintProperties&) const override;
};
@@ -83,6 +85,7 @@ class OperationTargetResolverImpl : public OperationTargetResolver {
uint32_t _minUsedBucketBits;
uint16_t _redundancy;
document::BucketSpace _bucketSpace;
+ bool _symmetric_replica_selection;
public:
OperationTargetResolverImpl(const DistributorBucketSpace& distributor_bucket_space,
@@ -94,9 +97,14 @@ public:
_bucketDatabase(bucketDatabase),
_minUsedBucketBits(minUsedBucketBits),
_redundancy(redundancy),
- _bucketSpace(bucketSpace)
+ _bucketSpace(bucketSpace),
+ _symmetric_replica_selection(true)
{}
+ void use_symmetric_replica_selection(bool symmetry) noexcept {
+ _symmetric_replica_selection = symmetry;
+ }
+
BucketInstanceList getAllInstances(OperationType type, const document::BucketId& id);
BucketInstanceList getInstances(OperationType type, const document::BucketId& id) {
BucketInstanceList result(getAllInstances(type, id));