diff options
author | Tor Brede Vekterli <vekterli@yahooinc.com> | 2021-12-07 14:54:17 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahooinc.com> | 2021-12-07 15:04:28 +0000 |
commit | 85ced6e73b0b220dedebdb7f4b54bdceeb7f3fa2 (patch) | |
tree | 79d4bbc6eae895a735a4e843380962fc649c9a6b /storage | |
parent | e2818c4593f6964e57bdbd332ebe0481d39bb0d8 (diff) |
Prevent orphaned bucket replicas caused by indeterminate CreateBucket replies
Previously we'd implicitly assume a failed CreateBucket reply meant the
bucket replica was not created, but this does not hold in the general case.
A failure may just as well be due to connection failures etc between the
distributor and content node. To tell for sure, we now send an explicit
RequestBucketInfo to the node in the case of CreateBucket failures. If
it _was_ created, the replica will be reintroduced into the bucket DB.
We still implicitly delete the bucket replica from the DB to avoid
transiently routing client write load to a bucket that may likely not
exist.
Diffstat (limited to 'storage')
-rw-r--r-- | storage/src/tests/distributor/putoperationtest.cpp | 32 | ||||
-rw-r--r-- | storage/src/vespa/storage/distributor/persistencemessagetracker.cpp | 7 |
2 files changed, 36 insertions, 3 deletions
diff --git a/storage/src/tests/distributor/putoperationtest.cpp b/storage/src/tests/distributor/putoperationtest.cpp index a047fb7d79c..b02395717e0 100644 --- a/storage/src/tests/distributor/putoperationtest.cpp +++ b/storage/src/tests/distributor/putoperationtest.cpp @@ -51,9 +51,8 @@ public: document::BucketId createAndSendSampleDocument(vespalib::duration timeout); void sendReply(int idx = -1, - api::ReturnCode::Result result - = api::ReturnCode::OK, - api::BucketInfo info = api::BucketInfo(1,2,3,4,5)) + api::ReturnCode::Result result = api::ReturnCode::OK, + api::BucketInfo info = api::BucketInfo(1,2,3,4,5)) { ASSERT_FALSE(_sender.commands().empty()); if (idx == -1) { @@ -152,6 +151,33 @@ TEST_F(PutOperationTest, bucket_database_gets_special_entry_when_CreateBucket_se ASSERT_EQ("Create bucket => 0,Put => 0", _sender.getCommands(true)); } +TEST_F(PutOperationTest, failed_CreateBucket_removes_replica_from_db_and_sends_RequestBucketInfo) { + setup_stripe(2, 2, "distributor:1 storage:2"); + + auto doc = createDummyDocument("test", "test"); + sendPut(createPut(doc)); + + ASSERT_EQ("Create bucket => 1,Create bucket => 0,Put => 1,Put => 0", _sender.getCommands(true)); + + // Simulate timeouts on node 1. Replica existence is in a Schrödinger's cat state until we send + // a RequestBucketInfo to the node and open the box to find out for sure. + sendReply(0, api::ReturnCode::TIMEOUT, api::BucketInfo()); // CreateBucket + sendReply(2, api::ReturnCode::TIMEOUT, api::BucketInfo()); // Put + // Pretend everything went fine on node 0 + sendReply(1); // CreateBucket + sendReply(3); // Put + + ASSERT_EQ("BucketId(0x4000000000008f09) : " + "node(idx=0,crc=0x1,docs=2/4,bytes=3/5,trusted=true,active=false,ready=false)", + dumpBucket(operation_context().make_split_bit_constrained_bucket_id(doc->getId()))); + + // TODO remove revert concept; does not make sense with Proton (since it's not a multi-version store and + // therefore does not have anything to revert back to) and is config-disabled by default for this provider. + ASSERT_EQ("RequestBucketInfoCommand(1 buckets, super bucket BucketId(0x4000000000008f09). ) => 1," + "Revert(BucketId(0x4000000000008f09)) => 0", + _sender.getCommands(true, true, 4)); +} + TEST_F(PutOperationTest, send_inline_split_before_put_if_bucket_too_large) { setup_stripe(1, 1, "storage:1 distributor:1"); auto cfg = make_config(); diff --git a/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp index 8cacbb0bf5a..45129f7be04 100644 --- a/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp +++ b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp @@ -259,7 +259,14 @@ PersistenceMessageTrackerImpl::handleCreateBucketReply( && reply.getResult().getResult() != api::ReturnCode::EXISTS) { LOG(spam, "Create bucket reply failed, so deleting it from bucket db"); + // We don't know if the bucket exists at this point, so we remove it from the DB. + // If we get subsequent write load the bucket will be implicitly created again + // (which is an idempotent operation) and all is well. But since we don't know _if_ + // we'll get any further write load we send a RequestBucketInfo to bring the bucket + // back into the DB if it _was_ successfully created. We have to do the latter to + // avoid the risk of introducing an orphaned bucket replica on the content node. _op_ctx.remove_node_from_bucket_database(reply.getBucket(), node); + _op_ctx.recheck_bucket_info(node, reply.getBucket()); } } |