aboutsummaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2021-12-07 14:54:17 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2021-12-07 15:04:28 +0000
commit85ced6e73b0b220dedebdb7f4b54bdceeb7f3fa2 (patch)
tree79d4bbc6eae895a735a4e843380962fc649c9a6b /storage
parente2818c4593f6964e57bdbd332ebe0481d39bb0d8 (diff)
Prevent orphaned bucket replicas caused by indeterminate CreateBucket replies
Previously we'd implicitly assume a failed CreateBucket reply meant the bucket replica was not created, but this does not hold in the general case. A failure may just as well be due to connection failures etc between the distributor and content node. To tell for sure, we now send an explicit RequestBucketInfo to the node in the case of CreateBucket failures. If it _was_ created, the replica will be reintroduced into the bucket DB. We still implicitly delete the bucket replica from the DB to avoid transiently routing client write load to a bucket that may likely not exist.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/putoperationtest.cpp32
-rw-r--r--storage/src/vespa/storage/distributor/persistencemessagetracker.cpp7
2 files changed, 36 insertions, 3 deletions
diff --git a/storage/src/tests/distributor/putoperationtest.cpp b/storage/src/tests/distributor/putoperationtest.cpp
index a047fb7d79c..b02395717e0 100644
--- a/storage/src/tests/distributor/putoperationtest.cpp
+++ b/storage/src/tests/distributor/putoperationtest.cpp
@@ -51,9 +51,8 @@ public:
document::BucketId createAndSendSampleDocument(vespalib::duration timeout);
void sendReply(int idx = -1,
- api::ReturnCode::Result result
- = api::ReturnCode::OK,
- api::BucketInfo info = api::BucketInfo(1,2,3,4,5))
+ api::ReturnCode::Result result = api::ReturnCode::OK,
+ api::BucketInfo info = api::BucketInfo(1,2,3,4,5))
{
ASSERT_FALSE(_sender.commands().empty());
if (idx == -1) {
@@ -152,6 +151,33 @@ TEST_F(PutOperationTest, bucket_database_gets_special_entry_when_CreateBucket_se
ASSERT_EQ("Create bucket => 0,Put => 0", _sender.getCommands(true));
}
+TEST_F(PutOperationTest, failed_CreateBucket_removes_replica_from_db_and_sends_RequestBucketInfo) {
+ setup_stripe(2, 2, "distributor:1 storage:2");
+
+ auto doc = createDummyDocument("test", "test");
+ sendPut(createPut(doc));
+
+ ASSERT_EQ("Create bucket => 1,Create bucket => 0,Put => 1,Put => 0", _sender.getCommands(true));
+
+ // Simulate timeouts on node 1. Replica existence is in a Schrödinger's cat state until we send
+ // a RequestBucketInfo to the node and open the box to find out for sure.
+ sendReply(0, api::ReturnCode::TIMEOUT, api::BucketInfo()); // CreateBucket
+ sendReply(2, api::ReturnCode::TIMEOUT, api::BucketInfo()); // Put
+ // Pretend everything went fine on node 0
+ sendReply(1); // CreateBucket
+ sendReply(3); // Put
+
+ ASSERT_EQ("BucketId(0x4000000000008f09) : "
+ "node(idx=0,crc=0x1,docs=2/4,bytes=3/5,trusted=true,active=false,ready=false)",
+ dumpBucket(operation_context().make_split_bit_constrained_bucket_id(doc->getId())));
+
+ // TODO remove revert concept; does not make sense with Proton (since it's not a multi-version store and
+ // therefore does not have anything to revert back to) and is config-disabled by default for this provider.
+ ASSERT_EQ("RequestBucketInfoCommand(1 buckets, super bucket BucketId(0x4000000000008f09). ) => 1,"
+ "Revert(BucketId(0x4000000000008f09)) => 0",
+ _sender.getCommands(true, true, 4));
+}
+
TEST_F(PutOperationTest, send_inline_split_before_put_if_bucket_too_large) {
setup_stripe(1, 1, "storage:1 distributor:1");
auto cfg = make_config();
diff --git a/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp
index 8cacbb0bf5a..45129f7be04 100644
--- a/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp
+++ b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp
@@ -259,7 +259,14 @@ PersistenceMessageTrackerImpl::handleCreateBucketReply(
&& reply.getResult().getResult() != api::ReturnCode::EXISTS)
{
LOG(spam, "Create bucket reply failed, so deleting it from bucket db");
+ // We don't know if the bucket exists at this point, so we remove it from the DB.
+ // If we get subsequent write load the bucket will be implicitly created again
+ // (which is an idempotent operation) and all is well. But since we don't know _if_
+ // we'll get any further write load we send a RequestBucketInfo to bring the bucket
+ // back into the DB if it _was_ successfully created. We have to do the latter to
+ // avoid the risk of introducing an orphaned bucket replica on the content node.
_op_ctx.remove_node_from_bucket_database(reply.getBucket(), node);
+ _op_ctx.recheck_bucket_info(node, reply.getBucket());
}
}