diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2020-05-25 14:13:03 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2020-05-26 14:07:50 +0000 |
commit | 8ad9c917da366e7faa8869b053db06d8c8393f8b (patch) | |
tree | ba7bb86fb4c12446fdb1f772762498b5f88a01a6 /storage | |
parent | a4c6471dcf5748a28d61ba10f1360548d88f352b (diff) |
Handle tombstones in GetOperation
If the newest document version is a tombstone, behave
as if the document was not found at all. Since we still
track replica consistency, this should work as expected
for multi-phase update operations as well.
Diffstat (limited to 'storage')
5 files changed, 106 insertions, 24 deletions
diff --git a/storage/src/tests/distributor/getoperationtest.cpp b/storage/src/tests/distributor/getoperationtest.cpp index f14b78094d1..d4d14314790 100644 --- a/storage/src/tests/distributor/getoperationtest.cpp +++ b/storage/src/tests/distributor/getoperationtest.cpp @@ -66,7 +66,9 @@ struct GetOperationTest : Test, DistributorTestUtil { void sendReply(uint32_t idx, api::ReturnCode::Result result, - std::string authorVal, uint32_t timestamp) + std::string authorVal, + uint32_t timestamp, + bool is_tombstone = false) { if (idx == LastCommand) { idx = _sender.commands().size() - 1; @@ -75,7 +77,8 @@ struct GetOperationTest : Test, DistributorTestUtil { std::shared_ptr<api::StorageCommand> msg2 = _sender.command(idx); ASSERT_EQ(api::MessageType::GET, msg2->getType()); - auto* tmp = static_cast<api::GetCommand*>(msg2.get()); + auto* tmp = dynamic_cast<api::GetCommand*>(msg2.get()); + assert(tmp != nullptr); document::Document::SP doc; if (!authorVal.empty()) { @@ -86,12 +89,16 @@ struct GetOperationTest : Test, DistributorTestUtil { document::StringFieldValue(authorVal)); } - auto reply = std::make_shared<api::GetReply>(*tmp, doc, timestamp); + auto reply = std::make_shared<api::GetReply>(*tmp, doc, timestamp, false, is_tombstone); reply->setResult(result); op->receive(_sender, reply); } + void reply_with_tombstone(uint32_t idx, uint32_t tombstone_ts) { + sendReply(idx, api::ReturnCode::OK, "", tombstone_ts, true); + } + void replyWithFailure() { sendReply(LastCommand, api::ReturnCode::IO_FAILURE, "", 0); } @@ -126,6 +133,13 @@ struct GetOperationTest : Test, DistributorTestUtil { return dynamic_cast<api::GetReply&>(msg).had_consistent_replicas(); } + bool last_reply_has_document() { + assert(!_sender.replies().empty()); + auto& msg = *_sender.replies().back(); + assert(msg.getType() == api::MessageType::GET_REPLY); + return (dynamic_cast<api::GetReply&>(msg).getDocument().get() != nullptr); + } + void setClusterState(const std::string& clusterState) { enableDistributorClusterState(clusterState); } @@ -138,8 +152,8 @@ GetOperationTest::~GetOperationTest() = default; namespace { -NewestReplica replica_of(api::Timestamp ts, const document::BucketId& bucket_id, uint16_t node) { - return NewestReplica::of(ts, bucket_id, node); +NewestReplica replica_of(api::Timestamp ts, const document::BucketId& bucket_id, uint16_t node, bool is_tombstone) { + return NewestReplica::of(ts, bucket_id, node, is_tombstone); } } @@ -161,7 +175,7 @@ TEST_F(GetOperationTest, simple) { EXPECT_FALSE(op->any_replicas_failed()); EXPECT_TRUE(last_reply_had_consistent_replicas()); ASSERT_TRUE(op->newest_replica().has_value()); - EXPECT_EQ(replica_of(api::Timestamp(100), bucketId, 0), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(100), bucketId, 0, false), *op->newest_replica()); } TEST_F(GetOperationTest, ask_all_checksum_groups_if_inconsistent_even_if_trusted_replica_available) { @@ -182,7 +196,7 @@ TEST_F(GetOperationTest, ask_all_checksum_groups_if_inconsistent_even_if_trusted EXPECT_FALSE(op->any_replicas_failed()); EXPECT_FALSE(last_reply_had_consistent_replicas()); ASSERT_TRUE(op->newest_replica().has_value()); - EXPECT_EQ(replica_of(api::Timestamp(2), bucketId, 0), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(2), bucketId, 0, false), *op->newest_replica()); } TEST_F(GetOperationTest, ask_all_nodes_if_bucket_is_inconsistent) { @@ -205,7 +219,7 @@ TEST_F(GetOperationTest, ask_all_nodes_if_bucket_is_inconsistent) { EXPECT_FALSE(op->any_replicas_failed()); EXPECT_FALSE(last_reply_had_consistent_replicas()); ASSERT_TRUE(op->newest_replica().has_value()); - EXPECT_EQ(replica_of(api::Timestamp(2), bucketId, 1), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(2), bucketId, 1, false), *op->newest_replica()); } TEST_F(GetOperationTest, send_to_all_invalid_copies) { @@ -273,7 +287,7 @@ TEST_F(GetOperationTest, inconsistent_split) { EXPECT_FALSE(last_reply_had_consistent_replicas()); // Bucket with highest timestamp should be returned. In this case it's the one on node 0. ASSERT_TRUE(op->newest_replica().has_value()); - EXPECT_EQ(replica_of(api::Timestamp(2), BucketId(16, 0x0593), 0), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(2), BucketId(16, 0x0593), 0, false), *op->newest_replica()); } TEST_F(GetOperationTest, multi_inconsistent_bucket_not_found) { @@ -317,7 +331,7 @@ TEST_F(GetOperationTest, multi_inconsistent_bucket_not_found_deleted) { EXPECT_FALSE(op->any_replicas_failed()); EXPECT_FALSE(last_reply_had_consistent_replicas()); ASSERT_TRUE(op->newest_replica().has_value()); - EXPECT_EQ(replica_of(api::Timestamp(3), bucketId, 1), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(3), bucketId, 1, false), *op->newest_replica()); } TEST_F(GetOperationTest, multi_inconsistent_bucket) { @@ -367,7 +381,7 @@ TEST_F(GetOperationTest, multi_inconsistent_bucket_fail) { EXPECT_FALSE(last_reply_had_consistent_replicas()); ASSERT_TRUE(op->newest_replica().has_value()); // First send to node 2 fails, second is to node 3 which returned the highest timestamp - EXPECT_EQ(replica_of(api::Timestamp(100), bucketId, 3), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(100), bucketId, 3, false), *op->newest_replica()); } TEST_F(GetOperationTest, return_not_found_when_bucket_not_in_db) { @@ -408,7 +422,7 @@ TEST_F(GetOperationTest, not_found) { // the caller may want to perform special logic if all replicas are in sync // but are missing the document. // FIXME make sure all callers are aware of this! - EXPECT_EQ(replica_of(api::Timestamp(0), bucketId, 0), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(0), bucketId, 0, false), *op->newest_replica()); } TEST_F(GetOperationTest, not_found_on_subset_of_replicas_marks_get_as_inconsistent) { @@ -453,7 +467,7 @@ TEST_F(GetOperationTest, resend_on_storage_failure) { // Replica had read failure, but they're still in sync. An immutable Get won't change that fact. EXPECT_TRUE(last_reply_had_consistent_replicas()); ASSERT_TRUE(op->newest_replica().has_value()); - EXPECT_EQ(replica_of(api::Timestamp(100), bucketId, 2), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(100), bucketId, 2, false), *op->newest_replica()); } TEST_F(GetOperationTest, storage_failure_of_out_of_sync_replica_is_tracked_as_inconsistent) { @@ -470,7 +484,7 @@ TEST_F(GetOperationTest, storage_failure_of_out_of_sync_replica_is_tracked_as_in EXPECT_TRUE(op->any_replicas_failed()); EXPECT_FALSE(last_reply_had_consistent_replicas()); ASSERT_TRUE(op->newest_replica().has_value()); - EXPECT_EQ(replica_of(api::Timestamp(3), bucketId, 2), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(3), bucketId, 2, false), *op->newest_replica()); } TEST_F(GetOperationTest, resend_on_storage_failure_all_fail) { @@ -519,7 +533,7 @@ TEST_F(GetOperationTest, send_to_ideal_copy_if_bucket_in_sync) { _sender.getLastReply()); EXPECT_TRUE(last_reply_had_consistent_replicas()); ASSERT_TRUE(op->newest_replica().has_value()); - EXPECT_EQ(replica_of(api::Timestamp(100), bucketId, 1), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(100), bucketId, 1, false), *op->newest_replica()); } TEST_F(GetOperationTest, multiple_copies_with_failure_on_local_node) { @@ -550,7 +564,7 @@ TEST_F(GetOperationTest, multiple_copies_with_failure_on_local_node) { EXPECT_TRUE(op->any_replicas_failed()); EXPECT_TRUE(last_reply_had_consistent_replicas()); ASSERT_TRUE(op->newest_replica().has_value()); - EXPECT_EQ(replica_of(api::Timestamp(3), BucketId(16, 0x0593), 2), *op->newest_replica()); + EXPECT_EQ(replica_of(api::Timestamp(3), BucketId(16, 0x0593), 2, false), *op->newest_replica()); } TEST_F(GetOperationTest, can_get_documents_when_all_replica_nodes_retired) { @@ -580,4 +594,58 @@ TEST_F(GetOperationTest, can_send_gets_with_weak_internal_read_consistency) { do_test_read_consistency_is_propagated(api::InternalReadConsistency::Weak); } +TEST_F(GetOperationTest, replicas_considered_consistent_if_all_equal_tombstone_timestamps) { + setClusterState("distributor:1 storage:4"); + addNodesToBucketDB(bucketId, "0=100,2=100,1=200,3=200"); + sendGet(); + ASSERT_EQ("Get => 0,Get => 1", _sender.getCommands(true)); + + ASSERT_NO_FATAL_FAILURE(reply_with_tombstone(0, 100)); + ASSERT_NO_FATAL_FAILURE(reply_with_tombstone(1, 100)); + + ASSERT_EQ("GetReply(BucketId(0x0000000000000000), id:ns:text/html::uri, timestamp 0) ReturnCode(NONE)", + _sender.getLastReply()); + + EXPECT_FALSE(op->any_replicas_failed()); + EXPECT_TRUE(last_reply_had_consistent_replicas()); + EXPECT_FALSE(last_reply_has_document()); + EXPECT_EQ(replica_of(api::Timestamp(100), bucketId, 0, true), *op->newest_replica()); +} + +TEST_F(GetOperationTest, newer_tombstone_hides_older_document) { + setClusterState("distributor:1 storage:4"); + addNodesToBucketDB(bucketId, "0=100,2=100,1=200,3=200"); + sendGet(); + ASSERT_EQ("Get => 0,Get => 1", _sender.getCommands(true)); + + ASSERT_NO_FATAL_FAILURE(reply_with_tombstone(1, 200)); + ASSERT_NO_FATAL_FAILURE(sendReply(0, api::ReturnCode::OK, "newauthor", 100)); + + ASSERT_EQ("GetReply(BucketId(0x0000000000000000), id:ns:text/html::uri, timestamp 0) ReturnCode(NONE)", + _sender.getLastReply()); + + EXPECT_FALSE(op->any_replicas_failed()); + EXPECT_FALSE(last_reply_had_consistent_replicas()); + EXPECT_FALSE(last_reply_has_document()); + EXPECT_EQ(replica_of(api::Timestamp(200), bucketId, 1, true), *op->newest_replica()); +} + +TEST_F(GetOperationTest, older_tombstone_does_not_hide_newer_document) { + setClusterState("distributor:1 storage:4"); + addNodesToBucketDB(bucketId, "0=100,2=100,1=200,3=200"); + sendGet(); + ASSERT_EQ("Get => 0,Get => 1", _sender.getCommands(true)); + + ASSERT_NO_FATAL_FAILURE(reply_with_tombstone(1, 100)); + ASSERT_NO_FATAL_FAILURE(sendReply(0, api::ReturnCode::OK, "newauthor", 200)); + + ASSERT_EQ("GetReply(BucketId(0x0000000000000000), id:ns:text/html::uri, timestamp 200) ReturnCode(NONE)", + _sender.getLastReply()); + + EXPECT_FALSE(op->any_replicas_failed()); + EXPECT_FALSE(last_reply_had_consistent_replicas()); + EXPECT_TRUE(last_reply_has_document()); + EXPECT_EQ(replica_of(api::Timestamp(200), bucketId, 0, false), *op->newest_replica()); +} + } diff --git a/storage/src/vespa/storage/distributor/operations/external/getoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/getoperation.cpp index b59405a73cd..bbe477be9ef 100644 --- a/storage/src/vespa/storage/distributor/operations/external/getoperation.cpp +++ b/storage/src/vespa/storage/distributor/operations/external/getoperation.cpp @@ -168,8 +168,9 @@ GetOperation::onReceive(DistributorMessageSender& sender, const std::shared_ptr< if (!_newest_replica.has_value() || getreply->getLastModifiedTimestamp() > _newest_replica->timestamp) { _returnCode = getreply->getResult(); assert(response.second[i].to_node != UINT16_MAX); - _newest_replica = NewestReplica::of(getreply->getLastModifiedTimestamp(), bucket_id, send_state.to_node); - _doc = getreply->getDocument(); + _newest_replica = NewestReplica::of(getreply->getLastModifiedTimestamp(), bucket_id, + send_state.to_node, getreply->is_tombstone()); + _doc = getreply->getDocument(); // May be empty (tombstones). } } else { _any_replicas_failed = true; @@ -228,7 +229,12 @@ void GetOperation::sendReply(DistributorMessageSender& sender) { if (_msg.get()) { - const auto timestamp = _newest_replica.value_or(NewestReplica::make_empty()).timestamp; + const auto newest = _newest_replica.value_or(NewestReplica::make_empty()); + // If the newest entry is a tombstone (remove entry), the externally visible + // behavior is as if the document was not found. In this case _doc will also + // be empty. This means we also currently don't propagate tombstone status outside + // of this operation (except via the newest_replica() functionality). + const auto timestamp = (newest.is_tombstone ? api::Timestamp(0) : newest.timestamp); auto repl = std::make_shared<api::GetReply>(*_msg, _doc, timestamp, !_has_replica_inconsistency); repl->setResult(_returnCode); update_internal_metrics(); diff --git a/storage/src/vespa/storage/distributor/operations/external/newest_replica.cpp b/storage/src/vespa/storage/distributor/operations/external/newest_replica.cpp index 8ca3b9bf411..2520db3e57a 100644 --- a/storage/src/vespa/storage/distributor/operations/external/newest_replica.cpp +++ b/storage/src/vespa/storage/distributor/operations/external/newest_replica.cpp @@ -7,7 +7,9 @@ namespace storage::distributor { std::ostream& operator<<(std::ostream& os, const NewestReplica& nr) { os << "NewestReplica(timestamp " << nr.timestamp << ", bucket_id " << nr.bucket_id - << ", node " << nr.node << ')'; + << ", node " << nr.node + << ", is_tombstone " << nr.is_tombstone + << ')'; return os; } diff --git a/storage/src/vespa/storage/distributor/operations/external/newest_replica.h b/storage/src/vespa/storage/distributor/operations/external/newest_replica.h index 9eb9c1b8bd0..ec2b73e52ab 100644 --- a/storage/src/vespa/storage/distributor/operations/external/newest_replica.h +++ b/storage/src/vespa/storage/distributor/operations/external/newest_replica.h @@ -17,21 +17,24 @@ struct NewestReplica { api::Timestamp timestamp {0}; document::BucketId bucket_id; uint16_t node {UINT16_MAX}; + bool is_tombstone {false}; static NewestReplica of(api::Timestamp timestamp, const document::BucketId& bucket_id, - uint16_t node) noexcept { - return {timestamp, bucket_id, node}; + uint16_t node, + bool is_tombstone) noexcept { + return {timestamp, bucket_id, node, is_tombstone}; } static NewestReplica make_empty() { - return {api::Timestamp(0), document::BucketId(), 0}; + return {api::Timestamp(0), document::BucketId(), 0, false}; } bool operator==(const NewestReplica& rhs) const noexcept { return ((timestamp == rhs.timestamp) && (bucket_id == rhs.bucket_id) && - (node == rhs.node)); + (node == rhs.node) && + (is_tombstone == rhs.is_tombstone)); } bool operator!=(const NewestReplica& rhs) const noexcept { return !(*this == rhs); diff --git a/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp index 2e6fe4020c2..43cf43b02b6 100644 --- a/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp +++ b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp @@ -463,6 +463,9 @@ void TwoPhaseUpdateOperation::handle_safe_path_received_metadata_get( // Timestamps were not in sync, so we have to fetch the document from the highest // timestamped replica, apply the update to it and then explicitly Put the result // to all replicas. + // Note that this timestamp may be for a tombstone (remove) entry, in which case + // conditional create-if-missing behavior kicks in as usual. + // TODO avoid sending the Get at all if the newest replica is marked as a tombstone. _single_get_latency_timer.emplace(_manager.getClock()); document::Bucket bucket(_updateCmd->getBucket().getBucketSpace(), newest_replica->bucket_id); LOG(debug, "Update(%s): sending single payload Get to %s on node %u (had timestamp %" PRIu64 ")", |