From 2b9743be9c82fd6fc5087764f48f617e16b9f956 Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Tue, 13 Mar 2018 15:44:22 +0000 Subject: Missing replicas must be in a different checksum group than empty replicas Otherwise we risk merges with more than 16 nodes being scheduled for only the empty replicas and not actually include the missing replica nodes. This fixes #5313 --- storage/src/tests/distributor/mergelimitertest.cpp | 20 ++++++++++++++++++++ storage/src/tests/distributor/mergeoperationtest.cpp | 19 +++++++++++++++++++ .../operations/idealstate/mergeoperation.cpp | 6 +++--- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/storage/src/tests/distributor/mergelimitertest.cpp b/storage/src/tests/distributor/mergelimitertest.cpp index d6508ee82d2..0b40424594b 100644 --- a/storage/src/tests/distributor/mergelimitertest.cpp +++ b/storage/src/tests/distributor/mergelimitertest.cpp @@ -17,6 +17,7 @@ struct MergeLimiterTest : public CppUnit::TestFixture void non_source_only_replica_chosen_from_in_sync_group(); void non_source_only_replicas_preferred_when_replicas_not_in_sync(); void at_least_one_non_source_only_replica_chosen_when_all_trusted(); + void missing_replica_distinct_from_empty_replica(); CPPUNIT_TEST_SUITE(MergeLimiterTest); CPPUNIT_TEST(testKeepsAllBelowLimit); @@ -29,6 +30,7 @@ struct MergeLimiterTest : public CppUnit::TestFixture CPPUNIT_TEST(non_source_only_replica_chosen_from_in_sync_group); CPPUNIT_TEST(non_source_only_replicas_preferred_when_replicas_not_in_sync); CPPUNIT_TEST(at_least_one_non_source_only_replica_chosen_when_all_trusted); + CPPUNIT_TEST(missing_replica_distinct_from_empty_replica); CPPUNIT_TEST_SUITE_END(); }; @@ -52,6 +54,14 @@ namespace { _bucketDatabase.back()->setTrusted(true); return *this; } + NodeFactory& addMissing(int index) { + add(index, 0x1); // "Magic" checksum value implying invalid/recently created replica + return *this; + } + NodeFactory& addEmpty(int index) { + add(index, 0x0); + return *this; + } NodeFactory& setSourceOnly() { _nodes.back()._sourceOnly = true; return *this; @@ -208,4 +218,14 @@ void MergeLimiterTest::at_least_one_non_source_only_replica_chosen_when_all_trus ASSERT_LIMIT(3, nodes, "2,13s,1s"); } +void MergeLimiterTest::missing_replica_distinct_from_empty_replica() { + MergeLimiter::NodeArray nodes(NodeFactory() + .addEmpty(3) + .addEmpty(5) + .addMissing(1) + .addMissing(2)); + ASSERT_LIMIT(2, nodes, "5,2"); + ASSERT_LIMIT(3, nodes, "5,2,3"); +} + } // storage::distributor diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp index 468ddd47790..1556880d426 100644 --- a/storage/src/tests/distributor/mergeoperationtest.cpp +++ b/storage/src/tests/distributor/mergeoperationtest.cpp @@ -31,6 +31,7 @@ class MergeOperationTest : public CppUnit::TestFixture, CPPUNIT_TEST(onlyMarkRedundantRetiredReplicasAsSourceOnly); CPPUNIT_TEST(mark_post_merge_redundant_replicas_source_only); CPPUNIT_TEST(merge_operation_is_blocked_by_any_busy_target_node); + CPPUNIT_TEST(missing_replica_is_included_in_limited_node_list); CPPUNIT_TEST_SUITE_END(); std::unique_ptr _pendingTracker; @@ -45,6 +46,7 @@ protected: void onlyMarkRedundantRetiredReplicasAsSourceOnly(); void mark_post_merge_redundant_replicas_source_only(); void merge_operation_is_blocked_by_any_busy_target_node(); + void missing_replica_is_included_in_limited_node_list(); public: void setUp() override { @@ -504,5 +506,22 @@ void MergeOperationTest::merge_operation_is_blocked_by_any_busy_target_node() { CPPUNIT_ASSERT(op.isBlocked(*_pendingTracker)); } +void MergeOperationTest::missing_replica_is_included_in_limited_node_list() { + setupDistributor(Redundancy(4), NodeCount(4), "distributor:1 storage:4"); + getClock().setAbsoluteTimeInSeconds(10); + addNodesToBucketDB(document::BucketId(16, 1), "4=0/0/0/t,1=0/0/0t,2=0/0/0/t"); + const uint16_t max_merge_size = 2; + MergeOperation op(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)), toVector(0, 1, 2, 4)), max_merge_size); + op.setIdealStateManager(&getIdealStateManager()); + op.start(_sender, framework::MilliSecTime(0)); + + // Must include missing node 0 and not just 2 existing replicas + CPPUNIT_ASSERT_EQUAL( + std::string("MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, " + "cluster state version: 0, nodes: [0, 4], chain: [], " + "reasons to start: ) => 0"), + _sender.getLastCommand(true)); +} + } // distributor } // storage diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp index 170ad8a9ac5..b61de2fa06b 100644 --- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp +++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp @@ -119,11 +119,11 @@ MergeOperation::onStart(DistributorMessageSender& sender) for (uint32_t i = 0; i < getNodes().size(); ++i) { const BucketCopy* copy = entry->getNode(getNodes()[i]); - if (copy == 0) { // New copies? - newCopies.push_back(std::make_unique(0, getNodes()[i], api::BucketInfo())); + if (copy == nullptr) { // New copies? + newCopies.emplace_back(std::make_unique(BucketCopy::recentlyCreatedCopy(0, getNodes()[i]))); copy = newCopies.back().get(); } - nodes.push_back(MergeMetaData(getNodes()[i], *copy)); + nodes.emplace_back(getNodes()[i], *copy); } _infoBefore = entry.getBucketInfo(); -- cgit v1.2.3 From b5ff64519674c8c06ed35903b653d8b0098d2720 Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Tue, 13 Mar 2018 16:39:23 +0000 Subject: Use more appropriate node indices in test --- storage/src/tests/distributor/mergeoperationtest.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp index 1556880d426..672c1d06124 100644 --- a/storage/src/tests/distributor/mergeoperationtest.cpp +++ b/storage/src/tests/distributor/mergeoperationtest.cpp @@ -509,16 +509,16 @@ void MergeOperationTest::merge_operation_is_blocked_by_any_busy_target_node() { void MergeOperationTest::missing_replica_is_included_in_limited_node_list() { setupDistributor(Redundancy(4), NodeCount(4), "distributor:1 storage:4"); getClock().setAbsoluteTimeInSeconds(10); - addNodesToBucketDB(document::BucketId(16, 1), "4=0/0/0/t,1=0/0/0t,2=0/0/0/t"); + addNodesToBucketDB(document::BucketId(16, 1), "1=0/0/0/t,2=0/0/0/t,3=0/0/0/t"); const uint16_t max_merge_size = 2; - MergeOperation op(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)), toVector(0, 1, 2, 4)), max_merge_size); + MergeOperation op(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)), toVector(0, 1, 2, 3)), max_merge_size); op.setIdealStateManager(&getIdealStateManager()); op.start(_sender, framework::MilliSecTime(0)); // Must include missing node 0 and not just 2 existing replicas CPPUNIT_ASSERT_EQUAL( std::string("MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, " - "cluster state version: 0, nodes: [0, 4], chain: [], " + "cluster state version: 0, nodes: [0, 1], chain: [], " "reasons to start: ) => 0"), _sender.getLastCommand(true)); } -- cgit v1.2.3