diff options
Diffstat (limited to 'storage')
3 files changed, 78 insertions, 96 deletions
diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp index 1e317e20b74..79c6acd5f4b 100644 --- a/storage/src/tests/distributor/mergeoperationtest.cpp +++ b/storage/src/tests/distributor/mergeoperationtest.cpp @@ -26,6 +26,7 @@ class MergeOperationTest : public CppUnit::TestFixture, CPPUNIT_TEST(testDoNotRemoveActiveSourceOnlyCopies); CPPUNIT_TEST(testMarkRedundantTrustedCopiesAsSourceOnly); CPPUNIT_TEST(onlyMarkRedundantRetiredReplicasAsSourceOnly); + CPPUNIT_TEST(mark_post_merge_redundant_replicas_source_only); CPPUNIT_TEST_SUITE_END(); std::unique_ptr<PendingMessageTracker> _pendingTracker; @@ -38,6 +39,7 @@ protected: void testDoNotRemoveActiveSourceOnlyCopies(); void testMarkRedundantTrustedCopiesAsSourceOnly(); void onlyMarkRedundantRetiredReplicasAsSourceOnly(); + void mark_post_merge_redundant_replicas_source_only(); public: void setUp() override { @@ -269,13 +271,13 @@ MergeOperationTest::testGenerateNodeList() std::string("3,5,7,6,8,0,9,2,1,4"), getNodeList("storage:10", 10, "9,8,7,6,5,4,3,2,1,0")); - // Trusted copies should not be source only. + // Trusted copies can be source-only if they are in the non-ideal node set. CPPUNIT_ASSERT_EQUAL( - std::string("3,5,7,6,8,0,9,1,2,4s"), + std::string("3,5,7,6,8,0,9,1s,2s,4s"), getNodeList("storage:10", 7, "0,1t,2t,3,4,5,6,7,8,9")); CPPUNIT_ASSERT_EQUAL( - std::string("3,5,7,6,8,0,9,2,1s,4s"), + std::string("3,5,7,6,8,0,9,1s,2s,4s"), getNodeList("storage:10", 7, "0,1,2t,3,4,5,6,7,8,9")); // Retired nodes are not in ideal state @@ -376,35 +378,31 @@ MergeOperationTest::testMarkRedundantTrustedCopiesAsSourceOnly() std::string("3,5,7,6s,8s"), getNodeList("storage:10", 3, "3t,5t,7t,6,8")); - // 3 redundancy, 4 trusted -> 1 source only trusted. - // We allow marking a trusted, non-ideal copy as source even when we don't - // have #redundancy trusted _ideal_ copies, as long as we're left with >= - // #redundancy trusted copies in total. + // Trusted-ness should not be taken into account when marking nodes as source-only. + // 2 out of 3 ideal replicas trusted. CPPUNIT_ASSERT_EQUAL( - std::string("3,5,7,6,8s"), + std::string("3,5,7,6s,8s"), getNodeList("storage:10", 3, "3t,5t,7,6t,8t")); - // Not sufficient number of trusted copies to mark any as source only. + // 1 out of 3 ideal replicas trusted. CPPUNIT_ASSERT_EQUAL( - std::string("3,5,7,6,8"), + std::string("3,5,7,6s,8s"), getNodeList("storage:10", 3, "3t,5,7,6t,8t")); - // Same as above, with all trusted copies being non-ideal. + // 0 out of 3 ideal replicas trusted. CPPUNIT_ASSERT_EQUAL( - std::string("3,5,7,6,8"), + std::string("3,5,7,6s,8s"), getNodeList("storage:10", 3, "3,5,7,6t,8t")); - // #redundancy of trusted, but none are ideal. Non-ideal trusted should - // not be marked as source only (though we can mark non-trusted non-ideal - // node as source only). - // Note the node reordering since trusted are added before the rest. + // #redundancy of trusted, but none are ideal. Non-ideal trusted may be + // marked as source only. CPPUNIT_ASSERT_EQUAL( - std::string("3,5,7,8,0,9,6s"), + std::string("3,5,7,6s,8s,0s,9s"), getNodeList("storage:10", 3, "3,5,7,6,8t,0t,9t")); - // But allow for removing excess trusted, non-ideal copies. + // Allow for removing excess trusted, non-ideal copies. CPPUNIT_ASSERT_EQUAL( - std::string("3,5,7,6,8,0,9s"), + std::string("3,5,7,6s,8s,0s,9s"), getNodeList("storage:10", 3, "3,5,7,6t,8t,0t,9t")); } @@ -417,8 +415,41 @@ MergeOperationTest::onlyMarkRedundantRetiredReplicasAsSourceOnly() // source-only nodes will have their replica removed after a successful // merge, which we cannot allow to happen here. CPPUNIT_ASSERT_EQUAL( - std::string("0,1,2s"), - getNodeList("storage:3 .0.s.:r .1.s:r .2.s:r", 2, "1,0,2")); + std::string("1,0,2s"), + getNodeList("storage:3 .0.s:r .1.s:r .2.s:r", 2, "1,0,2")); +} + +void MergeOperationTest::mark_post_merge_redundant_replicas_source_only() { + // Ideal state sequence is [3, 5, 7, 6, 8, 0, 9, 2, 1, 4] + + // Retired node 7 is not part of the #redundancy ideal state and should be moved + // to node 6. Once the merge is done we'll end up with too many replicas unless + // we allow marking the to-be-moved replica as source only. + CPPUNIT_ASSERT_EQUAL( + std::string("3,5,6,7s"), + getNodeList("storage:10 .7.s:r", 3, "3t,5t,7t,6")); + + // Should be allowed to mark as source only even if retired replica is the + // only trusted replica at the time the merge starts. + CPPUNIT_ASSERT_EQUAL( + std::string("3,5,6,7s"), + getNodeList("storage:10 .7.s:r", 3, "3,5,7t,6")); + + // This extends to multiple retired nodes. + CPPUNIT_ASSERT_EQUAL( + std::string("3,6,8,5s,7s"), + getNodeList("storage:10 .5.s:r .7.s:r", 3, "3t,5t,7t,6,8")); + + // If number of post-merge ideal nodes is lower than desired redundancy, don't + // mark any as source only. + CPPUNIT_ASSERT_EQUAL( + std::string("3,5,7,6"), + getNodeList("storage:10", 5, "3,5,7,6")); + + // Same applies to when post-merge ideal nodes is _equal_ to desired redundancy. + CPPUNIT_ASSERT_EQUAL( + std::string("3,5,7,6"), + getNodeList("storage:10", 4, "3,5,7,6")); } } // distributor diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp index 75c58ca4f87..0821408560b 100644 --- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp +++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp @@ -20,18 +20,13 @@ MergeOperation::getStatus() const void MergeOperation::addIdealNodes( - const lib::Distribution& distribution, - const lib::ClusterState& state, - const document::BucketId& bucketId, + const std::vector<uint16_t>& idealNodes, const std::vector<MergeMetaData>& nodes, std::vector<MergeMetaData>& result) { - std::vector<uint16_t> idealNodes( - distribution.getIdealStorageNodes(state, bucketId, "ui")); - - // Add all ideal nodes first + // Add all ideal nodes first. These are never marked source-only. for (uint32_t i = 0; i < idealNodes.size(); i++) { - const MergeMetaData* entry = 0; + const MergeMetaData* entry = nullptr; for (uint32_t j = 0; j < nodes.size(); j++) { if (idealNodes[i] == nodes[j]._nodeIndex) { entry = &nodes[j]; @@ -39,52 +34,13 @@ MergeOperation::addIdealNodes( } } - if (entry != 0) { + if (entry != nullptr) { result.push_back(*entry); result.back()._sourceOnly = false; } } } -uint16_t -MergeOperation::countTrusted(const std::vector<MergeMetaData>& nodes) -{ - uint16_t trusted = 0; - for (const auto& n : nodes) { - if (n.trusted()) { - ++trusted; - } - } - return trusted; -} - -void -MergeOperation::addTrustedNodesNotAlreadyAdded( - uint16_t redundancy, - const std::vector<MergeMetaData>& nodes, - std::vector<MergeMetaData>& result) -{ - uint16_t alreadyTrusted = countTrusted(result); - for (uint32_t i = 0; i < nodes.size(); i++) { - if (!nodes[i].trusted()) { - continue; - } - - bool found = false; - for (uint32_t j = 0; j < result.size(); j++) { - if (result[j]._nodeIndex == nodes[i]._nodeIndex) { - found = true; - } - } - - if (!found) { - result.push_back(nodes[i]); - result.back()._sourceOnly = (alreadyTrusted >= redundancy); - ++alreadyTrusted; - } - } -} - void MergeOperation::addCopiesNotAlreadyAdded( uint16_t redundancy, @@ -114,11 +70,19 @@ MergeOperation::generateSortedNodeList( MergeLimiter& limiter, std::vector<MergeMetaData>& nodes) { + std::vector<uint16_t> idealNodes( + distribution.getIdealStorageNodes(state, bucketId, "ui")); + std::vector<MergeMetaData> result; const uint16_t redundancy = distribution.getRedundancy(); - addIdealNodes(distribution, state, bucketId, nodes, result); - addTrustedNodesNotAlreadyAdded(redundancy, nodes, result); + addIdealNodes(idealNodes, nodes, result); addCopiesNotAlreadyAdded(redundancy, nodes, result); + // TODO optimization: when merge case is obviously a replica move (all existing N replicas + // are in sync and new replicas are empty), could prune away N-1 lowest indexed replicas + // from the node list. This would minimize the number of nodes involved in the merge without + // sacrificing the end result. Avoiding the lower indexed nodes would take pressure off the + // merge throttling "locks" and could potentially greatly speed up node retirement in the common + // case. Existing replica could also be marked as source-only if it's not in the ideal state. limiter.limitMergeToMaxNodes(result); result.swap(nodes); } @@ -154,7 +118,7 @@ MergeOperation::onStart(DistributorMessageSender& sender) for (uint32_t i = 0; i < getNodes().size(); ++i) { const BucketCopy* copy = entry->getNode(getNodes()[i]); if (copy == 0) { // New copies? - newCopies.push_back(std::unique_ptr<BucketCopy>(new BucketCopy(0, getNodes()[i], api::BucketInfo()))); + newCopies.push_back(std::make_unique<BucketCopy>(0, getNodes()[i], api::BucketInfo())); copy = newCopies.back().get(); } nodes.push_back(MergeMetaData(getNodes()[i], *copy)); @@ -172,12 +136,11 @@ MergeOperation::onStart(DistributorMessageSender& sender) } if (_mnodes.size() > 1) { - std::shared_ptr<api::MergeBucketCommand> msg( - new api::MergeBucketCommand( - getBucketId(), - _mnodes, - _manager->getDistributorComponent().getUniqueTimestamp(), - clusterState.getVersion())); + auto msg = std::make_shared<api::MergeBucketCommand>( + getBucketId(), + _mnodes, + _manager->getDistributorComponent().getUniqueTimestamp(), + clusterState.getVersion()); // Due to merge forwarding/chaining semantics, we must always send // the merge command to the lowest indexed storage node involved in @@ -192,10 +155,7 @@ MergeOperation::onStart(DistributorMessageSender& sender) msg->setTimeout(60 * 60 * 1000); setCommandMeta(*msg); - sender.sendToNode( - lib::NodeType::STORAGE, - _mnodes[0].index, - msg); + sender.sendToNode(lib::NodeType::STORAGE, _mnodes[0].index, msg); _sentMessageTime = _manager->getDistributorComponent().getClock().getTimeInSeconds(); } else { @@ -213,13 +173,11 @@ MergeOperation::sourceOnlyCopyChangedDuringMerge( { assert(currentState.valid()); for (size_t i = 0; i < _mnodes.size(); ++i) { - const BucketCopy* copyBefore( - _infoBefore.getNode(_mnodes[i].index)); + const BucketCopy* copyBefore(_infoBefore.getNode(_mnodes[i].index)); if (!copyBefore) { continue; } - const BucketCopy* copyAfter( - currentState->getNode(_mnodes[i].index)); + const BucketCopy* copyAfter(currentState->getNode(_mnodes[i].index)); if (!copyAfter) { LOG(debug, "Copy of %s on node %u removed during merge. Was %s", getBucketId().toString().c_str(), @@ -295,6 +253,8 @@ MergeOperation::deleteSourceOnlyNodes( done(); } } + // FIXME what about the else-case here...? done() is not invoked by caller in this branch. + // Not calling done() doesn't leak anything, but causes metric updates to be missed. } void diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h index 8fdf22c88f5..f50226dbd3c 100644 --- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h +++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h @@ -49,14 +49,7 @@ public: bool shouldBlockThisOperation(uint32_t messageType, uint8_t pri) const override ; private: static void addIdealNodes( - const lib::Distribution&, - const lib::ClusterState&, - const document::BucketId&, - const std::vector<MergeMetaData>& nodes, - std::vector<MergeMetaData>& result); - - static void addTrustedNodesNotAlreadyAdded( - uint16_t redundancy, + const std::vector<uint16_t>& idealNodes, const std::vector<MergeMetaData>& nodes, std::vector<MergeMetaData>& result); @@ -65,8 +58,6 @@ private: const std::vector<MergeMetaData>& nodes, std::vector<MergeMetaData>& result); - static uint16_t countTrusted(const std::vector<MergeMetaData>& nodes); - void deleteSourceOnlyNodes(const BucketDatabase::Entry& currentState, DistributorMessageSender& sender); }; |