aboutsummaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/mergeoperationtest.cpp73
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp90
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h11
3 files changed, 78 insertions, 96 deletions
diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp
index 1e317e20b74..79c6acd5f4b 100644
--- a/storage/src/tests/distributor/mergeoperationtest.cpp
+++ b/storage/src/tests/distributor/mergeoperationtest.cpp
@@ -26,6 +26,7 @@ class MergeOperationTest : public CppUnit::TestFixture,
CPPUNIT_TEST(testDoNotRemoveActiveSourceOnlyCopies);
CPPUNIT_TEST(testMarkRedundantTrustedCopiesAsSourceOnly);
CPPUNIT_TEST(onlyMarkRedundantRetiredReplicasAsSourceOnly);
+ CPPUNIT_TEST(mark_post_merge_redundant_replicas_source_only);
CPPUNIT_TEST_SUITE_END();
std::unique_ptr<PendingMessageTracker> _pendingTracker;
@@ -38,6 +39,7 @@ protected:
void testDoNotRemoveActiveSourceOnlyCopies();
void testMarkRedundantTrustedCopiesAsSourceOnly();
void onlyMarkRedundantRetiredReplicasAsSourceOnly();
+ void mark_post_merge_redundant_replicas_source_only();
public:
void setUp() override {
@@ -269,13 +271,13 @@ MergeOperationTest::testGenerateNodeList()
std::string("3,5,7,6,8,0,9,2,1,4"),
getNodeList("storage:10", 10, "9,8,7,6,5,4,3,2,1,0"));
- // Trusted copies should not be source only.
+ // Trusted copies can be source-only if they are in the non-ideal node set.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8,0,9,1,2,4s"),
+ std::string("3,5,7,6,8,0,9,1s,2s,4s"),
getNodeList("storage:10", 7, "0,1t,2t,3,4,5,6,7,8,9"));
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8,0,9,2,1s,4s"),
+ std::string("3,5,7,6,8,0,9,1s,2s,4s"),
getNodeList("storage:10", 7, "0,1,2t,3,4,5,6,7,8,9"));
// Retired nodes are not in ideal state
@@ -376,35 +378,31 @@ MergeOperationTest::testMarkRedundantTrustedCopiesAsSourceOnly()
std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3t,5t,7t,6,8"));
- // 3 redundancy, 4 trusted -> 1 source only trusted.
- // We allow marking a trusted, non-ideal copy as source even when we don't
- // have #redundancy trusted _ideal_ copies, as long as we're left with >=
- // #redundancy trusted copies in total.
+ // Trusted-ness should not be taken into account when marking nodes as source-only.
+ // 2 out of 3 ideal replicas trusted.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8s"),
+ std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3t,5t,7,6t,8t"));
- // Not sufficient number of trusted copies to mark any as source only.
+ // 1 out of 3 ideal replicas trusted.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8"),
+ std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3t,5,7,6t,8t"));
- // Same as above, with all trusted copies being non-ideal.
+ // 0 out of 3 ideal replicas trusted.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8"),
+ std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3,5,7,6t,8t"));
- // #redundancy of trusted, but none are ideal. Non-ideal trusted should
- // not be marked as source only (though we can mark non-trusted non-ideal
- // node as source only).
- // Note the node reordering since trusted are added before the rest.
+ // #redundancy of trusted, but none are ideal. Non-ideal trusted may be
+ // marked as source only.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,8,0,9,6s"),
+ std::string("3,5,7,6s,8s,0s,9s"),
getNodeList("storage:10", 3, "3,5,7,6,8t,0t,9t"));
- // But allow for removing excess trusted, non-ideal copies.
+ // Allow for removing excess trusted, non-ideal copies.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8,0,9s"),
+ std::string("3,5,7,6s,8s,0s,9s"),
getNodeList("storage:10", 3, "3,5,7,6t,8t,0t,9t"));
}
@@ -417,8 +415,41 @@ MergeOperationTest::onlyMarkRedundantRetiredReplicasAsSourceOnly()
// source-only nodes will have their replica removed after a successful
// merge, which we cannot allow to happen here.
CPPUNIT_ASSERT_EQUAL(
- std::string("0,1,2s"),
- getNodeList("storage:3 .0.s.:r .1.s:r .2.s:r", 2, "1,0,2"));
+ std::string("1,0,2s"),
+ getNodeList("storage:3 .0.s:r .1.s:r .2.s:r", 2, "1,0,2"));
+}
+
+void MergeOperationTest::mark_post_merge_redundant_replicas_source_only() {
+ // Ideal state sequence is [3, 5, 7, 6, 8, 0, 9, 2, 1, 4]
+
+ // Retired node 7 is not part of the #redundancy ideal state and should be moved
+ // to node 6. Once the merge is done we'll end up with too many replicas unless
+ // we allow marking the to-be-moved replica as source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,6,7s"),
+ getNodeList("storage:10 .7.s:r", 3, "3t,5t,7t,6"));
+
+ // Should be allowed to mark as source only even if retired replica is the
+ // only trusted replica at the time the merge starts.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,6,7s"),
+ getNodeList("storage:10 .7.s:r", 3, "3,5,7t,6"));
+
+ // This extends to multiple retired nodes.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,6,8,5s,7s"),
+ getNodeList("storage:10 .5.s:r .7.s:r", 3, "3t,5t,7t,6,8"));
+
+ // If number of post-merge ideal nodes is lower than desired redundancy, don't
+ // mark any as source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6"),
+ getNodeList("storage:10", 5, "3,5,7,6"));
+
+ // Same applies to when post-merge ideal nodes is _equal_ to desired redundancy.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6"),
+ getNodeList("storage:10", 4, "3,5,7,6"));
}
} // distributor
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
index 75c58ca4f87..0821408560b 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
@@ -20,18 +20,13 @@ MergeOperation::getStatus() const
void
MergeOperation::addIdealNodes(
- const lib::Distribution& distribution,
- const lib::ClusterState& state,
- const document::BucketId& bucketId,
+ const std::vector<uint16_t>& idealNodes,
const std::vector<MergeMetaData>& nodes,
std::vector<MergeMetaData>& result)
{
- std::vector<uint16_t> idealNodes(
- distribution.getIdealStorageNodes(state, bucketId, "ui"));
-
- // Add all ideal nodes first
+ // Add all ideal nodes first. These are never marked source-only.
for (uint32_t i = 0; i < idealNodes.size(); i++) {
- const MergeMetaData* entry = 0;
+ const MergeMetaData* entry = nullptr;
for (uint32_t j = 0; j < nodes.size(); j++) {
if (idealNodes[i] == nodes[j]._nodeIndex) {
entry = &nodes[j];
@@ -39,52 +34,13 @@ MergeOperation::addIdealNodes(
}
}
- if (entry != 0) {
+ if (entry != nullptr) {
result.push_back(*entry);
result.back()._sourceOnly = false;
}
}
}
-uint16_t
-MergeOperation::countTrusted(const std::vector<MergeMetaData>& nodes)
-{
- uint16_t trusted = 0;
- for (const auto& n : nodes) {
- if (n.trusted()) {
- ++trusted;
- }
- }
- return trusted;
-}
-
-void
-MergeOperation::addTrustedNodesNotAlreadyAdded(
- uint16_t redundancy,
- const std::vector<MergeMetaData>& nodes,
- std::vector<MergeMetaData>& result)
-{
- uint16_t alreadyTrusted = countTrusted(result);
- for (uint32_t i = 0; i < nodes.size(); i++) {
- if (!nodes[i].trusted()) {
- continue;
- }
-
- bool found = false;
- for (uint32_t j = 0; j < result.size(); j++) {
- if (result[j]._nodeIndex == nodes[i]._nodeIndex) {
- found = true;
- }
- }
-
- if (!found) {
- result.push_back(nodes[i]);
- result.back()._sourceOnly = (alreadyTrusted >= redundancy);
- ++alreadyTrusted;
- }
- }
-}
-
void
MergeOperation::addCopiesNotAlreadyAdded(
uint16_t redundancy,
@@ -114,11 +70,19 @@ MergeOperation::generateSortedNodeList(
MergeLimiter& limiter,
std::vector<MergeMetaData>& nodes)
{
+ std::vector<uint16_t> idealNodes(
+ distribution.getIdealStorageNodes(state, bucketId, "ui"));
+
std::vector<MergeMetaData> result;
const uint16_t redundancy = distribution.getRedundancy();
- addIdealNodes(distribution, state, bucketId, nodes, result);
- addTrustedNodesNotAlreadyAdded(redundancy, nodes, result);
+ addIdealNodes(idealNodes, nodes, result);
addCopiesNotAlreadyAdded(redundancy, nodes, result);
+ // TODO optimization: when merge case is obviously a replica move (all existing N replicas
+ // are in sync and new replicas are empty), could prune away N-1 lowest indexed replicas
+ // from the node list. This would minimize the number of nodes involved in the merge without
+ // sacrificing the end result. Avoiding the lower indexed nodes would take pressure off the
+ // merge throttling "locks" and could potentially greatly speed up node retirement in the common
+ // case. Existing replica could also be marked as source-only if it's not in the ideal state.
limiter.limitMergeToMaxNodes(result);
result.swap(nodes);
}
@@ -154,7 +118,7 @@ MergeOperation::onStart(DistributorMessageSender& sender)
for (uint32_t i = 0; i < getNodes().size(); ++i) {
const BucketCopy* copy = entry->getNode(getNodes()[i]);
if (copy == 0) { // New copies?
- newCopies.push_back(std::unique_ptr<BucketCopy>(new BucketCopy(0, getNodes()[i], api::BucketInfo())));
+ newCopies.push_back(std::make_unique<BucketCopy>(0, getNodes()[i], api::BucketInfo()));
copy = newCopies.back().get();
}
nodes.push_back(MergeMetaData(getNodes()[i], *copy));
@@ -172,12 +136,11 @@ MergeOperation::onStart(DistributorMessageSender& sender)
}
if (_mnodes.size() > 1) {
- std::shared_ptr<api::MergeBucketCommand> msg(
- new api::MergeBucketCommand(
- getBucketId(),
- _mnodes,
- _manager->getDistributorComponent().getUniqueTimestamp(),
- clusterState.getVersion()));
+ auto msg = std::make_shared<api::MergeBucketCommand>(
+ getBucketId(),
+ _mnodes,
+ _manager->getDistributorComponent().getUniqueTimestamp(),
+ clusterState.getVersion());
// Due to merge forwarding/chaining semantics, we must always send
// the merge command to the lowest indexed storage node involved in
@@ -192,10 +155,7 @@ MergeOperation::onStart(DistributorMessageSender& sender)
msg->setTimeout(60 * 60 * 1000);
setCommandMeta(*msg);
- sender.sendToNode(
- lib::NodeType::STORAGE,
- _mnodes[0].index,
- msg);
+ sender.sendToNode(lib::NodeType::STORAGE, _mnodes[0].index, msg);
_sentMessageTime = _manager->getDistributorComponent().getClock().getTimeInSeconds();
} else {
@@ -213,13 +173,11 @@ MergeOperation::sourceOnlyCopyChangedDuringMerge(
{
assert(currentState.valid());
for (size_t i = 0; i < _mnodes.size(); ++i) {
- const BucketCopy* copyBefore(
- _infoBefore.getNode(_mnodes[i].index));
+ const BucketCopy* copyBefore(_infoBefore.getNode(_mnodes[i].index));
if (!copyBefore) {
continue;
}
- const BucketCopy* copyAfter(
- currentState->getNode(_mnodes[i].index));
+ const BucketCopy* copyAfter(currentState->getNode(_mnodes[i].index));
if (!copyAfter) {
LOG(debug, "Copy of %s on node %u removed during merge. Was %s",
getBucketId().toString().c_str(),
@@ -295,6 +253,8 @@ MergeOperation::deleteSourceOnlyNodes(
done();
}
}
+ // FIXME what about the else-case here...? done() is not invoked by caller in this branch.
+ // Not calling done() doesn't leak anything, but causes metric updates to be missed.
}
void
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
index 8fdf22c88f5..f50226dbd3c 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
@@ -49,14 +49,7 @@ public:
bool shouldBlockThisOperation(uint32_t messageType, uint8_t pri) const override ;
private:
static void addIdealNodes(
- const lib::Distribution&,
- const lib::ClusterState&,
- const document::BucketId&,
- const std::vector<MergeMetaData>& nodes,
- std::vector<MergeMetaData>& result);
-
- static void addTrustedNodesNotAlreadyAdded(
- uint16_t redundancy,
+ const std::vector<uint16_t>& idealNodes,
const std::vector<MergeMetaData>& nodes,
std::vector<MergeMetaData>& result);
@@ -65,8 +58,6 @@ private:
const std::vector<MergeMetaData>& nodes,
std::vector<MergeMetaData>& result);
- static uint16_t countTrusted(const std::vector<MergeMetaData>& nodes);
-
void deleteSourceOnlyNodes(const BucketDatabase::Entry& currentState,
DistributorMessageSender& sender);
};