aboutsummaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahoo-inc.com>2017-05-02 11:05:34 +0000
committerTor Brede Vekterli <vekterli@yahoo-inc.com>2017-05-16 12:54:38 +0000
commit19b0a4b93d9270bbc63e72528866953ed5c43b00 (patch)
tree5d98218b28445ff4156385446c67366bffbb5830 /storage
parent1f6295e97ba29ffb5fd288771a7f4c5309198825 (diff)
Simplify source-only selection logic for replicas.
Don't take trusted flag into account as it's not to be trusted (the irony is not lost on anyone) when deciding what nodes to mark as source-only during a merge. Now we always mark non-ideal replicas that exceed the redundancy level as source-only to ensure they are removed when the merge is complete. This prevents former ideal replicas from not being marked source-only simply because they happen to be in sync with the now-ideal replicas. Still exist some edges where a source-only replica isn't removed, such as when it's marked active at the time of the merge's completion.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/mergeoperationtest.cpp73
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp90
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h11
3 files changed, 78 insertions, 96 deletions
diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp
index 1e317e20b74..79c6acd5f4b 100644
--- a/storage/src/tests/distributor/mergeoperationtest.cpp
+++ b/storage/src/tests/distributor/mergeoperationtest.cpp
@@ -26,6 +26,7 @@ class MergeOperationTest : public CppUnit::TestFixture,
CPPUNIT_TEST(testDoNotRemoveActiveSourceOnlyCopies);
CPPUNIT_TEST(testMarkRedundantTrustedCopiesAsSourceOnly);
CPPUNIT_TEST(onlyMarkRedundantRetiredReplicasAsSourceOnly);
+ CPPUNIT_TEST(mark_post_merge_redundant_replicas_source_only);
CPPUNIT_TEST_SUITE_END();
std::unique_ptr<PendingMessageTracker> _pendingTracker;
@@ -38,6 +39,7 @@ protected:
void testDoNotRemoveActiveSourceOnlyCopies();
void testMarkRedundantTrustedCopiesAsSourceOnly();
void onlyMarkRedundantRetiredReplicasAsSourceOnly();
+ void mark_post_merge_redundant_replicas_source_only();
public:
void setUp() override {
@@ -269,13 +271,13 @@ MergeOperationTest::testGenerateNodeList()
std::string("3,5,7,6,8,0,9,2,1,4"),
getNodeList("storage:10", 10, "9,8,7,6,5,4,3,2,1,0"));
- // Trusted copies should not be source only.
+ // Trusted copies can be source-only if they are in the non-ideal node set.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8,0,9,1,2,4s"),
+ std::string("3,5,7,6,8,0,9,1s,2s,4s"),
getNodeList("storage:10", 7, "0,1t,2t,3,4,5,6,7,8,9"));
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8,0,9,2,1s,4s"),
+ std::string("3,5,7,6,8,0,9,1s,2s,4s"),
getNodeList("storage:10", 7, "0,1,2t,3,4,5,6,7,8,9"));
// Retired nodes are not in ideal state
@@ -376,35 +378,31 @@ MergeOperationTest::testMarkRedundantTrustedCopiesAsSourceOnly()
std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3t,5t,7t,6,8"));
- // 3 redundancy, 4 trusted -> 1 source only trusted.
- // We allow marking a trusted, non-ideal copy as source even when we don't
- // have #redundancy trusted _ideal_ copies, as long as we're left with >=
- // #redundancy trusted copies in total.
+ // Trusted-ness should not be taken into account when marking nodes as source-only.
+ // 2 out of 3 ideal replicas trusted.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8s"),
+ std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3t,5t,7,6t,8t"));
- // Not sufficient number of trusted copies to mark any as source only.
+ // 1 out of 3 ideal replicas trusted.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8"),
+ std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3t,5,7,6t,8t"));
- // Same as above, with all trusted copies being non-ideal.
+ // 0 out of 3 ideal replicas trusted.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8"),
+ std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3,5,7,6t,8t"));
- // #redundancy of trusted, but none are ideal. Non-ideal trusted should
- // not be marked as source only (though we can mark non-trusted non-ideal
- // node as source only).
- // Note the node reordering since trusted are added before the rest.
+ // #redundancy of trusted, but none are ideal. Non-ideal trusted may be
+ // marked as source only.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,8,0,9,6s"),
+ std::string("3,5,7,6s,8s,0s,9s"),
getNodeList("storage:10", 3, "3,5,7,6,8t,0t,9t"));
- // But allow for removing excess trusted, non-ideal copies.
+ // Allow for removing excess trusted, non-ideal copies.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8,0,9s"),
+ std::string("3,5,7,6s,8s,0s,9s"),
getNodeList("storage:10", 3, "3,5,7,6t,8t,0t,9t"));
}
@@ -417,8 +415,41 @@ MergeOperationTest::onlyMarkRedundantRetiredReplicasAsSourceOnly()
// source-only nodes will have their replica removed after a successful
// merge, which we cannot allow to happen here.
CPPUNIT_ASSERT_EQUAL(
- std::string("0,1,2s"),
- getNodeList("storage:3 .0.s.:r .1.s:r .2.s:r", 2, "1,0,2"));
+ std::string("1,0,2s"),
+ getNodeList("storage:3 .0.s:r .1.s:r .2.s:r", 2, "1,0,2"));
+}
+
+void MergeOperationTest::mark_post_merge_redundant_replicas_source_only() {
+ // Ideal state sequence is [3, 5, 7, 6, 8, 0, 9, 2, 1, 4]
+
+ // Retired node 7 is not part of the #redundancy ideal state and should be moved
+ // to node 6. Once the merge is done we'll end up with too many replicas unless
+ // we allow marking the to-be-moved replica as source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,6,7s"),
+ getNodeList("storage:10 .7.s:r", 3, "3t,5t,7t,6"));
+
+ // Should be allowed to mark as source only even if retired replica is the
+ // only trusted replica at the time the merge starts.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,6,7s"),
+ getNodeList("storage:10 .7.s:r", 3, "3,5,7t,6"));
+
+ // This extends to multiple retired nodes.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,6,8,5s,7s"),
+ getNodeList("storage:10 .5.s:r .7.s:r", 3, "3t,5t,7t,6,8"));
+
+ // If number of post-merge ideal nodes is lower than desired redundancy, don't
+ // mark any as source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6"),
+ getNodeList("storage:10", 5, "3,5,7,6"));
+
+ // Same applies to when post-merge ideal nodes is _equal_ to desired redundancy.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6"),
+ getNodeList("storage:10", 4, "3,5,7,6"));
}
} // distributor
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
index 75c58ca4f87..0821408560b 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
@@ -20,18 +20,13 @@ MergeOperation::getStatus() const
void
MergeOperation::addIdealNodes(
- const lib::Distribution& distribution,
- const lib::ClusterState& state,
- const document::BucketId& bucketId,
+ const std::vector<uint16_t>& idealNodes,
const std::vector<MergeMetaData>& nodes,
std::vector<MergeMetaData>& result)
{
- std::vector<uint16_t> idealNodes(
- distribution.getIdealStorageNodes(state, bucketId, "ui"));
-
- // Add all ideal nodes first
+ // Add all ideal nodes first. These are never marked source-only.
for (uint32_t i = 0; i < idealNodes.size(); i++) {
- const MergeMetaData* entry = 0;
+ const MergeMetaData* entry = nullptr;
for (uint32_t j = 0; j < nodes.size(); j++) {
if (idealNodes[i] == nodes[j]._nodeIndex) {
entry = &nodes[j];
@@ -39,52 +34,13 @@ MergeOperation::addIdealNodes(
}
}
- if (entry != 0) {
+ if (entry != nullptr) {
result.push_back(*entry);
result.back()._sourceOnly = false;
}
}
}
-uint16_t
-MergeOperation::countTrusted(const std::vector<MergeMetaData>& nodes)
-{
- uint16_t trusted = 0;
- for (const auto& n : nodes) {
- if (n.trusted()) {
- ++trusted;
- }
- }
- return trusted;
-}
-
-void
-MergeOperation::addTrustedNodesNotAlreadyAdded(
- uint16_t redundancy,
- const std::vector<MergeMetaData>& nodes,
- std::vector<MergeMetaData>& result)
-{
- uint16_t alreadyTrusted = countTrusted(result);
- for (uint32_t i = 0; i < nodes.size(); i++) {
- if (!nodes[i].trusted()) {
- continue;
- }
-
- bool found = false;
- for (uint32_t j = 0; j < result.size(); j++) {
- if (result[j]._nodeIndex == nodes[i]._nodeIndex) {
- found = true;
- }
- }
-
- if (!found) {
- result.push_back(nodes[i]);
- result.back()._sourceOnly = (alreadyTrusted >= redundancy);
- ++alreadyTrusted;
- }
- }
-}
-
void
MergeOperation::addCopiesNotAlreadyAdded(
uint16_t redundancy,
@@ -114,11 +70,19 @@ MergeOperation::generateSortedNodeList(
MergeLimiter& limiter,
std::vector<MergeMetaData>& nodes)
{
+ std::vector<uint16_t> idealNodes(
+ distribution.getIdealStorageNodes(state, bucketId, "ui"));
+
std::vector<MergeMetaData> result;
const uint16_t redundancy = distribution.getRedundancy();
- addIdealNodes(distribution, state, bucketId, nodes, result);
- addTrustedNodesNotAlreadyAdded(redundancy, nodes, result);
+ addIdealNodes(idealNodes, nodes, result);
addCopiesNotAlreadyAdded(redundancy, nodes, result);
+ // TODO optimization: when merge case is obviously a replica move (all existing N replicas
+ // are in sync and new replicas are empty), could prune away N-1 lowest indexed replicas
+ // from the node list. This would minimize the number of nodes involved in the merge without
+ // sacrificing the end result. Avoiding the lower indexed nodes would take pressure off the
+ // merge throttling "locks" and could potentially greatly speed up node retirement in the common
+ // case. Existing replica could also be marked as source-only if it's not in the ideal state.
limiter.limitMergeToMaxNodes(result);
result.swap(nodes);
}
@@ -154,7 +118,7 @@ MergeOperation::onStart(DistributorMessageSender& sender)
for (uint32_t i = 0; i < getNodes().size(); ++i) {
const BucketCopy* copy = entry->getNode(getNodes()[i]);
if (copy == 0) { // New copies?
- newCopies.push_back(std::unique_ptr<BucketCopy>(new BucketCopy(0, getNodes()[i], api::BucketInfo())));
+ newCopies.push_back(std::make_unique<BucketCopy>(0, getNodes()[i], api::BucketInfo()));
copy = newCopies.back().get();
}
nodes.push_back(MergeMetaData(getNodes()[i], *copy));
@@ -172,12 +136,11 @@ MergeOperation::onStart(DistributorMessageSender& sender)
}
if (_mnodes.size() > 1) {
- std::shared_ptr<api::MergeBucketCommand> msg(
- new api::MergeBucketCommand(
- getBucketId(),
- _mnodes,
- _manager->getDistributorComponent().getUniqueTimestamp(),
- clusterState.getVersion()));
+ auto msg = std::make_shared<api::MergeBucketCommand>(
+ getBucketId(),
+ _mnodes,
+ _manager->getDistributorComponent().getUniqueTimestamp(),
+ clusterState.getVersion());
// Due to merge forwarding/chaining semantics, we must always send
// the merge command to the lowest indexed storage node involved in
@@ -192,10 +155,7 @@ MergeOperation::onStart(DistributorMessageSender& sender)
msg->setTimeout(60 * 60 * 1000);
setCommandMeta(*msg);
- sender.sendToNode(
- lib::NodeType::STORAGE,
- _mnodes[0].index,
- msg);
+ sender.sendToNode(lib::NodeType::STORAGE, _mnodes[0].index, msg);
_sentMessageTime = _manager->getDistributorComponent().getClock().getTimeInSeconds();
} else {
@@ -213,13 +173,11 @@ MergeOperation::sourceOnlyCopyChangedDuringMerge(
{
assert(currentState.valid());
for (size_t i = 0; i < _mnodes.size(); ++i) {
- const BucketCopy* copyBefore(
- _infoBefore.getNode(_mnodes[i].index));
+ const BucketCopy* copyBefore(_infoBefore.getNode(_mnodes[i].index));
if (!copyBefore) {
continue;
}
- const BucketCopy* copyAfter(
- currentState->getNode(_mnodes[i].index));
+ const BucketCopy* copyAfter(currentState->getNode(_mnodes[i].index));
if (!copyAfter) {
LOG(debug, "Copy of %s on node %u removed during merge. Was %s",
getBucketId().toString().c_str(),
@@ -295,6 +253,8 @@ MergeOperation::deleteSourceOnlyNodes(
done();
}
}
+ // FIXME what about the else-case here...? done() is not invoked by caller in this branch.
+ // Not calling done() doesn't leak anything, but causes metric updates to be missed.
}
void
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
index 8fdf22c88f5..f50226dbd3c 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
@@ -49,14 +49,7 @@ public:
bool shouldBlockThisOperation(uint32_t messageType, uint8_t pri) const override ;
private:
static void addIdealNodes(
- const lib::Distribution&,
- const lib::ClusterState&,
- const document::BucketId&,
- const std::vector<MergeMetaData>& nodes,
- std::vector<MergeMetaData>& result);
-
- static void addTrustedNodesNotAlreadyAdded(
- uint16_t redundancy,
+ const std::vector<uint16_t>& idealNodes,
const std::vector<MergeMetaData>& nodes,
std::vector<MergeMetaData>& result);
@@ -65,8 +58,6 @@ private:
const std::vector<MergeMetaData>& nodes,
std::vector<MergeMetaData>& result);
- static uint16_t countTrusted(const std::vector<MergeMetaData>& nodes);
-
void deleteSourceOnlyNodes(const BucketDatabase::Entry& currentState,
DistributorMessageSender& sender);
};