summaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahoo-inc.com>2017-05-29 10:46:26 +0200
committerGitHub <noreply@github.com>2017-05-29 10:46:26 +0200
commit61e613620a5934273cd069b4d46635757feba8bc (patch)
treea6536b623bdfab23ab8a449b217888225a3b24b4 /storage
parenta6ebf71d87365ce49077934d2f3e0c7675a5cf8e (diff)
parent8b270a58ec72feffe5f151ce69957e88fad327dd (diff)
Merge pull request #2478 from yahoo/vekterli/remove-special-handling-of-trusted-replicas-for-merge-source-only-labeling-rebased-fixed
Remove special handling of trusted replicas for merge source only labeling (rebased/fixed)
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/mergelimitertest.cpp69
-rw-r--r--storage/src/tests/distributor/mergeoperationtest.cpp73
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.cpp104
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.h6
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergemetadata.h1
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp90
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h11
7 files changed, 203 insertions, 151 deletions
diff --git a/storage/src/tests/distributor/mergelimitertest.cpp b/storage/src/tests/distributor/mergelimitertest.cpp
index 57b3690dc24..04dfa1b4e7b 100644
--- a/storage/src/tests/distributor/mergelimitertest.cpp
+++ b/storage/src/tests/distributor/mergelimitertest.cpp
@@ -3,8 +3,7 @@
#include <vespa/storage/distributor/operations/idealstate/mergelimiter.h>
#include <vespa/vdstestlib/cppunit/macros.h>
-namespace storage {
-namespace distributor {
+namespace storage::distributor {
struct MergeLimiterTest : public CppUnit::TestFixture
{
@@ -14,6 +13,10 @@ struct MergeLimiterTest : public CppUnit::TestFixture
void testAllUntrustedLessThanMaxVariants();
void testAllUntrustedMoreThanMaxVariants();
void testSourceOnlyLast();
+ void limited_set_cannot_be_just_source_only();
+ void non_source_only_replica_chosen_from_in_sync_group();
+ void non_source_only_replicas_preferred_when_replicas_not_in_sync();
+ void at_least_one_non_source_only_replica_chosen_when_all_trusted();
CPPUNIT_TEST_SUITE(MergeLimiterTest);
CPPUNIT_TEST(testKeepsAllBelowLimit);
@@ -22,6 +25,10 @@ struct MergeLimiterTest : public CppUnit::TestFixture
CPPUNIT_TEST(testAllUntrustedLessThanMaxVariants);
CPPUNIT_TEST(testAllUntrustedMoreThanMaxVariants);
CPPUNIT_TEST(testSourceOnlyLast);
+ CPPUNIT_TEST(limited_set_cannot_be_just_source_only);
+ CPPUNIT_TEST(non_source_only_replica_chosen_from_in_sync_group);
+ CPPUNIT_TEST(non_source_only_replicas_preferred_when_replicas_not_in_sync);
+ CPPUNIT_TEST(at_least_one_non_source_only_replica_chosen_when_all_trusted);
CPPUNIT_TEST_SUITE_END();
};
@@ -56,12 +63,13 @@ namespace {
#define ASSERT_LIMIT(maxNodes, nodes, result) \
{ \
MergeLimiter limiter(maxNodes); \
- limiter.limitMergeToMaxNodes(nodes); \
+ auto nodesCopy = nodes; \
+ limiter.limitMergeToMaxNodes(nodesCopy); \
std::ostringstream actual; \
- for (uint32_t i=0; i<nodes.size(); ++i) { \
+ for (uint32_t i = 0; i < nodesCopy.size(); ++i) { \
if (i != 0) actual << ","; \
- actual << nodes[i]._nodeIndex; \
- if (nodes[i]._sourceOnly) actual << 's'; \
+ actual << nodesCopy[i]._nodeIndex; \
+ if (nodesCopy[i]._sourceOnly) actual << 's'; \
} \
CPPUNIT_ASSERT_EQUAL(std::string(result), actual.str()); \
}
@@ -153,8 +161,51 @@ MergeLimiterTest::testSourceOnlyLast()
.add(13, 0x9)
.add(1, 0x7)
.add(4, 0x5));
- ASSERT_LIMIT(4, nodes, "13,1,2s,5s");
+ ASSERT_LIMIT(4, nodes, "9,3,5s,2s");
}
-} // distributor
-} // storage
+void MergeLimiterTest::limited_set_cannot_be_just_source_only() {
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .addTrusted(9, 0x6)
+ .addTrusted(2, 0x6)
+ .addTrusted(13, 0x6).setSourceOnly()
+ .add(1, 0x7).setSourceOnly());
+ ASSERT_LIMIT(2, nodes, "2,13s");
+ ASSERT_LIMIT(3, nodes, "2,13s,1s");
+}
+
+void MergeLimiterTest::non_source_only_replica_chosen_from_in_sync_group() {
+ // nodes 9, 2, 13 are all in sync. Merge limiter will currently by default
+ // pop the _last_ node of an in-sync replica "group" when outputting a limited
+ // set. Unless we special-case source-only replicas here, we'd end up with an
+ // output set of "13s,1s", i.e. all source-only.
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .add(9, 0x6)
+ .add(2, 0x6)
+ .add(13, 0x6).setSourceOnly()
+ .add(1, 0x7).setSourceOnly());
+ ASSERT_LIMIT(2, nodes, "2,13s");
+ ASSERT_LIMIT(3, nodes, "2,13s,1s");
+}
+
+void MergeLimiterTest::non_source_only_replicas_preferred_when_replicas_not_in_sync() {
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .add(9, 0x4)
+ .add(2, 0x5)
+ .add(13, 0x6).setSourceOnly()
+ .add(1, 0x7).setSourceOnly());
+ ASSERT_LIMIT(2, nodes, "9,2");
+ ASSERT_LIMIT(3, nodes, "9,2,13s");
+}
+
+void MergeLimiterTest::at_least_one_non_source_only_replica_chosen_when_all_trusted() {
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .addTrusted(9, 0x6)
+ .addTrusted(2, 0x6)
+ .addTrusted(13, 0x6).setSourceOnly()
+ .addTrusted(1, 0x6).setSourceOnly());
+ ASSERT_LIMIT(2, nodes, "2,13s");
+ ASSERT_LIMIT(3, nodes, "2,13s,1s");
+}
+
+} // storage::distributor
diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp
index 1e317e20b74..79c6acd5f4b 100644
--- a/storage/src/tests/distributor/mergeoperationtest.cpp
+++ b/storage/src/tests/distributor/mergeoperationtest.cpp
@@ -26,6 +26,7 @@ class MergeOperationTest : public CppUnit::TestFixture,
CPPUNIT_TEST(testDoNotRemoveActiveSourceOnlyCopies);
CPPUNIT_TEST(testMarkRedundantTrustedCopiesAsSourceOnly);
CPPUNIT_TEST(onlyMarkRedundantRetiredReplicasAsSourceOnly);
+ CPPUNIT_TEST(mark_post_merge_redundant_replicas_source_only);
CPPUNIT_TEST_SUITE_END();
std::unique_ptr<PendingMessageTracker> _pendingTracker;
@@ -38,6 +39,7 @@ protected:
void testDoNotRemoveActiveSourceOnlyCopies();
void testMarkRedundantTrustedCopiesAsSourceOnly();
void onlyMarkRedundantRetiredReplicasAsSourceOnly();
+ void mark_post_merge_redundant_replicas_source_only();
public:
void setUp() override {
@@ -269,13 +271,13 @@ MergeOperationTest::testGenerateNodeList()
std::string("3,5,7,6,8,0,9,2,1,4"),
getNodeList("storage:10", 10, "9,8,7,6,5,4,3,2,1,0"));
- // Trusted copies should not be source only.
+ // Trusted copies can be source-only if they are in the non-ideal node set.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8,0,9,1,2,4s"),
+ std::string("3,5,7,6,8,0,9,1s,2s,4s"),
getNodeList("storage:10", 7, "0,1t,2t,3,4,5,6,7,8,9"));
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8,0,9,2,1s,4s"),
+ std::string("3,5,7,6,8,0,9,1s,2s,4s"),
getNodeList("storage:10", 7, "0,1,2t,3,4,5,6,7,8,9"));
// Retired nodes are not in ideal state
@@ -376,35 +378,31 @@ MergeOperationTest::testMarkRedundantTrustedCopiesAsSourceOnly()
std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3t,5t,7t,6,8"));
- // 3 redundancy, 4 trusted -> 1 source only trusted.
- // We allow marking a trusted, non-ideal copy as source even when we don't
- // have #redundancy trusted _ideal_ copies, as long as we're left with >=
- // #redundancy trusted copies in total.
+ // Trusted-ness should not be taken into account when marking nodes as source-only.
+ // 2 out of 3 ideal replicas trusted.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8s"),
+ std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3t,5t,7,6t,8t"));
- // Not sufficient number of trusted copies to mark any as source only.
+ // 1 out of 3 ideal replicas trusted.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8"),
+ std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3t,5,7,6t,8t"));
- // Same as above, with all trusted copies being non-ideal.
+ // 0 out of 3 ideal replicas trusted.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8"),
+ std::string("3,5,7,6s,8s"),
getNodeList("storage:10", 3, "3,5,7,6t,8t"));
- // #redundancy of trusted, but none are ideal. Non-ideal trusted should
- // not be marked as source only (though we can mark non-trusted non-ideal
- // node as source only).
- // Note the node reordering since trusted are added before the rest.
+ // #redundancy of trusted, but none are ideal. Non-ideal trusted may be
+ // marked as source only.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,8,0,9,6s"),
+ std::string("3,5,7,6s,8s,0s,9s"),
getNodeList("storage:10", 3, "3,5,7,6,8t,0t,9t"));
- // But allow for removing excess trusted, non-ideal copies.
+ // Allow for removing excess trusted, non-ideal copies.
CPPUNIT_ASSERT_EQUAL(
- std::string("3,5,7,6,8,0,9s"),
+ std::string("3,5,7,6s,8s,0s,9s"),
getNodeList("storage:10", 3, "3,5,7,6t,8t,0t,9t"));
}
@@ -417,8 +415,41 @@ MergeOperationTest::onlyMarkRedundantRetiredReplicasAsSourceOnly()
// source-only nodes will have their replica removed after a successful
// merge, which we cannot allow to happen here.
CPPUNIT_ASSERT_EQUAL(
- std::string("0,1,2s"),
- getNodeList("storage:3 .0.s.:r .1.s:r .2.s:r", 2, "1,0,2"));
+ std::string("1,0,2s"),
+ getNodeList("storage:3 .0.s:r .1.s:r .2.s:r", 2, "1,0,2"));
+}
+
+void MergeOperationTest::mark_post_merge_redundant_replicas_source_only() {
+ // Ideal state sequence is [3, 5, 7, 6, 8, 0, 9, 2, 1, 4]
+
+ // Retired node 7 is not part of the #redundancy ideal state and should be moved
+ // to node 6. Once the merge is done we'll end up with too many replicas unless
+ // we allow marking the to-be-moved replica as source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,6,7s"),
+ getNodeList("storage:10 .7.s:r", 3, "3t,5t,7t,6"));
+
+ // Should be allowed to mark as source only even if retired replica is the
+ // only trusted replica at the time the merge starts.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,6,7s"),
+ getNodeList("storage:10 .7.s:r", 3, "3,5,7t,6"));
+
+ // This extends to multiple retired nodes.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,6,8,5s,7s"),
+ getNodeList("storage:10 .5.s:r .7.s:r", 3, "3t,5t,7t,6,8"));
+
+ // If number of post-merge ideal nodes is lower than desired redundancy, don't
+ // mark any as source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6"),
+ getNodeList("storage:10", 5, "3,5,7,6"));
+
+ // Same applies to when post-merge ideal nodes is _equal_ to desired redundancy.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6"),
+ getNodeList("storage:10", 4, "3,5,7,6"));
}
} // distributor
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.cpp
index 618ba9ba884..d708b902454 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.cpp
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.cpp
@@ -1,20 +1,32 @@
// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/storage/distributor/operations/idealstate/mergelimiter.h>
+#include <cassert>
#include <vespa/log/log.h>
-
LOG_SETUP(".distributor.operations.merge.limiter");
-namespace storage {
-namespace distributor {
+namespace storage::distributor {
MergeLimiter::MergeLimiter(uint16_t maxNodes)
: _maxNodes(maxNodes)
{
+ assert(maxNodes > 1);
LOG(spam, "Limiter initialized with %u nodes.", uint32_t(maxNodes));
}
+// TODO replace this overly complicated set of heuristics with something simpler.
+// Suggestion:
+// 1. Find non-source only replica with highest meta entry count. Emit it and remove from set.
+// This tries to maintain a "seed" replica that can hopefully let the remaining replicas
+// converge to the complete document entry set as quickly as possible.
+// 2. Create mapping from checksum -> replica set.
+// 3. Circularly loop through mapping and emit+remove the first replica in each mapping's set.
+// Distributing the merge across replica checksum groups is a heuristic to fetch as many
+// distinct document entries in one merge operation as possible, as these are all known to
+// be pairwise divergent from each other.
+// 3.1 Once merge limit is reached, break
+// 4. Do a stable sort on the emitted list such that source only replicas are last in the sequence.
namespace {
class EqualCopies {
uint32_t _checksum;
@@ -28,15 +40,24 @@ namespace {
{
}
- bool hasTrusted() const { return (_trustedCopies > 0); }
- uint32_t trustedCount() const { return _trustedCopies; }
- uint32_t size() const { return _copies.size(); }
- bool operator==(const MergeMetaData& mmd) const {
+ bool hasTrusted() const noexcept { return (_trustedCopies > 0); }
+ uint32_t trustedCount() const noexcept { return _trustedCopies; }
+ uint32_t size() const noexcept { return static_cast<uint32_t>(_copies.size()); }
+ bool operator==(const MergeMetaData& mmd) const noexcept {
return (_checksum == mmd.checksum());
}
void add(const MergeMetaData& mmd) {
- if (_copies.empty()) _checksum = mmd.checksum();
- if (mmd.trusted()) ++_trustedCopies;
+ if (_copies.empty()) {
+ _checksum = mmd.checksum();
+ }
+ // Don't treat source only replicas as trusted from the perspective of
+ // picking replica groups. "Trusted" in the context of the merge limiter
+ // logic _in practice_ means "may be output as the sole non-source only node
+ // in the resulting node set", which obviously doesn't work if it is in fact
+ // source only to begin with...
+ if (mmd.trusted() && !mmd.source_only()) {
+ ++_trustedCopies;
+ }
_copies.push_back(mmd);
}
MergeMetaData extractNext() {
@@ -56,36 +77,22 @@ namespace {
: _trustedCopies(0)
{
_groups.reserve(a.size());
- for (uint32_t i=0, n=a.size(); i<n; ++i) {
+ for (uint32_t i = 0, n = static_cast<uint32_t>(a.size()); i < n; ++i) {
add(a[i]);
- if (a[i].trusted()) {
+ if (a[i].trusted() && !a[i].source_only()) {
++_trustedCopies;
}
}
}
- EqualCopies& getMajority() {
- EqualCopies* candidate = 0;
- uint32_t size = 0;
- for (uint32_t i=0, n=_groups.size(); i<n; ++i) {
- if (_groups[i].size() > size) {
- candidate = &_groups[i];
- size = candidate->size();
- }
- }
- assert(candidate != 0);
- return *candidate;
- }
-
- bool hasTrusted() const { return (_trustedCopies > 0); }
- uint32_t trustedCount() const { return _trustedCopies; }
+ bool hasTrusted() const noexcept { return (_trustedCopies > 0); }
Statistics extractGroupsWithTrustedCopies() {
std::vector<EqualCopies> remaining;
Statistics trusted;
remaining.reserve(_groups.size());
trusted._groups.reserve(_groups.size());
- for (uint32_t i=0, n=_groups.size(); i<n; ++i) {
+ for (uint32_t i = 0, n = static_cast<uint32_t>(_groups.size()); i < n; ++i) {
if (_groups[i].hasTrusted()) {
trusted._groups.push_back(_groups[i]);
trusted._trustedCopies += _groups[i].trustedCount();
@@ -110,7 +117,7 @@ namespace {
void removeGroup(uint32_t groupIndex) {
std::vector<EqualCopies> remaining;
remaining.reserve(_groups.size()-1);
- for (uint32_t i=0, n=_groups.size(); i<n; ++i) {
+ for (uint32_t i = 0, n = static_cast<uint32_t>(_groups.size()); i < n; ++i) {
if (i != groupIndex) {
remaining.push_back(_groups[i]);
}
@@ -120,10 +127,16 @@ namespace {
private:
void add(const MergeMetaData& mmd) {
- for (uint32_t i=0; i<_groups.size(); ++i) {
- if (_groups[i] == mmd) {
- _groups[i].add(mmd);
- return;
+ // Treat source only replicas as their own distinct "groups" with regards
+ // to picking replicas for being part of the merge. This way, we avoid
+ // accidentally picking a trusted source only replica as our one trusted
+ // replica that will be part of the merge.
+ if (!mmd.source_only()) {
+ for (uint32_t i = 0; i < _groups.size(); ++i) {
+ if (_groups[i] == mmd) {
+ _groups[i].add(mmd);
+ return;
+ }
}
}
_groups.push_back(EqualCopies());
@@ -131,13 +144,15 @@ namespace {
}
};
- // Add up to max nodes, where different variants exist, prefer having
- // some of each.
+ // Add up to max nodes, where different variants exist, prefer having
+ // some of each.
void addNodes(uint32_t max, Statistics& stats,
MergeLimiter::NodeArray& result)
{
+ // FIXME redesign! `last` will unsigned over/underflow in extractNext, which
+ // is not a very pretty solution, to say the least.
uint32_t last = -1;
- for (uint32_t i=0; i<max; ++i) {
+ for (uint32_t i = 0; i < max; ++i) {
MergeMetaData data;
if (!stats.extractNext(data, last)) return;
result.push_back(data);
@@ -152,16 +167,22 @@ namespace {
};
}
+// FIXME the only reason why this code doesn't end up accidentally picking
+// just source-only replicas as the output node set today is due to an implicit
+// guarantee that the input to this function always has source-only replicas
+// listed _last_ in the sequence.
void
MergeLimiter::limitMergeToMaxNodes(NodeArray& nodes)
{
- // If not above max anyhow, we need not do anything
- if (nodes.size() <= _maxNodes) return;
- // Gather some statistics to base decision on what we are going to do on
+ // If not above max anyhow, we need not do anything
+ if (nodes.size() <= _maxNodes) {
+ return;
+ }
+ // Gather some statistics to base decision on what we are going to do on
Statistics stats(nodes);
NodeArray result;
- // If we have trusted copies, these should be complete. Pick one of them
- // and merge with as many untrusted copies as possible
+ // If we have trusted copies, these are likely to be complete. Pick one of them
+ // and merge with as many untrusted copies as possible
if (stats.hasTrusted()) {
Statistics trusted(stats.extractGroupsWithTrustedCopies());
addNodes(_maxNodes - 1, stats, result);
@@ -173,5 +194,4 @@ MergeLimiter::limitMergeToMaxNodes(NodeArray& nodes)
result.swap(nodes);
}
-} // distributor
-} // storage
+} // storage::distributor
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.h
index acde58e6061..ef86fc07810 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.h
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.h
@@ -5,8 +5,7 @@
#include <vespa/storage/distributor/operations/idealstate/mergemetadata.h>
#include <vector>
-namespace storage {
-namespace distributor {
+namespace storage::distributor {
class MergeLimiter {
uint16_t _maxNodes;
@@ -19,5 +18,4 @@ public:
void limitMergeToMaxNodes(NodeArray&);
};
-} // distributor
-} // storage
+} // storage::distributor
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergemetadata.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergemetadata.h
index 8c1cb02bcda..4c4225587bc 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergemetadata.h
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergemetadata.h
@@ -25,6 +25,7 @@ struct MergeMetaData {
assert(_copy != 0);
return _copy->getChecksum();
}
+ bool source_only() const noexcept { return _sourceOnly; }
};
vespalib::asciistream& operator<<(vespalib::asciistream& out, const MergeMetaData& e);
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
index 75c58ca4f87..0821408560b 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
@@ -20,18 +20,13 @@ MergeOperation::getStatus() const
void
MergeOperation::addIdealNodes(
- const lib::Distribution& distribution,
- const lib::ClusterState& state,
- const document::BucketId& bucketId,
+ const std::vector<uint16_t>& idealNodes,
const std::vector<MergeMetaData>& nodes,
std::vector<MergeMetaData>& result)
{
- std::vector<uint16_t> idealNodes(
- distribution.getIdealStorageNodes(state, bucketId, "ui"));
-
- // Add all ideal nodes first
+ // Add all ideal nodes first. These are never marked source-only.
for (uint32_t i = 0; i < idealNodes.size(); i++) {
- const MergeMetaData* entry = 0;
+ const MergeMetaData* entry = nullptr;
for (uint32_t j = 0; j < nodes.size(); j++) {
if (idealNodes[i] == nodes[j]._nodeIndex) {
entry = &nodes[j];
@@ -39,52 +34,13 @@ MergeOperation::addIdealNodes(
}
}
- if (entry != 0) {
+ if (entry != nullptr) {
result.push_back(*entry);
result.back()._sourceOnly = false;
}
}
}
-uint16_t
-MergeOperation::countTrusted(const std::vector<MergeMetaData>& nodes)
-{
- uint16_t trusted = 0;
- for (const auto& n : nodes) {
- if (n.trusted()) {
- ++trusted;
- }
- }
- return trusted;
-}
-
-void
-MergeOperation::addTrustedNodesNotAlreadyAdded(
- uint16_t redundancy,
- const std::vector<MergeMetaData>& nodes,
- std::vector<MergeMetaData>& result)
-{
- uint16_t alreadyTrusted = countTrusted(result);
- for (uint32_t i = 0; i < nodes.size(); i++) {
- if (!nodes[i].trusted()) {
- continue;
- }
-
- bool found = false;
- for (uint32_t j = 0; j < result.size(); j++) {
- if (result[j]._nodeIndex == nodes[i]._nodeIndex) {
- found = true;
- }
- }
-
- if (!found) {
- result.push_back(nodes[i]);
- result.back()._sourceOnly = (alreadyTrusted >= redundancy);
- ++alreadyTrusted;
- }
- }
-}
-
void
MergeOperation::addCopiesNotAlreadyAdded(
uint16_t redundancy,
@@ -114,11 +70,19 @@ MergeOperation::generateSortedNodeList(
MergeLimiter& limiter,
std::vector<MergeMetaData>& nodes)
{
+ std::vector<uint16_t> idealNodes(
+ distribution.getIdealStorageNodes(state, bucketId, "ui"));
+
std::vector<MergeMetaData> result;
const uint16_t redundancy = distribution.getRedundancy();
- addIdealNodes(distribution, state, bucketId, nodes, result);
- addTrustedNodesNotAlreadyAdded(redundancy, nodes, result);
+ addIdealNodes(idealNodes, nodes, result);
addCopiesNotAlreadyAdded(redundancy, nodes, result);
+ // TODO optimization: when merge case is obviously a replica move (all existing N replicas
+ // are in sync and new replicas are empty), could prune away N-1 lowest indexed replicas
+ // from the node list. This would minimize the number of nodes involved in the merge without
+ // sacrificing the end result. Avoiding the lower indexed nodes would take pressure off the
+ // merge throttling "locks" and could potentially greatly speed up node retirement in the common
+ // case. Existing replica could also be marked as source-only if it's not in the ideal state.
limiter.limitMergeToMaxNodes(result);
result.swap(nodes);
}
@@ -154,7 +118,7 @@ MergeOperation::onStart(DistributorMessageSender& sender)
for (uint32_t i = 0; i < getNodes().size(); ++i) {
const BucketCopy* copy = entry->getNode(getNodes()[i]);
if (copy == 0) { // New copies?
- newCopies.push_back(std::unique_ptr<BucketCopy>(new BucketCopy(0, getNodes()[i], api::BucketInfo())));
+ newCopies.push_back(std::make_unique<BucketCopy>(0, getNodes()[i], api::BucketInfo()));
copy = newCopies.back().get();
}
nodes.push_back(MergeMetaData(getNodes()[i], *copy));
@@ -172,12 +136,11 @@ MergeOperation::onStart(DistributorMessageSender& sender)
}
if (_mnodes.size() > 1) {
- std::shared_ptr<api::MergeBucketCommand> msg(
- new api::MergeBucketCommand(
- getBucketId(),
- _mnodes,
- _manager->getDistributorComponent().getUniqueTimestamp(),
- clusterState.getVersion()));
+ auto msg = std::make_shared<api::MergeBucketCommand>(
+ getBucketId(),
+ _mnodes,
+ _manager->getDistributorComponent().getUniqueTimestamp(),
+ clusterState.getVersion());
// Due to merge forwarding/chaining semantics, we must always send
// the merge command to the lowest indexed storage node involved in
@@ -192,10 +155,7 @@ MergeOperation::onStart(DistributorMessageSender& sender)
msg->setTimeout(60 * 60 * 1000);
setCommandMeta(*msg);
- sender.sendToNode(
- lib::NodeType::STORAGE,
- _mnodes[0].index,
- msg);
+ sender.sendToNode(lib::NodeType::STORAGE, _mnodes[0].index, msg);
_sentMessageTime = _manager->getDistributorComponent().getClock().getTimeInSeconds();
} else {
@@ -213,13 +173,11 @@ MergeOperation::sourceOnlyCopyChangedDuringMerge(
{
assert(currentState.valid());
for (size_t i = 0; i < _mnodes.size(); ++i) {
- const BucketCopy* copyBefore(
- _infoBefore.getNode(_mnodes[i].index));
+ const BucketCopy* copyBefore(_infoBefore.getNode(_mnodes[i].index));
if (!copyBefore) {
continue;
}
- const BucketCopy* copyAfter(
- currentState->getNode(_mnodes[i].index));
+ const BucketCopy* copyAfter(currentState->getNode(_mnodes[i].index));
if (!copyAfter) {
LOG(debug, "Copy of %s on node %u removed during merge. Was %s",
getBucketId().toString().c_str(),
@@ -295,6 +253,8 @@ MergeOperation::deleteSourceOnlyNodes(
done();
}
}
+ // FIXME what about the else-case here...? done() is not invoked by caller in this branch.
+ // Not calling done() doesn't leak anything, but causes metric updates to be missed.
}
void
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
index 8fdf22c88f5..f50226dbd3c 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
@@ -49,14 +49,7 @@ public:
bool shouldBlockThisOperation(uint32_t messageType, uint8_t pri) const override ;
private:
static void addIdealNodes(
- const lib::Distribution&,
- const lib::ClusterState&,
- const document::BucketId&,
- const std::vector<MergeMetaData>& nodes,
- std::vector<MergeMetaData>& result);
-
- static void addTrustedNodesNotAlreadyAdded(
- uint16_t redundancy,
+ const std::vector<uint16_t>& idealNodes,
const std::vector<MergeMetaData>& nodes,
std::vector<MergeMetaData>& result);
@@ -65,8 +58,6 @@ private:
const std::vector<MergeMetaData>& nodes,
std::vector<MergeMetaData>& result);
- static uint16_t countTrusted(const std::vector<MergeMetaData>& nodes);
-
void deleteSourceOnlyNodes(const BucketDatabase::Entry& currentState,
DistributorMessageSender& sender);
};