summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@oath.com>2018-03-13 15:44:22 +0000
committerTor Brede Vekterli <vekterli@oath.com>2018-03-13 15:48:04 +0000
commit2b9743be9c82fd6fc5087764f48f617e16b9f956 (patch)
tree27f268f1a7609fa1e9c08d2e0668c8d308230263
parent6efe23abadd6a6c2ed26ae30cee0c87e1c320b1c (diff)
Missing replicas must be in a different checksum group than empty replicas
Otherwise we risk merges with more than 16 nodes being scheduled for only the empty replicas and not actually include the missing replica nodes. This fixes #5313
-rw-r--r--storage/src/tests/distributor/mergelimitertest.cpp20
-rw-r--r--storage/src/tests/distributor/mergeoperationtest.cpp19
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp6
3 files changed, 42 insertions, 3 deletions
diff --git a/storage/src/tests/distributor/mergelimitertest.cpp b/storage/src/tests/distributor/mergelimitertest.cpp
index d6508ee82d2..0b40424594b 100644
--- a/storage/src/tests/distributor/mergelimitertest.cpp
+++ b/storage/src/tests/distributor/mergelimitertest.cpp
@@ -17,6 +17,7 @@ struct MergeLimiterTest : public CppUnit::TestFixture
void non_source_only_replica_chosen_from_in_sync_group();
void non_source_only_replicas_preferred_when_replicas_not_in_sync();
void at_least_one_non_source_only_replica_chosen_when_all_trusted();
+ void missing_replica_distinct_from_empty_replica();
CPPUNIT_TEST_SUITE(MergeLimiterTest);
CPPUNIT_TEST(testKeepsAllBelowLimit);
@@ -29,6 +30,7 @@ struct MergeLimiterTest : public CppUnit::TestFixture
CPPUNIT_TEST(non_source_only_replica_chosen_from_in_sync_group);
CPPUNIT_TEST(non_source_only_replicas_preferred_when_replicas_not_in_sync);
CPPUNIT_TEST(at_least_one_non_source_only_replica_chosen_when_all_trusted);
+ CPPUNIT_TEST(missing_replica_distinct_from_empty_replica);
CPPUNIT_TEST_SUITE_END();
};
@@ -52,6 +54,14 @@ namespace {
_bucketDatabase.back()->setTrusted(true);
return *this;
}
+ NodeFactory& addMissing(int index) {
+ add(index, 0x1); // "Magic" checksum value implying invalid/recently created replica
+ return *this;
+ }
+ NodeFactory& addEmpty(int index) {
+ add(index, 0x0);
+ return *this;
+ }
NodeFactory& setSourceOnly() {
_nodes.back()._sourceOnly = true;
return *this;
@@ -208,4 +218,14 @@ void MergeLimiterTest::at_least_one_non_source_only_replica_chosen_when_all_trus
ASSERT_LIMIT(3, nodes, "2,13s,1s");
}
+void MergeLimiterTest::missing_replica_distinct_from_empty_replica() {
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .addEmpty(3)
+ .addEmpty(5)
+ .addMissing(1)
+ .addMissing(2));
+ ASSERT_LIMIT(2, nodes, "5,2");
+ ASSERT_LIMIT(3, nodes, "5,2,3");
+}
+
} // storage::distributor
diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp
index 468ddd47790..1556880d426 100644
--- a/storage/src/tests/distributor/mergeoperationtest.cpp
+++ b/storage/src/tests/distributor/mergeoperationtest.cpp
@@ -31,6 +31,7 @@ class MergeOperationTest : public CppUnit::TestFixture,
CPPUNIT_TEST(onlyMarkRedundantRetiredReplicasAsSourceOnly);
CPPUNIT_TEST(mark_post_merge_redundant_replicas_source_only);
CPPUNIT_TEST(merge_operation_is_blocked_by_any_busy_target_node);
+ CPPUNIT_TEST(missing_replica_is_included_in_limited_node_list);
CPPUNIT_TEST_SUITE_END();
std::unique_ptr<PendingMessageTracker> _pendingTracker;
@@ -45,6 +46,7 @@ protected:
void onlyMarkRedundantRetiredReplicasAsSourceOnly();
void mark_post_merge_redundant_replicas_source_only();
void merge_operation_is_blocked_by_any_busy_target_node();
+ void missing_replica_is_included_in_limited_node_list();
public:
void setUp() override {
@@ -504,5 +506,22 @@ void MergeOperationTest::merge_operation_is_blocked_by_any_busy_target_node() {
CPPUNIT_ASSERT(op.isBlocked(*_pendingTracker));
}
+void MergeOperationTest::missing_replica_is_included_in_limited_node_list() {
+ setupDistributor(Redundancy(4), NodeCount(4), "distributor:1 storage:4");
+ getClock().setAbsoluteTimeInSeconds(10);
+ addNodesToBucketDB(document::BucketId(16, 1), "4=0/0/0/t,1=0/0/0t,2=0/0/0/t");
+ const uint16_t max_merge_size = 2;
+ MergeOperation op(BucketAndNodes(makeDocumentBucket(document::BucketId(16, 1)), toVector<uint16_t>(0, 1, 2, 4)), max_merge_size);
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ // Must include missing node 0 and not just 2 existing replicas
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, "
+ "cluster state version: 0, nodes: [0, 4], chain: [], "
+ "reasons to start: ) => 0"),
+ _sender.getLastCommand(true));
+}
+
} // distributor
} // storage
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
index 170ad8a9ac5..b61de2fa06b 100644
--- a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
@@ -119,11 +119,11 @@ MergeOperation::onStart(DistributorMessageSender& sender)
for (uint32_t i = 0; i < getNodes().size(); ++i) {
const BucketCopy* copy = entry->getNode(getNodes()[i]);
- if (copy == 0) { // New copies?
- newCopies.push_back(std::make_unique<BucketCopy>(0, getNodes()[i], api::BucketInfo()));
+ if (copy == nullptr) { // New copies?
+ newCopies.emplace_back(std::make_unique<BucketCopy>(BucketCopy::recentlyCreatedCopy(0, getNodes()[i])));
copy = newCopies.back().get();
}
- nodes.push_back(MergeMetaData(getNodes()[i], *copy));
+ nodes.emplace_back(getNodes()[i], *copy);
}
_infoBefore = entry.getBucketInfo();