summaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2021-02-24 16:52:50 +0100
committerGitHub <noreply@github.com>2021-02-24 16:52:50 +0100
commitdb0fcba270059f327d66256f02c4e105d1d45d08 (patch)
tree0e3a6a1175f97fb116c02a74a06591ea37d3517c /storage
parent6f1196bb5a7d705b6a9580b1ba301b2db4b0a40b (diff)
parent26aac047c332f5027f47bd0e997633eeeb8884ea (diff)
Merge pull request #16658 from vespa-engine/vekterli/prefer-ready-replicas-with-more-documents
Prefer activating ready replicas with more documents
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/distributor/distributortest.cpp2
-rw-r--r--storage/src/tests/distributor/idealstatemanagertest.cpp9
-rw-r--r--storage/src/tests/distributor/statecheckerstest.cpp110
-rw-r--r--storage/src/vespa/storage/distributor/activecopy.cpp55
-rw-r--r--storage/src/vespa/storage/distributor/activecopy.h4
-rw-r--r--storage/src/vespa/storage/distributor/distributor.cpp19
6 files changed, 102 insertions, 97 deletions
diff --git a/storage/src/tests/distributor/distributortest.cpp b/storage/src/tests/distributor/distributortest.cpp
index c06919a7489..564cd2bc876 100644
--- a/storage/src/tests/distributor/distributortest.cpp
+++ b/storage/src/tests/distributor/distributortest.cpp
@@ -435,7 +435,7 @@ TEST_F(DistributorTest, metric_update_hook_updates_pending_maintenance_metrics)
getConfig().setMaxPendingMaintenanceOps(1);
// 1 bucket must be merged, 1 must be split, 1 should be activated.
- addNodesToBucketDB(document::BucketId(16, 1), "0=1/1/1/t/a,1=2/2/2");
+ addNodesToBucketDB(document::BucketId(16, 1), "0=2/2/2/t/a,1=1/1/1");
addNodesToBucketDB(document::BucketId(16, 2),
"0=100/10000000/200000/t/a,1=100/10000000/200000/t");
addNodesToBucketDB(document::BucketId(16, 3),
diff --git a/storage/src/tests/distributor/idealstatemanagertest.cpp b/storage/src/tests/distributor/idealstatemanagertest.cpp
index 51284307fa8..cae6bf9f226 100644
--- a/storage/src/tests/distributor/idealstatemanagertest.cpp
+++ b/storage/src/tests/distributor/idealstatemanagertest.cpp
@@ -138,12 +138,13 @@ TEST_F(IdealStateManagerTest, clear_active_on_node_down) {
tick();
}
- EXPECT_EQ("setbucketstate to [0] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000001)) (pri 100)\n"
- "setbucketstate to [0] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000002)) (pri 100)\n"
- "setbucketstate to [0] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000003)) (pri 100)\n",
+ // Node 2 gets activated for each bucket as it has the most documents.
+ EXPECT_EQ("setbucketstate to [2] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000001)) (pri 100)\n"
+ "setbucketstate to [2] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000002)) (pri 100)\n"
+ "setbucketstate to [2] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000003)) (pri 100)\n",
_distributor->getActiveIdealStateOperations());
- setSystemState(lib::ClusterState("distributor:1 storage:3 .0.s:d"));
+ setSystemState(lib::ClusterState("distributor:1 storage:3 .2.s:d"));
EXPECT_EQ("", _distributor->getActiveIdealStateOperations());
EXPECT_EQ(0, _distributor->getPendingMessageTracker()
diff --git a/storage/src/tests/distributor/statecheckerstest.cpp b/storage/src/tests/distributor/statecheckerstest.cpp
index eda083dcc22..314e5b27e25 100644
--- a/storage/src/tests/distributor/statecheckerstest.cpp
+++ b/storage/src/tests/distributor/statecheckerstest.cpp
@@ -959,45 +959,38 @@ TEST_F(StateCheckersTest, bucket_state) {
testBucketState(""));
// Node 1 is in ideal state
- EXPECT_EQ("[Setting node 1 as active:"
- " copy is ideal state priority 0] (pri 90)",
+ EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 0] (pri 90)",
testBucketState("1=2/3/4", 2, true));
// Node 3 is in ideal state
- EXPECT_EQ("[Setting node 3 as active:"
- " copy is ideal state priority 1]",
+ EXPECT_EQ("[Setting node 3 as active: copy has 3 docs and ideal state priority 1]",
testBucketState("3=2/3/4"));
- // No trusted nodes, but node 1 is first in ideal state.
+ // No ready replicas. Node 1 is first in ideal state but node 2 has
+ // more docs and should remain active.
// Also check bad case where more than 1 node is set as active just
// to ensure we can get out of that situation if it should ever happen.
- // Nothing done with node 3 since is't not active and shouldn't be.
- EXPECT_EQ("[Setting node 1 as active:"
- " copy is ideal state priority 0]"
- "[Setting node 0 as inactive]"
- "[Setting node 2 as inactive] (pri 120)",
+ // Nothing done with node 3 since it's not active and shouldn't be.
+ EXPECT_EQ("[Setting node 0 as inactive] (pri 90)",
testBucketState("0=3/4/5/u/a,1=3,2=4/5/6/u/a,3=3", 2, true));
// Test setting active when only node available is not contained
// within the resolved ideal state.
- EXPECT_EQ("[Setting node 0 as active: first available copy]",
+ EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]",
testBucketState("0=2/3/4"));
- // A trusted ideal state copy should be set active rather than a non-trusted
- // ideal state copy
- EXPECT_EQ("[Setting node 3 as active:"
- " copy is trusted and ideal state priority 1]"
+ // A replica with more documents should be preferred over one with fewer.
+ EXPECT_EQ("[Setting node 3 as active: copy has 6 docs and ideal state priority 1]"
"[Setting node 1 as inactive]",
testBucketState("1=2/3/4/u/a,3=5/6/7/t"));
- // None of the ideal state copies are trusted but a non-ideal copy is.
- // The trusted copy should be active.
- EXPECT_EQ("[Setting node 2 as active: copy is trusted]",
+ // Replica 2 has most documents and should be activated
+ EXPECT_EQ("[Setting node 2 as active: copy has 9 docs]",
testBucketState("1=2/3/4,3=5/6/7/,2=8/9/10/t"));
// Make sure bucket db ordering does not matter
- EXPECT_EQ("[Setting node 2 as active: copy is trusted]",
- testBucketState("2=8/9/10/t,1=2/3/4,3=5/6/7"));
+ EXPECT_EQ("[Setting node 2 as active: copy has 9 docs]",
+ testBucketState("1=2/3/4,3=5/6/7,2=8/9/10/t"));
// If copy is already active, we shouldn't generate operations
EXPECT_EQ("NO OPERATIONS GENERATED",
@@ -1016,26 +1009,26 @@ TEST_F(StateCheckersTest, bucket_state) {
EXPECT_EQ("NO OPERATIONS GENERATED",
testBucketState("1=0/0/1,3=0/0/1"));
- // Ready preferred over trusted & ideal state
+ // Ready preferred over ideal state
EXPECT_EQ("NO OPERATIONS GENERATED",
testBucketState("2=8/9/10/t/i/u,1=2/3/4/u/a/r,3=5/6/7"));
- EXPECT_EQ("[Setting node 2 as active: copy is ready]"
+ EXPECT_EQ("[Setting node 2 as active: copy is ready with 9 docs]"
"[Setting node 1 as inactive]",
testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/u,3=5/6/7/u/i/u"));
// Prefer in ideal state if multiple copies ready
- EXPECT_EQ("[Setting node 3 as active: copy is ready]"
+ EXPECT_EQ("[Setting node 3 as active: copy is ready, has 9 docs and ideal state priority 1]"
"[Setting node 1 as inactive]",
- testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/u,3=5/6/7/u/i/r"));
+ testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/u,3=8/9/10/u/i/r"));
- // Prefer ideal state if all ready but no trusted
- EXPECT_EQ("[Setting node 1 as active: copy is ready]",
- testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/i/r,3=5/6/7/u/i/r"));
+ // Prefer ideal state if all ready
+ EXPECT_EQ("[Setting node 1 as active: copy is ready, has 9 docs and ideal state priority 0]",
+ testBucketState("2=8/9/10/u/i/r,1=8/9/10/u/i/r,3=8/9/10/u/i/r"));
- // Prefer trusted over ideal state
- EXPECT_EQ("[Setting node 2 as active: copy is ready and trusted]"
+ // Ready with more documents is preferred over ideal state or trusted
+ EXPECT_EQ("[Setting node 2 as active: copy is ready with 9 docs]"
"[Setting node 1 as inactive]",
- testBucketState("2=8/9/10/t/i/r,1=2/3/4/u/a/r,3=5/6/7"));
+ testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/r,3=5/6/7/u/i/r"));
}
/**
@@ -1050,7 +1043,7 @@ TEST_F(StateCheckersTest, do_not_activate_non_ready_copies_when_ideal_node_in_ma
EXPECT_EQ("NO OPERATIONS GENERATED",
testBucketState("2=8/9/10/t/i/u,3=5/6/7"));
// But we should activate another copy iff there's another ready copy.
- EXPECT_EQ("[Setting node 2 as active: copy is ready]",
+ EXPECT_EQ("[Setting node 2 as active: copy is ready with 9 docs]",
testBucketState("2=8/9/10/u/i/r,3=5/6/7/u/i/u"));
}
@@ -1151,18 +1144,14 @@ TEST_F(StateCheckersTest, bucket_state_per_group) {
}
// Node 1 and 8 is is ideal state
- EXPECT_EQ("[Setting node 1 as active: "
- "copy is trusted and ideal state priority 4]"
- "[Setting node 6 as active: "
- "copy is trusted and ideal state priority 0] (pri 90)",
+ EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 4]"
+ "[Setting node 6 as active: copy has 3 docs and ideal state priority 0] (pri 90)",
testBucketStatePerGroup("0=2/3/4/t, 1=2/3/4/t, 3=2/3/4/t, "
"5=2/3/4/t, 6=2/3/4/t, 8=2/3/4/t", true));
// Data differ between groups
- EXPECT_EQ("[Setting node 1 as active: "
- "copy is trusted and ideal state priority 4]"
- "[Setting node 6 as active: "
- "copy is ideal state priority 0] (pri 90)",
+ EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 4]"
+ "[Setting node 6 as active: copy has 6 docs and ideal state priority 0] (pri 90)",
testBucketStatePerGroup("0=2/3/4/t, 1=2/3/4/t, 3=2/3/4/t, "
"5=5/6/7, 6=5/6/7, 8=5/6/7", true));
@@ -1176,12 +1165,9 @@ TEST_F(StateCheckersTest, bucket_state_per_group) {
true));
// Node 1 and 8 is is ideal state
- EXPECT_EQ("[Setting node 1 as active: "
- "copy is trusted and ideal state priority 4]"
- "[Setting node 6 as active: "
- "copy is trusted and ideal state priority 0]"
- "[Setting node 9 as active: "
- "copy is trusted and ideal state priority 2] (pri 90)",
+ EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 4]"
+ "[Setting node 6 as active: copy has 3 docs and ideal state priority 0]"
+ "[Setting node 9 as active: copy has 3 docs and ideal state priority 2] (pri 90)",
testBucketStatePerGroup("0=2/3/4/t, 1=2/3/4/t, 3=2/3/4/t, "
"5=2/3/4/t, 6=2/3/4/t, 8=2/3/4/t, "
"9=2/3/4/t, 10=2/3/4/t, 11=2/3/4/t",
@@ -1196,31 +1182,31 @@ TEST_F(StateCheckersTest, do_not_activate_replicas_that_are_out_of_sync_with_maj
getConfig().set_max_activation_inhibited_out_of_sync_groups(3);
// 5 is out of sync with 0 and 9 and will NOT be activated.
- EXPECT_EQ("[Setting node 0 as active: first available copy]"
- "[Setting node 9 as active: copy is ideal state priority 2]",
+ EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]"
+ "[Setting node 9 as active: copy has 3 docs and ideal state priority 2]",
testBucketStatePerGroup("0=2/3/4, 5=3/4/5, 9=2/3/4"));
// We also try the other indices:...
// 0 out of sync, 5 and 9 in sync (one hopes..!)
- EXPECT_EQ("[Setting node 5 as active: first available copy]"
- "[Setting node 9 as active: copy is ideal state priority 2]",
+ EXPECT_EQ("[Setting node 5 as active: copy has 3 docs]"
+ "[Setting node 9 as active: copy has 3 docs and ideal state priority 2]",
testBucketStatePerGroup("0=4/5/6, 5=2/3/4, 9=2/3/4"));
// 9 out of sync, 0 and 5 in sync
- EXPECT_EQ("[Setting node 0 as active: first available copy]"
- "[Setting node 5 as active: first available copy]",
+ EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]"
+ "[Setting node 5 as active: copy has 3 docs]",
testBucketStatePerGroup("0=2/3/4, 5=2/3/4, 9=5/3/4"));
// If there's no majority, we activate everything because there's really nothing
// better we can do.
- EXPECT_EQ("[Setting node 0 as active: first available copy]"
- "[Setting node 5 as active: first available copy]"
- "[Setting node 9 as active: copy is ideal state priority 2]",
+ EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]"
+ "[Setting node 5 as active: copy has 6 docs]"
+ "[Setting node 9 as active: copy has 9 docs and ideal state priority 2]",
testBucketStatePerGroup("0=2/3/4, 5=5/6/7, 9=8/9/10"));
// However, if a replica is _already_ active, we will not deactivate it.
- EXPECT_EQ("[Setting node 0 as active: first available copy]"
- "[Setting node 9 as active: copy is ideal state priority 2]",
+ EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]"
+ "[Setting node 9 as active: copy has 3 docs and ideal state priority 2]",
testBucketStatePerGroup("0=2/3/4, 5=3/4/5/u/a, 9=2/3/4"));
}
@@ -1234,8 +1220,8 @@ TEST_F(StateCheckersTest, replica_activation_inhibition_can_be_limited_to_max_n_
// group 0, 1 out of sync in group 1 and 1 out of sync in group 2. Unless we have
// mechanisms in place to limit the number of affected groups, both groups 1 and 2 would
// be inhibited for activation. Since we limit to 1, only group 1 should be affected.
- EXPECT_EQ("[Setting node 1 as active: copy is ideal state priority 4]"
- "[Setting node 9 as active: copy is ideal state priority 2]",
+ EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 4]"
+ "[Setting node 9 as active: copy has 6 docs and ideal state priority 2]",
testBucketStatePerGroup("0=2/3/4, 1=2/3/4, 3=2/3/4, 5=3/4/5, 9=5/6/7"));
}
@@ -1246,9 +1232,9 @@ TEST_F(StateCheckersTest, activate_replicas_that_are_out_of_sync_with_majority_i
getConfig().set_max_activation_inhibited_out_of_sync_groups(0);
// 5 is out of sync with 0 and 9 but will still be activated since the config is false.
- EXPECT_EQ("[Setting node 0 as active: first available copy]"
- "[Setting node 5 as active: first available copy]"
- "[Setting node 9 as active: copy is ideal state priority 2]",
+ EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]"
+ "[Setting node 5 as active: copy has 4 docs]"
+ "[Setting node 9 as active: copy has 3 docs and ideal state priority 2]",
testBucketStatePerGroup("0=2/3/4, 5=3/4/5, 9=2/3/4"));
}
@@ -1257,7 +1243,7 @@ TEST_F(StateCheckersTest, allow_activation_of_retired_nodes) {
// we still want to be able to shuffle bucket activations around in order
// to preserve coverage.
setupDistributor(2, 2, "distributor:1 storage:2 .0.s:r .1.s:r");
- EXPECT_EQ("[Setting node 1 as active: copy is trusted]"
+ EXPECT_EQ("[Setting node 1 as active: copy has 6 docs]"
"[Setting node 0 as inactive]",
testBucketState("0=2/3/4/u/a,1=5/6/7/t"));
}
diff --git a/storage/src/vespa/storage/distributor/activecopy.cpp b/storage/src/vespa/storage/distributor/activecopy.cpp
index e5bfe489d26..e1e88116921 100644
--- a/storage/src/vespa/storage/distributor/activecopy.cpp
+++ b/storage/src/vespa/storage/distributor/activecopy.cpp
@@ -27,9 +27,9 @@ ActiveCopy::ActiveCopy(uint16_t node, const BucketDatabase::Entry& e, const std:
_ideal(0xffff)
{
const BucketCopy* copy = e->getNode(node);
- assert(copy != 0);
+ assert(copy != nullptr);
+ _doc_count = copy->getDocumentCount();
_ready = copy->ready();
- _trusted = copy->trusted();
_active = copy->active();
for (uint32_t i=0; i<idealState.size(); ++i) {
if (idealState[i] == node) {
@@ -41,20 +41,27 @@ ActiveCopy::ActiveCopy(uint16_t node, const BucketDatabase::Entry& e, const std:
vespalib::string
ActiveCopy::getReason() const {
- if (_ready && _trusted && _ideal < 0xffff) {
+ if (_ready && (_doc_count > 0) && (_ideal < 0xffff)) {
vespalib::asciistream ost;
- ost << "copy is ready, trusted and ideal state priority " << _ideal;
+ ost << "copy is ready, has " << _doc_count
+ << " docs and ideal state priority " << _ideal;
+ return ost.str();
+ } else if (_ready && (_doc_count > 0)) {
+ vespalib::asciistream ost;
+ ost << "copy is ready with " << _doc_count << " docs";
return ost.str();
- } else if (_ready && _trusted) {
- return "copy is ready and trusted";
} else if (_ready) {
return "copy is ready";
- } else if (_trusted && _ideal < 0xffff) {
+ } else if ((_doc_count > 0) && (_ideal < 0xffff)) {
+ vespalib::asciistream ost;
+ ost << "copy has " << _doc_count << " docs and ideal state priority " << _ideal;
+ return ost.str();
+ } else if (_doc_count > 0) {
vespalib::asciistream ost;
- ost << "copy is trusted and ideal state priority " << _ideal;
+ ost << "copy has " << _doc_count << " docs";
return ost.str();
- } else if (_trusted) {
- return "copy is trusted";
+ } else if (_active) {
+ return "copy is already active";
} else if (_ideal < 0xffff) {
vespalib::asciistream ost;
ost << "copy is ideal state priority " << _ideal;
@@ -67,9 +74,15 @@ ActiveCopy::getReason() const {
std::ostream&
operator<<(std::ostream& out, const ActiveCopy & e) {
out << "Entry(Node " << e._nodeIndex;
- if (e._ready) out << ", ready";
- if (e._trusted) out << ", trusted";
- if (e._ideal < 0xffff) out << ", ideal pri " << e._ideal;
+ if (e._ready) {
+ out << ", ready";
+ }
+ if (e._doc_count > 0) {
+ out << ", doc_count " << e._doc_count;
+ }
+ if (e._ideal < 0xffff) {
+ out << ", ideal pri " << e._ideal;
+ }
out << ")";
return out;
}
@@ -78,10 +91,18 @@ namespace {
struct ActiveStateOrder {
bool operator()(const ActiveCopy & e1, const ActiveCopy & e2) {
- if (e1._ready != e2._ready) return e1._ready;
- if (e1._trusted != e2._trusted) return e1._trusted;
- if (e1._ideal != e2._ideal) return e1._ideal < e2._ideal;
- if (e1._active != e2._active) return e1._active;
+ if (e1._ready != e2._ready) {
+ return e1._ready;
+ }
+ if (e1._doc_count != e2._doc_count) {
+ return e1._doc_count > e2._doc_count;
+ }
+ if (e1._ideal != e2._ideal) {
+ return e1._ideal < e2._ideal;
+ }
+ if (e1._active != e2._active) {
+ return e1._active;
+ }
return e1._nodeIndex < e2._nodeIndex;
}
};
diff --git a/storage/src/vespa/storage/distributor/activecopy.h b/storage/src/vespa/storage/distributor/activecopy.h
index d22e7072cc7..7ad3649d738 100644
--- a/storage/src/vespa/storage/distributor/activecopy.h
+++ b/storage/src/vespa/storage/distributor/activecopy.h
@@ -10,7 +10,7 @@ namespace storage::distributor {
class ActiveList;
struct ActiveCopy {
- ActiveCopy() : _nodeIndex(-1), _ideal(-1), _ready(false), _trusted(false), _active(false) { }
+ ActiveCopy() : _nodeIndex(-1), _ideal(-1), _doc_count(0), _ready(false), _active(false) { }
ActiveCopy(uint16_t node, const BucketDatabase::Entry& e, const std::vector<uint16_t>& idealState);
vespalib::string getReason() const;
@@ -23,8 +23,8 @@ struct ActiveCopy {
uint16_t _nodeIndex;
uint16_t _ideal;
+ uint32_t _doc_count;
bool _ready;
- bool _trusted;
bool _active;
};
diff --git a/storage/src/vespa/storage/distributor/distributor.cpp b/storage/src/vespa/storage/distributor/distributor.cpp
index 2b24eb1195a..cb4336d9f1e 100644
--- a/storage/src/vespa/storage/distributor/distributor.cpp
+++ b/storage/src/vespa/storage/distributor/distributor.cpp
@@ -17,6 +17,7 @@
#include <vespa/storageframework/generic/status/xmlstatusreporter.h>
#include <vespa/vdslib/distribution/distribution.h>
#include <vespa/vespalib/util/memoryusage.h>
+#include <algorithm>
#include <vespa/log/log.h>
LOG_SETUP(".distributor-main");
@@ -373,17 +374,13 @@ Distributor::enableClusterStateBundle(const lib::ClusterStateBundle& state)
enterRecoveryMode();
// Clear all active messages on nodes that are down.
- for (uint16_t i = 0; i < baselineState.getNodeCount(lib::NodeType::STORAGE); ++i) {
- if (!baselineState.getNodeState(lib::Node(lib::NodeType::STORAGE, i)).getState()
- .oneOf(getStorageNodeUpStates()))
- {
- std::vector<uint64_t> msgIds(
- _pendingMessageTracker.clearMessagesForNode(i));
-
- LOG(debug,
- "Node %d is down, clearing %d pending maintenance operations",
- (int)i,
- (int)msgIds.size());
+ const uint16_t old_node_count = oldState.getBaselineClusterState()->getNodeCount(lib::NodeType::STORAGE);
+ const uint16_t new_node_count = baselineState.getNodeCount(lib::NodeType::STORAGE);
+ for (uint16_t i = 0; i < std::max(old_node_count, new_node_count); ++i) {
+ const auto& node_state = baselineState.getNodeState(lib::Node(lib::NodeType::STORAGE, i)).getState();
+ if (!node_state.oneOf(getStorageNodeUpStates())) {
+ std::vector<uint64_t> msgIds = _pendingMessageTracker.clearMessagesForNode(i);
+ LOG(debug, "Node %u is down, clearing %zu pending maintenance operations", i, msgIds.size());
for (uint32_t j = 0; j < msgIds.size(); ++j) {
_maintenanceOperationOwner.erase(msgIds[j]);