diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2021-02-24 16:52:50 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-24 16:52:50 +0100 |
commit | db0fcba270059f327d66256f02c4e105d1d45d08 (patch) | |
tree | 0e3a6a1175f97fb116c02a74a06591ea37d3517c /storage | |
parent | 6f1196bb5a7d705b6a9580b1ba301b2db4b0a40b (diff) | |
parent | 26aac047c332f5027f47bd0e997633eeeb8884ea (diff) |
Merge pull request #16658 from vespa-engine/vekterli/prefer-ready-replicas-with-more-documents
Prefer activating ready replicas with more documents
Diffstat (limited to 'storage')
6 files changed, 102 insertions, 97 deletions
diff --git a/storage/src/tests/distributor/distributortest.cpp b/storage/src/tests/distributor/distributortest.cpp index c06919a7489..564cd2bc876 100644 --- a/storage/src/tests/distributor/distributortest.cpp +++ b/storage/src/tests/distributor/distributortest.cpp @@ -435,7 +435,7 @@ TEST_F(DistributorTest, metric_update_hook_updates_pending_maintenance_metrics) getConfig().setMaxPendingMaintenanceOps(1); // 1 bucket must be merged, 1 must be split, 1 should be activated. - addNodesToBucketDB(document::BucketId(16, 1), "0=1/1/1/t/a,1=2/2/2"); + addNodesToBucketDB(document::BucketId(16, 1), "0=2/2/2/t/a,1=1/1/1"); addNodesToBucketDB(document::BucketId(16, 2), "0=100/10000000/200000/t/a,1=100/10000000/200000/t"); addNodesToBucketDB(document::BucketId(16, 3), diff --git a/storage/src/tests/distributor/idealstatemanagertest.cpp b/storage/src/tests/distributor/idealstatemanagertest.cpp index 51284307fa8..cae6bf9f226 100644 --- a/storage/src/tests/distributor/idealstatemanagertest.cpp +++ b/storage/src/tests/distributor/idealstatemanagertest.cpp @@ -138,12 +138,13 @@ TEST_F(IdealStateManagerTest, clear_active_on_node_down) { tick(); } - EXPECT_EQ("setbucketstate to [0] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000001)) (pri 100)\n" - "setbucketstate to [0] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000002)) (pri 100)\n" - "setbucketstate to [0] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000003)) (pri 100)\n", + // Node 2 gets activated for each bucket as it has the most documents. + EXPECT_EQ("setbucketstate to [2] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000001)) (pri 100)\n" + "setbucketstate to [2] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000002)) (pri 100)\n" + "setbucketstate to [2] Bucket(BucketSpace(0x0000000000000001), BucketId(0x4000000000000003)) (pri 100)\n", _distributor->getActiveIdealStateOperations()); - setSystemState(lib::ClusterState("distributor:1 storage:3 .0.s:d")); + setSystemState(lib::ClusterState("distributor:1 storage:3 .2.s:d")); EXPECT_EQ("", _distributor->getActiveIdealStateOperations()); EXPECT_EQ(0, _distributor->getPendingMessageTracker() diff --git a/storage/src/tests/distributor/statecheckerstest.cpp b/storage/src/tests/distributor/statecheckerstest.cpp index eda083dcc22..314e5b27e25 100644 --- a/storage/src/tests/distributor/statecheckerstest.cpp +++ b/storage/src/tests/distributor/statecheckerstest.cpp @@ -959,45 +959,38 @@ TEST_F(StateCheckersTest, bucket_state) { testBucketState("")); // Node 1 is in ideal state - EXPECT_EQ("[Setting node 1 as active:" - " copy is ideal state priority 0] (pri 90)", + EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 0] (pri 90)", testBucketState("1=2/3/4", 2, true)); // Node 3 is in ideal state - EXPECT_EQ("[Setting node 3 as active:" - " copy is ideal state priority 1]", + EXPECT_EQ("[Setting node 3 as active: copy has 3 docs and ideal state priority 1]", testBucketState("3=2/3/4")); - // No trusted nodes, but node 1 is first in ideal state. + // No ready replicas. Node 1 is first in ideal state but node 2 has + // more docs and should remain active. // Also check bad case where more than 1 node is set as active just // to ensure we can get out of that situation if it should ever happen. - // Nothing done with node 3 since is't not active and shouldn't be. - EXPECT_EQ("[Setting node 1 as active:" - " copy is ideal state priority 0]" - "[Setting node 0 as inactive]" - "[Setting node 2 as inactive] (pri 120)", + // Nothing done with node 3 since it's not active and shouldn't be. + EXPECT_EQ("[Setting node 0 as inactive] (pri 90)", testBucketState("0=3/4/5/u/a,1=3,2=4/5/6/u/a,3=3", 2, true)); // Test setting active when only node available is not contained // within the resolved ideal state. - EXPECT_EQ("[Setting node 0 as active: first available copy]", + EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]", testBucketState("0=2/3/4")); - // A trusted ideal state copy should be set active rather than a non-trusted - // ideal state copy - EXPECT_EQ("[Setting node 3 as active:" - " copy is trusted and ideal state priority 1]" + // A replica with more documents should be preferred over one with fewer. + EXPECT_EQ("[Setting node 3 as active: copy has 6 docs and ideal state priority 1]" "[Setting node 1 as inactive]", testBucketState("1=2/3/4/u/a,3=5/6/7/t")); - // None of the ideal state copies are trusted but a non-ideal copy is. - // The trusted copy should be active. - EXPECT_EQ("[Setting node 2 as active: copy is trusted]", + // Replica 2 has most documents and should be activated + EXPECT_EQ("[Setting node 2 as active: copy has 9 docs]", testBucketState("1=2/3/4,3=5/6/7/,2=8/9/10/t")); // Make sure bucket db ordering does not matter - EXPECT_EQ("[Setting node 2 as active: copy is trusted]", - testBucketState("2=8/9/10/t,1=2/3/4,3=5/6/7")); + EXPECT_EQ("[Setting node 2 as active: copy has 9 docs]", + testBucketState("1=2/3/4,3=5/6/7,2=8/9/10/t")); // If copy is already active, we shouldn't generate operations EXPECT_EQ("NO OPERATIONS GENERATED", @@ -1016,26 +1009,26 @@ TEST_F(StateCheckersTest, bucket_state) { EXPECT_EQ("NO OPERATIONS GENERATED", testBucketState("1=0/0/1,3=0/0/1")); - // Ready preferred over trusted & ideal state + // Ready preferred over ideal state EXPECT_EQ("NO OPERATIONS GENERATED", testBucketState("2=8/9/10/t/i/u,1=2/3/4/u/a/r,3=5/6/7")); - EXPECT_EQ("[Setting node 2 as active: copy is ready]" + EXPECT_EQ("[Setting node 2 as active: copy is ready with 9 docs]" "[Setting node 1 as inactive]", testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/u,3=5/6/7/u/i/u")); // Prefer in ideal state if multiple copies ready - EXPECT_EQ("[Setting node 3 as active: copy is ready]" + EXPECT_EQ("[Setting node 3 as active: copy is ready, has 9 docs and ideal state priority 1]" "[Setting node 1 as inactive]", - testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/u,3=5/6/7/u/i/r")); + testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/u,3=8/9/10/u/i/r")); - // Prefer ideal state if all ready but no trusted - EXPECT_EQ("[Setting node 1 as active: copy is ready]", - testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/i/r,3=5/6/7/u/i/r")); + // Prefer ideal state if all ready + EXPECT_EQ("[Setting node 1 as active: copy is ready, has 9 docs and ideal state priority 0]", + testBucketState("2=8/9/10/u/i/r,1=8/9/10/u/i/r,3=8/9/10/u/i/r")); - // Prefer trusted over ideal state - EXPECT_EQ("[Setting node 2 as active: copy is ready and trusted]" + // Ready with more documents is preferred over ideal state or trusted + EXPECT_EQ("[Setting node 2 as active: copy is ready with 9 docs]" "[Setting node 1 as inactive]", - testBucketState("2=8/9/10/t/i/r,1=2/3/4/u/a/r,3=5/6/7")); + testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/r,3=5/6/7/u/i/r")); } /** @@ -1050,7 +1043,7 @@ TEST_F(StateCheckersTest, do_not_activate_non_ready_copies_when_ideal_node_in_ma EXPECT_EQ("NO OPERATIONS GENERATED", testBucketState("2=8/9/10/t/i/u,3=5/6/7")); // But we should activate another copy iff there's another ready copy. - EXPECT_EQ("[Setting node 2 as active: copy is ready]", + EXPECT_EQ("[Setting node 2 as active: copy is ready with 9 docs]", testBucketState("2=8/9/10/u/i/r,3=5/6/7/u/i/u")); } @@ -1151,18 +1144,14 @@ TEST_F(StateCheckersTest, bucket_state_per_group) { } // Node 1 and 8 is is ideal state - EXPECT_EQ("[Setting node 1 as active: " - "copy is trusted and ideal state priority 4]" - "[Setting node 6 as active: " - "copy is trusted and ideal state priority 0] (pri 90)", + EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 4]" + "[Setting node 6 as active: copy has 3 docs and ideal state priority 0] (pri 90)", testBucketStatePerGroup("0=2/3/4/t, 1=2/3/4/t, 3=2/3/4/t, " "5=2/3/4/t, 6=2/3/4/t, 8=2/3/4/t", true)); // Data differ between groups - EXPECT_EQ("[Setting node 1 as active: " - "copy is trusted and ideal state priority 4]" - "[Setting node 6 as active: " - "copy is ideal state priority 0] (pri 90)", + EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 4]" + "[Setting node 6 as active: copy has 6 docs and ideal state priority 0] (pri 90)", testBucketStatePerGroup("0=2/3/4/t, 1=2/3/4/t, 3=2/3/4/t, " "5=5/6/7, 6=5/6/7, 8=5/6/7", true)); @@ -1176,12 +1165,9 @@ TEST_F(StateCheckersTest, bucket_state_per_group) { true)); // Node 1 and 8 is is ideal state - EXPECT_EQ("[Setting node 1 as active: " - "copy is trusted and ideal state priority 4]" - "[Setting node 6 as active: " - "copy is trusted and ideal state priority 0]" - "[Setting node 9 as active: " - "copy is trusted and ideal state priority 2] (pri 90)", + EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 4]" + "[Setting node 6 as active: copy has 3 docs and ideal state priority 0]" + "[Setting node 9 as active: copy has 3 docs and ideal state priority 2] (pri 90)", testBucketStatePerGroup("0=2/3/4/t, 1=2/3/4/t, 3=2/3/4/t, " "5=2/3/4/t, 6=2/3/4/t, 8=2/3/4/t, " "9=2/3/4/t, 10=2/3/4/t, 11=2/3/4/t", @@ -1196,31 +1182,31 @@ TEST_F(StateCheckersTest, do_not_activate_replicas_that_are_out_of_sync_with_maj getConfig().set_max_activation_inhibited_out_of_sync_groups(3); // 5 is out of sync with 0 and 9 and will NOT be activated. - EXPECT_EQ("[Setting node 0 as active: first available copy]" - "[Setting node 9 as active: copy is ideal state priority 2]", + EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]" + "[Setting node 9 as active: copy has 3 docs and ideal state priority 2]", testBucketStatePerGroup("0=2/3/4, 5=3/4/5, 9=2/3/4")); // We also try the other indices:... // 0 out of sync, 5 and 9 in sync (one hopes..!) - EXPECT_EQ("[Setting node 5 as active: first available copy]" - "[Setting node 9 as active: copy is ideal state priority 2]", + EXPECT_EQ("[Setting node 5 as active: copy has 3 docs]" + "[Setting node 9 as active: copy has 3 docs and ideal state priority 2]", testBucketStatePerGroup("0=4/5/6, 5=2/3/4, 9=2/3/4")); // 9 out of sync, 0 and 5 in sync - EXPECT_EQ("[Setting node 0 as active: first available copy]" - "[Setting node 5 as active: first available copy]", + EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]" + "[Setting node 5 as active: copy has 3 docs]", testBucketStatePerGroup("0=2/3/4, 5=2/3/4, 9=5/3/4")); // If there's no majority, we activate everything because there's really nothing // better we can do. - EXPECT_EQ("[Setting node 0 as active: first available copy]" - "[Setting node 5 as active: first available copy]" - "[Setting node 9 as active: copy is ideal state priority 2]", + EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]" + "[Setting node 5 as active: copy has 6 docs]" + "[Setting node 9 as active: copy has 9 docs and ideal state priority 2]", testBucketStatePerGroup("0=2/3/4, 5=5/6/7, 9=8/9/10")); // However, if a replica is _already_ active, we will not deactivate it. - EXPECT_EQ("[Setting node 0 as active: first available copy]" - "[Setting node 9 as active: copy is ideal state priority 2]", + EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]" + "[Setting node 9 as active: copy has 3 docs and ideal state priority 2]", testBucketStatePerGroup("0=2/3/4, 5=3/4/5/u/a, 9=2/3/4")); } @@ -1234,8 +1220,8 @@ TEST_F(StateCheckersTest, replica_activation_inhibition_can_be_limited_to_max_n_ // group 0, 1 out of sync in group 1 and 1 out of sync in group 2. Unless we have // mechanisms in place to limit the number of affected groups, both groups 1 and 2 would // be inhibited for activation. Since we limit to 1, only group 1 should be affected. - EXPECT_EQ("[Setting node 1 as active: copy is ideal state priority 4]" - "[Setting node 9 as active: copy is ideal state priority 2]", + EXPECT_EQ("[Setting node 1 as active: copy has 3 docs and ideal state priority 4]" + "[Setting node 9 as active: copy has 6 docs and ideal state priority 2]", testBucketStatePerGroup("0=2/3/4, 1=2/3/4, 3=2/3/4, 5=3/4/5, 9=5/6/7")); } @@ -1246,9 +1232,9 @@ TEST_F(StateCheckersTest, activate_replicas_that_are_out_of_sync_with_majority_i getConfig().set_max_activation_inhibited_out_of_sync_groups(0); // 5 is out of sync with 0 and 9 but will still be activated since the config is false. - EXPECT_EQ("[Setting node 0 as active: first available copy]" - "[Setting node 5 as active: first available copy]" - "[Setting node 9 as active: copy is ideal state priority 2]", + EXPECT_EQ("[Setting node 0 as active: copy has 3 docs]" + "[Setting node 5 as active: copy has 4 docs]" + "[Setting node 9 as active: copy has 3 docs and ideal state priority 2]", testBucketStatePerGroup("0=2/3/4, 5=3/4/5, 9=2/3/4")); } @@ -1257,7 +1243,7 @@ TEST_F(StateCheckersTest, allow_activation_of_retired_nodes) { // we still want to be able to shuffle bucket activations around in order // to preserve coverage. setupDistributor(2, 2, "distributor:1 storage:2 .0.s:r .1.s:r"); - EXPECT_EQ("[Setting node 1 as active: copy is trusted]" + EXPECT_EQ("[Setting node 1 as active: copy has 6 docs]" "[Setting node 0 as inactive]", testBucketState("0=2/3/4/u/a,1=5/6/7/t")); } diff --git a/storage/src/vespa/storage/distributor/activecopy.cpp b/storage/src/vespa/storage/distributor/activecopy.cpp index e5bfe489d26..e1e88116921 100644 --- a/storage/src/vespa/storage/distributor/activecopy.cpp +++ b/storage/src/vespa/storage/distributor/activecopy.cpp @@ -27,9 +27,9 @@ ActiveCopy::ActiveCopy(uint16_t node, const BucketDatabase::Entry& e, const std: _ideal(0xffff) { const BucketCopy* copy = e->getNode(node); - assert(copy != 0); + assert(copy != nullptr); + _doc_count = copy->getDocumentCount(); _ready = copy->ready(); - _trusted = copy->trusted(); _active = copy->active(); for (uint32_t i=0; i<idealState.size(); ++i) { if (idealState[i] == node) { @@ -41,20 +41,27 @@ ActiveCopy::ActiveCopy(uint16_t node, const BucketDatabase::Entry& e, const std: vespalib::string ActiveCopy::getReason() const { - if (_ready && _trusted && _ideal < 0xffff) { + if (_ready && (_doc_count > 0) && (_ideal < 0xffff)) { vespalib::asciistream ost; - ost << "copy is ready, trusted and ideal state priority " << _ideal; + ost << "copy is ready, has " << _doc_count + << " docs and ideal state priority " << _ideal; + return ost.str(); + } else if (_ready && (_doc_count > 0)) { + vespalib::asciistream ost; + ost << "copy is ready with " << _doc_count << " docs"; return ost.str(); - } else if (_ready && _trusted) { - return "copy is ready and trusted"; } else if (_ready) { return "copy is ready"; - } else if (_trusted && _ideal < 0xffff) { + } else if ((_doc_count > 0) && (_ideal < 0xffff)) { + vespalib::asciistream ost; + ost << "copy has " << _doc_count << " docs and ideal state priority " << _ideal; + return ost.str(); + } else if (_doc_count > 0) { vespalib::asciistream ost; - ost << "copy is trusted and ideal state priority " << _ideal; + ost << "copy has " << _doc_count << " docs"; return ost.str(); - } else if (_trusted) { - return "copy is trusted"; + } else if (_active) { + return "copy is already active"; } else if (_ideal < 0xffff) { vespalib::asciistream ost; ost << "copy is ideal state priority " << _ideal; @@ -67,9 +74,15 @@ ActiveCopy::getReason() const { std::ostream& operator<<(std::ostream& out, const ActiveCopy & e) { out << "Entry(Node " << e._nodeIndex; - if (e._ready) out << ", ready"; - if (e._trusted) out << ", trusted"; - if (e._ideal < 0xffff) out << ", ideal pri " << e._ideal; + if (e._ready) { + out << ", ready"; + } + if (e._doc_count > 0) { + out << ", doc_count " << e._doc_count; + } + if (e._ideal < 0xffff) { + out << ", ideal pri " << e._ideal; + } out << ")"; return out; } @@ -78,10 +91,18 @@ namespace { struct ActiveStateOrder { bool operator()(const ActiveCopy & e1, const ActiveCopy & e2) { - if (e1._ready != e2._ready) return e1._ready; - if (e1._trusted != e2._trusted) return e1._trusted; - if (e1._ideal != e2._ideal) return e1._ideal < e2._ideal; - if (e1._active != e2._active) return e1._active; + if (e1._ready != e2._ready) { + return e1._ready; + } + if (e1._doc_count != e2._doc_count) { + return e1._doc_count > e2._doc_count; + } + if (e1._ideal != e2._ideal) { + return e1._ideal < e2._ideal; + } + if (e1._active != e2._active) { + return e1._active; + } return e1._nodeIndex < e2._nodeIndex; } }; diff --git a/storage/src/vespa/storage/distributor/activecopy.h b/storage/src/vespa/storage/distributor/activecopy.h index d22e7072cc7..7ad3649d738 100644 --- a/storage/src/vespa/storage/distributor/activecopy.h +++ b/storage/src/vespa/storage/distributor/activecopy.h @@ -10,7 +10,7 @@ namespace storage::distributor { class ActiveList; struct ActiveCopy { - ActiveCopy() : _nodeIndex(-1), _ideal(-1), _ready(false), _trusted(false), _active(false) { } + ActiveCopy() : _nodeIndex(-1), _ideal(-1), _doc_count(0), _ready(false), _active(false) { } ActiveCopy(uint16_t node, const BucketDatabase::Entry& e, const std::vector<uint16_t>& idealState); vespalib::string getReason() const; @@ -23,8 +23,8 @@ struct ActiveCopy { uint16_t _nodeIndex; uint16_t _ideal; + uint32_t _doc_count; bool _ready; - bool _trusted; bool _active; }; diff --git a/storage/src/vespa/storage/distributor/distributor.cpp b/storage/src/vespa/storage/distributor/distributor.cpp index 2b24eb1195a..cb4336d9f1e 100644 --- a/storage/src/vespa/storage/distributor/distributor.cpp +++ b/storage/src/vespa/storage/distributor/distributor.cpp @@ -17,6 +17,7 @@ #include <vespa/storageframework/generic/status/xmlstatusreporter.h> #include <vespa/vdslib/distribution/distribution.h> #include <vespa/vespalib/util/memoryusage.h> +#include <algorithm> #include <vespa/log/log.h> LOG_SETUP(".distributor-main"); @@ -373,17 +374,13 @@ Distributor::enableClusterStateBundle(const lib::ClusterStateBundle& state) enterRecoveryMode(); // Clear all active messages on nodes that are down. - for (uint16_t i = 0; i < baselineState.getNodeCount(lib::NodeType::STORAGE); ++i) { - if (!baselineState.getNodeState(lib::Node(lib::NodeType::STORAGE, i)).getState() - .oneOf(getStorageNodeUpStates())) - { - std::vector<uint64_t> msgIds( - _pendingMessageTracker.clearMessagesForNode(i)); - - LOG(debug, - "Node %d is down, clearing %d pending maintenance operations", - (int)i, - (int)msgIds.size()); + const uint16_t old_node_count = oldState.getBaselineClusterState()->getNodeCount(lib::NodeType::STORAGE); + const uint16_t new_node_count = baselineState.getNodeCount(lib::NodeType::STORAGE); + for (uint16_t i = 0; i < std::max(old_node_count, new_node_count); ++i) { + const auto& node_state = baselineState.getNodeState(lib::Node(lib::NodeType::STORAGE, i)).getState(); + if (!node_state.oneOf(getStorageNodeUpStates())) { + std::vector<uint64_t> msgIds = _pendingMessageTracker.clearMessagesForNode(i); + LOG(debug, "Node %u is down, clearing %zu pending maintenance operations", i, msgIds.size()); for (uint32_t j = 0; j < msgIds.size(); ++j) { _maintenanceOperationOwner.erase(msgIds[j]); |