diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-02-19 13:25:21 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-02-22 15:55:07 +0000 |
commit | f5414eadef7354893f64032d7ed1779777875502 (patch) | |
tree | bf0efa90f7c3f72206b40f8c12375ba7a6e97eed /storage | |
parent | 8bd7b6534fab28d507629fca1c109fff65585c40 (diff) |
Fail client ops gracefully when distributor is marked down
Previously, clients would only receive `ABORTED` when the distributor was
marked down by orchestration. This would simply cause the client to resend
until either the `StoragePolicy` would discard the cluster state entirely
and retry against a working distributor, or the operations would time out.
Now they will receive a `WrongDistributionReply` that shall immediately update
the `StoragePolicy` to avoid sending to the distributor that has been
marked down.
Also add a separate metric for number of operations aborted by `Bouncer`.
This fixes #8448.
Diffstat (limited to 'storage')
6 files changed, 70 insertions, 36 deletions
diff --git a/storage/src/tests/common/teststorageapp.cpp b/storage/src/tests/common/teststorageapp.cpp index 8fd80d47bb3..a720cd191e4 100644 --- a/storage/src/tests/common/teststorageapp.cpp +++ b/storage/src/tests/common/teststorageapp.cpp @@ -248,8 +248,7 @@ TestDistributorApp::configure(vespalib::stringref id) TestDistributorApp::TestDistributorApp(vespalib::stringref configId) : TestStorageApp( - StorageComponentRegisterImpl::UP( - new DistributorComponentRegisterImpl), + std::make_unique<DistributorComponentRegisterImpl>(), lib::NodeType::DISTRIBUTOR, getIndexFromConfig(configId), configId), _compReg(dynamic_cast<DistributorComponentRegisterImpl&>( TestStorageApp::getComponentRegister())), @@ -263,7 +262,7 @@ TestDistributorApp::TestDistributorApp(vespalib::stringref configId) TestDistributorApp::TestDistributorApp(NodeIndex index, vespalib::stringref configId) : TestStorageApp( - StorageComponentRegisterImpl::UP(new StorageComponentRegisterImpl), + std::make_unique<DistributorComponentRegisterImpl>(), lib::NodeType::DISTRIBUTOR, index, configId), _compReg(dynamic_cast<DistributorComponentRegisterImpl&>( TestStorageApp::getComponentRegister())), diff --git a/storage/src/tests/storageserver/bouncertest.cpp b/storage/src/tests/storageserver/bouncertest.cpp index be926722497..27c13a3707e 100644 --- a/storage/src/tests/storageserver/bouncertest.cpp +++ b/storage/src/tests/storageserver/bouncertest.cpp @@ -19,7 +19,7 @@ using document::test::makeDocumentBucket; namespace storage { struct BouncerTest : public CppUnit::TestFixture { - std::unique_ptr<TestServiceLayerApp> _node; + std::unique_ptr<TestStorageApp> _node; std::unique_ptr<DummyStorageLink> _upper; Bouncer* _manager; DummyStorageLink* _lower; @@ -29,6 +29,8 @@ struct BouncerTest : public CppUnit::TestFixture { void setUp() override; void tearDown() override; + void setUpAsNode(const lib::NodeType& type); + void testFutureTimestamp(); void testAllowNotifyBucketChangeEvenWhenDistributorDown(); void rejectLowerPrioritizedFeedMessagesWhenConfigured(); @@ -40,6 +42,7 @@ struct BouncerTest : public CppUnit::TestFixture { void internalOperationsAreNotRejected(); void outOfBoundsConfigValuesThrowException(); void abort_request_when_derived_bucket_space_node_state_is_marked_down(); + void client_operations_are_allowed_through_on_cluster_state_down_distributor(); CPPUNIT_TEST_SUITE(BouncerTest); CPPUNIT_TEST(testFutureTimestamp); @@ -53,6 +56,7 @@ struct BouncerTest : public CppUnit::TestFixture { CPPUNIT_TEST(internalOperationsAreNotRejected); CPPUNIT_TEST(outOfBoundsConfigValuesThrowException); CPPUNIT_TEST(abort_request_when_derived_bucket_space_node_state_is_marked_down); + CPPUNIT_TEST(client_operations_are_allowed_through_on_cluster_state_down_distributor); CPPUNIT_TEST_SUITE_END(); using Priority = api::StorageMessage::Priority; @@ -81,39 +85,42 @@ CPPUNIT_TEST_SUITE_REGISTRATION(BouncerTest); BouncerTest::BouncerTest() : _node(), _upper(), - _manager(0), - _lower(0) + _manager(nullptr), + _lower(nullptr) { } -void -BouncerTest::setUp() { - try{ - vdstestlib::DirConfig config(getStandardConfig(true)); - _node.reset(new TestServiceLayerApp( - DiskCount(1), NodeIndex(2), config.getConfigId())); - _upper.reset(new DummyStorageLink()); - _manager = new Bouncer(_node->getComponentRegister(), - config.getConfigId()); - _lower = new DummyStorageLink(); - _upper->push_back(std::unique_ptr<StorageLink>(_manager)); - _upper->push_back(std::unique_ptr<StorageLink>(_lower)); - _upper->open(); - } catch (std::exception& e) { - std::cerr << "Failed to static initialize objects: " << e.what() - << "\n"; +void BouncerTest::setUpAsNode(const lib::NodeType& type) { + vdstestlib::DirConfig config(getStandardConfig(type == lib::NodeType::STORAGE)); + if (type == lib::NodeType::STORAGE) { + _node.reset(new TestServiceLayerApp(DiskCount(1), NodeIndex(2), config.getConfigId())); + } else { + _node.reset(new TestDistributorApp(NodeIndex(2), config.getConfigId())); } + _upper.reset(new DummyStorageLink()); + _manager = new Bouncer(_node->getComponentRegister(), config.getConfigId()); + _lower = new DummyStorageLink(); + _upper->push_back(std::unique_ptr<StorageLink>(_manager)); + _upper->push_back(std::unique_ptr<StorageLink>(_lower)); + _upper->open(); _node->getClock().setAbsoluteTimeInSeconds(10); } void +BouncerTest::setUp() { + setUpAsNode(lib::NodeType::STORAGE); +} + +void BouncerTest::tearDown() { - _manager = 0; - _lower = 0; - _upper->close(); - _upper->flush(); - _upper.reset(0); - _node.reset(0); + _manager = nullptr; + _lower = nullptr; + if (_upper) { + _upper->close(); + _upper->flush(); + _upper.reset(); + } + _node.reset(); } std::shared_ptr<api::StorageCommand> @@ -334,13 +341,31 @@ makeClusterStateBundle(const vespalib::string &baselineState, const std::map<doc void BouncerTest::abort_request_when_derived_bucket_space_node_state_is_marked_down() { + CPPUNIT_ASSERT_EQUAL(uint64_t(0), _manager->metrics().unavailable_node_aborts.getValue()); + auto state = makeClusterStateBundle("distributor:3 storage:3", {{ document::FixedBucketSpaces::default_space(), "distributor:3 storage:3 .2.s:d" }}); _node->getNodeStateUpdater().setClusterStateBundle(state); _upper->sendDown(createDummyFeedMessage(11 * 1000000, document::FixedBucketSpaces::default_space())); assertMessageBouncedWithAbort(); + CPPUNIT_ASSERT_EQUAL(uint64_t(1), _manager->metrics().unavailable_node_aborts.getValue()); + _upper->reset(); _upper->sendDown(createDummyFeedMessage(11 * 1000000, document::FixedBucketSpaces::global_space())); assertMessageNotBounced(); + CPPUNIT_ASSERT_EQUAL(uint64_t(1), _manager->metrics().unavailable_node_aborts.getValue()); +} + +void BouncerTest::client_operations_are_allowed_through_on_cluster_state_down_distributor() { + tearDown(); + setUpAsNode(lib::NodeType::DISTRIBUTOR); + + // Distributor states never vary across bucket spaces, so not necessary to test with + // anything except baseline state here. + auto state = makeClusterStateBundle("distributor:3 .2.s:d storage:3", {}); + _node->getNodeStateUpdater().setClusterStateBundle(state); + _upper->sendDown(createDummyFeedMessage(11 * 1000000, document::FixedBucketSpaces::default_space())); + assertMessageNotBounced(); + CPPUNIT_ASSERT_EQUAL(uint64_t(0), _manager->metrics().unavailable_node_aborts.getValue()); } } // storage diff --git a/storage/src/vespa/storage/storageserver/bouncer.cpp b/storage/src/vespa/storage/storageserver/bouncer.cpp index 82cf64423c6..0541c7322f1 100644 --- a/storage/src/vespa/storage/storageserver/bouncer.cpp +++ b/storage/src/vespa/storage/storageserver/bouncer.cpp @@ -118,6 +118,7 @@ Bouncer::abortCommandForUnavailableNode(api::StorageMessage& msg, << " when node is in state " << state.toString(true); append_node_identity(ost); reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED, ost.str())); + _metrics->unavailable_node_aborts.inc(); sendUp(reply); } @@ -158,6 +159,10 @@ Bouncer::clusterIsUp() const return (*_clusterState == lib::State::UP); } +bool Bouncer::isDistributor() const { + return (_component.getNodeType() == lib::NodeType::DISTRIBUTOR); +} + uint64_t Bouncer::extractMutationTimestampIfAny(const api::StorageMessage& msg) { @@ -243,12 +248,11 @@ Bouncer::onDown(const std::shared_ptr<api::StorageMessage>& msg) int feedPriorityLowerBound; { vespalib::LockGuard lock(_lock); - state = &getDerivedNodeState(msg->getBucket().getBucketSpace()).getState(); - maxClockSkewInSeconds = _config->maxClockSkewSeconds; + state = &getDerivedNodeState(msg->getBucket().getBucketSpace()).getState(); + maxClockSkewInSeconds = _config->maxClockSkewSeconds; abortLoadWhenClusterDown = _config->stopExternalLoadWhenClusterDown; - isInAvailableState = state->oneOf( - _config->stopAllLoadWhenNodestateNotIn.c_str()); - feedPriorityLowerBound = _config->feedRejectionPriorityThreshold; + isInAvailableState = state->oneOf(_config->stopAllLoadWhenNodestateNotIn.c_str()); + feedPriorityLowerBound = _config->feedRejectionPriorityThreshold; } // Special case for messages storage nodes are expected to get during // initializing. Request bucket info will be queued so storage can @@ -258,13 +262,14 @@ Bouncer::onDown(const std::shared_ptr<api::StorageMessage>& msg) { return false; } - if (!isInAvailableState) { + const bool externalLoad = isExternalLoad(type); + if (!isInAvailableState && !(isDistributor() && externalLoad)) { abortCommandForUnavailableNode(*msg, *state); return true; } // Allow all internal load to go through at this point - if (!isExternalLoad(type)) { + if (!externalLoad) { return false; } if (priorityRejectionIsEnabled(feedPriorityLowerBound) diff --git a/storage/src/vespa/storage/storageserver/bouncer.h b/storage/src/vespa/storage/storageserver/bouncer.h index 258b0b18a32..19c8e084c7d 100644 --- a/storage/src/vespa/storage/storageserver/bouncer.h +++ b/storage/src/vespa/storage/storageserver/bouncer.h @@ -70,6 +70,8 @@ private: bool clusterIsUp() const; + bool isDistributor() const; + bool isExternalLoad(const api::MessageType&) const noexcept; bool isExternalWriteOperation(const api::MessageType&) const noexcept; diff --git a/storage/src/vespa/storage/storageserver/bouncer_metrics.cpp b/storage/src/vespa/storage/storageserver/bouncer_metrics.cpp index c0fac35263e..5f3ba5a236d 100644 --- a/storage/src/vespa/storage/storageserver/bouncer_metrics.cpp +++ b/storage/src/vespa/storage/storageserver/bouncer_metrics.cpp @@ -7,7 +7,9 @@ namespace storage { BouncerMetrics::BouncerMetrics() : MetricSet("bouncer", {}, "Metrics for Bouncer component", nullptr), clock_skew_aborts("clock_skew_aborts", {}, "Number of client operations that were aborted due to " - "clock skew between sender and receiver exceeding acceptable range", this) + "clock skew between sender and receiver exceeding acceptable range", this), + unavailable_node_aborts("unavailable_node_aborts", {}, "Number of operations that were aborted due " + "to the node (or target bucket space) being unavailable", this) { } diff --git a/storage/src/vespa/storage/storageserver/bouncer_metrics.h b/storage/src/vespa/storage/storageserver/bouncer_metrics.h index 9beca6c73b7..9842bed1c6f 100644 --- a/storage/src/vespa/storage/storageserver/bouncer_metrics.h +++ b/storage/src/vespa/storage/storageserver/bouncer_metrics.h @@ -8,6 +8,7 @@ namespace storage { struct BouncerMetrics : metrics::MetricSet { metrics::LongCountMetric clock_skew_aborts; + metrics::LongCountMetric unavailable_node_aborts; BouncerMetrics(); ~BouncerMetrics() override; |