diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-04-29 09:46:04 +0000 |
---|---|---|
committer | Geir Storli <geirst@verizonmedia.com> | 2020-04-29 09:46:04 +0000 |
commit | 7cc0f99900706932ac98c29af66c609199387785 (patch) | |
tree | ec102d1b5084ba716a0d3efc2447bfcd14e8212a /searchcore | |
parent | 6d108bacdf3c92ebbf8a84b70ff619201ff1ba53 (diff) |
Add tracking of remove operations rate and use this to consider blocking lid space compaction.
During a period with a high rate of remove operations, there is no use running lid space compaction
as this will interfere with the remove operations, increasing latency of those.
Moving a document as part of lid space compaction is a costly operation (similar to putting the document in the first place)
and it typically uses both the index and attribute writer thread pools.
Diffstat (limited to 'searchcore')
12 files changed, 147 insertions, 38 deletions
diff --git a/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_compaction_test.cpp b/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_compaction_test.cpp index a97a2380f25..e679c028d1b 100644 --- a/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_compaction_test.cpp +++ b/searchcore/src/tests/proton/documentdb/lid_space_compaction/lid_space_compaction_test.cpp @@ -30,7 +30,8 @@ constexpr uint32_t SUBDB_ID = 2; constexpr vespalib::duration JOB_DELAY = 1s; constexpr uint32_t ALLOWED_LID_BLOAT = 1; constexpr double ALLOWED_LID_BLOAT_FACTOR = 0.3; -constexpr double REMOVE_BATCH_BLOCK_RATE = 1.0 / 20.0; +constexpr double REMOVE_BATCH_BLOCK_RATE = 1.0 / 21.0; +constexpr double REMOVE_BLOCK_RATE = 1.0 / 20.0; constexpr uint32_t MAX_DOCS_TO_SCAN = 100; constexpr double RESOURCE_LIMIT_FACTOR = 1.0; constexpr uint32_t MAX_OUTSTANDING_MOVE_OPS = 10; @@ -88,14 +89,24 @@ struct MyHandler : public ILidSpaceCompactionHandler { MyHandler(bool storeMoveDoneContexts = false); ~MyHandler(); void clearMoveDoneContexts() { _moveDoneContexts.clear(); } - void run_remove_batch_ops() { + void run_remove_ops(bool remove_batch) { // This ensures to max out the threshold time in the operation rate tracker. - _op_listener->notify_remove_batch(); - _op_listener->notify_remove_batch(); - _op_listener->notify_remove_batch(); + if (remove_batch) { + _op_listener->notify_remove_batch(); + _op_listener->notify_remove_batch(); + _op_listener->notify_remove_batch(); + } else { + _op_listener->notify_remove(); + _op_listener->notify_remove(); + _op_listener->notify_remove(); + } } - void stop_remove_batch_ops() { - _rm_listener->reset_remove_batch_tracker(); + void stop_remove_ops(bool remove_batch) { + if (remove_batch) { + _rm_listener->get_remove_batch_tracker().reset(vespalib::steady_clock::now()); + } else { + _rm_listener->get_remove_tracker().reset(vespalib::steady_clock::now()); + } } virtual vespalib::string getName() const override { return "myhandler"; @@ -279,6 +290,7 @@ struct JobTestBase : public ::testing::Test { _job = std::make_unique<LidSpaceCompactionJob>(DocumentDBLidSpaceCompactionConfig(interval, allowedLidBloat, allowedLidBloatFactor, REMOVE_BATCH_BLOCK_RATE, + REMOVE_BLOCK_RATE, false, maxDocsToScan), *_handler, _storer, _frozenHandler, _diskMemUsageNotifier, BlockableMaintenanceJobConfig(resourceLimitFactor, maxOutstandingMoveOps), @@ -630,40 +642,78 @@ TEST_F(JobTest, job_is_re_enabled_when_node_is_no_longer_retired) assertOneDocumentCompacted(); } -TEST_F(JobTest, job_is_disabled_while_remove_batch_is_ongoing) +class JobDisabledByRemoveOpsTest : public JobTest { +public: + JobDisabledByRemoveOpsTest() : JobTest() {} + + void job_is_disabled_while_remove_ops_are_ongoing(bool remove_batch) { + setupOneDocumentToCompact(); + _handler->run_remove_ops(remove_batch); + EXPECT_TRUE(run()); // job is disabled + assertNoWorkDone(); + } + + void job_becomes_disabled_if_remove_ops_starts(bool remove_batch) { + setupThreeDocumentsToCompact(); + EXPECT_FALSE(run()); // job executed as normal (with more work to do) + assertJobContext(2, 9, 1, 0, 0); + + _handler->run_remove_ops(remove_batch); + EXPECT_TRUE(run()); // job is disabled + assertJobContext(2, 9, 1, 0, 0); + } + + void job_is_re_enabled_when_remove_ops_are_no_longer_ongoing(bool remove_batch) { + job_becomes_disabled_if_remove_ops_starts(remove_batch); + + _handler->stop_remove_ops(remove_batch); + EXPECT_FALSE(run()); // job executed as normal (with more work to do) + assertJobContext(3, 8, 2, 0, 0); + } +}; + +TEST_F(JobDisabledByRemoveOpsTest, config_is_propagated_to_remove_operations_rate_tracker) { - setupOneDocumentToCompact(); - _handler->run_remove_batch_ops(); - EXPECT_TRUE(run()); // job is disabled - assertNoWorkDone(); + auto& remove_batch_tracker = _handler->_rm_listener->get_remove_batch_tracker(); + EXPECT_EQ(vespalib::from_s(21.0), remove_batch_tracker.get_time_budget_per_op()); + EXPECT_EQ(vespalib::from_s(21.0), remove_batch_tracker.get_time_budget_window()); + + auto& remove_tracker = _handler->_rm_listener->get_remove_tracker(); + EXPECT_EQ(vespalib::from_s(20.0), remove_tracker.get_time_budget_per_op()); + EXPECT_EQ(vespalib::from_s(20.0), remove_tracker.get_time_budget_window()); } -TEST_F(JobTest, job_becomes_disabled_if_remove_batch_starts) +TEST_F(JobDisabledByRemoveOpsTest, job_is_disabled_while_remove_batch_is_ongoing) { - setupThreeDocumentsToCompact(); - EXPECT_FALSE(run()); // job executed as normal (with more work to do) - assertJobContext(2, 9, 1, 0, 0); + job_is_disabled_while_remove_ops_are_ongoing(true); +} - _handler->run_remove_batch_ops(); - EXPECT_TRUE(run()); // job is disabled - assertJobContext(2, 9, 1, 0, 0); +TEST_F(JobDisabledByRemoveOpsTest, job_becomes_disabled_if_remove_batch_starts) +{ + job_becomes_disabled_if_remove_ops_starts(true); } -TEST_F(JobTest, job_is_re_enabled_when_remove_batch_is_no_longer_ongoing) +TEST_F(JobDisabledByRemoveOpsTest, job_is_re_enabled_when_remove_batch_is_no_longer_ongoing) { - setupThreeDocumentsToCompact(); - EXPECT_FALSE(run()); // job executed as normal (with more work to do) - assertJobContext(2, 9, 1, 0, 0); + job_is_re_enabled_when_remove_ops_are_no_longer_ongoing(true); +} - _handler->run_remove_batch_ops(); - EXPECT_TRUE(run()); // job is disabled - assertJobContext(2, 9, 1, 0, 0); +TEST_F(JobDisabledByRemoveOpsTest, job_is_disabled_while_removes_are_ongoing) +{ + job_is_disabled_while_remove_ops_are_ongoing(false); +} - _handler->stop_remove_batch_ops(); - EXPECT_FALSE(run()); // job executed as normal (with more work to do) - assertJobContext(3, 8, 2, 0, 0); +TEST_F(JobDisabledByRemoveOpsTest, job_becomes_disabled_if_removes_start) +{ + job_becomes_disabled_if_remove_ops_starts(false); } +TEST_F(JobDisabledByRemoveOpsTest, job_is_re_enabled_when_removes_are_no_longer_ongoing) +{ + job_is_re_enabled_when_remove_ops_are_no_longer_ongoing(false); +} + + struct MaxOutstandingJobTest : public JobTest { std::unique_ptr<MyCountJobRunner> runner; MaxOutstandingJobTest() diff --git a/searchcore/src/tests/proton/documentmetastore/documentmetastore_test.cpp b/searchcore/src/tests/proton/documentmetastore/documentmetastore_test.cpp index 8935143a7be..49b84c09040 100644 --- a/searchcore/src/tests/proton/documentmetastore/documentmetastore_test.cpp +++ b/searchcore/src/tests/proton/documentmetastore/documentmetastore_test.cpp @@ -2088,12 +2088,15 @@ TEST(DocumentMetaStoreTest, multiple_lids_can_be_removed_with_removeBatch) class MockOperationListener : public documentmetastore::OperationListener { public: size_t remove_batch_cnt; + size_t remove_cnt; MockOperationListener() - : remove_batch_cnt(0) + : remove_batch_cnt(0), + remove_cnt(0) { } void notify_remove_batch() override { ++remove_batch_cnt; } + void notify_remove() override { ++remove_cnt; } }; TEST(DocumentMetaStoreTest, call_to_remove_batch_is_notified) @@ -2108,6 +2111,18 @@ TEST(DocumentMetaStoreTest, call_to_remove_batch_is_notified) EXPECT_EQ(1, listener->remove_batch_cnt); } +TEST(DocumentMetaStoreTest, call_to_remove_is_notified) +{ + DocumentMetaStore dms(createBucketDB()); + auto listener = std::make_shared<MockOperationListener>(); + dms.set_operation_listener(listener); + dms.constructFreeList(); + addLid(dms, 1); + + dms.remove(1); + EXPECT_EQ(1, listener->remove_cnt); +} + } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchcore/src/vespa/searchcore/config/proton.def b/searchcore/src/vespa/searchcore/config/proton.def index 644564c0e2e..5d84516d100 100644 --- a/searchcore/src/vespa/searchcore/config/proton.def +++ b/searchcore/src/vespa/searchcore/config/proton.def @@ -391,6 +391,13 @@ lidspacecompaction.removebatchblockdelay double default=2.0 ## lid space compaction do not interfere, but instead is applied after deleting of buckets is complete. lidspacecompaction.removebatchblockrate double default=0.5 +## The rate (ops / second) of remove operations for when to block lid space compaction. +## +## When considering compaction, if the current observed rate of remove operations +## is higher than the given block rate, the lid space compaction job is blocked. +## It is considered again at the next regular interval (see above). +lidspacecompaction.removeblockrate double default=100000.0 + ## This is the maximum value visibilitydelay you can have. ## A to higher value here will cost more memory while not improving too much. maxvisibilitydelay double default=1.0 diff --git a/searchcore/src/vespa/searchcore/proton/documentmetastore/documentmetastore.cpp b/searchcore/src/vespa/searchcore/proton/documentmetastore/documentmetastore.cpp index 3f8ede91faa..b43efa41d0b 100644 --- a/searchcore/src/vespa/searchcore/proton/documentmetastore/documentmetastore.cpp +++ b/searchcore/src/vespa/searchcore/proton/documentmetastore/documentmetastore.cpp @@ -622,6 +622,9 @@ DocumentMetaStore::remove(DocId lid) BucketDBOwner::Guard bucketGuard = _bucketDB->takeGuard(); bool result = remove(lid, bucketGuard); incGeneration(); + if (result && _op_listener) { + _op_listener->notify_remove(); + } return result; } diff --git a/searchcore/src/vespa/searchcore/proton/documentmetastore/operation_listener.h b/searchcore/src/vespa/searchcore/proton/documentmetastore/operation_listener.h index 3b974554551..259f1a40c9d 100644 --- a/searchcore/src/vespa/searchcore/proton/documentmetastore/operation_listener.h +++ b/searchcore/src/vespa/searchcore/proton/documentmetastore/operation_listener.h @@ -14,6 +14,7 @@ public: using SP = std::shared_ptr<OperationListener>; virtual ~OperationListener() {} virtual void notify_remove_batch() = 0; + virtual void notify_remove() = 0; }; } diff --git a/searchcore/src/vespa/searchcore/proton/server/document_db_maintenance_config.cpp b/searchcore/src/vespa/searchcore/proton/server/document_db_maintenance_config.cpp index 6bd1888ad06..a4e8b6752b0 100644 --- a/searchcore/src/vespa/searchcore/proton/server/document_db_maintenance_config.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/document_db_maintenance_config.cpp @@ -54,6 +54,7 @@ DocumentDBLidSpaceCompactionConfig::DocumentDBLidSpaceCompactionConfig() _allowedLidBloat(1000000000), _allowedLidBloatFactor(1.0), _remove_batch_block_rate(0.5), + _remove_block_rate(100000), _disabled(false), _maxDocsToScan(10000) { @@ -63,6 +64,7 @@ DocumentDBLidSpaceCompactionConfig::DocumentDBLidSpaceCompactionConfig(vespalib: uint32_t allowedLidBloat, double allowedLidBloatFactor, double remove_batch_block_rate, + double remove_block_rate, bool disabled, uint32_t maxDocsToScan) : _delay(std::min(MAX_DELAY_SEC, interval)), @@ -70,6 +72,7 @@ DocumentDBLidSpaceCompactionConfig::DocumentDBLidSpaceCompactionConfig(vespalib: _allowedLidBloat(allowedLidBloat), _allowedLidBloatFactor(allowedLidBloatFactor), _remove_batch_block_rate(remove_batch_block_rate), + _remove_block_rate(remove_block_rate), _disabled(disabled), _maxDocsToScan(maxDocsToScan) { diff --git a/searchcore/src/vespa/searchcore/proton/server/document_db_maintenance_config.h b/searchcore/src/vespa/searchcore/proton/server/document_db_maintenance_config.h index 1370d855b31..4c0485baec6 100644 --- a/searchcore/src/vespa/searchcore/proton/server/document_db_maintenance_config.h +++ b/searchcore/src/vespa/searchcore/proton/server/document_db_maintenance_config.h @@ -48,6 +48,7 @@ private: uint32_t _allowedLidBloat; double _allowedLidBloatFactor; double _remove_batch_block_rate; + double _remove_block_rate; bool _disabled; uint32_t _maxDocsToScan; @@ -57,6 +58,7 @@ public: uint32_t allowedLidBloat, double allowwedLidBloatFactor, double remove_batch_block_rate, + double remove_block_rate, bool disabled, uint32_t maxDocsToScan = 10000); @@ -67,6 +69,7 @@ public: uint32_t getAllowedLidBloat() const { return _allowedLidBloat; } double getAllowedLidBloatFactor() const { return _allowedLidBloatFactor; } double get_remove_batch_block_rate() const { return _remove_batch_block_rate; } + double get_remove_block_rate() const { return _remove_block_rate; } bool isDisabled() const { return _disabled; } uint32_t getMaxDocsToScan() const { return _maxDocsToScan; } }; diff --git a/searchcore/src/vespa/searchcore/proton/server/documentdbconfigmanager.cpp b/searchcore/src/vespa/searchcore/proton/server/documentdbconfigmanager.cpp index 9bf8ce71244..d1d88434bd6 100644 --- a/searchcore/src/vespa/searchcore/proton/server/documentdbconfigmanager.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/documentdbconfigmanager.cpp @@ -135,6 +135,7 @@ buildMaintenanceConfig(const BootstrapConfig::SP &bootstrapConfig, proton.lidspacecompaction.allowedlidbloat, proton.lidspacecompaction.allowedlidbloatfactor, proton.lidspacecompaction.removebatchblockrate, + proton.lidspacecompaction.removeblockrate, isDocumentTypeGlobal), AttributeUsageFilterConfig( proton.writefilter.attribute.enumstorelimit, diff --git a/searchcore/src/vespa/searchcore/proton/server/lid_space_compaction_job.cpp b/searchcore/src/vespa/searchcore/proton/server/lid_space_compaction_job.cpp index 62893055a0a..bd53810e14b 100644 --- a/searchcore/src/vespa/searchcore/proton/server/lid_space_compaction_job.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/lid_space_compaction_job.cpp @@ -95,6 +95,12 @@ LidSpaceCompactionJob::remove_batch_is_ongoing() const return _ops_rate_tracker->remove_batch_above_threshold(); } +bool +LidSpaceCompactionJob::remove_is_ongoing() const +{ + return _ops_rate_tracker->remove_above_threshold(); +} + LidSpaceCompactionJob::LidSpaceCompactionJob(const DocumentDBLidSpaceCompactionConfig &config, ILidSpaceCompactionHandler &handler, IOperationStorer &opStorer, @@ -114,7 +120,8 @@ LidSpaceCompactionJob::LidSpaceCompactionJob(const DocumentDBLidSpaceCompactionC _shouldCompactLidSpace(false), _diskMemUsageNotifier(diskMemUsageNotifier), _clusterStateChangedNotifier(clusterStateChangedNotifier), - _ops_rate_tracker(std::make_shared<RemoveOperationsRateTracker>(config.get_remove_batch_block_rate())) + _ops_rate_tracker(std::make_shared<RemoveOperationsRateTracker>(config.get_remove_batch_block_rate(), + config.get_remove_block_rate())) { _diskMemUsageNotifier.addDiskMemUsageListener(this); _clusterStateChangedNotifier.addClusterStateChangedHandler(this); @@ -142,6 +149,11 @@ LidSpaceCompactionJob::run() LOG(info, "run(): Lid space compaction is disabled while remove batch (delete buckets) is ongoing"); return true; } + if (remove_is_ongoing()) { + // Note that we don't set the job as blocked as the decision to un-block it is not driven externally. + LOG(info, "run(): Lid space compaction is disabled while remove operations are ongoing"); + return true; + } if (_scanItr) { return scanDocuments(stats); } else if (_shouldCompactLidSpace) { diff --git a/searchcore/src/vespa/searchcore/proton/server/lid_space_compaction_job.h b/searchcore/src/vespa/searchcore/proton/server/lid_space_compaction_job.h index 44c30987a2f..9171fe7472e 100644 --- a/searchcore/src/vespa/searchcore/proton/server/lid_space_compaction_job.h +++ b/searchcore/src/vespa/searchcore/proton/server/lid_space_compaction_job.h @@ -48,6 +48,7 @@ private: void refreshRunnable(); void refreshAndConsiderRunnable(); bool remove_batch_is_ongoing() const; + bool remove_is_ongoing() const; public: LidSpaceCompactionJob(const DocumentDBLidSpaceCompactionConfig &config, diff --git a/searchcore/src/vespa/searchcore/proton/server/remove_operations_rate_tracker.cpp b/searchcore/src/vespa/searchcore/proton/server/remove_operations_rate_tracker.cpp index eae59fbe175..22294d33aba 100644 --- a/searchcore/src/vespa/searchcore/proton/server/remove_operations_rate_tracker.cpp +++ b/searchcore/src/vespa/searchcore/proton/server/remove_operations_rate_tracker.cpp @@ -5,8 +5,10 @@ namespace proton { -RemoveOperationsRateTracker::RemoveOperationsRateTracker(double remove_batch_rate_threshold) - : _remove_batch_tracker(remove_batch_rate_threshold) +RemoveOperationsRateTracker::RemoveOperationsRateTracker(double remove_batch_rate_threshold, + double remove_rate_threshold) + : _remove_batch_tracker(remove_batch_rate_threshold), + _remove_tracker(remove_rate_threshold) { } @@ -16,16 +18,22 @@ RemoveOperationsRateTracker::notify_remove_batch() _remove_batch_tracker.observe(vespalib::steady_clock::now()); } +void +RemoveOperationsRateTracker::notify_remove() +{ + _remove_tracker.observe(vespalib::steady_clock::now()); +} + bool RemoveOperationsRateTracker::remove_batch_above_threshold() const { return _remove_batch_tracker.above_threshold(vespalib::steady_clock::now()); } -void -RemoveOperationsRateTracker::reset_remove_batch_tracker() +bool +RemoveOperationsRateTracker::remove_above_threshold() const { - _remove_batch_tracker.reset(vespalib::steady_clock::now()); + return _remove_tracker.above_threshold(vespalib::steady_clock::now()); } } diff --git a/searchcore/src/vespa/searchcore/proton/server/remove_operations_rate_tracker.h b/searchcore/src/vespa/searchcore/proton/server/remove_operations_rate_tracker.h index a66ed5b9b54..b8c0315b9e9 100644 --- a/searchcore/src/vespa/searchcore/proton/server/remove_operations_rate_tracker.h +++ b/searchcore/src/vespa/searchcore/proton/server/remove_operations_rate_tracker.h @@ -15,16 +15,21 @@ namespace proton { class RemoveOperationsRateTracker : public documentmetastore::OperationListener { private: OperationRateTracker _remove_batch_tracker; + OperationRateTracker _remove_tracker; public: - RemoveOperationsRateTracker(double remove_batch_rate_threshold); + RemoveOperationsRateTracker(double remove_batch_rate_threshold, + double remove_rate_threshold); void notify_remove_batch() override; + void notify_remove() override; bool remove_batch_above_threshold() const; + bool remove_above_threshold() const; // Should only be used for testing - void reset_remove_batch_tracker(); + OperationRateTracker& get_remove_batch_tracker() { return _remove_batch_tracker; } + OperationRateTracker& get_remove_tracker() { return _remove_tracker; } }; } |