summaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2022-08-30 10:31:26 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2022-08-30 10:33:07 +0000
commit8d19ccd64e7df0c795318ffc38f37b21a34e4fe4 (patch)
tree13f5a78bd9ffda54ad44f3ab323509cdd758133b /storage
parentc7bcf3cb739b2a9665212da031cf45cb201d8126 (diff)
Force content node-internal bucket DB metric update during startup
After initialization, the node will immediately start communicating with the cluster controller, exchanging host info. This host info contains a subset snapshot of the active metrics, which includes the total bucket count, doc count etc. It is critical that we must never report back host info _prior_ to having run at least one full sweep of the bucket database, lest we risk transiently reporting zero buckets held by the content node. Doing so could cause orchestration logic to perform operations based on erroneous assumptions. To avoid this, we explicitly force a full DB sweep and metric update prior to reporting the node as up. Since this function is called prior to the CommunicationManager thread being started, any CC health pings should also always happen after this init step.
Diffstat (limited to 'storage')
-rw-r--r--storage/src/tests/persistence/filestorage/filestormanagertest.cpp1
-rw-r--r--storage/src/vespa/storage/bucketdb/bucketmanager.h2
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/filestormanager.cpp2
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/filestormanager.h5
-rw-r--r--storage/src/vespa/storage/storageserver/servicelayernode.cpp18
-rw-r--r--storage/src/vespa/storage/storageserver/servicelayernode.h2
6 files changed, 29 insertions, 1 deletions
diff --git a/storage/src/tests/persistence/filestorage/filestormanagertest.cpp b/storage/src/tests/persistence/filestorage/filestormanagertest.cpp
index 4c1b1662f68..a5f5075a4d8 100644
--- a/storage/src/tests/persistence/filestorage/filestormanagertest.cpp
+++ b/storage/src/tests/persistence/filestorage/filestormanagertest.cpp
@@ -1966,6 +1966,7 @@ TEST_F(FileStorManagerTest, bucket_db_is_populated_from_provider_when_initialize
getDummyPersistence().set_fake_bucket_set(buckets);
c.manager->initialize_bucket_databases_from_provider();
+ c.manager->complete_internal_initialization();
std::vector<std::pair<document::BucketId, api::BucketInfo>> from_db;
auto populate_from_db = [&from_db](uint64_t key, auto& entry) {
diff --git a/storage/src/vespa/storage/bucketdb/bucketmanager.h b/storage/src/vespa/storage/bucketdb/bucketmanager.h
index 124acf1864c..8bd892e0f09 100644
--- a/storage/src/vespa/storage/bucketdb/bucketmanager.h
+++ b/storage/src/vespa/storage/bucketdb/bucketmanager.h
@@ -103,6 +103,8 @@ public:
/** Get info for given bucket (Used for whitebox testing) */
StorBucketDatabase::Entry getBucketInfo(const document::Bucket &id) const;
+ void force_db_sweep_and_metric_update() { updateMetrics(true); }
+
private:
friend struct BucketManagerTest;
diff --git a/storage/src/vespa/storage/persistence/filestorage/filestormanager.cpp b/storage/src/vespa/storage/persistence/filestorage/filestormanager.cpp
index c7ee0ab97e0..63fec9f037f 100644
--- a/storage/src/vespa/storage/persistence/filestorage/filestormanager.cpp
+++ b/storage/src/vespa/storage/persistence/filestorage/filestormanager.cpp
@@ -1041,7 +1041,9 @@ void FileStorManager::initialize_bucket_databases_from_provider() {
const double elapsed = start_time.getElapsedTimeAsDouble();
LOG(info, "Completed listing of %zu buckets in %.2g milliseconds", bucket_count, elapsed);
_metrics->bucket_db_init_latency.addValue(elapsed);
+}
+void FileStorManager::complete_internal_initialization() {
update_reported_state_after_db_init();
_init_handler.notifyDoneInitializing();
}
diff --git a/storage/src/vespa/storage/persistence/filestorage/filestormanager.h b/storage/src/vespa/storage/persistence/filestorage/filestormanager.h
index 83f1826c498..6820ed6f451 100644
--- a/storage/src/vespa/storage/persistence/filestorage/filestormanager.h
+++ b/storage/src/vespa/storage/persistence/filestorage/filestormanager.h
@@ -110,7 +110,12 @@ public:
// By ensuring that this function is called prior to chain opening, this invariant
// shall be upheld since no RPC/MessageBus endpoints have been made available
// yet at that point in time.
+ // Must always be called _before_ complete_internal_initialization()
void initialize_bucket_databases_from_provider();
+ // Tag node internally as having completed initialization. Updates reported state
+ // (although this will not be communicated out of the process until the
+ // CommunicationManager thread has been fired up).
+ void complete_internal_initialization();
const FileStorMetrics& get_metrics() const { return *_metrics; }
diff --git a/storage/src/vespa/storage/storageserver/servicelayernode.cpp b/storage/src/vespa/storage/storageserver/servicelayernode.cpp
index 1cb2334803e..b76e2cca02a 100644
--- a/storage/src/vespa/storage/storageserver/servicelayernode.cpp
+++ b/storage/src/vespa/storage/storageserver/servicelayernode.cpp
@@ -33,6 +33,7 @@ ServiceLayerNode::ServiceLayerNode(const config::ConfigUri & configUri, ServiceL
_context(context),
_persistenceProvider(persistenceProvider),
_externalVisitors(externalVisitors),
+ _bucket_manager(nullptr),
_fileStorManager(nullptr),
_init_has_been_called(false)
{
@@ -160,7 +161,9 @@ ServiceLayerNode::createChain(IStorageChainBuilder &builder)
auto merge_throttler = merge_throttler_up.get();
builder.add(std::move(merge_throttler_up));
builder.add(std::make_unique<ChangedBucketOwnershipHandler>(_configUri, compReg));
- builder.add(std::make_unique<BucketManager>(_configUri, _context.getComponentRegister()));
+ auto bucket_manager = std::make_unique<BucketManager>(_configUri, _context.getComponentRegister());
+ _bucket_manager = bucket_manager.get();
+ builder.add(std::move(bucket_manager));
builder.add(std::make_unique<VisitorManager>(_configUri, _context.getComponentRegister(), static_cast<VisitorMessageSessionFactory &>(*this), _externalVisitors));
builder.add(std::make_unique<ModifiedBucketChecker>(
_context.getComponentRegister(), _persistenceProvider, _configUri));
@@ -186,7 +189,20 @@ ServiceLayerNode::pause()
void ServiceLayerNode::perform_post_chain_creation_init_steps() {
assert(_fileStorManager);
+ assert(_bucket_manager);
+ // After initialization, the node will immediately start communicating with the cluster
+ // controller, exchanging host info. This host info contains a subset snapshot of the active
+ // metrics, which includes the total bucket count, doc count etc. It is critical that
+ // we must never report back host info _prior_ to having run at least one full sweep of
+ // the bucket database, lest we risk transiently reporting zero buckets held by the
+ // content node. Doing so could cause orchestration logic to perform operations based
+ // on erroneous assumptions.
+ // To avoid this, we explicitly force a full DB sweep and metric update prior to reporting
+ // the node as up. Since this function is called prior to the CommunicationManager thread
+ // being started, any CC health pings should also always happen after this init step.
_fileStorManager->initialize_bucket_databases_from_provider();
+ _bucket_manager->force_db_sweep_and_metric_update();
+ _fileStorManager->complete_internal_initialization();
}
} // storage
diff --git a/storage/src/vespa/storage/storageserver/servicelayernode.h b/storage/src/vespa/storage/storageserver/servicelayernode.h
index f0189f943ab..4b0e0f73408 100644
--- a/storage/src/vespa/storage/storageserver/servicelayernode.h
+++ b/storage/src/vespa/storage/storageserver/servicelayernode.h
@@ -18,6 +18,7 @@ namespace storage {
namespace spi { struct PersistenceProvider; }
+class BucketManager;
class FileStorManager;
class ServiceLayerNode
@@ -31,6 +32,7 @@ class ServiceLayerNode
// FIXME: Should probably use the fetcher in StorageNode
std::unique_ptr<config::ConfigFetcher> _configFetcher;
+ BucketManager* _bucket_manager;
FileStorManager* _fileStorManager;
bool _init_has_been_called;