aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2020-07-03 13:19:12 +0200
committerGitHub <noreply@github.com>2020-07-03 13:19:12 +0200
commitbff64708d76b80ec125fa44c3c70df93b97ff22a (patch)
tree4afad579347d0f81e1b2e353f31acf174e3e9079
parent4279238c4fa15c1a1bb69830d626798f3a54924e (diff)
parent902fcbb77170f5799c60ae6b6be4710aa29dde10 (diff)
Merge pull request #13786 from vespa-engine/vekterli/unify-content-node-btree-databases
Unify content node and distributor B-tree databases
-rw-r--r--storage/src/tests/distributor/bucketdatabasetest.cpp4
-rw-r--r--storage/src/vespa/storage/bucketdb/abstract_bucket_map.h1
-rw-r--r--storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp527
-rw-r--r--storage/src/vespa/storage/bucketdb/btree_bucket_database.h65
-rw-r--r--storage/src/vespa/storage/bucketdb/btree_lockable_map.h2
-rw-r--r--storage/src/vespa/storage/bucketdb/btree_lockable_map.hpp6
-rw-r--r--storage/src/vespa/storage/bucketdb/bucketdatabase.h119
-rw-r--r--storage/src/vespa/storage/bucketdb/db_merger.h111
-rw-r--r--storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.cpp13
-rw-r--r--storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.h130
-rw-r--r--storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.hpp74
-rw-r--r--storage/src/vespa/storage/bucketdb/lockablemap.h2
-rw-r--r--storage/src/vespa/storage/bucketdb/mapbucketdatabase.cpp8
-rw-r--r--storage/src/vespa/storage/bucketdb/mapbucketdatabase.h4
-rw-r--r--storage/src/vespa/storage/bucketdb/read_guard.h46
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.cpp4
-rw-r--r--storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp14
17 files changed, 365 insertions, 765 deletions
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp
index a0354c8ad4e..0b832699364 100644
--- a/storage/src/tests/distributor/bucketdatabasetest.cpp
+++ b/storage/src/tests/distributor/bucketdatabasetest.cpp
@@ -610,7 +610,7 @@ struct InsertBeforeBucketMergingProcessor : BucketDatabase::MergingProcessor {
Result merge(Merger& m) override {
if (m.bucket_id() == _before_bucket) {
// Assumes _before_bucket is > the inserted bucket
- m.insert_before_current(BucketDatabase::Entry(document::BucketId(16, 2), BI(2)));
+ m.insert_before_current(document::BucketId(16, 2), BucketDatabase::Entry(document::BucketId(16, 2), BI(2)));
}
return Result::KeepUnchanged;
}
@@ -622,7 +622,7 @@ struct InsertAtEndMergingProcessor : BucketDatabase::MergingProcessor {
}
void insert_remaining_at_end(TrailingInserter& inserter) override {
- inserter.insert_at_end(BucketDatabase::Entry(document::BucketId(16, 3), BI(3)));
+ inserter.insert_at_end(document::BucketId(16, 3), BucketDatabase::Entry(document::BucketId(16, 3), BI(3)));
}
};
diff --git a/storage/src/vespa/storage/bucketdb/abstract_bucket_map.h b/storage/src/vespa/storage/bucketdb/abstract_bucket_map.h
index 6c669beab1c..a6855532e30 100644
--- a/storage/src/vespa/storage/bucketdb/abstract_bucket_map.h
+++ b/storage/src/vespa/storage/bucketdb/abstract_bucket_map.h
@@ -151,7 +151,6 @@ public:
static constexpr uint32_t DEFAULT_CHUNK_SIZE = 1000;
-
/**
* Iterate over the entire database contents, holding the global database
* mutex for `chunkSize` processed entries at a time, yielding the current
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
index 42bd3a247bb..9634d6d0953 100644
--- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
@@ -1,22 +1,10 @@
// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "btree_bucket_database.h"
-#include <vespa/vespalib/btree/btreebuilder.h>
-#include <vespa/vespalib/btree/btreenodeallocator.hpp>
-#include <vespa/vespalib/btree/btreenode.hpp>
-#include <vespa/vespalib/btree/btreenodestore.hpp>
-#include <vespa/vespalib/btree/btreeiterator.hpp>
-#include <vespa/vespalib/btree/btreeroot.hpp>
-#include <vespa/vespalib/btree/btreebuilder.hpp>
-#include <vespa/vespalib/btree/btree.hpp>
-#include <vespa/vespalib/btree/btreestore.hpp>
+#include "generic_btree_bucket_database.hpp"
#include <vespa/vespalib/datastore/array_store.hpp>
#include <iostream>
-// TODO remove once this impl uses the generic bucket B-tree code!
-#include "generic_btree_bucket_database.h"
-#include <vespa/vespalib/datastore/datastore.h>
-
/*
* Buckets in our tree are represented by their 64-bit numeric key, in what's known as
* "reversed bit order with appended used-bits" form. I.e. a bucket ID (16, 0xcafe), which
@@ -74,232 +62,53 @@ uint64_t value_from(uint32_t gc_timestamp, EntryRef ref) {
return ((uint64_t(gc_timestamp) << 32u) | ref.ref());
}
-// TODO dedupe and unify common code
-uint8_t
-getMinDiffBits(uint16_t minBits, const document::BucketId& a, const document::BucketId& b) {
- for (uint32_t i = minBits; i <= std::min(a.getUsedBits(), b.getUsedBits()); i++) {
- document::BucketId a1(i, a.getRawId());
- document::BucketId b1(i, b.getRawId());
- if (b1.getId() != a1.getId()) {
- return i;
- }
- }
- return minBits;
}
-uint8_t next_parent_bit_seek_level(uint8_t minBits, const document::BucketId& a, const document::BucketId& b) {
- const uint8_t min_used = std::min(a.getUsedBits(), b.getUsedBits());
- assert(min_used >= minBits); // Always monotonically descending towards leaves
- for (uint32_t i = minBits; i <= min_used; i++) {
- document::BucketId a1(i, a.getRawId());
- document::BucketId b1(i, b.getRawId());
- if (b1.getId() != a1.getId()) {
- return i;
- }
- }
- // The bit prefix is equal, which means that one node is a parent of the other. In this
- // case we have to force the seek to continue from the next level in the tree.
- return std::max(min_used, minBits) + 1;
-}
+struct BTreeBucketDatabase::ReplicaValueTraits {
+ using ValueType = Entry;
+ using ConstValueRef = ConstEntryRef;
+ using DataStoreType = vespalib::datastore::ArrayStore<BucketCopy>;
-// TODO getMinDiffBits is hoisted from lockablemap.cpp, could probably be rewritten in terms of xor and MSB bit scan instr
-/*
- * 63 -------- ... -> 0
- * a: 1101111111 ... 0010
- * b: 1101110010 ... 0011
- * a ^ b: 0000001101 ... 0001
- * ^- diff bit = 57
- *
- * 63 - vespalib::Optimized::msbIdx(a ^ b) ==> 6
- *
- * what if a == b? special case? not a problem if we can prove this never happens.
- */
+ static ValueType make_invalid_value() {
+ return Entry::createInvalid();
+ }
+ static uint64_t wrap_and_store_value(DataStoreType& store, const Entry& entry) noexcept {
+ auto replicas_ref = store.add(entry.getBucketInfo().getRawNodes());
+ return value_from(entry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref);
+ }
+ static void remove_by_wrapped_value(DataStoreType& store, uint64_t value) noexcept {
+ store.remove(entry_ref_from_value(value));
+ }
+ static ValueType unwrap_from_key_value(const DataStoreType& store, uint64_t key, uint64_t value) {
+ const auto replicas_ref = store.get(entry_ref_from_value(value));
+ const auto bucket = BucketId(BucketId::keyToBucketId(key));
+ return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref);
+ }
+ static ConstValueRef unwrap_const_ref_from_key_value(const DataStoreType& store, uint64_t key, uint64_t value) {
+ const auto replicas_ref = store.get(entry_ref_from_value(value));
+ const auto bucket = BucketId(BucketId::keyToBucketId(key));
+ return const_entry_ref_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref);
+ }
+};
-}
+template class bucketdb::GenericBTreeBucketDatabase<BTreeBucketDatabase::ReplicaValueTraits>;
BTreeBucketDatabase::BTreeBucketDatabase()
- : _tree(),
- _store(make_default_array_store_config<ReplicaStore>()),
- _generation_handler()
+ : _impl(std::make_unique<ImplType>(make_default_array_store_config<ReplicaValueTraits::DataStoreType>()))
{
}
BTreeBucketDatabase::~BTreeBucketDatabase() = default;
-void BTreeBucketDatabase::commit_tree_changes() {
- // TODO break up and refactor
- // TODO verify semantics and usage
- // TODO make BTree wrapping API which abstracts away all this stuff via reader/writer interfaces
- _tree.getAllocator().freeze();
-
- auto current_gen = _generation_handler.getCurrentGeneration();
- _store.transferHoldLists(current_gen);
- _tree.getAllocator().transferHoldLists(current_gen);
-
- _generation_handler.incGeneration();
-
- auto used_gen = _generation_handler.getFirstUsedGeneration();
- _store.trimHoldLists(used_gen);
- _tree.getAllocator().trimHoldLists(used_gen);
-}
-
-Entry BTreeBucketDatabase::entry_from_value(uint64_t bucket_key, uint64_t value) const {
- const auto replicas_ref = _store.get(entry_ref_from_value(value));
- const auto bucket = BucketId(BucketId::keyToBucketId(bucket_key));
- return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref);
-}
-
-Entry BTreeBucketDatabase::entry_from_iterator(const BTree::ConstIterator& iter) const {
- if (!iter.valid()) {
- return Entry::createInvalid();
- }
- const auto value = iter.getData();
- std::atomic_thread_fence(std::memory_order_acquire);
- return entry_from_value(iter.getKey(), value);
-}
-
-ConstEntryRef BTreeBucketDatabase::const_entry_ref_from_iterator(const BTree::ConstIterator& iter) const {
- if (!iter.valid()) {
- return ConstEntryRef::createInvalid();
- }
- const auto value = iter.getData();
- std::atomic_thread_fence(std::memory_order_acquire);
- const auto replicas_ref = _store.get(entry_ref_from_value(value));
- const auto bucket = BucketId(BucketId::keyToBucketId(iter.getKey()));
- return const_entry_ref_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref);
-}
-
-BucketId BTreeBucketDatabase::bucket_from_valid_iterator(const BTree::ConstIterator& iter) const {
- return BucketId(BucketId::keyToBucketId(iter.getKey()));
-}
-
Entry BTreeBucketDatabase::get(const BucketId& bucket) const {
- return entry_from_iterator(_tree.find(bucket.toKey()));
+ return _impl->get(bucket);
}
void BTreeBucketDatabase::remove(const BucketId& bucket) {
- auto iter = _tree.find(bucket.toKey());
- if (!iter.valid()) {
- return;
- }
- const auto value = iter.getData();
- _store.remove(entry_ref_from_value(value));
- _tree.remove(iter);
- commit_tree_changes();
+ _impl->remove(bucket);
}
-/*
- * Finding the complete set of parents of a given bucket is not obvious how to
- * do efficiently, as we only know that the parents are ordered before their
- * children, but we do not a-priori know if any exist at all. The Judy DB impl
- * does O(b) explicit point lookups (where b is the number of used bits in the
- * bucket), starting at the leaf bit and working towards the root. To avoid
- * having to re-create iterators and perform a full tree search every time, we
- * turn this on its head and start from the root, progressing towards the leaf.
- * This allows us to reuse a single iterator and to continue seeking forwards
- * from its current position.
- *
- * To speed up the process of converging on the target bucket without needing
- * to check many unrelated subtrees, we let the underlying B-tree automatically
- * aggregate the min/max range of the used-bits of all contained bucket keys.
- * If we e.g. know that the minimum number of used bits in the DB is 16, we can
- * immediately seek to this level in the tree instead of working our way down
- * one bit at a time. By definition, no parents can exist above this level.
- * This is a very important optimization, as bucket trees are usually very well
- * balanced due to randomized distribution of data (combined with a cluster-wide
- * minimum tree level imposed by distribution bits). It is common that the minimum
- * number of used bits == max number of used bits, i.e. a totally even split.
- * This means that for a system without inconsistently split buckets (i.e. no
- * parents) we're highly likely to converge on the target bucket in a single seek.
- *
- * Algorithm:
- *
- * Core invariant: every subsequent iterator seek performed in this algorithm
- * is for a key that is strictly higher than the one the iterator is currently at.
- *
- * 1. Lbound seek to the lowest key that is known to exclude all already visited
- * parents. On the first iteration we use a bit count equal to the minimum number
- * of key used-bits in the entire DB, allowing us to potentially skip most subtrees.
- * 2. If the current node's key is greater than that of the requested bucket's key,
- * we've either descended to--or beyond--it in its own subtree or we've entered
- * a disjoint subtree. Since we know that all parents must sort before any given
- * child bucket, no more parents may be found at this point. Algorithm terminates.
- * 3. As the main body of the loop is entered, we know one of following must hold:
- * 3.1 The current node is an explicitly present parent of our bucket.
- * 3.2 The current node is contained in a left subtree branch of a parent that
- * does not have a bucket explicitly present in the tree. It cannot be in
- * a right subtree of any parent, as that would imply the node is ordered
- * _after_ our own bucket in an in-order traversal, which would contradict
- * the check in step 2 above.
- * 4. If the current node contains the requested bucket, we're at a parent
- * node of the bucket; add it to the result set.
- * If this is _not_ the case, we're in a different subtree. Example: the
- * requested bucket has a key whose MSB is 1 but the first bucket in the
- * tree has a key with an MSB of 0. Either way we need to update our search
- * key to home in on the target subtree where more parents may be found;
- * 5. Update the seek key to find the next possible parent. To ensure this key is
- * strictly greater than the iterator's current key we find the largest shared
- * prefix of bits in common between the current node's key and the requested
- * bucket's key. The prefix length + 1 is then the depth in the tree at which the
- * two subtrees branch off and diverge.
- * The new key is then the MSB prefix length + 1 requested bucket's key with a
- * matching number of used-bits set. Forward lbound-seek the iterator to this key.
- * `--> TODO elaborate on prefix semantics when they are equal wrt. min used bits
- * 6. Iff iterator is still valid, go to step 2
- *
- * This algorithm is able to skip through large parts of the tree in a sparsely populated
- * tree, but the number of seeks will trend towards O(b - min_bits) as with the legacy
- * implementation when a tree is densely populated (where `b` is the used-bits count of the
- * most specific node in the tree for the target bucket, and min_bits is the minimum number
- * of used-bits for any key in the database). This because all logical inner nodes in the tree
- * will have subtrees under them. Even in the worst case we should be more efficient than the
- * legacy Judy-based implementation since we've cut any dense search space in half for each
- * invocation of seek() on the iterator.
- */
-BTreeBucketDatabase::BTree::ConstIterator
-BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view,
- const document::BucketId& bucket,
- std::vector<Entry>& entries) const
-{
- const uint64_t bucket_key = bucket.toKey();
- if (frozen_view.empty()) {
- return frozen_view.begin(); // Will be invalid.
- }
- const auto min_db_bits = frozen_view.getAggregated().getMin();
- assert(min_db_bits >= static_cast<int32_t>(BucketId::minNumBits));
- assert(min_db_bits <= static_cast<int32_t>(BucketId::maxNumBits));
- // Start at the lowest possible tree level no parents can exist above,
- // descending towards the bucket itself.
- // Note: important to use getId() rather than getRawId(), as min_db_bits may be
- // greater than the used bits of the queried bucket. If we used the raw ID, we'd
- // end up looking at undefined bits.
- const auto first_key = BucketId(min_db_bits, bucket.getId()).toKey();
- auto iter = frozen_view.lowerBound(first_key);
- // Try skipping as many levels of the tree as possible as we go.
- uint32_t bits = min_db_bits;
- while (iter.valid() && (iter.getKey() < bucket_key)) {
- auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
- if (candidate.contains(bucket)) {
- assert(candidate.getUsedBits() >= bits);
- entries.emplace_back(entry_from_iterator(iter));
- }
- bits = next_parent_bit_seek_level(bits, candidate, bucket);
- const auto parent_key = BucketId(bits, bucket.getRawId()).toKey();
- assert(parent_key > iter.getKey());
- iter.seek(parent_key);
- }
- return iter;
-}
-
-void BTreeBucketDatabase::find_parents_and_self_internal(const BTree::FrozenView& frozen_view,
- const document::BucketId& bucket,
- std::vector<Entry>& entries) const
-{
- auto iter = find_parents_internal(frozen_view, bucket, entries);
- if (iter.valid() && iter.getKey() == bucket.toKey()) {
- entries.emplace_back(entry_from_iterator(iter));
- }
-}
+using bucketdb::ByValue;
/*
* Note: due to legacy API reasons, iff the requested bucket itself exists in the
@@ -309,212 +118,53 @@ void BTreeBucketDatabase::find_parents_and_self_internal(const BTree::FrozenView
void BTreeBucketDatabase::getParents(const BucketId& bucket,
std::vector<Entry>& entries) const
{
- auto view = _tree.getFrozenView();
- find_parents_and_self_internal(view, bucket, entries);
+ _impl->find_parents_and_self<ByValue>(bucket, [&entries]([[maybe_unused]] uint64_t key, Entry entry){
+ entries.emplace_back(std::move(entry));
+ });
}
void BTreeBucketDatabase::getAll(const BucketId& bucket,
std::vector<Entry>& entries) const
{
- auto view = _tree.getFrozenView();
- auto iter = find_parents_internal(view, bucket, entries);
- // `iter` is already pointing at, or beyond, one of the bucket's subtrees.
- for (; iter.valid(); ++iter) {
- auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
- if (bucket.contains(candidate)) {
- entries.emplace_back(entry_from_iterator(iter));
- } else {
- break;
- }
- }
+ _impl->find_parents_self_and_children<ByValue>(bucket, [&entries]([[maybe_unused]] uint64_t key, Entry entry){
+ entries.emplace_back(std::move(entry));
+ });
}
void BTreeBucketDatabase::update(const Entry& newEntry) {
assert(newEntry.valid());
- auto replicas_ref = _store.add(newEntry.getBucketInfo().getRawNodes());
- const auto new_value = value_from(newEntry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref);
- const auto bucket_key = newEntry.getBucketId().toKey();
- auto iter = _tree.lowerBound(bucket_key);
- if (iter.valid() && (iter.getKey() == bucket_key)) {
- _store.remove(entry_ref_from_value(iter.getData()));
- // In-place update of value; does not require tree structure modification
- std::atomic_thread_fence(std::memory_order_release); // Must ensure visibility when new array ref is observed
- iter.writeData(new_value);
- } else {
- _tree.insert(iter, bucket_key, new_value);
- }
- commit_tree_changes(); // TODO does publishing a new root imply an implicit memory fence?
+ _impl->update(newEntry.getBucketId(), newEntry);
}
// TODO need snapshot read with guarding
// FIXME semantics of for-each in judy and bit tree DBs differ, former expects lbound, latter ubound..!
// FIXME but bit-tree code says "lowerBound" in impl and "after" in declaration???
void BTreeBucketDatabase::forEach(EntryProcessor& proc, const BucketId& after) const {
- for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) {
- if (!proc.process(const_entry_ref_from_iterator(iter))) {
+ for (auto iter = _impl->upper_bound(after.toKey()); iter.valid(); ++iter) {
+ if (!proc.process(_impl->const_value_ref_from_valid_iterator(iter))) {
break;
}
}
}
-struct BTreeBuilderMerger final : BucketDatabase::Merger {
- BTreeBucketDatabase& _db;
- BTreeBucketDatabase::BTree::Builder& _builder;
- uint64_t _current_key;
- uint64_t _current_value;
- Entry _cached_entry;
- bool _valid_cached_entry;
-
- BTreeBuilderMerger(BTreeBucketDatabase& db,
- BTreeBucketDatabase::BTree::Builder& builder)
- : _db(db),
- _builder(builder),
- _current_key(0),
- _current_value(0),
- _cached_entry(),
- _valid_cached_entry(false)
- {}
- ~BTreeBuilderMerger() override = default;
-
- uint64_t bucket_key() const noexcept override {
- return _current_key;
- }
- BucketId bucket_id() const noexcept override {
- return BucketId(BucketId::keyToBucketId(_current_key));
- }
- Entry& current_entry() override {
- if (!_valid_cached_entry) {
- _cached_entry = _db.entry_from_value(_current_key, _current_value);
- _valid_cached_entry = true;
- }
- return _cached_entry;
- }
- void insert_before_current(const Entry& e) override {
- const uint64_t bucket_key = e.getBucketId().toKey();
- assert(bucket_key < _current_key);
-
- auto replicas_ref = _db._store.add(e.getBucketInfo().getRawNodes());
- const auto new_value = value_from(e.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref);
-
- _builder.insert(bucket_key, new_value);
- }
-
- void update_iteration_state(uint64_t key, uint64_t value) {
- _current_key = key;
- _current_value = value;
- _valid_cached_entry = false;
- }
-};
-
-struct BTreeTrailingInserter final : BucketDatabase::TrailingInserter {
- BTreeBucketDatabase& _db;
- BTreeBucketDatabase::BTree::Builder& _builder;
-
- BTreeTrailingInserter(BTreeBucketDatabase& db,
- BTreeBucketDatabase::BTree::Builder& builder)
- : _db(db),
- _builder(builder)
- {}
-
- ~BTreeTrailingInserter() override = default;
-
- void insert_at_end(const Entry& e) override {
- const uint64_t bucket_key = e.getBucketId().toKey();
- const auto replicas_ref = _db._store.add(e.getBucketInfo().getRawNodes());
- const auto new_value = value_from(e.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref);
- _builder.insert(bucket_key, new_value);
- }
-};
-
-// TODO lbound arg?
void BTreeBucketDatabase::merge(MergingProcessor& proc) {
- BTreeBucketDatabase::BTree::Builder builder(_tree.getAllocator());
- BTreeBuilderMerger merger(*this, builder);
-
- // TODO for_each instead?
- for (auto iter = _tree.begin(); iter.valid(); ++iter) {
- const uint64_t key = iter.getKey();
- const uint64_t value = iter.getData();
- merger.update_iteration_state(key, value);
-
- auto result = proc.merge(merger);
-
- if (result == MergingProcessor::Result::KeepUnchanged) {
- builder.insert(key, value); // Reuse array store ref with no changes
- } else if (result == MergingProcessor::Result::Update) {
- assert(merger._valid_cached_entry); // Must actually have been touched
- assert(merger._cached_entry.valid());
- _store.remove(entry_ref_from_value(value));
- auto new_replicas_ref = _store.add(merger._cached_entry.getBucketInfo().getRawNodes());
- const auto new_value = value_from(merger._cached_entry.getBucketInfo().getLastGarbageCollectionTime(), new_replicas_ref);
- builder.insert(key, new_value);
- } else if (result == MergingProcessor::Result::Skip) {
- _store.remove(entry_ref_from_value(value));
- } else {
- abort();
- }
- }
- BTreeTrailingInserter inserter(*this, builder);
- proc.insert_remaining_at_end(inserter);
-
- _tree.assign(builder);
- commit_tree_changes();
+ _impl->merge(proc);
}
Entry BTreeBucketDatabase::upperBound(const BucketId& bucket) const {
- return entry_from_iterator(_tree.upperBound(bucket.toKey()));
+ return _impl->entry_from_iterator(_impl->upper_bound(bucket.toKey()));
}
uint64_t BTreeBucketDatabase::size() const {
- return _tree.size();
+ return _impl->size();
}
void BTreeBucketDatabase::clear() {
- _tree.clear();
- commit_tree_changes();
+ _impl->clear();
}
-/*
- * Returns the bucket ID which, based on the buckets already existing in the DB,
- * is the most specific location in the tree in which it should reside. This may
- * or may not be a bucket that already exists.
- *
- * Example: if there is a single bucket (1, 1) in the tree, a query for (1, 1) or
- * (1, 3) will return (1, 1) as that is the most specific leaf in that subtree.
- * A query for (1, 0) will return (1, 0) even though this doesn't currently exist,
- * as there is no existing bucket that can contain the queried bucket. It is up to
- * the caller to create this bucket according to its needs.
- *
- * Usually this function will be called with an ID whose used-bits is at max (58), in
- * order to find a leaf bucket to route an incoming document operation to.
- *
- * TODO rename this function, it's very much _not_ obvious what an "appropriate" bucket is..!
- * TODO this should be possible to do concurrently
- */
BucketId BTreeBucketDatabase::getAppropriateBucket(uint16_t minBits, const BucketId& bid) {
- // The bucket tree is ordered in such a way that it represents a
- // natural in-order traversal of all buckets, with inner nodes being
- // visited before leaf nodes. This means that a lower bound seek will
- // never return a parent of a seeked bucket. The iterator will be pointing
- // to a bucket that is either the actual bucket given as the argument to
- // lowerBound() or the next in-order bucket (or end() if none exists).
- auto iter = _tree.lowerBound(bid.toKey());
- if (iter.valid()) {
- // Find the first level in the tree where the paths through the bucket tree
- // diverge for the target bucket and the current bucket.
- minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid);
- }
- // TODO is it better to copy original iterator and do begin() on the copy?
- auto first_iter = _tree.begin();
- // Original iterator might be in a different subtree than that of our
- // target bucket. If possible, rewind one node to discover any parent or
- // leftmost sibling of our node. If there's no such node, we'll still
- // discover the greatest equal bit prefix.
- if (iter != first_iter) {
- --iter;
- minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid);
- }
- return BucketId(minBits, bid.getRawId());
+ return _impl->getAppropriateBucket(minBits, bid);
}
/*
@@ -527,24 +177,7 @@ BucketId BTreeBucketDatabase::getAppropriateBucket(uint16_t minBits, const Bucke
*/
// TODO rename/clarify to indicate this is child _subtrees_, not explicit child _buckets_!
uint32_t BTreeBucketDatabase::childCount(const BucketId& bucket) const {
- assert(bucket.getUsedBits() < BucketId::maxNumBits);
- BucketId lhs_bucket(bucket.getUsedBits() + 1, bucket.getId());
- BucketId rhs_bucket(bucket.getUsedBits() + 1, (1ULL << bucket.getUsedBits()) | bucket.getId());
-
- auto iter = _tree.lowerBound(lhs_bucket.toKey());
- if (!iter.valid()) {
- return 0;
- }
- if (lhs_bucket.contains(bucket_from_valid_iterator(iter))) {
- iter.seek(rhs_bucket.toKey());
- if (!iter.valid()) {
- return 1; // lhs subtree only
- }
- return (rhs_bucket.contains(bucket_from_valid_iterator(iter)) ? 2 : 1);
- } else if (rhs_bucket.contains(bucket_from_valid_iterator(iter))) {
- return 1; // rhs subtree only
- }
- return 0;
+ return _impl->child_subtree_count(bucket);
}
void BTreeBucketDatabase::print(std::ostream& out, bool verbose,
@@ -556,15 +189,22 @@ void BTreeBucketDatabase::print(std::ostream& out, bool verbose,
}
vespalib::MemoryUsage BTreeBucketDatabase::memory_usage() const noexcept {
- auto mem_usage = _tree.getMemoryUsage();
- mem_usage.merge(_store.getMemoryUsage());
- return mem_usage;
+ return _impl->memory_usage();
}
+class BTreeBucketDatabase::ReadGuardImpl final : public bucketdb::ReadGuard<Entry> {
+ ImplType::ReadSnapshot _snapshot;
+public:
+ explicit ReadGuardImpl(const BTreeBucketDatabase& db);
+ ~ReadGuardImpl() override;
+
+ void find_parents_and_self(const document::BucketId& bucket,
+ std::vector<Entry>& entries) const override;
+ [[nodiscard]] uint64_t generation() const noexcept override;
+};
+
BTreeBucketDatabase::ReadGuardImpl::ReadGuardImpl(const BTreeBucketDatabase& db)
- : _db(&db),
- _guard(_db->_generation_handler.takeGuard()),
- _frozen_view(_db->_tree.getFrozenView())
+ : _snapshot(*db._impl)
{}
BTreeBucketDatabase::ReadGuardImpl::~ReadGuardImpl() = default;
@@ -572,52 +212,17 @@ BTreeBucketDatabase::ReadGuardImpl::~ReadGuardImpl() = default;
void BTreeBucketDatabase::ReadGuardImpl::find_parents_and_self(const document::BucketId& bucket,
std::vector<Entry>& entries) const
{
- _db->find_parents_and_self_internal(_frozen_view, bucket, entries);
+ _snapshot.find_parents_and_self<ByValue>(bucket, [&entries]([[maybe_unused]] uint64_t key, Entry entry){
+ entries.emplace_back(std::move(entry));
+ });
}
uint64_t BTreeBucketDatabase::ReadGuardImpl::generation() const noexcept {
- return _guard.getGeneration();
+ return _snapshot.generation();
}
-// TODO replace existing distributor DB code with generic impl.
-// This is to ensure the generic implementation compiles with an ArrayStore backing in
-// the meantime.
-struct BTreeBucketDatabase2 {
- struct ReplicaValueTraits {
- using ValueType = Entry;
- using ConstValueRef = ConstEntryRef;
- using DataStoreType = vespalib::datastore::ArrayStore<BucketCopy>;
-
- static ValueType make_invalid_value() {
- return Entry::createInvalid();
- }
- static uint64_t wrap_and_store_value(DataStoreType& store, const Entry& entry) noexcept {
- auto replicas_ref = store.add(entry.getBucketInfo().getRawNodes());
- return value_from(entry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref);
- }
- static void remove_by_wrapped_value(DataStoreType& store, uint64_t value) noexcept {
- store.remove(entry_ref_from_value(value));
- }
- static ValueType unwrap_from_key_value(const DataStoreType& store, uint64_t key, uint64_t value) {
- const auto replicas_ref = store.get(entry_ref_from_value(value));
- const auto bucket = BucketId(BucketId::keyToBucketId(key));
- return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref);
- }
- static ConstValueRef unwrap_const_ref_from_key_value(const DataStoreType& store, uint64_t key, uint64_t value) {
- const auto replicas_ref = store.get(entry_ref_from_value(value));
- const auto bucket = BucketId(BucketId::keyToBucketId(key));
- return const_entry_ref_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref);
- }
- };
-
- using BTreeImpl = bucketdb::GenericBTreeBucketDatabase<ReplicaValueTraits>;
- BTreeImpl _impl;
-
- BTreeBucketDatabase2()
- : _impl(make_default_array_store_config<ReplicaValueTraits::DataStoreType>())
- {}
-};
-
-template class bucketdb::GenericBTreeBucketDatabase<BTreeBucketDatabase2::ReplicaValueTraits>;
+std::unique_ptr<bucketdb::ReadGuard<Entry>> BTreeBucketDatabase::acquire_read_guard() const {
+ return std::make_unique<ReadGuardImpl>(*this);
+}
}
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
index 0dfa2b07b8a..122c6eeb0fb 100644
--- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
@@ -3,13 +3,14 @@
#pragma once
#include "bucketdatabase.h"
-#include <vespa/vespalib/btree/btree.h>
-#include <vespa/vespalib/btree/minmaxaggregated.h>
-#include <vespa/vespalib/btree/minmaxaggrcalc.h>
-#include <vespa/vespalib/datastore/array_store.h>
+#include <memory>
namespace storage {
+namespace bucketdb {
+template <typename DataStoreTraitsT> class GenericBTreeBucketDatabase;
+}
+
/*
* Bucket database implementation built around lock-free single-writer/multiple-readers B+tree.
*
@@ -25,27 +26,9 @@ namespace storage {
*/
// TODO create and use a new DB interface with better bulk loading, snapshot and iteration support
class BTreeBucketDatabase : public BucketDatabase {
-
- struct KeyUsedBitsMinMaxAggrCalc : vespalib::btree::MinMaxAggrCalc {
- constexpr static bool aggregate_over_values() { return false; }
- constexpr static int32_t getVal(uint64_t key) noexcept {
- static_assert(document::BucketId::CountBits == 6u);
- return static_cast<int32_t>(key & 0b11'1111U); // 6 LSB of key contains used-bits
- }
- };
-
- // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref>
- using BTree = vespalib::btree::BTree<uint64_t, uint64_t,
- vespalib::btree::MinMaxAggregated,
- std::less<>,
- vespalib::btree::BTreeDefaultTraits,
- KeyUsedBitsMinMaxAggrCalc>;
- using ReplicaStore = vespalib::datastore::ArrayStore<BucketCopy>;
- using GenerationHandler = vespalib::GenerationHandler;
-
- BTree _tree;
- ReplicaStore _store;
- GenerationHandler _generation_handler;
+ struct ReplicaValueTraits;
+ using ImplType = bucketdb::GenericBTreeBucketDatabase<ReplicaValueTraits>;
+ std::unique_ptr<ImplType> _impl;
public:
BTreeBucketDatabase();
~BTreeBucketDatabase() override;
@@ -72,38 +55,10 @@ public:
const std::string& indent) const override;
private:
- Entry entry_from_value(uint64_t bucket_key, uint64_t value) const;
- Entry entry_from_iterator(const BTree::ConstIterator& iter) const;
- ConstEntryRef const_entry_ref_from_iterator(const BTree::ConstIterator& iter) const;
- document::BucketId bucket_from_valid_iterator(const BTree::ConstIterator& iter) const;
- void commit_tree_changes();
- BTree::ConstIterator find_parents_internal(const BTree::FrozenView& frozen_view,
- const document::BucketId& bucket,
- std::vector<Entry>& entries) const;
- void find_parents_and_self_internal(const BTree::FrozenView& frozen_view,
- const document::BucketId& bucket,
- std::vector<Entry>& entries) const;
-
- class ReadGuardImpl : public ReadGuard {
- const BTreeBucketDatabase* _db;
- GenerationHandler::Guard _guard;
- BTree::FrozenView _frozen_view;
- public:
- explicit ReadGuardImpl(const BTreeBucketDatabase& db);
- ~ReadGuardImpl() override;
-
- void find_parents_and_self(const document::BucketId& bucket,
- std::vector<Entry>& entries) const override;
- uint64_t generation() const noexcept override;
- };
-
+ class ReadGuardImpl;
friend class ReadGuardImpl;
- friend struct BTreeBuilderMerger;
- friend struct BTreeTrailingInserter;
public:
- std::unique_ptr<ReadGuard> acquire_read_guard() const override {
- return std::make_unique<ReadGuardImpl>(*this);
- }
+ std::unique_ptr<bucketdb::ReadGuard<Entry>> acquire_read_guard() const override;
vespalib::MemoryUsage memory_usage() const noexcept override;
};
diff --git a/storage/src/vespa/storage/bucketdb/btree_lockable_map.h b/storage/src/vespa/storage/bucketdb/btree_lockable_map.h
index 136baefb615..3d58b85b063 100644
--- a/storage/src/vespa/storage/bucketdb/btree_lockable_map.h
+++ b/storage/src/vespa/storage/bucketdb/btree_lockable_map.h
@@ -24,7 +24,7 @@ template <typename DataStoreTraitsT> class GenericBTreeBucketDatabase;
* Identical global and per-bucket locking semantics as LockableMap.
*/
template <typename T>
-class BTreeLockableMap : public AbstractBucketMap<T> {
+class BTreeLockableMap final : public AbstractBucketMap<T> {
struct ValueTraits;
public:
using ParentType = AbstractBucketMap<T>;
diff --git a/storage/src/vespa/storage/bucketdb/btree_lockable_map.hpp b/storage/src/vespa/storage/bucketdb/btree_lockable_map.hpp
index 9c7228ae21d..18eae405dc6 100644
--- a/storage/src/vespa/storage/bucketdb/btree_lockable_map.hpp
+++ b/storage/src/vespa/storage/bucketdb/btree_lockable_map.hpp
@@ -439,7 +439,7 @@ BTreeLockableMap<T>::getContained(const BucketId& bucket,
std::map<BucketId, WrappedEntry> results;
std::vector<BucketId::Type> keys;
- _impl->find_parents_and_self(bucket, [&keys](uint64_t key, [[maybe_unused]]const auto& value){
+ _impl->template find_parents_and_self<ByConstRef>(bucket, [&keys](uint64_t key, [[maybe_unused]]const auto& value){
keys.emplace_back(key);
});
@@ -454,7 +454,7 @@ template <typename T>
void BTreeLockableMap<T>::getAllWithoutLocking(const BucketId& bucket,
std::vector<BucketId::Type>& keys)
{
- _impl->find_parents_self_and_children(bucket, [&keys](uint64_t key, [[maybe_unused]]const auto& value){
+ _impl->template find_parents_self_and_children<ByConstRef>(bucket, [&keys](uint64_t key, [[maybe_unused]]const auto& value){
keys.emplace_back(key);
});
}
@@ -480,7 +480,7 @@ template <typename T>
bool BTreeLockableMap<T>::isConsistent(const BTreeLockableMap::WrappedEntry& entry) {
std::lock_guard guard(_lock);
uint64_t n_buckets = 0;
- _impl->find_parents_self_and_children(entry.getBucketId(),
+ _impl->template find_parents_self_and_children<ByConstRef>(entry.getBucketId(),
[&n_buckets]([[maybe_unused]] uint64_t key, [[maybe_unused]] const auto& value) {
++n_buckets;
});
diff --git a/storage/src/vespa/storage/bucketdb/bucketdatabase.h b/storage/src/vespa/storage/bucketdb/bucketdatabase.h
index 2dbcdd194ef..1ee165a739c 100644
--- a/storage/src/vespa/storage/bucketdb/bucketdatabase.h
+++ b/storage/src/vespa/storage/bucketdb/bucketdatabase.h
@@ -4,6 +4,8 @@
*/
#pragma once
+#include "db_merger.h"
+#include "read_guard.h"
#include <vespa/vespalib/util/printable.h>
#include <vespa/storage/bucketdb/bucketinfo.h>
#include <vespa/document/bucket/bucketid.h>
@@ -84,104 +86,9 @@ public:
EntryProcessor&,
const document::BucketId& after = document::BucketId()) const = 0;
- /**
- * Database implementation-specific interface for appending entries
- * during a merge() operation.
- */
- struct TrailingInserter {
- virtual ~TrailingInserter() = default;
- /**
- * Insert a new database entry at the end of the current bucket space.
- *
- * Precondition: the entry's bucket ID must sort after all entries that
- * have already been iterated over or inserted via insert_at_end().
- */
- virtual void insert_at_end(const Entry&) = 0;
- };
-
- /**
- * Database implementation-specific interface for accessing bucket
- * entries and prepending entries during a merge() operation.
- */
- struct Merger {
- virtual ~Merger() = default;
-
- // TODO this should ideally be separated into read/write functions, but this
- // will suffice for now to avoid too many changes.
-
- /**
- * Bucket key/ID of the currently iterated entry. Unless the information stored
- * in the DB Entry is needed, using one of these methods should be preferred to
- * getting the bucket ID via current_entry(). The underlying DB is expected to
- * have cheap access to the ID but _may_ have expensive access to the entry itself.
- */
- virtual uint64_t bucket_key() const noexcept = 0;
- virtual document::BucketId bucket_id() const noexcept = 0;
- /**
- * Returns a mutable representation of the currently iterated database
- * entry. If changes are made to this object, Result::Update must be
- * returned from merge(). Otherwise, mutation visibility is undefined.
- */
- virtual Entry& current_entry() = 0;
- /**
- * Insert a new entry into the bucket database that is ordered before the
- * currently iterated entry.
- *
- * Preconditions:
- * - The entry's bucket ID must sort _before_ the currently iterated
- * entry's bucket ID, in "reversed bits" bucket key order.
- * - The entry's bucket ID must sort _after_ any entries previously
- * inserted with insert_before_current().
- * - The entry's bucket ID must not be the same as a bucket that was
- * already iterated over as part of the DB merge() call or inserted
- * via a previous call to insert_before_current().
- * Such buckets must be handled by explicitly updating the provided
- * entry for the iterated bucket and returning Result::Update.
- */
- virtual void insert_before_current(const Entry&) = 0;
- };
-
- /**
- * Interface to be implemented by callers that wish to receive callbacks
- * during a bucket merge() operation.
- */
- struct MergingProcessor {
- // See merge() for semantics on enum values.
- enum class Result {
- Update,
- KeepUnchanged,
- Skip
- };
-
- virtual ~MergingProcessor() = default;
- /**
- * Invoked for each existing bucket in the database, in bucket key order.
- * The provided Merge instance may be used to access the current entry
- * and prepend entries to the DB.
- *
- * Return value semantics:
- * - Result::Update:
- * when merge() returns, the changes made to the current entry will
- * become visible in the bucket database.
- * - Result::KeepUnchanged:
- * when merge() returns, the entry will remain in the same state as
- * it was when merge() was originally called.
- * - Result::Skip:
- * when merge() returns, the entry will no longer be part of the DB.
- * Any entries added via insert_before_current() _will_ be present.
- *
- */
- virtual Result merge(Merger&) = 0;
- /**
- * Invoked once after all existing buckets have been iterated over.
- * The provided TrailingInserter instance may be used to append
- * an arbitrary number of entries to the database.
- *
- * This is used to handle elements remaining at the end of a linear
- * merge operation.
- */
- virtual void insert_remaining_at_end(TrailingInserter&) {}
- };
+ using TrailingInserter = bucketdb::TrailingInserter<Entry>;
+ using Merger = bucketdb::Merger<Entry>;
+ using MergingProcessor = bucketdb::MergingProcessor<Entry>;
/**
* Iterate over the bucket database in bucket key order, allowing an arbitrary
@@ -234,22 +141,10 @@ public:
virtual uint32_t childCount(const document::BucketId&) const = 0;
- struct ReadGuard {
- ReadGuard() = default;
- virtual ~ReadGuard() = default;
-
- ReadGuard(ReadGuard&&) = delete;
- ReadGuard& operator=(ReadGuard&&) = delete;
- ReadGuard(const ReadGuard&) = delete;
- ReadGuard& operator=(const ReadGuard&) = delete;
-
- virtual void find_parents_and_self(const document::BucketId& bucket,
- std::vector<Entry>& entries) const = 0;
- virtual uint64_t generation() const noexcept = 0;
- };
+ using ReadGuard = bucketdb::ReadGuard<Entry>;
virtual std::unique_ptr<ReadGuard> acquire_read_guard() const {
- return std::unique_ptr<ReadGuard>();
+ return std::unique_ptr<bucketdb::ReadGuard<Entry>>();
}
[[nodiscard]] virtual vespalib::MemoryUsage memory_usage() const noexcept = 0;
diff --git a/storage/src/vespa/storage/bucketdb/db_merger.h b/storage/src/vespa/storage/bucketdb/db_merger.h
new file mode 100644
index 00000000000..4bda7ecff4d
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/db_merger.h
@@ -0,0 +1,111 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+
+namespace storage::bucketdb {
+
+/**
+ * Database implementation-specific interface for appending entries
+ * during a merge() operation.
+ */
+template <typename ValueT>
+struct TrailingInserter {
+ virtual ~TrailingInserter() = default;
+ /**
+ * Insert a new database entry at the end of the current bucket space.
+ *
+ * Precondition: the bucket ID must sort after all entries that
+ * have already been iterated over or inserted via insert_at_end().
+ */
+ virtual void insert_at_end(const document::BucketId& bucket_id, const ValueT&) = 0;
+};
+
+/**
+ * Database implementation-specific interface for accessing bucket
+ * entries and prepending entries during a merge() operation.
+ */
+template <typename ValueT>
+struct Merger {
+ virtual ~Merger() = default;
+
+ // TODO this should ideally be separated into read/write functions, but this
+ // will suffice for now to avoid too many changes.
+
+ /**
+ * Bucket key/ID of the currently iterated entry. Unless the information stored
+ * in the DB Entry is needed, using one of these methods should be preferred to
+ * getting the bucket ID via current_entry(). The underlying DB is expected to
+ * have cheap access to the ID but _may_ have expensive access to the entry itself.
+ */
+ [[nodiscard]] virtual uint64_t bucket_key() const noexcept = 0;
+ [[nodiscard]] virtual document::BucketId bucket_id() const noexcept = 0;
+ /**
+ * Returns a mutable representation of the currently iterated database
+ * entry. If changes are made to this object, Result::Update must be
+ * returned from merge(). Otherwise, mutation visibility is undefined.
+ */
+ [[nodiscard]] virtual ValueT& current_entry() = 0;
+ /**
+ * Insert a new entry into the bucket database that is ordered before the
+ * currently iterated entry.
+ *
+ * Preconditions:
+ * - The bucket ID must sort _before_ the currently iterated
+ * entry's bucket ID, in "reversed bits" bucket key order.
+ * - The bucket ID must sort _after_ any entries previously
+ * inserted with insert_before_current().
+ * - The bucket ID must not be the same as a bucket that was
+ * already iterated over as part of the DB merge() call or inserted
+ * via a previous call to insert_before_current().
+ * Such buckets must be handled by explicitly updating the provided
+ * entry for the iterated bucket and returning Result::Update.
+ */
+ virtual void insert_before_current(const document::BucketId& bucket_id, const ValueT&) = 0;
+};
+
+/**
+ * Interface to be implemented by callers that wish to receive callbacks
+ * during a bucket merge() operation.
+ */
+template <typename ValueT>
+struct MergingProcessor {
+ // See merge() for semantics on enum values.
+ enum class Result {
+ Update,
+ KeepUnchanged,
+ Skip
+ };
+
+ virtual ~MergingProcessor() = default;
+ /**
+ * Invoked for each existing bucket in the database, in bucket key order.
+ * The provided Merge instance may be used to access the current entry
+ * and prepend entries to the DB.
+ *
+ * Return value semantics:
+ * - Result::Update:
+ * when merge() returns, the changes made to the current entry will
+ * become visible in the bucket database.
+ * - Result::KeepUnchanged:
+ * when merge() returns, the entry will remain in the same state as
+ * it was when merge() was originally called.
+ * - Result::Skip:
+ * when merge() returns, the entry will no longer be part of the DB.
+ * Any entries added via insert_before_current() _will_ be present.
+ *
+ */
+ virtual Result merge(Merger<ValueT>&) = 0;
+ /**
+ * Invoked once after all existing buckets have been iterated over.
+ * The provided TrailingInserter instance may be used to append
+ * an arbitrary number of entries to the database.
+ *
+ * This is used to handle elements remaining at the end of a linear
+ * merge operation.
+ */
+ virtual void insert_remaining_at_end(TrailingInserter<ValueT>&) {}
+};
+
+
+}
diff --git a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.cpp
index bcc471cc903..b4b8c6e54b9 100644
--- a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.cpp
+++ b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.cpp
@@ -5,6 +5,19 @@ namespace storage::bucketdb {
using document::BucketId;
+// TODO getMinDiffBits is hoisted from lockablemap.cpp, could probably be rewritten in terms of xor and MSB bit scan instr
+/*
+ * 63 -------- ... -> 0
+ * a: 1101111111 ... 0010
+ * b: 1101110010 ... 0011
+ * a ^ b: 0000001101 ... 0001
+ * ^- diff bit = 57
+ *
+ * 63 - vespalib::Optimized::msbIdx(a ^ b) ==> 6
+ *
+ * what if a == b? special case? not a problem if we can prove this never happens.
+ */
+
// TODO dedupe and unify common code
uint8_t
getMinDiffBits(uint16_t minBits, const BucketId& a, const BucketId& b) {
diff --git a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.h
index 15de7f3525b..8bc7a3379b3 100644
--- a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.h
+++ b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.h
@@ -1,6 +1,7 @@
// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
+#include "db_merger.h"
#include <vespa/document/bucket/bucketid.h>
#include <vespa/vespalib/btree/btree.h>
#include <vespa/vespalib/btree/minmaxaggregated.h>
@@ -8,108 +9,6 @@
namespace storage::bucketdb {
-/**
- * Database implementation-specific interface for appending entries
- * during a merge() operation.
- */
-template <typename ValueT>
-struct TrailingInserter {
- virtual ~TrailingInserter() = default;
- /**
- * Insert a new database entry at the end of the current bucket space.
- *
- * Precondition: the bucket ID must sort after all entries that
- * have already been iterated over or inserted via insert_at_end().
- */
- virtual void insert_at_end(const document::BucketId& bucket_id, const ValueT&) = 0;
-};
-
-/**
- * Database implementation-specific interface for accessing bucket
- * entries and prepending entries during a merge() operation.
- */
-template <typename ValueT>
-struct Merger {
- virtual ~Merger() = default;
-
- // TODO this should ideally be separated into read/write functions, but this
- // will suffice for now to avoid too many changes.
-
- /**
- * Bucket key/ID of the currently iterated entry. Unless the information stored
- * in the DB Entry is needed, using one of these methods should be preferred to
- * getting the bucket ID via current_entry(). The underlying DB is expected to
- * have cheap access to the ID but _may_ have expensive access to the entry itself.
- */
- [[nodiscard]] virtual uint64_t bucket_key() const noexcept = 0;
- [[nodiscard]] virtual document::BucketId bucket_id() const noexcept = 0;
- /**
- * Returns a mutable representation of the currently iterated database
- * entry. If changes are made to this object, Result::Update must be
- * returned from merge(). Otherwise, mutation visibility is undefined.
- */
- [[nodiscard]] virtual ValueT& current_entry() = 0;
- /**
- * Insert a new entry into the bucket database that is ordered before the
- * currently iterated entry.
- *
- * Preconditions:
- * - The bucket ID must sort _before_ the currently iterated
- * entry's bucket ID, in "reversed bits" bucket key order.
- * - The bucket ID must sort _after_ any entries previously
- * inserted with insert_before_current().
- * - The bucket ID must not be the same as a bucket that was
- * already iterated over as part of the DB merge() call or inserted
- * via a previous call to insert_before_current().
- * Such buckets must be handled by explicitly updating the provided
- * entry for the iterated bucket and returning Result::Update.
- */
- virtual void insert_before_current(const document::BucketId& bucket_id, const ValueT&) = 0;
-};
-
-/**
- * Interface to be implemented by callers that wish to receive callbacks
- * during a bucket merge() operation.
- */
-template <typename ValueT>
-struct MergingProcessor {
- // See merge() for semantics on enum values.
- enum class Result {
- Update,
- KeepUnchanged,
- Skip
- };
-
- virtual ~MergingProcessor() = default;
- /**
- * Invoked for each existing bucket in the database, in bucket key order.
- * The provided Merge instance may be used to access the current entry
- * and prepend entries to the DB.
- *
- * Return value semantics:
- * - Result::Update:
- * when merge() returns, the changes made to the current entry will
- * become visible in the bucket database.
- * - Result::KeepUnchanged:
- * when merge() returns, the entry will remain in the same state as
- * it was when merge() was originally called.
- * - Result::Skip:
- * when merge() returns, the entry will no longer be part of the DB.
- * Any entries added via insert_before_current() _will_ be present.
- *
- */
- virtual Result merge(Merger<ValueT>&) = 0;
- /**
- * Invoked once after all existing buckets have been iterated over.
- * The provided TrailingInserter instance may be used to append
- * an arbitrary number of entries to the database.
- *
- * This is used to handle elements remaining at the end of a linear
- * merge operation.
- */
- virtual void insert_remaining_at_end(TrailingInserter<ValueT>&) {}
-};
-
/*
* Bucket database implementation built around lock-free single-writer/multiple-readers B+tree.
*
@@ -186,6 +85,7 @@ public:
BTreeConstIterator find(uint64_t key) const noexcept;
BTreeConstIterator lower_bound(uint64_t key) const noexcept;
+ BTreeConstIterator upper_bound(uint64_t key) const noexcept;
BTreeConstIterator begin() const noexcept;
void clear() noexcept;
@@ -202,11 +102,11 @@ public:
bool update(const document::BucketId& bucket, const ValueType& new_entry);
bool update_by_raw_key(uint64_t bucket_key, const ValueType& new_entry);
- template <typename Func>
+ template <typename IterValueExtractor, typename Func>
void find_parents_and_self(const document::BucketId& bucket,
Func func) const;
- template <typename Func>
+ template <typename IterValueExtractor, typename Func>
void find_parents_self_and_children(const document::BucketId& bucket,
Func func) const;
@@ -220,13 +120,31 @@ public:
DataStoreType& store() noexcept { return _store; }
void merge(MergingProcessor<ValueType>& proc);
+
+ friend class ReadSnapshot;
+ // See ReadGuard class comments for semantics.
+ class ReadSnapshot {
+ const GenericBTreeBucketDatabase* _db;
+ vespalib::GenerationHandler::Guard _guard;
+ typename BTree::FrozenView _frozen_view;
+ public:
+ explicit ReadSnapshot(const GenericBTreeBucketDatabase& db);
+ ~ReadSnapshot();
+
+ ReadSnapshot(const ReadSnapshot&) = delete;
+ ReadSnapshot& operator=(const ReadSnapshot&) = delete;
+
+ template <typename IterValueExtractor, typename Func>
+ void find_parents_and_self(const document::BucketId& bucket, Func func) const;
+ [[nodiscard]] uint64_t generation() const noexcept;
+ };
private:
// Functor is called for each found element in key order, with raw u64 keys and values.
- template <typename Func>
+ template <typename IterValueExtractor, typename Func>
BTreeConstIterator find_parents_internal(const typename BTree::FrozenView& frozen_view,
const document::BucketId& bucket,
Func func) const;
- template <typename Func>
+ template <typename IterValueExtractor, typename Func>
void find_parents_and_self_internal(const typename BTree::FrozenView& frozen_view,
const document::BucketId& bucket,
Func func) const;
diff --git a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.hpp b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.hpp
index 4b1b507d95a..adaf402e4d1 100644
--- a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.hpp
+++ b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.hpp
@@ -2,6 +2,15 @@
#pragma once
#include "generic_btree_bucket_database.h"
+#include <vespa/vespalib/btree/btreebuilder.h>
+#include <vespa/vespalib/btree/btreenodeallocator.hpp>
+#include <vespa/vespalib/btree/btreenode.hpp>
+#include <vespa/vespalib/btree/btreenodestore.hpp>
+#include <vespa/vespalib/btree/btreeiterator.hpp>
+#include <vespa/vespalib/btree/btreeroot.hpp>
+#include <vespa/vespalib/btree/btreebuilder.hpp>
+#include <vespa/vespalib/btree/btree.hpp>
+#include <vespa/vespalib/btree/btreestore.hpp>
namespace storage::bucketdb {
@@ -80,6 +89,12 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::lower_bound(uint64_t key) const no
template <typename DataStoreTraitsT>
typename GenericBTreeBucketDatabase<DataStoreTraitsT>::BTreeConstIterator
+GenericBTreeBucketDatabase<DataStoreTraitsT>::upper_bound(uint64_t key) const noexcept {
+ return _tree.upperBound(key);
+}
+
+template <typename DataStoreTraitsT>
+typename GenericBTreeBucketDatabase<DataStoreTraitsT>::BTreeConstIterator
GenericBTreeBucketDatabase<DataStoreTraitsT>::find(uint64_t key) const noexcept {
return _tree.find(key);
}
@@ -90,6 +105,20 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::begin() const noexcept {
return _tree.begin();
}
+struct ByValue {
+ template <typename DB, typename Iter>
+ static typename DB::ValueType apply(const DB& db, const Iter& iter) {
+ return db.entry_from_iterator(iter);
+ };
+};
+
+struct ByConstRef {
+ template <typename DB, typename Iter>
+ static typename DB::ConstValueRef apply(const DB& db, const Iter& iter) {
+ return db.const_value_ref_from_valid_iterator(iter);
+ };
+};
+
/*
* Finding the complete set of parents of a given bucket is not obvious how to
* do efficiently, as we only know that the parents are ordered before their
@@ -159,7 +188,7 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::begin() const noexcept {
* invocation of seek() on the iterator.
*/
template <typename DataStoreTraitsT>
-template <typename Func>
+template <typename IterValueExtractor, typename Func>
typename GenericBTreeBucketDatabase<DataStoreTraitsT>::BTreeConstIterator
GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_internal(
const typename BTree::FrozenView& frozen_view,
@@ -186,7 +215,7 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_internal(
auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
if (candidate.contains(bucket)) {
assert(candidate.getUsedBits() >= bits);
- func(iter.getKey(), const_value_ref_from_valid_iterator(iter));
+ func(iter.getKey(), IterValueExtractor::apply(*this, iter));
}
bits = next_parent_bit_seek_level(bits, candidate, bucket);
const auto parent_key = BucketId(bits, bucket.getRawId()).toKey();
@@ -197,41 +226,41 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_internal(
}
template <typename DataStoreTraitsT>
-template <typename Func>
+template <typename IterValueExtractor, typename Func>
void GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_and_self_internal(
const typename BTree::FrozenView& frozen_view,
const BucketId& bucket,
Func func) const
{
- auto iter = find_parents_internal(frozen_view, bucket, func);
+ auto iter = find_parents_internal<IterValueExtractor>(frozen_view, bucket, func);
if (iter.valid() && iter.getKey() == bucket.toKey()) {
- func(iter.getKey(), entry_from_iterator(iter));
+ func(iter.getKey(), IterValueExtractor::apply(*this, iter));
}
}
template <typename DataStoreTraitsT>
-template <typename Func>
+template <typename IterValueExtractor, typename Func>
void GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_and_self(
const document::BucketId& bucket,
Func func) const
{
auto view = _tree.getFrozenView();
- find_parents_and_self_internal(view, bucket, std::move(func));
+ find_parents_and_self_internal<IterValueExtractor>(view, bucket, std::move(func));
}
template <typename DataStoreTraitsT>
-template <typename Func>
+template <typename IterValueExtractor, typename Func>
void GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_self_and_children(
const BucketId& bucket,
Func func) const
{
auto view = _tree.getFrozenView();
- auto iter = find_parents_internal(view, bucket, func);
+ auto iter = find_parents_internal<IterValueExtractor>(view, bucket, func);
// `iter` is already pointing at, or beyond, one of the bucket's subtrees.
for (; iter.valid(); ++iter) {
auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
if (bucket.contains(candidate)) {
- func(iter.getKey(), entry_from_iterator(iter));
+ func(iter.getKey(), IterValueExtractor::apply(*this, iter));
} else {
break;
}
@@ -490,5 +519,30 @@ void GenericBTreeBucketDatabase<DataStoreTraitsT>::merge(MergingProcessor<ValueT
commit_tree_changes();
}
+template <typename DataStoreTraitsT>
+GenericBTreeBucketDatabase<DataStoreTraitsT>::ReadSnapshot::ReadSnapshot(
+ const GenericBTreeBucketDatabase<DataStoreTraitsT>& db)
+ : _db(&db),
+ _guard(_db->_generation_handler.takeGuard()),
+ _frozen_view(_db->_tree.getFrozenView())
+{
+}
+
+template <typename DataStoreTraitsT>
+GenericBTreeBucketDatabase<DataStoreTraitsT>::ReadSnapshot::~ReadSnapshot() = default;
+
+template <typename DataStoreTraitsT>
+template <typename IterValueExtractor, typename Func>
+void GenericBTreeBucketDatabase<DataStoreTraitsT>::ReadSnapshot::find_parents_and_self(
+ const BucketId& bucket,
+ Func func) const
+{
+ _db->find_parents_and_self_internal<IterValueExtractor>(_frozen_view, bucket, std::move(func));
+}
+
+template <typename DataStoreTraitsT>
+uint64_t GenericBTreeBucketDatabase<DataStoreTraitsT>::ReadSnapshot::generation() const noexcept {
+ return _guard.getGeneration();
+}
}
diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.h b/storage/src/vespa/storage/bucketdb/lockablemap.h
index 83ecac0a94f..66619a4f7e8 100644
--- a/storage/src/vespa/storage/bucketdb/lockablemap.h
+++ b/storage/src/vespa/storage/bucketdb/lockablemap.h
@@ -28,7 +28,7 @@
namespace storage {
template <typename Map>
-class LockableMap
+class LockableMap final
: public bucketdb::AbstractBucketMap<typename Map::mapped_type>
{
public:
diff --git a/storage/src/vespa/storage/bucketdb/mapbucketdatabase.cpp b/storage/src/vespa/storage/bucketdb/mapbucketdatabase.cpp
index edb808da294..7556b80b29c 100644
--- a/storage/src/vespa/storage/bucketdb/mapbucketdatabase.cpp
+++ b/storage/src/vespa/storage/bucketdb/mapbucketdatabase.cpp
@@ -414,7 +414,8 @@ struct MapDbMerger final : BucketDatabase::Merger {
BucketDatabase::Entry& current_entry() override {
return _current_entry;
}
- void insert_before_current(const BucketDatabase::Entry& e) override {
+ void insert_before_current([[maybe_unused]] const document::BucketId& bucket_id,
+ const BucketDatabase::Entry& e) override {
_to_insert.emplace_back(e); // TODO movable
}
};
@@ -423,7 +424,8 @@ struct MapDbTrailingInserter final : BucketDatabase::TrailingInserter {
MapBucketDatabase& _db;
explicit MapDbTrailingInserter(MapBucketDatabase& db) : _db(db) {}
- void insert_at_end(const BucketDatabase::Entry& e) override {
+ void insert_at_end([[maybe_unused]] const document::BucketId& bucket_id,
+ const BucketDatabase::Entry& e) override {
_db.update(e);
}
};
@@ -584,7 +586,7 @@ MapBucketDatabase::print(std::ostream& out, bool verbose,
out << ')';
}
-std::unique_ptr<BucketDatabase::ReadGuard> MapBucketDatabase::acquire_read_guard() const {
+std::unique_ptr<bucketdb::ReadGuard<BucketDatabase::Entry>> MapBucketDatabase::acquire_read_guard() const {
return std::make_unique<ReadGuardImpl>(*this);
}
diff --git a/storage/src/vespa/storage/bucketdb/mapbucketdatabase.h b/storage/src/vespa/storage/bucketdb/mapbucketdatabase.h
index e41b797a321..e8d5688068f 100644
--- a/storage/src/vespa/storage/bucketdb/mapbucketdatabase.h
+++ b/storage/src/vespa/storage/bucketdb/mapbucketdatabase.h
@@ -29,7 +29,7 @@ public:
document::BucketId getAppropriateBucket(uint16_t minBits, const document::BucketId& bid) override;
void print(std::ostream& out, bool verbose, const std::string& indent) const override;
- std::unique_ptr<ReadGuard> acquire_read_guard() const override;
+ std::unique_ptr<bucketdb::ReadGuard<Entry>> acquire_read_guard() const override;
vespalib::MemoryUsage memory_usage() const noexcept override;
private:
struct E {
@@ -66,7 +66,7 @@ private:
uint32_t childCountImpl(int index, uint8_t bitCount, const document::BucketId& b) const;
// NOT thread-safe for concurrent reads!
- class ReadGuardImpl : public ReadGuard {
+ class ReadGuardImpl final : public bucketdb::ReadGuard<Entry> {
const MapBucketDatabase* _db;
public:
explicit ReadGuardImpl(const MapBucketDatabase& db) : _db(&db) {}
diff --git a/storage/src/vespa/storage/bucketdb/read_guard.h b/storage/src/vespa/storage/bucketdb/read_guard.h
new file mode 100644
index 00000000000..cf37bafb0dd
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/read_guard.h
@@ -0,0 +1,46 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+#include <vector>
+
+namespace storage::bucketdb {
+
+/*
+ * Read guard for accessing the bucket tree of an underlying bucket database
+ * in a thread-safe, read-only manner.
+ *
+ * Important: If the underlying database is _not_ backed by a B-tree, the
+ * read guard does _not_ provide a stable view of the bucket key set when
+ * iterating, as that is not possible without locking the entire DB.
+ *
+ * If the guard is created by a B-tree DB, the following properties hold:
+ * - The set of bucket keys that can be iterated over is stable for the lifetime
+ * of the read guard.
+ * - The bucket _values_ may change during the lifetime of the read guard,
+ * but the reader will always observe a fully consistent value as if it were
+ * written atomically.
+ *
+ * Do not hold read guards for longer than absolutely necessary, as they cause
+ * memory to be retained by the backing DB until released.
+ */
+
+template <typename ValueT>
+class ReadGuard {
+public:
+ ReadGuard() = default;
+ virtual ~ReadGuard() = default;
+
+ ReadGuard(ReadGuard&&) = delete;
+ ReadGuard& operator=(ReadGuard&&) = delete;
+ ReadGuard(const ReadGuard&) = delete;
+ ReadGuard& operator=(const ReadGuard&) = delete;
+
+ virtual void find_parents_and_self(const document::BucketId& bucket,
+ std::vector<ValueT>& entries) const = 0;
+ // If the underlying guard represents a snapshot, returns its monotonically
+ // increasing generation. Otherwise returns 0.
+ [[nodiscard]] virtual uint64_t generation() const noexcept = 0;
+};
+
+}
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
index 35af71898e6..057b106f775 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
@@ -181,7 +181,7 @@ public:
if (key_at_cursor >= key_to_insert) {
break;
}
- m.insert_before_current(*_current);
+ m.insert_before_current(_current->getBucketId(), *_current);
++_current;
}
if ((_current != _last) && (key_at_cursor == key_to_insert)) {
@@ -201,7 +201,7 @@ public:
void insert_remaining_at_end(BucketDatabase::TrailingInserter& inserter) override {
for (; _current != _last; ++_current) {
- inserter.insert_at_end(*_current);
+ inserter.insert_at_end(_current->getBucketId(), *_current);
}
}
};
diff --git a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
index 13f9c21eed0..6983e3594af 100644
--- a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
+++ b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
@@ -193,11 +193,12 @@ void PendingBucketSpaceDbTransition::insert_remaining_at_end(BucketDatabase::Tra
void
PendingBucketSpaceDbTransition::addToMerger(BucketDatabase::Merger& merger, const Range& range)
{
+ const auto bucket_id = _entries[range.first].bucket_id();
LOG(spam, "Adding new bucket %s with %d copies",
- _entries[range.first].bucket_id().toString().c_str(),
+ bucket_id.toString().c_str(),
range.second - range.first);
- BucketDatabase::Entry e(_entries[range.first].bucket_id(), BucketInfo());
+ BucketDatabase::Entry e(bucket_id, BucketInfo());
insertInfo(e, range);
if (e->getLastGarbageCollectionTime() == 0) {
e->setLastGarbageCollectionTime(
@@ -205,18 +206,19 @@ PendingBucketSpaceDbTransition::addToMerger(BucketDatabase::Merger& merger, cons
.getSeconds().getTime());
}
e.getBucketInfo().updateTrusted();
- merger.insert_before_current(e);
+ merger.insert_before_current(bucket_id, e);
}
void
PendingBucketSpaceDbTransition::addToInserter(BucketDatabase::TrailingInserter& inserter, const Range& range)
{
// TODO dedupe
+ const auto bucket_id = _entries[range.first].bucket_id();
LOG(spam, "Adding new bucket %s with %d copies",
- _entries[range.first].bucket_id().toString().c_str(),
+ bucket_id.toString().c_str(),
range.second - range.first);
- BucketDatabase::Entry e(_entries[range.first].bucket_id(), BucketInfo());
+ BucketDatabase::Entry e(bucket_id, BucketInfo());
insertInfo(e, range);
if (e->getLastGarbageCollectionTime() == 0) {
e->setLastGarbageCollectionTime(
@@ -224,7 +226,7 @@ PendingBucketSpaceDbTransition::addToInserter(BucketDatabase::TrailingInserter&
.getSeconds().getTime());
}
e.getBucketInfo().updateTrusted();
- inserter.insert_at_end(e);
+ inserter.insert_at_end(bucket_id, e);
}
void