diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2020-07-03 13:19:12 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-03 13:19:12 +0200 |
commit | bff64708d76b80ec125fa44c3c70df93b97ff22a (patch) | |
tree | 4afad579347d0f81e1b2e353f31acf174e3e9079 | |
parent | 4279238c4fa15c1a1bb69830d626798f3a54924e (diff) | |
parent | 902fcbb77170f5799c60ae6b6be4710aa29dde10 (diff) |
Merge pull request #13786 from vespa-engine/vekterli/unify-content-node-btree-databases
Unify content node and distributor B-tree databases
17 files changed, 365 insertions, 765 deletions
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp index a0354c8ad4e..0b832699364 100644 --- a/storage/src/tests/distributor/bucketdatabasetest.cpp +++ b/storage/src/tests/distributor/bucketdatabasetest.cpp @@ -610,7 +610,7 @@ struct InsertBeforeBucketMergingProcessor : BucketDatabase::MergingProcessor { Result merge(Merger& m) override { if (m.bucket_id() == _before_bucket) { // Assumes _before_bucket is > the inserted bucket - m.insert_before_current(BucketDatabase::Entry(document::BucketId(16, 2), BI(2))); + m.insert_before_current(document::BucketId(16, 2), BucketDatabase::Entry(document::BucketId(16, 2), BI(2))); } return Result::KeepUnchanged; } @@ -622,7 +622,7 @@ struct InsertAtEndMergingProcessor : BucketDatabase::MergingProcessor { } void insert_remaining_at_end(TrailingInserter& inserter) override { - inserter.insert_at_end(BucketDatabase::Entry(document::BucketId(16, 3), BI(3))); + inserter.insert_at_end(document::BucketId(16, 3), BucketDatabase::Entry(document::BucketId(16, 3), BI(3))); } }; diff --git a/storage/src/vespa/storage/bucketdb/abstract_bucket_map.h b/storage/src/vespa/storage/bucketdb/abstract_bucket_map.h index 6c669beab1c..a6855532e30 100644 --- a/storage/src/vespa/storage/bucketdb/abstract_bucket_map.h +++ b/storage/src/vespa/storage/bucketdb/abstract_bucket_map.h @@ -151,7 +151,6 @@ public: static constexpr uint32_t DEFAULT_CHUNK_SIZE = 1000; - /** * Iterate over the entire database contents, holding the global database * mutex for `chunkSize` processed entries at a time, yielding the current diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp index 42bd3a247bb..9634d6d0953 100644 --- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp +++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp @@ -1,22 +1,10 @@ // Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "btree_bucket_database.h" -#include <vespa/vespalib/btree/btreebuilder.h> -#include <vespa/vespalib/btree/btreenodeallocator.hpp> -#include <vespa/vespalib/btree/btreenode.hpp> -#include <vespa/vespalib/btree/btreenodestore.hpp> -#include <vespa/vespalib/btree/btreeiterator.hpp> -#include <vespa/vespalib/btree/btreeroot.hpp> -#include <vespa/vespalib/btree/btreebuilder.hpp> -#include <vespa/vespalib/btree/btree.hpp> -#include <vespa/vespalib/btree/btreestore.hpp> +#include "generic_btree_bucket_database.hpp" #include <vespa/vespalib/datastore/array_store.hpp> #include <iostream> -// TODO remove once this impl uses the generic bucket B-tree code! -#include "generic_btree_bucket_database.h" -#include <vespa/vespalib/datastore/datastore.h> - /* * Buckets in our tree are represented by their 64-bit numeric key, in what's known as * "reversed bit order with appended used-bits" form. I.e. a bucket ID (16, 0xcafe), which @@ -74,232 +62,53 @@ uint64_t value_from(uint32_t gc_timestamp, EntryRef ref) { return ((uint64_t(gc_timestamp) << 32u) | ref.ref()); } -// TODO dedupe and unify common code -uint8_t -getMinDiffBits(uint16_t minBits, const document::BucketId& a, const document::BucketId& b) { - for (uint32_t i = minBits; i <= std::min(a.getUsedBits(), b.getUsedBits()); i++) { - document::BucketId a1(i, a.getRawId()); - document::BucketId b1(i, b.getRawId()); - if (b1.getId() != a1.getId()) { - return i; - } - } - return minBits; } -uint8_t next_parent_bit_seek_level(uint8_t minBits, const document::BucketId& a, const document::BucketId& b) { - const uint8_t min_used = std::min(a.getUsedBits(), b.getUsedBits()); - assert(min_used >= minBits); // Always monotonically descending towards leaves - for (uint32_t i = minBits; i <= min_used; i++) { - document::BucketId a1(i, a.getRawId()); - document::BucketId b1(i, b.getRawId()); - if (b1.getId() != a1.getId()) { - return i; - } - } - // The bit prefix is equal, which means that one node is a parent of the other. In this - // case we have to force the seek to continue from the next level in the tree. - return std::max(min_used, minBits) + 1; -} +struct BTreeBucketDatabase::ReplicaValueTraits { + using ValueType = Entry; + using ConstValueRef = ConstEntryRef; + using DataStoreType = vespalib::datastore::ArrayStore<BucketCopy>; -// TODO getMinDiffBits is hoisted from lockablemap.cpp, could probably be rewritten in terms of xor and MSB bit scan instr -/* - * 63 -------- ... -> 0 - * a: 1101111111 ... 0010 - * b: 1101110010 ... 0011 - * a ^ b: 0000001101 ... 0001 - * ^- diff bit = 57 - * - * 63 - vespalib::Optimized::msbIdx(a ^ b) ==> 6 - * - * what if a == b? special case? not a problem if we can prove this never happens. - */ + static ValueType make_invalid_value() { + return Entry::createInvalid(); + } + static uint64_t wrap_and_store_value(DataStoreType& store, const Entry& entry) noexcept { + auto replicas_ref = store.add(entry.getBucketInfo().getRawNodes()); + return value_from(entry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref); + } + static void remove_by_wrapped_value(DataStoreType& store, uint64_t value) noexcept { + store.remove(entry_ref_from_value(value)); + } + static ValueType unwrap_from_key_value(const DataStoreType& store, uint64_t key, uint64_t value) { + const auto replicas_ref = store.get(entry_ref_from_value(value)); + const auto bucket = BucketId(BucketId::keyToBucketId(key)); + return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref); + } + static ConstValueRef unwrap_const_ref_from_key_value(const DataStoreType& store, uint64_t key, uint64_t value) { + const auto replicas_ref = store.get(entry_ref_from_value(value)); + const auto bucket = BucketId(BucketId::keyToBucketId(key)); + return const_entry_ref_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref); + } +}; -} +template class bucketdb::GenericBTreeBucketDatabase<BTreeBucketDatabase::ReplicaValueTraits>; BTreeBucketDatabase::BTreeBucketDatabase() - : _tree(), - _store(make_default_array_store_config<ReplicaStore>()), - _generation_handler() + : _impl(std::make_unique<ImplType>(make_default_array_store_config<ReplicaValueTraits::DataStoreType>())) { } BTreeBucketDatabase::~BTreeBucketDatabase() = default; -void BTreeBucketDatabase::commit_tree_changes() { - // TODO break up and refactor - // TODO verify semantics and usage - // TODO make BTree wrapping API which abstracts away all this stuff via reader/writer interfaces - _tree.getAllocator().freeze(); - - auto current_gen = _generation_handler.getCurrentGeneration(); - _store.transferHoldLists(current_gen); - _tree.getAllocator().transferHoldLists(current_gen); - - _generation_handler.incGeneration(); - - auto used_gen = _generation_handler.getFirstUsedGeneration(); - _store.trimHoldLists(used_gen); - _tree.getAllocator().trimHoldLists(used_gen); -} - -Entry BTreeBucketDatabase::entry_from_value(uint64_t bucket_key, uint64_t value) const { - const auto replicas_ref = _store.get(entry_ref_from_value(value)); - const auto bucket = BucketId(BucketId::keyToBucketId(bucket_key)); - return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref); -} - -Entry BTreeBucketDatabase::entry_from_iterator(const BTree::ConstIterator& iter) const { - if (!iter.valid()) { - return Entry::createInvalid(); - } - const auto value = iter.getData(); - std::atomic_thread_fence(std::memory_order_acquire); - return entry_from_value(iter.getKey(), value); -} - -ConstEntryRef BTreeBucketDatabase::const_entry_ref_from_iterator(const BTree::ConstIterator& iter) const { - if (!iter.valid()) { - return ConstEntryRef::createInvalid(); - } - const auto value = iter.getData(); - std::atomic_thread_fence(std::memory_order_acquire); - const auto replicas_ref = _store.get(entry_ref_from_value(value)); - const auto bucket = BucketId(BucketId::keyToBucketId(iter.getKey())); - return const_entry_ref_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref); -} - -BucketId BTreeBucketDatabase::bucket_from_valid_iterator(const BTree::ConstIterator& iter) const { - return BucketId(BucketId::keyToBucketId(iter.getKey())); -} - Entry BTreeBucketDatabase::get(const BucketId& bucket) const { - return entry_from_iterator(_tree.find(bucket.toKey())); + return _impl->get(bucket); } void BTreeBucketDatabase::remove(const BucketId& bucket) { - auto iter = _tree.find(bucket.toKey()); - if (!iter.valid()) { - return; - } - const auto value = iter.getData(); - _store.remove(entry_ref_from_value(value)); - _tree.remove(iter); - commit_tree_changes(); + _impl->remove(bucket); } -/* - * Finding the complete set of parents of a given bucket is not obvious how to - * do efficiently, as we only know that the parents are ordered before their - * children, but we do not a-priori know if any exist at all. The Judy DB impl - * does O(b) explicit point lookups (where b is the number of used bits in the - * bucket), starting at the leaf bit and working towards the root. To avoid - * having to re-create iterators and perform a full tree search every time, we - * turn this on its head and start from the root, progressing towards the leaf. - * This allows us to reuse a single iterator and to continue seeking forwards - * from its current position. - * - * To speed up the process of converging on the target bucket without needing - * to check many unrelated subtrees, we let the underlying B-tree automatically - * aggregate the min/max range of the used-bits of all contained bucket keys. - * If we e.g. know that the minimum number of used bits in the DB is 16, we can - * immediately seek to this level in the tree instead of working our way down - * one bit at a time. By definition, no parents can exist above this level. - * This is a very important optimization, as bucket trees are usually very well - * balanced due to randomized distribution of data (combined with a cluster-wide - * minimum tree level imposed by distribution bits). It is common that the minimum - * number of used bits == max number of used bits, i.e. a totally even split. - * This means that for a system without inconsistently split buckets (i.e. no - * parents) we're highly likely to converge on the target bucket in a single seek. - * - * Algorithm: - * - * Core invariant: every subsequent iterator seek performed in this algorithm - * is for a key that is strictly higher than the one the iterator is currently at. - * - * 1. Lbound seek to the lowest key that is known to exclude all already visited - * parents. On the first iteration we use a bit count equal to the minimum number - * of key used-bits in the entire DB, allowing us to potentially skip most subtrees. - * 2. If the current node's key is greater than that of the requested bucket's key, - * we've either descended to--or beyond--it in its own subtree or we've entered - * a disjoint subtree. Since we know that all parents must sort before any given - * child bucket, no more parents may be found at this point. Algorithm terminates. - * 3. As the main body of the loop is entered, we know one of following must hold: - * 3.1 The current node is an explicitly present parent of our bucket. - * 3.2 The current node is contained in a left subtree branch of a parent that - * does not have a bucket explicitly present in the tree. It cannot be in - * a right subtree of any parent, as that would imply the node is ordered - * _after_ our own bucket in an in-order traversal, which would contradict - * the check in step 2 above. - * 4. If the current node contains the requested bucket, we're at a parent - * node of the bucket; add it to the result set. - * If this is _not_ the case, we're in a different subtree. Example: the - * requested bucket has a key whose MSB is 1 but the first bucket in the - * tree has a key with an MSB of 0. Either way we need to update our search - * key to home in on the target subtree where more parents may be found; - * 5. Update the seek key to find the next possible parent. To ensure this key is - * strictly greater than the iterator's current key we find the largest shared - * prefix of bits in common between the current node's key and the requested - * bucket's key. The prefix length + 1 is then the depth in the tree at which the - * two subtrees branch off and diverge. - * The new key is then the MSB prefix length + 1 requested bucket's key with a - * matching number of used-bits set. Forward lbound-seek the iterator to this key. - * `--> TODO elaborate on prefix semantics when they are equal wrt. min used bits - * 6. Iff iterator is still valid, go to step 2 - * - * This algorithm is able to skip through large parts of the tree in a sparsely populated - * tree, but the number of seeks will trend towards O(b - min_bits) as with the legacy - * implementation when a tree is densely populated (where `b` is the used-bits count of the - * most specific node in the tree for the target bucket, and min_bits is the minimum number - * of used-bits for any key in the database). This because all logical inner nodes in the tree - * will have subtrees under them. Even in the worst case we should be more efficient than the - * legacy Judy-based implementation since we've cut any dense search space in half for each - * invocation of seek() on the iterator. - */ -BTreeBucketDatabase::BTree::ConstIterator -BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view, - const document::BucketId& bucket, - std::vector<Entry>& entries) const -{ - const uint64_t bucket_key = bucket.toKey(); - if (frozen_view.empty()) { - return frozen_view.begin(); // Will be invalid. - } - const auto min_db_bits = frozen_view.getAggregated().getMin(); - assert(min_db_bits >= static_cast<int32_t>(BucketId::minNumBits)); - assert(min_db_bits <= static_cast<int32_t>(BucketId::maxNumBits)); - // Start at the lowest possible tree level no parents can exist above, - // descending towards the bucket itself. - // Note: important to use getId() rather than getRawId(), as min_db_bits may be - // greater than the used bits of the queried bucket. If we used the raw ID, we'd - // end up looking at undefined bits. - const auto first_key = BucketId(min_db_bits, bucket.getId()).toKey(); - auto iter = frozen_view.lowerBound(first_key); - // Try skipping as many levels of the tree as possible as we go. - uint32_t bits = min_db_bits; - while (iter.valid() && (iter.getKey() < bucket_key)) { - auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey())); - if (candidate.contains(bucket)) { - assert(candidate.getUsedBits() >= bits); - entries.emplace_back(entry_from_iterator(iter)); - } - bits = next_parent_bit_seek_level(bits, candidate, bucket); - const auto parent_key = BucketId(bits, bucket.getRawId()).toKey(); - assert(parent_key > iter.getKey()); - iter.seek(parent_key); - } - return iter; -} - -void BTreeBucketDatabase::find_parents_and_self_internal(const BTree::FrozenView& frozen_view, - const document::BucketId& bucket, - std::vector<Entry>& entries) const -{ - auto iter = find_parents_internal(frozen_view, bucket, entries); - if (iter.valid() && iter.getKey() == bucket.toKey()) { - entries.emplace_back(entry_from_iterator(iter)); - } -} +using bucketdb::ByValue; /* * Note: due to legacy API reasons, iff the requested bucket itself exists in the @@ -309,212 +118,53 @@ void BTreeBucketDatabase::find_parents_and_self_internal(const BTree::FrozenView void BTreeBucketDatabase::getParents(const BucketId& bucket, std::vector<Entry>& entries) const { - auto view = _tree.getFrozenView(); - find_parents_and_self_internal(view, bucket, entries); + _impl->find_parents_and_self<ByValue>(bucket, [&entries]([[maybe_unused]] uint64_t key, Entry entry){ + entries.emplace_back(std::move(entry)); + }); } void BTreeBucketDatabase::getAll(const BucketId& bucket, std::vector<Entry>& entries) const { - auto view = _tree.getFrozenView(); - auto iter = find_parents_internal(view, bucket, entries); - // `iter` is already pointing at, or beyond, one of the bucket's subtrees. - for (; iter.valid(); ++iter) { - auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey())); - if (bucket.contains(candidate)) { - entries.emplace_back(entry_from_iterator(iter)); - } else { - break; - } - } + _impl->find_parents_self_and_children<ByValue>(bucket, [&entries]([[maybe_unused]] uint64_t key, Entry entry){ + entries.emplace_back(std::move(entry)); + }); } void BTreeBucketDatabase::update(const Entry& newEntry) { assert(newEntry.valid()); - auto replicas_ref = _store.add(newEntry.getBucketInfo().getRawNodes()); - const auto new_value = value_from(newEntry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref); - const auto bucket_key = newEntry.getBucketId().toKey(); - auto iter = _tree.lowerBound(bucket_key); - if (iter.valid() && (iter.getKey() == bucket_key)) { - _store.remove(entry_ref_from_value(iter.getData())); - // In-place update of value; does not require tree structure modification - std::atomic_thread_fence(std::memory_order_release); // Must ensure visibility when new array ref is observed - iter.writeData(new_value); - } else { - _tree.insert(iter, bucket_key, new_value); - } - commit_tree_changes(); // TODO does publishing a new root imply an implicit memory fence? + _impl->update(newEntry.getBucketId(), newEntry); } // TODO need snapshot read with guarding // FIXME semantics of for-each in judy and bit tree DBs differ, former expects lbound, latter ubound..! // FIXME but bit-tree code says "lowerBound" in impl and "after" in declaration??? void BTreeBucketDatabase::forEach(EntryProcessor& proc, const BucketId& after) const { - for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) { - if (!proc.process(const_entry_ref_from_iterator(iter))) { + for (auto iter = _impl->upper_bound(after.toKey()); iter.valid(); ++iter) { + if (!proc.process(_impl->const_value_ref_from_valid_iterator(iter))) { break; } } } -struct BTreeBuilderMerger final : BucketDatabase::Merger { - BTreeBucketDatabase& _db; - BTreeBucketDatabase::BTree::Builder& _builder; - uint64_t _current_key; - uint64_t _current_value; - Entry _cached_entry; - bool _valid_cached_entry; - - BTreeBuilderMerger(BTreeBucketDatabase& db, - BTreeBucketDatabase::BTree::Builder& builder) - : _db(db), - _builder(builder), - _current_key(0), - _current_value(0), - _cached_entry(), - _valid_cached_entry(false) - {} - ~BTreeBuilderMerger() override = default; - - uint64_t bucket_key() const noexcept override { - return _current_key; - } - BucketId bucket_id() const noexcept override { - return BucketId(BucketId::keyToBucketId(_current_key)); - } - Entry& current_entry() override { - if (!_valid_cached_entry) { - _cached_entry = _db.entry_from_value(_current_key, _current_value); - _valid_cached_entry = true; - } - return _cached_entry; - } - void insert_before_current(const Entry& e) override { - const uint64_t bucket_key = e.getBucketId().toKey(); - assert(bucket_key < _current_key); - - auto replicas_ref = _db._store.add(e.getBucketInfo().getRawNodes()); - const auto new_value = value_from(e.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref); - - _builder.insert(bucket_key, new_value); - } - - void update_iteration_state(uint64_t key, uint64_t value) { - _current_key = key; - _current_value = value; - _valid_cached_entry = false; - } -}; - -struct BTreeTrailingInserter final : BucketDatabase::TrailingInserter { - BTreeBucketDatabase& _db; - BTreeBucketDatabase::BTree::Builder& _builder; - - BTreeTrailingInserter(BTreeBucketDatabase& db, - BTreeBucketDatabase::BTree::Builder& builder) - : _db(db), - _builder(builder) - {} - - ~BTreeTrailingInserter() override = default; - - void insert_at_end(const Entry& e) override { - const uint64_t bucket_key = e.getBucketId().toKey(); - const auto replicas_ref = _db._store.add(e.getBucketInfo().getRawNodes()); - const auto new_value = value_from(e.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref); - _builder.insert(bucket_key, new_value); - } -}; - -// TODO lbound arg? void BTreeBucketDatabase::merge(MergingProcessor& proc) { - BTreeBucketDatabase::BTree::Builder builder(_tree.getAllocator()); - BTreeBuilderMerger merger(*this, builder); - - // TODO for_each instead? - for (auto iter = _tree.begin(); iter.valid(); ++iter) { - const uint64_t key = iter.getKey(); - const uint64_t value = iter.getData(); - merger.update_iteration_state(key, value); - - auto result = proc.merge(merger); - - if (result == MergingProcessor::Result::KeepUnchanged) { - builder.insert(key, value); // Reuse array store ref with no changes - } else if (result == MergingProcessor::Result::Update) { - assert(merger._valid_cached_entry); // Must actually have been touched - assert(merger._cached_entry.valid()); - _store.remove(entry_ref_from_value(value)); - auto new_replicas_ref = _store.add(merger._cached_entry.getBucketInfo().getRawNodes()); - const auto new_value = value_from(merger._cached_entry.getBucketInfo().getLastGarbageCollectionTime(), new_replicas_ref); - builder.insert(key, new_value); - } else if (result == MergingProcessor::Result::Skip) { - _store.remove(entry_ref_from_value(value)); - } else { - abort(); - } - } - BTreeTrailingInserter inserter(*this, builder); - proc.insert_remaining_at_end(inserter); - - _tree.assign(builder); - commit_tree_changes(); + _impl->merge(proc); } Entry BTreeBucketDatabase::upperBound(const BucketId& bucket) const { - return entry_from_iterator(_tree.upperBound(bucket.toKey())); + return _impl->entry_from_iterator(_impl->upper_bound(bucket.toKey())); } uint64_t BTreeBucketDatabase::size() const { - return _tree.size(); + return _impl->size(); } void BTreeBucketDatabase::clear() { - _tree.clear(); - commit_tree_changes(); + _impl->clear(); } -/* - * Returns the bucket ID which, based on the buckets already existing in the DB, - * is the most specific location in the tree in which it should reside. This may - * or may not be a bucket that already exists. - * - * Example: if there is a single bucket (1, 1) in the tree, a query for (1, 1) or - * (1, 3) will return (1, 1) as that is the most specific leaf in that subtree. - * A query for (1, 0) will return (1, 0) even though this doesn't currently exist, - * as there is no existing bucket that can contain the queried bucket. It is up to - * the caller to create this bucket according to its needs. - * - * Usually this function will be called with an ID whose used-bits is at max (58), in - * order to find a leaf bucket to route an incoming document operation to. - * - * TODO rename this function, it's very much _not_ obvious what an "appropriate" bucket is..! - * TODO this should be possible to do concurrently - */ BucketId BTreeBucketDatabase::getAppropriateBucket(uint16_t minBits, const BucketId& bid) { - // The bucket tree is ordered in such a way that it represents a - // natural in-order traversal of all buckets, with inner nodes being - // visited before leaf nodes. This means that a lower bound seek will - // never return a parent of a seeked bucket. The iterator will be pointing - // to a bucket that is either the actual bucket given as the argument to - // lowerBound() or the next in-order bucket (or end() if none exists). - auto iter = _tree.lowerBound(bid.toKey()); - if (iter.valid()) { - // Find the first level in the tree where the paths through the bucket tree - // diverge for the target bucket and the current bucket. - minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid); - } - // TODO is it better to copy original iterator and do begin() on the copy? - auto first_iter = _tree.begin(); - // Original iterator might be in a different subtree than that of our - // target bucket. If possible, rewind one node to discover any parent or - // leftmost sibling of our node. If there's no such node, we'll still - // discover the greatest equal bit prefix. - if (iter != first_iter) { - --iter; - minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid); - } - return BucketId(minBits, bid.getRawId()); + return _impl->getAppropriateBucket(minBits, bid); } /* @@ -527,24 +177,7 @@ BucketId BTreeBucketDatabase::getAppropriateBucket(uint16_t minBits, const Bucke */ // TODO rename/clarify to indicate this is child _subtrees_, not explicit child _buckets_! uint32_t BTreeBucketDatabase::childCount(const BucketId& bucket) const { - assert(bucket.getUsedBits() < BucketId::maxNumBits); - BucketId lhs_bucket(bucket.getUsedBits() + 1, bucket.getId()); - BucketId rhs_bucket(bucket.getUsedBits() + 1, (1ULL << bucket.getUsedBits()) | bucket.getId()); - - auto iter = _tree.lowerBound(lhs_bucket.toKey()); - if (!iter.valid()) { - return 0; - } - if (lhs_bucket.contains(bucket_from_valid_iterator(iter))) { - iter.seek(rhs_bucket.toKey()); - if (!iter.valid()) { - return 1; // lhs subtree only - } - return (rhs_bucket.contains(bucket_from_valid_iterator(iter)) ? 2 : 1); - } else if (rhs_bucket.contains(bucket_from_valid_iterator(iter))) { - return 1; // rhs subtree only - } - return 0; + return _impl->child_subtree_count(bucket); } void BTreeBucketDatabase::print(std::ostream& out, bool verbose, @@ -556,15 +189,22 @@ void BTreeBucketDatabase::print(std::ostream& out, bool verbose, } vespalib::MemoryUsage BTreeBucketDatabase::memory_usage() const noexcept { - auto mem_usage = _tree.getMemoryUsage(); - mem_usage.merge(_store.getMemoryUsage()); - return mem_usage; + return _impl->memory_usage(); } +class BTreeBucketDatabase::ReadGuardImpl final : public bucketdb::ReadGuard<Entry> { + ImplType::ReadSnapshot _snapshot; +public: + explicit ReadGuardImpl(const BTreeBucketDatabase& db); + ~ReadGuardImpl() override; + + void find_parents_and_self(const document::BucketId& bucket, + std::vector<Entry>& entries) const override; + [[nodiscard]] uint64_t generation() const noexcept override; +}; + BTreeBucketDatabase::ReadGuardImpl::ReadGuardImpl(const BTreeBucketDatabase& db) - : _db(&db), - _guard(_db->_generation_handler.takeGuard()), - _frozen_view(_db->_tree.getFrozenView()) + : _snapshot(*db._impl) {} BTreeBucketDatabase::ReadGuardImpl::~ReadGuardImpl() = default; @@ -572,52 +212,17 @@ BTreeBucketDatabase::ReadGuardImpl::~ReadGuardImpl() = default; void BTreeBucketDatabase::ReadGuardImpl::find_parents_and_self(const document::BucketId& bucket, std::vector<Entry>& entries) const { - _db->find_parents_and_self_internal(_frozen_view, bucket, entries); + _snapshot.find_parents_and_self<ByValue>(bucket, [&entries]([[maybe_unused]] uint64_t key, Entry entry){ + entries.emplace_back(std::move(entry)); + }); } uint64_t BTreeBucketDatabase::ReadGuardImpl::generation() const noexcept { - return _guard.getGeneration(); + return _snapshot.generation(); } -// TODO replace existing distributor DB code with generic impl. -// This is to ensure the generic implementation compiles with an ArrayStore backing in -// the meantime. -struct BTreeBucketDatabase2 { - struct ReplicaValueTraits { - using ValueType = Entry; - using ConstValueRef = ConstEntryRef; - using DataStoreType = vespalib::datastore::ArrayStore<BucketCopy>; - - static ValueType make_invalid_value() { - return Entry::createInvalid(); - } - static uint64_t wrap_and_store_value(DataStoreType& store, const Entry& entry) noexcept { - auto replicas_ref = store.add(entry.getBucketInfo().getRawNodes()); - return value_from(entry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref); - } - static void remove_by_wrapped_value(DataStoreType& store, uint64_t value) noexcept { - store.remove(entry_ref_from_value(value)); - } - static ValueType unwrap_from_key_value(const DataStoreType& store, uint64_t key, uint64_t value) { - const auto replicas_ref = store.get(entry_ref_from_value(value)); - const auto bucket = BucketId(BucketId::keyToBucketId(key)); - return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref); - } - static ConstValueRef unwrap_const_ref_from_key_value(const DataStoreType& store, uint64_t key, uint64_t value) { - const auto replicas_ref = store.get(entry_ref_from_value(value)); - const auto bucket = BucketId(BucketId::keyToBucketId(key)); - return const_entry_ref_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref); - } - }; - - using BTreeImpl = bucketdb::GenericBTreeBucketDatabase<ReplicaValueTraits>; - BTreeImpl _impl; - - BTreeBucketDatabase2() - : _impl(make_default_array_store_config<ReplicaValueTraits::DataStoreType>()) - {} -}; - -template class bucketdb::GenericBTreeBucketDatabase<BTreeBucketDatabase2::ReplicaValueTraits>; +std::unique_ptr<bucketdb::ReadGuard<Entry>> BTreeBucketDatabase::acquire_read_guard() const { + return std::make_unique<ReadGuardImpl>(*this); +} } diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h index 0dfa2b07b8a..122c6eeb0fb 100644 --- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h +++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h @@ -3,13 +3,14 @@ #pragma once #include "bucketdatabase.h" -#include <vespa/vespalib/btree/btree.h> -#include <vespa/vespalib/btree/minmaxaggregated.h> -#include <vespa/vespalib/btree/minmaxaggrcalc.h> -#include <vespa/vespalib/datastore/array_store.h> +#include <memory> namespace storage { +namespace bucketdb { +template <typename DataStoreTraitsT> class GenericBTreeBucketDatabase; +} + /* * Bucket database implementation built around lock-free single-writer/multiple-readers B+tree. * @@ -25,27 +26,9 @@ namespace storage { */ // TODO create and use a new DB interface with better bulk loading, snapshot and iteration support class BTreeBucketDatabase : public BucketDatabase { - - struct KeyUsedBitsMinMaxAggrCalc : vespalib::btree::MinMaxAggrCalc { - constexpr static bool aggregate_over_values() { return false; } - constexpr static int32_t getVal(uint64_t key) noexcept { - static_assert(document::BucketId::CountBits == 6u); - return static_cast<int32_t>(key & 0b11'1111U); // 6 LSB of key contains used-bits - } - }; - - // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref> - using BTree = vespalib::btree::BTree<uint64_t, uint64_t, - vespalib::btree::MinMaxAggregated, - std::less<>, - vespalib::btree::BTreeDefaultTraits, - KeyUsedBitsMinMaxAggrCalc>; - using ReplicaStore = vespalib::datastore::ArrayStore<BucketCopy>; - using GenerationHandler = vespalib::GenerationHandler; - - BTree _tree; - ReplicaStore _store; - GenerationHandler _generation_handler; + struct ReplicaValueTraits; + using ImplType = bucketdb::GenericBTreeBucketDatabase<ReplicaValueTraits>; + std::unique_ptr<ImplType> _impl; public: BTreeBucketDatabase(); ~BTreeBucketDatabase() override; @@ -72,38 +55,10 @@ public: const std::string& indent) const override; private: - Entry entry_from_value(uint64_t bucket_key, uint64_t value) const; - Entry entry_from_iterator(const BTree::ConstIterator& iter) const; - ConstEntryRef const_entry_ref_from_iterator(const BTree::ConstIterator& iter) const; - document::BucketId bucket_from_valid_iterator(const BTree::ConstIterator& iter) const; - void commit_tree_changes(); - BTree::ConstIterator find_parents_internal(const BTree::FrozenView& frozen_view, - const document::BucketId& bucket, - std::vector<Entry>& entries) const; - void find_parents_and_self_internal(const BTree::FrozenView& frozen_view, - const document::BucketId& bucket, - std::vector<Entry>& entries) const; - - class ReadGuardImpl : public ReadGuard { - const BTreeBucketDatabase* _db; - GenerationHandler::Guard _guard; - BTree::FrozenView _frozen_view; - public: - explicit ReadGuardImpl(const BTreeBucketDatabase& db); - ~ReadGuardImpl() override; - - void find_parents_and_self(const document::BucketId& bucket, - std::vector<Entry>& entries) const override; - uint64_t generation() const noexcept override; - }; - + class ReadGuardImpl; friend class ReadGuardImpl; - friend struct BTreeBuilderMerger; - friend struct BTreeTrailingInserter; public: - std::unique_ptr<ReadGuard> acquire_read_guard() const override { - return std::make_unique<ReadGuardImpl>(*this); - } + std::unique_ptr<bucketdb::ReadGuard<Entry>> acquire_read_guard() const override; vespalib::MemoryUsage memory_usage() const noexcept override; }; diff --git a/storage/src/vespa/storage/bucketdb/btree_lockable_map.h b/storage/src/vespa/storage/bucketdb/btree_lockable_map.h index 136baefb615..3d58b85b063 100644 --- a/storage/src/vespa/storage/bucketdb/btree_lockable_map.h +++ b/storage/src/vespa/storage/bucketdb/btree_lockable_map.h @@ -24,7 +24,7 @@ template <typename DataStoreTraitsT> class GenericBTreeBucketDatabase; * Identical global and per-bucket locking semantics as LockableMap. */ template <typename T> -class BTreeLockableMap : public AbstractBucketMap<T> { +class BTreeLockableMap final : public AbstractBucketMap<T> { struct ValueTraits; public: using ParentType = AbstractBucketMap<T>; diff --git a/storage/src/vespa/storage/bucketdb/btree_lockable_map.hpp b/storage/src/vespa/storage/bucketdb/btree_lockable_map.hpp index 9c7228ae21d..18eae405dc6 100644 --- a/storage/src/vespa/storage/bucketdb/btree_lockable_map.hpp +++ b/storage/src/vespa/storage/bucketdb/btree_lockable_map.hpp @@ -439,7 +439,7 @@ BTreeLockableMap<T>::getContained(const BucketId& bucket, std::map<BucketId, WrappedEntry> results; std::vector<BucketId::Type> keys; - _impl->find_parents_and_self(bucket, [&keys](uint64_t key, [[maybe_unused]]const auto& value){ + _impl->template find_parents_and_self<ByConstRef>(bucket, [&keys](uint64_t key, [[maybe_unused]]const auto& value){ keys.emplace_back(key); }); @@ -454,7 +454,7 @@ template <typename T> void BTreeLockableMap<T>::getAllWithoutLocking(const BucketId& bucket, std::vector<BucketId::Type>& keys) { - _impl->find_parents_self_and_children(bucket, [&keys](uint64_t key, [[maybe_unused]]const auto& value){ + _impl->template find_parents_self_and_children<ByConstRef>(bucket, [&keys](uint64_t key, [[maybe_unused]]const auto& value){ keys.emplace_back(key); }); } @@ -480,7 +480,7 @@ template <typename T> bool BTreeLockableMap<T>::isConsistent(const BTreeLockableMap::WrappedEntry& entry) { std::lock_guard guard(_lock); uint64_t n_buckets = 0; - _impl->find_parents_self_and_children(entry.getBucketId(), + _impl->template find_parents_self_and_children<ByConstRef>(entry.getBucketId(), [&n_buckets]([[maybe_unused]] uint64_t key, [[maybe_unused]] const auto& value) { ++n_buckets; }); diff --git a/storage/src/vespa/storage/bucketdb/bucketdatabase.h b/storage/src/vespa/storage/bucketdb/bucketdatabase.h index 2dbcdd194ef..1ee165a739c 100644 --- a/storage/src/vespa/storage/bucketdb/bucketdatabase.h +++ b/storage/src/vespa/storage/bucketdb/bucketdatabase.h @@ -4,6 +4,8 @@ */ #pragma once +#include "db_merger.h" +#include "read_guard.h" #include <vespa/vespalib/util/printable.h> #include <vespa/storage/bucketdb/bucketinfo.h> #include <vespa/document/bucket/bucketid.h> @@ -84,104 +86,9 @@ public: EntryProcessor&, const document::BucketId& after = document::BucketId()) const = 0; - /** - * Database implementation-specific interface for appending entries - * during a merge() operation. - */ - struct TrailingInserter { - virtual ~TrailingInserter() = default; - /** - * Insert a new database entry at the end of the current bucket space. - * - * Precondition: the entry's bucket ID must sort after all entries that - * have already been iterated over or inserted via insert_at_end(). - */ - virtual void insert_at_end(const Entry&) = 0; - }; - - /** - * Database implementation-specific interface for accessing bucket - * entries and prepending entries during a merge() operation. - */ - struct Merger { - virtual ~Merger() = default; - - // TODO this should ideally be separated into read/write functions, but this - // will suffice for now to avoid too many changes. - - /** - * Bucket key/ID of the currently iterated entry. Unless the information stored - * in the DB Entry is needed, using one of these methods should be preferred to - * getting the bucket ID via current_entry(). The underlying DB is expected to - * have cheap access to the ID but _may_ have expensive access to the entry itself. - */ - virtual uint64_t bucket_key() const noexcept = 0; - virtual document::BucketId bucket_id() const noexcept = 0; - /** - * Returns a mutable representation of the currently iterated database - * entry. If changes are made to this object, Result::Update must be - * returned from merge(). Otherwise, mutation visibility is undefined. - */ - virtual Entry& current_entry() = 0; - /** - * Insert a new entry into the bucket database that is ordered before the - * currently iterated entry. - * - * Preconditions: - * - The entry's bucket ID must sort _before_ the currently iterated - * entry's bucket ID, in "reversed bits" bucket key order. - * - The entry's bucket ID must sort _after_ any entries previously - * inserted with insert_before_current(). - * - The entry's bucket ID must not be the same as a bucket that was - * already iterated over as part of the DB merge() call or inserted - * via a previous call to insert_before_current(). - * Such buckets must be handled by explicitly updating the provided - * entry for the iterated bucket and returning Result::Update. - */ - virtual void insert_before_current(const Entry&) = 0; - }; - - /** - * Interface to be implemented by callers that wish to receive callbacks - * during a bucket merge() operation. - */ - struct MergingProcessor { - // See merge() for semantics on enum values. - enum class Result { - Update, - KeepUnchanged, - Skip - }; - - virtual ~MergingProcessor() = default; - /** - * Invoked for each existing bucket in the database, in bucket key order. - * The provided Merge instance may be used to access the current entry - * and prepend entries to the DB. - * - * Return value semantics: - * - Result::Update: - * when merge() returns, the changes made to the current entry will - * become visible in the bucket database. - * - Result::KeepUnchanged: - * when merge() returns, the entry will remain in the same state as - * it was when merge() was originally called. - * - Result::Skip: - * when merge() returns, the entry will no longer be part of the DB. - * Any entries added via insert_before_current() _will_ be present. - * - */ - virtual Result merge(Merger&) = 0; - /** - * Invoked once after all existing buckets have been iterated over. - * The provided TrailingInserter instance may be used to append - * an arbitrary number of entries to the database. - * - * This is used to handle elements remaining at the end of a linear - * merge operation. - */ - virtual void insert_remaining_at_end(TrailingInserter&) {} - }; + using TrailingInserter = bucketdb::TrailingInserter<Entry>; + using Merger = bucketdb::Merger<Entry>; + using MergingProcessor = bucketdb::MergingProcessor<Entry>; /** * Iterate over the bucket database in bucket key order, allowing an arbitrary @@ -234,22 +141,10 @@ public: virtual uint32_t childCount(const document::BucketId&) const = 0; - struct ReadGuard { - ReadGuard() = default; - virtual ~ReadGuard() = default; - - ReadGuard(ReadGuard&&) = delete; - ReadGuard& operator=(ReadGuard&&) = delete; - ReadGuard(const ReadGuard&) = delete; - ReadGuard& operator=(const ReadGuard&) = delete; - - virtual void find_parents_and_self(const document::BucketId& bucket, - std::vector<Entry>& entries) const = 0; - virtual uint64_t generation() const noexcept = 0; - }; + using ReadGuard = bucketdb::ReadGuard<Entry>; virtual std::unique_ptr<ReadGuard> acquire_read_guard() const { - return std::unique_ptr<ReadGuard>(); + return std::unique_ptr<bucketdb::ReadGuard<Entry>>(); } [[nodiscard]] virtual vespalib::MemoryUsage memory_usage() const noexcept = 0; diff --git a/storage/src/vespa/storage/bucketdb/db_merger.h b/storage/src/vespa/storage/bucketdb/db_merger.h new file mode 100644 index 00000000000..4bda7ecff4d --- /dev/null +++ b/storage/src/vespa/storage/bucketdb/db_merger.h @@ -0,0 +1,111 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/document/bucket/bucketid.h> + +namespace storage::bucketdb { + +/** + * Database implementation-specific interface for appending entries + * during a merge() operation. + */ +template <typename ValueT> +struct TrailingInserter { + virtual ~TrailingInserter() = default; + /** + * Insert a new database entry at the end of the current bucket space. + * + * Precondition: the bucket ID must sort after all entries that + * have already been iterated over or inserted via insert_at_end(). + */ + virtual void insert_at_end(const document::BucketId& bucket_id, const ValueT&) = 0; +}; + +/** + * Database implementation-specific interface for accessing bucket + * entries and prepending entries during a merge() operation. + */ +template <typename ValueT> +struct Merger { + virtual ~Merger() = default; + + // TODO this should ideally be separated into read/write functions, but this + // will suffice for now to avoid too many changes. + + /** + * Bucket key/ID of the currently iterated entry. Unless the information stored + * in the DB Entry is needed, using one of these methods should be preferred to + * getting the bucket ID via current_entry(). The underlying DB is expected to + * have cheap access to the ID but _may_ have expensive access to the entry itself. + */ + [[nodiscard]] virtual uint64_t bucket_key() const noexcept = 0; + [[nodiscard]] virtual document::BucketId bucket_id() const noexcept = 0; + /** + * Returns a mutable representation of the currently iterated database + * entry. If changes are made to this object, Result::Update must be + * returned from merge(). Otherwise, mutation visibility is undefined. + */ + [[nodiscard]] virtual ValueT& current_entry() = 0; + /** + * Insert a new entry into the bucket database that is ordered before the + * currently iterated entry. + * + * Preconditions: + * - The bucket ID must sort _before_ the currently iterated + * entry's bucket ID, in "reversed bits" bucket key order. + * - The bucket ID must sort _after_ any entries previously + * inserted with insert_before_current(). + * - The bucket ID must not be the same as a bucket that was + * already iterated over as part of the DB merge() call or inserted + * via a previous call to insert_before_current(). + * Such buckets must be handled by explicitly updating the provided + * entry for the iterated bucket and returning Result::Update. + */ + virtual void insert_before_current(const document::BucketId& bucket_id, const ValueT&) = 0; +}; + +/** + * Interface to be implemented by callers that wish to receive callbacks + * during a bucket merge() operation. + */ +template <typename ValueT> +struct MergingProcessor { + // See merge() for semantics on enum values. + enum class Result { + Update, + KeepUnchanged, + Skip + }; + + virtual ~MergingProcessor() = default; + /** + * Invoked for each existing bucket in the database, in bucket key order. + * The provided Merge instance may be used to access the current entry + * and prepend entries to the DB. + * + * Return value semantics: + * - Result::Update: + * when merge() returns, the changes made to the current entry will + * become visible in the bucket database. + * - Result::KeepUnchanged: + * when merge() returns, the entry will remain in the same state as + * it was when merge() was originally called. + * - Result::Skip: + * when merge() returns, the entry will no longer be part of the DB. + * Any entries added via insert_before_current() _will_ be present. + * + */ + virtual Result merge(Merger<ValueT>&) = 0; + /** + * Invoked once after all existing buckets have been iterated over. + * The provided TrailingInserter instance may be used to append + * an arbitrary number of entries to the database. + * + * This is used to handle elements remaining at the end of a linear + * merge operation. + */ + virtual void insert_remaining_at_end(TrailingInserter<ValueT>&) {} +}; + + +} diff --git a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.cpp index bcc471cc903..b4b8c6e54b9 100644 --- a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.cpp +++ b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.cpp @@ -5,6 +5,19 @@ namespace storage::bucketdb { using document::BucketId; +// TODO getMinDiffBits is hoisted from lockablemap.cpp, could probably be rewritten in terms of xor and MSB bit scan instr +/* + * 63 -------- ... -> 0 + * a: 1101111111 ... 0010 + * b: 1101110010 ... 0011 + * a ^ b: 0000001101 ... 0001 + * ^- diff bit = 57 + * + * 63 - vespalib::Optimized::msbIdx(a ^ b) ==> 6 + * + * what if a == b? special case? not a problem if we can prove this never happens. + */ + // TODO dedupe and unify common code uint8_t getMinDiffBits(uint16_t minBits, const BucketId& a, const BucketId& b) { diff --git a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.h index 15de7f3525b..8bc7a3379b3 100644 --- a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.h +++ b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.h @@ -1,6 +1,7 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once +#include "db_merger.h" #include <vespa/document/bucket/bucketid.h> #include <vespa/vespalib/btree/btree.h> #include <vespa/vespalib/btree/minmaxaggregated.h> @@ -8,108 +9,6 @@ namespace storage::bucketdb { -/** - * Database implementation-specific interface for appending entries - * during a merge() operation. - */ -template <typename ValueT> -struct TrailingInserter { - virtual ~TrailingInserter() = default; - /** - * Insert a new database entry at the end of the current bucket space. - * - * Precondition: the bucket ID must sort after all entries that - * have already been iterated over or inserted via insert_at_end(). - */ - virtual void insert_at_end(const document::BucketId& bucket_id, const ValueT&) = 0; -}; - -/** - * Database implementation-specific interface for accessing bucket - * entries and prepending entries during a merge() operation. - */ -template <typename ValueT> -struct Merger { - virtual ~Merger() = default; - - // TODO this should ideally be separated into read/write functions, but this - // will suffice for now to avoid too many changes. - - /** - * Bucket key/ID of the currently iterated entry. Unless the information stored - * in the DB Entry is needed, using one of these methods should be preferred to - * getting the bucket ID via current_entry(). The underlying DB is expected to - * have cheap access to the ID but _may_ have expensive access to the entry itself. - */ - [[nodiscard]] virtual uint64_t bucket_key() const noexcept = 0; - [[nodiscard]] virtual document::BucketId bucket_id() const noexcept = 0; - /** - * Returns a mutable representation of the currently iterated database - * entry. If changes are made to this object, Result::Update must be - * returned from merge(). Otherwise, mutation visibility is undefined. - */ - [[nodiscard]] virtual ValueT& current_entry() = 0; - /** - * Insert a new entry into the bucket database that is ordered before the - * currently iterated entry. - * - * Preconditions: - * - The bucket ID must sort _before_ the currently iterated - * entry's bucket ID, in "reversed bits" bucket key order. - * - The bucket ID must sort _after_ any entries previously - * inserted with insert_before_current(). - * - The bucket ID must not be the same as a bucket that was - * already iterated over as part of the DB merge() call or inserted - * via a previous call to insert_before_current(). - * Such buckets must be handled by explicitly updating the provided - * entry for the iterated bucket and returning Result::Update. - */ - virtual void insert_before_current(const document::BucketId& bucket_id, const ValueT&) = 0; -}; - -/** - * Interface to be implemented by callers that wish to receive callbacks - * during a bucket merge() operation. - */ -template <typename ValueT> -struct MergingProcessor { - // See merge() for semantics on enum values. - enum class Result { - Update, - KeepUnchanged, - Skip - }; - - virtual ~MergingProcessor() = default; - /** - * Invoked for each existing bucket in the database, in bucket key order. - * The provided Merge instance may be used to access the current entry - * and prepend entries to the DB. - * - * Return value semantics: - * - Result::Update: - * when merge() returns, the changes made to the current entry will - * become visible in the bucket database. - * - Result::KeepUnchanged: - * when merge() returns, the entry will remain in the same state as - * it was when merge() was originally called. - * - Result::Skip: - * when merge() returns, the entry will no longer be part of the DB. - * Any entries added via insert_before_current() _will_ be present. - * - */ - virtual Result merge(Merger<ValueT>&) = 0; - /** - * Invoked once after all existing buckets have been iterated over. - * The provided TrailingInserter instance may be used to append - * an arbitrary number of entries to the database. - * - * This is used to handle elements remaining at the end of a linear - * merge operation. - */ - virtual void insert_remaining_at_end(TrailingInserter<ValueT>&) {} -}; - /* * Bucket database implementation built around lock-free single-writer/multiple-readers B+tree. * @@ -186,6 +85,7 @@ public: BTreeConstIterator find(uint64_t key) const noexcept; BTreeConstIterator lower_bound(uint64_t key) const noexcept; + BTreeConstIterator upper_bound(uint64_t key) const noexcept; BTreeConstIterator begin() const noexcept; void clear() noexcept; @@ -202,11 +102,11 @@ public: bool update(const document::BucketId& bucket, const ValueType& new_entry); bool update_by_raw_key(uint64_t bucket_key, const ValueType& new_entry); - template <typename Func> + template <typename IterValueExtractor, typename Func> void find_parents_and_self(const document::BucketId& bucket, Func func) const; - template <typename Func> + template <typename IterValueExtractor, typename Func> void find_parents_self_and_children(const document::BucketId& bucket, Func func) const; @@ -220,13 +120,31 @@ public: DataStoreType& store() noexcept { return _store; } void merge(MergingProcessor<ValueType>& proc); + + friend class ReadSnapshot; + // See ReadGuard class comments for semantics. + class ReadSnapshot { + const GenericBTreeBucketDatabase* _db; + vespalib::GenerationHandler::Guard _guard; + typename BTree::FrozenView _frozen_view; + public: + explicit ReadSnapshot(const GenericBTreeBucketDatabase& db); + ~ReadSnapshot(); + + ReadSnapshot(const ReadSnapshot&) = delete; + ReadSnapshot& operator=(const ReadSnapshot&) = delete; + + template <typename IterValueExtractor, typename Func> + void find_parents_and_self(const document::BucketId& bucket, Func func) const; + [[nodiscard]] uint64_t generation() const noexcept; + }; private: // Functor is called for each found element in key order, with raw u64 keys and values. - template <typename Func> + template <typename IterValueExtractor, typename Func> BTreeConstIterator find_parents_internal(const typename BTree::FrozenView& frozen_view, const document::BucketId& bucket, Func func) const; - template <typename Func> + template <typename IterValueExtractor, typename Func> void find_parents_and_self_internal(const typename BTree::FrozenView& frozen_view, const document::BucketId& bucket, Func func) const; diff --git a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.hpp b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.hpp index 4b1b507d95a..adaf402e4d1 100644 --- a/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.hpp +++ b/storage/src/vespa/storage/bucketdb/generic_btree_bucket_database.hpp @@ -2,6 +2,15 @@ #pragma once #include "generic_btree_bucket_database.h" +#include <vespa/vespalib/btree/btreebuilder.h> +#include <vespa/vespalib/btree/btreenodeallocator.hpp> +#include <vespa/vespalib/btree/btreenode.hpp> +#include <vespa/vespalib/btree/btreenodestore.hpp> +#include <vespa/vespalib/btree/btreeiterator.hpp> +#include <vespa/vespalib/btree/btreeroot.hpp> +#include <vespa/vespalib/btree/btreebuilder.hpp> +#include <vespa/vespalib/btree/btree.hpp> +#include <vespa/vespalib/btree/btreestore.hpp> namespace storage::bucketdb { @@ -80,6 +89,12 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::lower_bound(uint64_t key) const no template <typename DataStoreTraitsT> typename GenericBTreeBucketDatabase<DataStoreTraitsT>::BTreeConstIterator +GenericBTreeBucketDatabase<DataStoreTraitsT>::upper_bound(uint64_t key) const noexcept { + return _tree.upperBound(key); +} + +template <typename DataStoreTraitsT> +typename GenericBTreeBucketDatabase<DataStoreTraitsT>::BTreeConstIterator GenericBTreeBucketDatabase<DataStoreTraitsT>::find(uint64_t key) const noexcept { return _tree.find(key); } @@ -90,6 +105,20 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::begin() const noexcept { return _tree.begin(); } +struct ByValue { + template <typename DB, typename Iter> + static typename DB::ValueType apply(const DB& db, const Iter& iter) { + return db.entry_from_iterator(iter); + }; +}; + +struct ByConstRef { + template <typename DB, typename Iter> + static typename DB::ConstValueRef apply(const DB& db, const Iter& iter) { + return db.const_value_ref_from_valid_iterator(iter); + }; +}; + /* * Finding the complete set of parents of a given bucket is not obvious how to * do efficiently, as we only know that the parents are ordered before their @@ -159,7 +188,7 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::begin() const noexcept { * invocation of seek() on the iterator. */ template <typename DataStoreTraitsT> -template <typename Func> +template <typename IterValueExtractor, typename Func> typename GenericBTreeBucketDatabase<DataStoreTraitsT>::BTreeConstIterator GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_internal( const typename BTree::FrozenView& frozen_view, @@ -186,7 +215,7 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_internal( auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey())); if (candidate.contains(bucket)) { assert(candidate.getUsedBits() >= bits); - func(iter.getKey(), const_value_ref_from_valid_iterator(iter)); + func(iter.getKey(), IterValueExtractor::apply(*this, iter)); } bits = next_parent_bit_seek_level(bits, candidate, bucket); const auto parent_key = BucketId(bits, bucket.getRawId()).toKey(); @@ -197,41 +226,41 @@ GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_internal( } template <typename DataStoreTraitsT> -template <typename Func> +template <typename IterValueExtractor, typename Func> void GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_and_self_internal( const typename BTree::FrozenView& frozen_view, const BucketId& bucket, Func func) const { - auto iter = find_parents_internal(frozen_view, bucket, func); + auto iter = find_parents_internal<IterValueExtractor>(frozen_view, bucket, func); if (iter.valid() && iter.getKey() == bucket.toKey()) { - func(iter.getKey(), entry_from_iterator(iter)); + func(iter.getKey(), IterValueExtractor::apply(*this, iter)); } } template <typename DataStoreTraitsT> -template <typename Func> +template <typename IterValueExtractor, typename Func> void GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_and_self( const document::BucketId& bucket, Func func) const { auto view = _tree.getFrozenView(); - find_parents_and_self_internal(view, bucket, std::move(func)); + find_parents_and_self_internal<IterValueExtractor>(view, bucket, std::move(func)); } template <typename DataStoreTraitsT> -template <typename Func> +template <typename IterValueExtractor, typename Func> void GenericBTreeBucketDatabase<DataStoreTraitsT>::find_parents_self_and_children( const BucketId& bucket, Func func) const { auto view = _tree.getFrozenView(); - auto iter = find_parents_internal(view, bucket, func); + auto iter = find_parents_internal<IterValueExtractor>(view, bucket, func); // `iter` is already pointing at, or beyond, one of the bucket's subtrees. for (; iter.valid(); ++iter) { auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey())); if (bucket.contains(candidate)) { - func(iter.getKey(), entry_from_iterator(iter)); + func(iter.getKey(), IterValueExtractor::apply(*this, iter)); } else { break; } @@ -490,5 +519,30 @@ void GenericBTreeBucketDatabase<DataStoreTraitsT>::merge(MergingProcessor<ValueT commit_tree_changes(); } +template <typename DataStoreTraitsT> +GenericBTreeBucketDatabase<DataStoreTraitsT>::ReadSnapshot::ReadSnapshot( + const GenericBTreeBucketDatabase<DataStoreTraitsT>& db) + : _db(&db), + _guard(_db->_generation_handler.takeGuard()), + _frozen_view(_db->_tree.getFrozenView()) +{ +} + +template <typename DataStoreTraitsT> +GenericBTreeBucketDatabase<DataStoreTraitsT>::ReadSnapshot::~ReadSnapshot() = default; + +template <typename DataStoreTraitsT> +template <typename IterValueExtractor, typename Func> +void GenericBTreeBucketDatabase<DataStoreTraitsT>::ReadSnapshot::find_parents_and_self( + const BucketId& bucket, + Func func) const +{ + _db->find_parents_and_self_internal<IterValueExtractor>(_frozen_view, bucket, std::move(func)); +} + +template <typename DataStoreTraitsT> +uint64_t GenericBTreeBucketDatabase<DataStoreTraitsT>::ReadSnapshot::generation() const noexcept { + return _guard.getGeneration(); +} } diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.h b/storage/src/vespa/storage/bucketdb/lockablemap.h index 83ecac0a94f..66619a4f7e8 100644 --- a/storage/src/vespa/storage/bucketdb/lockablemap.h +++ b/storage/src/vespa/storage/bucketdb/lockablemap.h @@ -28,7 +28,7 @@ namespace storage { template <typename Map> -class LockableMap +class LockableMap final : public bucketdb::AbstractBucketMap<typename Map::mapped_type> { public: diff --git a/storage/src/vespa/storage/bucketdb/mapbucketdatabase.cpp b/storage/src/vespa/storage/bucketdb/mapbucketdatabase.cpp index edb808da294..7556b80b29c 100644 --- a/storage/src/vespa/storage/bucketdb/mapbucketdatabase.cpp +++ b/storage/src/vespa/storage/bucketdb/mapbucketdatabase.cpp @@ -414,7 +414,8 @@ struct MapDbMerger final : BucketDatabase::Merger { BucketDatabase::Entry& current_entry() override { return _current_entry; } - void insert_before_current(const BucketDatabase::Entry& e) override { + void insert_before_current([[maybe_unused]] const document::BucketId& bucket_id, + const BucketDatabase::Entry& e) override { _to_insert.emplace_back(e); // TODO movable } }; @@ -423,7 +424,8 @@ struct MapDbTrailingInserter final : BucketDatabase::TrailingInserter { MapBucketDatabase& _db; explicit MapDbTrailingInserter(MapBucketDatabase& db) : _db(db) {} - void insert_at_end(const BucketDatabase::Entry& e) override { + void insert_at_end([[maybe_unused]] const document::BucketId& bucket_id, + const BucketDatabase::Entry& e) override { _db.update(e); } }; @@ -584,7 +586,7 @@ MapBucketDatabase::print(std::ostream& out, bool verbose, out << ')'; } -std::unique_ptr<BucketDatabase::ReadGuard> MapBucketDatabase::acquire_read_guard() const { +std::unique_ptr<bucketdb::ReadGuard<BucketDatabase::Entry>> MapBucketDatabase::acquire_read_guard() const { return std::make_unique<ReadGuardImpl>(*this); } diff --git a/storage/src/vespa/storage/bucketdb/mapbucketdatabase.h b/storage/src/vespa/storage/bucketdb/mapbucketdatabase.h index e41b797a321..e8d5688068f 100644 --- a/storage/src/vespa/storage/bucketdb/mapbucketdatabase.h +++ b/storage/src/vespa/storage/bucketdb/mapbucketdatabase.h @@ -29,7 +29,7 @@ public: document::BucketId getAppropriateBucket(uint16_t minBits, const document::BucketId& bid) override; void print(std::ostream& out, bool verbose, const std::string& indent) const override; - std::unique_ptr<ReadGuard> acquire_read_guard() const override; + std::unique_ptr<bucketdb::ReadGuard<Entry>> acquire_read_guard() const override; vespalib::MemoryUsage memory_usage() const noexcept override; private: struct E { @@ -66,7 +66,7 @@ private: uint32_t childCountImpl(int index, uint8_t bitCount, const document::BucketId& b) const; // NOT thread-safe for concurrent reads! - class ReadGuardImpl : public ReadGuard { + class ReadGuardImpl final : public bucketdb::ReadGuard<Entry> { const MapBucketDatabase* _db; public: explicit ReadGuardImpl(const MapBucketDatabase& db) : _db(&db) {} diff --git a/storage/src/vespa/storage/bucketdb/read_guard.h b/storage/src/vespa/storage/bucketdb/read_guard.h new file mode 100644 index 00000000000..cf37bafb0dd --- /dev/null +++ b/storage/src/vespa/storage/bucketdb/read_guard.h @@ -0,0 +1,46 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/document/bucket/bucketid.h> +#include <vector> + +namespace storage::bucketdb { + +/* + * Read guard for accessing the bucket tree of an underlying bucket database + * in a thread-safe, read-only manner. + * + * Important: If the underlying database is _not_ backed by a B-tree, the + * read guard does _not_ provide a stable view of the bucket key set when + * iterating, as that is not possible without locking the entire DB. + * + * If the guard is created by a B-tree DB, the following properties hold: + * - The set of bucket keys that can be iterated over is stable for the lifetime + * of the read guard. + * - The bucket _values_ may change during the lifetime of the read guard, + * but the reader will always observe a fully consistent value as if it were + * written atomically. + * + * Do not hold read guards for longer than absolutely necessary, as they cause + * memory to be retained by the backing DB until released. + */ + +template <typename ValueT> +class ReadGuard { +public: + ReadGuard() = default; + virtual ~ReadGuard() = default; + + ReadGuard(ReadGuard&&) = delete; + ReadGuard& operator=(ReadGuard&&) = delete; + ReadGuard(const ReadGuard&) = delete; + ReadGuard& operator=(const ReadGuard&) = delete; + + virtual void find_parents_and_self(const document::BucketId& bucket, + std::vector<ValueT>& entries) const = 0; + // If the underlying guard represents a snapshot, returns its monotonically + // increasing generation. Otherwise returns 0. + [[nodiscard]] virtual uint64_t generation() const noexcept = 0; +}; + +} diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp index 35af71898e6..057b106f775 100644 --- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp +++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp @@ -181,7 +181,7 @@ public: if (key_at_cursor >= key_to_insert) { break; } - m.insert_before_current(*_current); + m.insert_before_current(_current->getBucketId(), *_current); ++_current; } if ((_current != _last) && (key_at_cursor == key_to_insert)) { @@ -201,7 +201,7 @@ public: void insert_remaining_at_end(BucketDatabase::TrailingInserter& inserter) override { for (; _current != _last; ++_current) { - inserter.insert_at_end(*_current); + inserter.insert_at_end(_current->getBucketId(), *_current); } } }; diff --git a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp index 13f9c21eed0..6983e3594af 100644 --- a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp +++ b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp @@ -193,11 +193,12 @@ void PendingBucketSpaceDbTransition::insert_remaining_at_end(BucketDatabase::Tra void PendingBucketSpaceDbTransition::addToMerger(BucketDatabase::Merger& merger, const Range& range) { + const auto bucket_id = _entries[range.first].bucket_id(); LOG(spam, "Adding new bucket %s with %d copies", - _entries[range.first].bucket_id().toString().c_str(), + bucket_id.toString().c_str(), range.second - range.first); - BucketDatabase::Entry e(_entries[range.first].bucket_id(), BucketInfo()); + BucketDatabase::Entry e(bucket_id, BucketInfo()); insertInfo(e, range); if (e->getLastGarbageCollectionTime() == 0) { e->setLastGarbageCollectionTime( @@ -205,18 +206,19 @@ PendingBucketSpaceDbTransition::addToMerger(BucketDatabase::Merger& merger, cons .getSeconds().getTime()); } e.getBucketInfo().updateTrusted(); - merger.insert_before_current(e); + merger.insert_before_current(bucket_id, e); } void PendingBucketSpaceDbTransition::addToInserter(BucketDatabase::TrailingInserter& inserter, const Range& range) { // TODO dedupe + const auto bucket_id = _entries[range.first].bucket_id(); LOG(spam, "Adding new bucket %s with %d copies", - _entries[range.first].bucket_id().toString().c_str(), + bucket_id.toString().c_str(), range.second - range.first); - BucketDatabase::Entry e(_entries[range.first].bucket_id(), BucketInfo()); + BucketDatabase::Entry e(bucket_id, BucketInfo()); insertInfo(e, range); if (e->getLastGarbageCollectionTime() == 0) { e->setLastGarbageCollectionTime( @@ -224,7 +226,7 @@ PendingBucketSpaceDbTransition::addToInserter(BucketDatabase::TrailingInserter& .getSeconds().getTime()); } e.getBucketInfo().updateTrusted(); - inserter.insert_at_end(e); + inserter.insert_at_end(bucket_id, e); } void |