diff options
Diffstat (limited to 'storage')
3 files changed, 79 insertions, 12 deletions
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp index 1d5355df92f..56e996abf99 100644 --- a/storage/src/tests/distributor/bucketdatabasetest.cpp +++ b/storage/src/tests/distributor/bucketdatabasetest.cpp @@ -707,4 +707,37 @@ TEST_P(BucketDatabaseTest, DISABLED_benchmark_const_iteration) { db().toString(false).c_str(), elapsed); } +TEST_P(BucketDatabaseTest, DISABLED_benchmark_find_parents) { + constexpr uint32_t superbuckets = 1u << 16u; + constexpr uint32_t sub_buckets = 14; + constexpr uint32_t n_buckets = superbuckets * sub_buckets; + + std::vector<uint64_t> bucket_keys; + bucket_keys.reserve(n_buckets); + + for (uint32_t sb = 0; sb < superbuckets; ++sb) { + for (uint64_t i = 0; i < sub_buckets; ++i) { + document::BucketId bucket(48, (i << 32ULL) | sb); // TODO eval with different bit counts + bucket_keys.emplace_back(bucket.toKey()); + } + } + fprintf(stderr, "Inserting %zu buckets into DB\n", bucket_keys.size()); + std::sort(bucket_keys.begin(), bucket_keys.end()); + for (uint64_t k : bucket_keys) { + db().update(BucketDatabase::Entry(BucketId(BucketId::keyToBucketId(k)), BI3(0, 1, 2))); + } + + fprintf(stderr, "Invoking getParents() %zu times\n", bucket_keys.size()); + auto elapsed = vespalib::BenchmarkTimer::benchmark([&] { + std::vector<BucketDatabase::Entry> entries; + for (uint64_t k : bucket_keys) { + db().getParents(BucketId(BucketId::keyToBucketId(k)), entries); + assert(entries.size() == 1); + entries.clear(); + } + }, 30); + fprintf(stderr, "Looking up all buckets in %s takes %g seconds\n", + db().toString(false).c_str(), elapsed); +} + } diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp index 85f5c8c5be9..2adb3049853 100644 --- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp +++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp @@ -194,13 +194,27 @@ void BTreeBucketDatabase::remove(const BucketId& bucket) { * This allows us to reuse a single iterator and to continue seeking forwards * from its current position. * + * To speed up the process of converging on the target bucket without needing + * to check many unrelated subtrees, we let the underlying B-tree automatically + * aggregate the min/max range of the used-bits of all contained bucket keys. + * If we e.g. know that the minimum number of used bits in the DB is 16, we can + * immediately seek to this level in the tree instead of working our way down + * one bit at a time. By definition, no parents can exist above this level. + * This is a very important optimization, as bucket trees are usually very well + * balanced due to randomized distribution of data (combined with a cluster-wide + * minimum tree level imposed by distribution bits). It is common that the minimum + * number of used bits == max number of used bits, i.e. a totally even split. + * This means that for a system without inconsistently split buckets (i.e. no + * parents) we're highly likely to converge on the target bucket in a single seek. + * * Algorithm: * * Core invariant: every subsequent iterator seek performed in this algorithm * is for a key that is strictly higher than the one the iterator is currently at. * * 1. Lbound seek to the lowest key that is known to exclude all already visited - * parents. On the first iteration this is zero, i.e. the first in-order node. + * parents. On the first iteration we use a bit count equal to the minimum number + * of key used-bits in the entire DB, allowing us to potentially skip most subtrees. * 2. If the current node's key is greater than that of the requested bucket's key, * we've either descended to--or beyond--it in its own subtree or we've entered * a disjoint subtree. Since we know that all parents must sort before any given @@ -229,14 +243,13 @@ void BTreeBucketDatabase::remove(const BucketId& bucket) { * 6. Iff iterator is still valid, go to step 2 * * This algorithm is able to skip through large parts of the tree in a sparsely populated - * tree, but the number of seeks will trend towards O(b) as with the legacy implementation - * when a tree is densely populated. This because all logical inner nodes in the tree will - * have subtrees under them. Even in the worst case we should be more efficient than the - * legacy implementation since we've cut any dense search space in half for each invocation - * of seek() on the iterator - * - * TODO use min-aggregation to infer used bit range in the tree. This should allow us to - * skip scanning through many disjoint subtrees. + * tree, but the number of seeks will trend towards O(b - min_bits) as with the legacy + * implementation when a tree is densely populated (where `b` is the used-bits count of the + * most specific node in the tree for the target bucket, and min_bits is the minimum number + * of used-bits for any key in the database). This because all logical inner nodes in the tree + * will have subtrees under them. Even in the worst case we should be more efficient than the + * legacy Judy-based implementation since we've cut any dense search space in half for each + * invocation of seek() on the iterator. */ BTreeBucketDatabase::BTree::ConstIterator BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view, @@ -245,9 +258,15 @@ BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view, { const uint64_t bucket_key = bucket.toKey(); auto iter = frozen_view.begin(); - // Start at the root level, descending towards the bucket itself. + if (!iter.valid()) { + return iter; + } + assert(iter.getAggregated().getMin() >= static_cast<int32_t>(BucketId::minNumBits)); + assert(iter.getAggregated().getMax() <= static_cast<int32_t>(BucketId::maxNumBits)); + // Start at the lowest possible tree level no parents can exist above, + // descending towards the bucket itself. // Try skipping as many levels of the tree as possible as we go. - uint32_t bits = 1; + uint32_t bits = iter.getAggregated().getMin(); while (iter.valid() && (iter.getKey() < bucket_key)) { auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey())); if (candidate.contains(bucket)) { diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h index 8898b0c395a..75736e59092 100644 --- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h +++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h @@ -4,6 +4,8 @@ #include "bucketdatabase.h" #include <vespa/vespalib/btree/btree.h> +#include <vespa/vespalib/btree/minmaxaggregated.h> +#include <vespa/vespalib/btree/minmaxaggrcalc.h> #include <vespa/vespalib/datastore/array_store.h> namespace storage { @@ -23,8 +25,21 @@ namespace storage { */ // TODO create and use a new DB interface with better bulk loading, snapshot and iteration support class BTreeBucketDatabase : public BucketDatabase { + + struct KeyUsedBitsMinMaxAggrCalc : search::btree::MinMaxAggrCalc { + constexpr static bool aggregate_over_values() { return false; } + constexpr static int32_t getVal(uint64_t key) noexcept { + static_assert(document::BucketId::CountBits == 6u); + return static_cast<int32_t>(key & 0b11'1111U); // 6 LSB of key contains used-bits + } + }; + // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref> - using BTree = search::btree::BTree<uint64_t, uint64_t>; + using BTree = search::btree::BTree<uint64_t, uint64_t, + search::btree::MinMaxAggregated, + std::less<>, + search::btree::BTreeDefaultTraits, + KeyUsedBitsMinMaxAggrCalc>; using ReplicaStore = search::datastore::ArrayStore<BucketCopy>; using GenerationHandler = vespalib::GenerationHandler; |