diff options
Diffstat (limited to 'storage')
3 files changed, 146 insertions, 49 deletions
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp index 1d5355df92f..a0354c8ad4e 100644 --- a/storage/src/tests/distributor/bucketdatabasetest.cpp +++ b/storage/src/tests/distributor/bucketdatabasetest.cpp @@ -202,66 +202,84 @@ TEST_P(BucketDatabaseTest, find_parents) { // test that it is so for now to avoid breaking the world. EXPECT_EQ( std::string("0"), - doFindParents(toVector(document::BucketId(17, 0xcafe)), - document::BucketId(17, 0xcafe))); + doFindParents({BucketId(17, 0xcafe)}, + BucketId(17, 0xcafe))); EXPECT_EQ( std::string("1,2"), - doFindParents(toVector(document::BucketId(1, 0x0), - document::BucketId(1, 0x1), - document::BucketId(2, 0x1)), - document::BucketId(16, 0x1))); + doFindParents({BucketId(1, 0x0), + BucketId(1, 0x1), + BucketId(2, 0x1)}, + BucketId(16, 0x1))); EXPECT_EQ( std::string("2"), - doFindParents(toVector(document::BucketId(17, 0x0ffff), - document::BucketId(18, 0x1ffff), - document::BucketId(18, 0x3ffff)), - document::BucketId(22, 0xfffff))); + doFindParents({BucketId(17, 0x0ffff), + BucketId(18, 0x1ffff), + BucketId(18, 0x3ffff)}, + BucketId(22, 0xfffff))); EXPECT_EQ( std::string("0,2,3"), - doFindParents(toVector(document::BucketId(16, 0x0ffff), - document::BucketId(17, 0x0ffff), - document::BucketId(17, 0x1ffff), - document::BucketId(19, 0xfffff)), - document::BucketId(22, 0xfffff))); + doFindParents({BucketId(16, 0x0ffff), + BucketId(17, 0x0ffff), + BucketId(17, 0x1ffff), + BucketId(19, 0xfffff)}, + BucketId(22, 0xfffff))); EXPECT_EQ( std::string("0,1,2,3"), - doFindParents(toVector(document::BucketId(16, 0x0ffff), - document::BucketId(17, 0x0ffff), - document::BucketId(18, 0x0ffff), - document::BucketId(19, 0x0ffff)), - document::BucketId(20, 0x0ffff))); + doFindParents({BucketId(16, 0x0ffff), + BucketId(17, 0x0ffff), + BucketId(18, 0x0ffff), + BucketId(19, 0x0ffff)}, + BucketId(20, 0x0ffff))); EXPECT_EQ( std::string("0,2,3"), - doFindParents(toVector(document::BucketId(16, 0x0ffff), - document::BucketId(17, 0x0ffff), - document::BucketId(17, 0x1ffff), - document::BucketId(18, 0x1ffff)), - document::BucketId(22, 0x1ffff))); + doFindParents({BucketId(16, 0x0ffff), + BucketId(17, 0x0ffff), + BucketId(17, 0x1ffff), + BucketId(18, 0x1ffff)}, + BucketId(22, 0x1ffff))); EXPECT_EQ( std::string("0"), - doFindParents(toVector(document::BucketId(16, 0x0ffff), - document::BucketId(17, 0x0ffff)), - document::BucketId(22, 0x1ffff))); + doFindParents({BucketId(16, 0x0ffff), + BucketId(17, 0x0ffff)}, + BucketId(22, 0x1ffff))); EXPECT_EQ( // ticket 3121525 std::string("0"), - doFindParents(toVector(document::BucketId(16, 0x0ffff), - document::BucketId(17, 0x0ffff), - document::BucketId(19, 0x1ffff)), - document::BucketId(18, 0x1ffff))); + doFindParents({BucketId(16, 0x0ffff), + BucketId(17, 0x0ffff), + BucketId(19, 0x1ffff)}, + BucketId(18, 0x1ffff))); EXPECT_EQ( // ticket 3121525 std::string("0"), - doFindParents(toVector(document::BucketId(16, 0x0ffff), - document::BucketId(17, 0x0ffff), - document::BucketId(19, 0x5ffff)), - document::BucketId(18, 0x1ffff))); + doFindParents({BucketId(16, 0x0ffff), + BucketId(17, 0x0ffff), + BucketId(19, 0x5ffff)}, + BucketId(18, 0x1ffff))); + + // Queried bucket is itself a parent of buckets in the DB, not a child. + EXPECT_EQ(std::string(""), + doFindParents({BucketId(16, 0x0ffff), + BucketId(17, 0x0ffff), + BucketId(19, 0x5ffff)}, + BucketId(15, 0x0ffff))); + + // Queried bucket has lower used bits than any buckets in the DB, and there + // are buckets in an unrelated leftmost subtree. + EXPECT_EQ(std::string(""), + doFindParents({BucketId(16, 0x0000)}, + BucketId(8, 0xffff))); + + // Similar as above test, but with subtree ordering reversed. + EXPECT_EQ(std::string(""), + doFindParents({BucketId(16, 0xffff)}, + BucketId(8, 0x0000))); } std::string @@ -380,6 +398,13 @@ TEST_P(BucketDatabaseTest, find_all) { document::BucketId(18, 0x1ffff))); } +TEST_P(BucketDatabaseTest, bucket_resolving_does_not_consider_unused_bits_in_id) { + EXPECT_EQ("0,1", + doFindAll({BucketId(0x840000003a7455d7), + BucketId(0x840000013a7455d7)}, + BucketId(0x8247fe133a7455d7))); // Raw bucket ID from group hash +} + document::BucketId BucketDatabaseTest::doCreate(const std::vector<document::BucketId>& ids, uint32_t minBits, @@ -707,4 +732,37 @@ TEST_P(BucketDatabaseTest, DISABLED_benchmark_const_iteration) { db().toString(false).c_str(), elapsed); } +TEST_P(BucketDatabaseTest, DISABLED_benchmark_find_parents) { + constexpr uint32_t superbuckets = 1u << 16u; + constexpr uint32_t sub_buckets = 14; + constexpr uint32_t n_buckets = superbuckets * sub_buckets; + + std::vector<uint64_t> bucket_keys; + bucket_keys.reserve(n_buckets); + + for (uint32_t sb = 0; sb < superbuckets; ++sb) { + for (uint64_t i = 0; i < sub_buckets; ++i) { + document::BucketId bucket(48, (i << 32ULL) | sb); // TODO eval with different bit counts + bucket_keys.emplace_back(bucket.toKey()); + } + } + fprintf(stderr, "Inserting %zu buckets into DB\n", bucket_keys.size()); + std::sort(bucket_keys.begin(), bucket_keys.end()); + for (uint64_t k : bucket_keys) { + db().update(BucketDatabase::Entry(BucketId(BucketId::keyToBucketId(k)), BI3(0, 1, 2))); + } + + fprintf(stderr, "Invoking getParents() %zu times\n", bucket_keys.size()); + auto elapsed = vespalib::BenchmarkTimer::benchmark([&] { + std::vector<BucketDatabase::Entry> entries; + for (uint64_t k : bucket_keys) { + db().getParents(BucketId(BucketId::keyToBucketId(k)), entries); + assert(entries.size() == 1); + entries.clear(); + } + }, 30); + fprintf(stderr, "Looking up all buckets in %s takes %g seconds\n", + db().toString(false).c_str(), elapsed); +} + } diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp index 85f5c8c5be9..fa2c915acc1 100644 --- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp +++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp @@ -194,13 +194,27 @@ void BTreeBucketDatabase::remove(const BucketId& bucket) { * This allows us to reuse a single iterator and to continue seeking forwards * from its current position. * + * To speed up the process of converging on the target bucket without needing + * to check many unrelated subtrees, we let the underlying B-tree automatically + * aggregate the min/max range of the used-bits of all contained bucket keys. + * If we e.g. know that the minimum number of used bits in the DB is 16, we can + * immediately seek to this level in the tree instead of working our way down + * one bit at a time. By definition, no parents can exist above this level. + * This is a very important optimization, as bucket trees are usually very well + * balanced due to randomized distribution of data (combined with a cluster-wide + * minimum tree level imposed by distribution bits). It is common that the minimum + * number of used bits == max number of used bits, i.e. a totally even split. + * This means that for a system without inconsistently split buckets (i.e. no + * parents) we're highly likely to converge on the target bucket in a single seek. + * * Algorithm: * * Core invariant: every subsequent iterator seek performed in this algorithm * is for a key that is strictly higher than the one the iterator is currently at. * * 1. Lbound seek to the lowest key that is known to exclude all already visited - * parents. On the first iteration this is zero, i.e. the first in-order node. + * parents. On the first iteration we use a bit count equal to the minimum number + * of key used-bits in the entire DB, allowing us to potentially skip most subtrees. * 2. If the current node's key is greater than that of the requested bucket's key, * we've either descended to--or beyond--it in its own subtree or we've entered * a disjoint subtree. Since we know that all parents must sort before any given @@ -229,14 +243,13 @@ void BTreeBucketDatabase::remove(const BucketId& bucket) { * 6. Iff iterator is still valid, go to step 2 * * This algorithm is able to skip through large parts of the tree in a sparsely populated - * tree, but the number of seeks will trend towards O(b) as with the legacy implementation - * when a tree is densely populated. This because all logical inner nodes in the tree will - * have subtrees under them. Even in the worst case we should be more efficient than the - * legacy implementation since we've cut any dense search space in half for each invocation - * of seek() on the iterator - * - * TODO use min-aggregation to infer used bit range in the tree. This should allow us to - * skip scanning through many disjoint subtrees. + * tree, but the number of seeks will trend towards O(b - min_bits) as with the legacy + * implementation when a tree is densely populated (where `b` is the used-bits count of the + * most specific node in the tree for the target bucket, and min_bits is the minimum number + * of used-bits for any key in the database). This because all logical inner nodes in the tree + * will have subtrees under them. Even in the worst case we should be more efficient than the + * legacy Judy-based implementation since we've cut any dense search space in half for each + * invocation of seek() on the iterator. */ BTreeBucketDatabase::BTree::ConstIterator BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view, @@ -244,10 +257,21 @@ BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view, std::vector<Entry>& entries) const { const uint64_t bucket_key = bucket.toKey(); - auto iter = frozen_view.begin(); - // Start at the root level, descending towards the bucket itself. + if (frozen_view.empty()) { + return frozen_view.begin(); // Will be invalid. + } + const auto min_db_bits = frozen_view.getAggregated().getMin(); + assert(min_db_bits >= static_cast<int32_t>(BucketId::minNumBits)); + assert(min_db_bits <= static_cast<int32_t>(BucketId::maxNumBits)); + // Start at the lowest possible tree level no parents can exist above, + // descending towards the bucket itself. + // Note: important to use getId() rather than getRawId(), as min_db_bits may be + // greater than the used bits of the queried bucket. If we used the raw ID, we'd + // end up looking at undefined bits. + const auto first_key = BucketId(min_db_bits, bucket.getId()).toKey(); + auto iter = frozen_view.lowerBound(first_key); // Try skipping as many levels of the tree as possible as we go. - uint32_t bits = 1; + uint32_t bits = min_db_bits; while (iter.valid() && (iter.getKey() < bucket_key)) { auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey())); if (candidate.contains(bucket)) { diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h index 8898b0c395a..75736e59092 100644 --- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h +++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h @@ -4,6 +4,8 @@ #include "bucketdatabase.h" #include <vespa/vespalib/btree/btree.h> +#include <vespa/vespalib/btree/minmaxaggregated.h> +#include <vespa/vespalib/btree/minmaxaggrcalc.h> #include <vespa/vespalib/datastore/array_store.h> namespace storage { @@ -23,8 +25,21 @@ namespace storage { */ // TODO create and use a new DB interface with better bulk loading, snapshot and iteration support class BTreeBucketDatabase : public BucketDatabase { + + struct KeyUsedBitsMinMaxAggrCalc : search::btree::MinMaxAggrCalc { + constexpr static bool aggregate_over_values() { return false; } + constexpr static int32_t getVal(uint64_t key) noexcept { + static_assert(document::BucketId::CountBits == 6u); + return static_cast<int32_t>(key & 0b11'1111U); // 6 LSB of key contains used-bits + } + }; + // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref> - using BTree = search::btree::BTree<uint64_t, uint64_t>; + using BTree = search::btree::BTree<uint64_t, uint64_t, + search::btree::MinMaxAggregated, + std::less<>, + search::btree::BTreeDefaultTraits, + KeyUsedBitsMinMaxAggrCalc>; using ReplicaStore = search::datastore::ArrayStore<BucketCopy>; using GenerationHandler = vespalib::GenerationHandler; |