Optimize B-tree bucket DB lookup with used-bits aggregation

By tracking the minimum used bits count across all buckets in the database we can immediately start seeking at that implicit level in the tree, as we know no parent buckets can exist above that level. Local synthetic benchmarking shows the following results with a DB size of 917504 buckets and performing getParents for all buckets in sequence: Before optimization: - B-tree DB: 0.593321 seconds - Legacy DB: 0.227947 seconds After optimization: - B-tree DB: 0.213738 seconds - Legacy DB: (unchanged)
author: Tor Brede Vekterli <vekterli@verizonmedia.com> 2020-04-24 11:13:48 +0000
committer: Tor Brede Vekterli <vekterli@verizonmedia.com> 2020-04-24 11:13:48 +0000
commit: 27a1f926c60f7684d5ec17988dc17681b0c73d30 (patch)
tree: 371e0c725264f466a12264fef3c124f784e68114 /storage
parent: 24eaf1642c012eb2f9f76a39102e87fb82874321 (diff)
3 files changed, 79 insertions, 12 deletions
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp
index 1d5355df92f..56e996abf99 100644
--- a/storage/src/tests/distributor/bucketdatabasetest.cpp
+++ b/storage/src/tests/distributor/bucketdatabasetest.cpp
@@ -707,4 +707,37 @@ TEST_P(BucketDatabaseTest, DISABLED_benchmark_const_iteration) {
             db().toString(false).c_str(), elapsed);
 }
 
+TEST_P(BucketDatabaseTest, DISABLED_benchmark_find_parents) {
+    constexpr uint32_t superbuckets = 1u << 16u;
+    constexpr uint32_t sub_buckets = 14;
+    constexpr uint32_t n_buckets = superbuckets * sub_buckets;
+
+    std::vector<uint64_t> bucket_keys;
+    bucket_keys.reserve(n_buckets);
+
+    for (uint32_t sb = 0; sb < superbuckets; ++sb) {
+        for (uint64_t i = 0; i < sub_buckets; ++i) {
+            document::BucketId bucket(48, (i << 32ULL) | sb); // TODO eval with different bit counts
+            bucket_keys.emplace_back(bucket.toKey());
+        }
+    }
+    fprintf(stderr, "Inserting %zu buckets into DB\n", bucket_keys.size());
+    std::sort(bucket_keys.begin(), bucket_keys.end());
+    for (uint64_t k : bucket_keys) {
+        db().update(BucketDatabase::Entry(BucketId(BucketId::keyToBucketId(k)), BI3(0, 1, 2)));
+    }
+
+    fprintf(stderr, "Invoking getParents() %zu times\n", bucket_keys.size());
+    auto elapsed = vespalib::BenchmarkTimer::benchmark([&] {
+        std::vector<BucketDatabase::Entry> entries;
+        for (uint64_t k : bucket_keys) {
+            db().getParents(BucketId(BucketId::keyToBucketId(k)), entries);
+            assert(entries.size() == 1);
+            entries.clear();
+        }
+    }, 30);
+    fprintf(stderr, "Looking up all buckets in %s takes %g seconds\n",
+            db().toString(false).c_str(), elapsed);
+}
+
 }
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
index 85f5c8c5be9..2adb3049853 100644
--- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
@@ -194,13 +194,27 @@ void BTreeBucketDatabase::remove(const BucketId& bucket) {
  * This allows us to reuse a single iterator and to continue seeking forwards
  * from its current position.
  *
+ * To speed up the process of converging on the target bucket without needing
+ * to check many unrelated subtrees, we let the underlying B-tree automatically
+ * aggregate the min/max range of the used-bits of all contained bucket keys.
+ * If we e.g. know that the minimum number of used bits in the DB is 16, we can
+ * immediately seek to this level in the tree instead of working our way down
+ * one bit at a time. By definition, no parents can exist above this level.
+ * This is a very important optimization, as bucket trees are usually very well
+ * balanced due to randomized distribution of data (combined with a cluster-wide
+ * minimum tree level imposed by distribution bits). It is common that the minimum
+ * number of used bits == max number of used bits, i.e. a totally even split.
+ * This means that for a system without inconsistently split buckets (i.e. no
+ * parents) we're highly likely to converge on the target bucket in a single seek.
+ *
  * Algorithm:
  *
  *   Core invariant: every subsequent iterator seek performed in this algorithm
  *   is for a key that is strictly higher than the one the iterator is currently at.
  *
  *   1. Lbound seek to the lowest key that is known to exclude all already visited
- *      parents. On the first iteration this is zero, i.e. the first in-order node.
+ *      parents. On the first iteration we use a bit count equal to the minimum number
+ *      of key used-bits in the entire DB, allowing us to potentially skip most subtrees.
  *   2. If the current node's key is greater than that of the requested bucket's key,
  *      we've either descended to--or beyond--it in its own subtree or we've entered
  *      a disjoint subtree. Since we know that all parents must sort before any given
@@ -229,14 +243,13 @@ void BTreeBucketDatabase::remove(const BucketId& bucket) {
  *   6. Iff iterator is still valid, go to step 2
  *
  * This algorithm is able to skip through large parts of the tree in a sparsely populated
- * tree, but the number of seeks will trend towards O(b) as with the legacy implementation
- * when a tree is densely populated. This because all logical inner nodes in the tree will
- * have subtrees under them. Even in the worst case we should be more efficient than the
- * legacy implementation since we've cut any dense search space in half for each invocation
- * of seek() on the iterator
- *
- * TODO use min-aggregation to infer used bit range in the tree. This should allow us to
- * skip scanning through many disjoint subtrees.
+ * tree, but the number of seeks will trend towards O(b - min_bits) as with the legacy
+ * implementation when a tree is densely populated (where `b` is the used-bits count of the
+ * most specific node in the tree for the target bucket, and min_bits is the minimum number
+ * of used-bits for any key in the database). This because all logical inner nodes in the tree
+ * will have subtrees under them. Even in the worst case we should be more efficient than the
+ * legacy Judy-based implementation since we've cut any dense search space in half for each
+ * invocation of seek() on the iterator.
  */
 BTreeBucketDatabase::BTree::ConstIterator
 BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view,
@@ -245,9 +258,15 @@ BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view,
 {
     const uint64_t bucket_key = bucket.toKey();
     auto iter = frozen_view.begin();
-    // Start at the root level, descending towards the bucket itself.
+    if (!iter.valid()) {
+        return iter;
+    }
+    assert(iter.getAggregated().getMin() >= static_cast<int32_t>(BucketId::minNumBits));
+    assert(iter.getAggregated().getMax() <= static_cast<int32_t>(BucketId::maxNumBits));
+    // Start at the lowest possible tree level no parents can exist above,
+    // descending towards the bucket itself.
     // Try skipping as many levels of the tree as possible as we go.
-    uint32_t bits = 1;
+    uint32_t bits = iter.getAggregated().getMin();
     while (iter.valid() && (iter.getKey() < bucket_key)) {
         auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
         if (candidate.contains(bucket)) {
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
index 8898b0c395a..75736e59092 100644
--- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
@@ -4,6 +4,8 @@
 
 #include "bucketdatabase.h"
 #include <vespa/vespalib/btree/btree.h>
+#include <vespa/vespalib/btree/minmaxaggregated.h>
+#include <vespa/vespalib/btree/minmaxaggrcalc.h>
 #include <vespa/vespalib/datastore/array_store.h>
 
 namespace storage {
@@ -23,8 +25,21 @@ namespace storage {
  */
 // TODO create and use a new DB interface with better bulk loading, snapshot and iteration support
 class BTreeBucketDatabase : public BucketDatabase {
+
+    struct KeyUsedBitsMinMaxAggrCalc : search::btree::MinMaxAggrCalc {
+        constexpr static bool aggregate_over_values() { return false; }
+        constexpr static int32_t getVal(uint64_t key) noexcept {
+            static_assert(document::BucketId::CountBits == 6u);
+            return static_cast<int32_t>(key & 0b11'1111U); // 6 LSB of key contains used-bits
+        }
+    };
+
     // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref>
-    using BTree        = search::btree::BTree<uint64_t, uint64_t>;
+    using BTree = search::btree::BTree<uint64_t, uint64_t,
+                                       search::btree::MinMaxAggregated,
+                                       std::less<>,
+                                       search::btree::BTreeDefaultTraits,
+                                       KeyUsedBitsMinMaxAggrCalc>;
     using ReplicaStore = search::datastore::ArrayStore<BucketCopy>;
     using GenerationHandler = vespalib::GenerationHandler;
author	Tor Brede Vekterli <vekterli@verizonmedia.com>	2020-04-24 11:13:48 +0000
committer	Tor Brede Vekterli <vekterli@verizonmedia.com>	2020-04-24 11:13:48 +0000
commit	27a1f926c60f7684d5ec17988dc17681b0c73d30 (patch)
tree	371e0c725264f466a12264fef3c124f784e68114 /storage
parent	24eaf1642c012eb2f9f76a39102e87fb82874321 (diff)