3 files changed, 146 insertions, 49 deletions
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp
index 1d5355df92f..a0354c8ad4e 100644
--- a/storage/src/tests/distributor/bucketdatabasetest.cpp
+++ b/storage/src/tests/distributor/bucketdatabasetest.cpp
@@ -202,66 +202,84 @@ TEST_P(BucketDatabaseTest, find_parents) {
     // test that it is so for now to avoid breaking the world.
     EXPECT_EQ(
             std::string("0"),
-            doFindParents(toVector(document::BucketId(17, 0xcafe)),
-                          document::BucketId(17, 0xcafe)));
+            doFindParents({BucketId(17, 0xcafe)},
+                          BucketId(17, 0xcafe)));
 
     EXPECT_EQ(
             std::string("1,2"),
-            doFindParents(toVector(document::BucketId(1, 0x0),
-                                   document::BucketId(1, 0x1),
-                                   document::BucketId(2, 0x1)),
-                          document::BucketId(16, 0x1)));
+            doFindParents({BucketId(1, 0x0),
+                           BucketId(1, 0x1),
+                           BucketId(2, 0x1)},
+                          BucketId(16, 0x1)));
 
     EXPECT_EQ(
             std::string("2"),
-            doFindParents(toVector(document::BucketId(17, 0x0ffff),
-                                   document::BucketId(18, 0x1ffff),
-                                   document::BucketId(18, 0x3ffff)),
-                          document::BucketId(22, 0xfffff)));
+            doFindParents({BucketId(17, 0x0ffff),
+                           BucketId(18, 0x1ffff),
+                           BucketId(18, 0x3ffff)},
+                          BucketId(22, 0xfffff)));
 
     EXPECT_EQ(
             std::string("0,2,3"),
-            doFindParents(toVector(document::BucketId(16, 0x0ffff),
-                                   document::BucketId(17, 0x0ffff),
-                                   document::BucketId(17, 0x1ffff),
-                                   document::BucketId(19, 0xfffff)),
-                          document::BucketId(22, 0xfffff)));
+            doFindParents({BucketId(16, 0x0ffff),
+                           BucketId(17, 0x0ffff),
+                           BucketId(17, 0x1ffff),
+                           BucketId(19, 0xfffff)},
+                          BucketId(22, 0xfffff)));
 
     EXPECT_EQ(
             std::string("0,1,2,3"),
-            doFindParents(toVector(document::BucketId(16, 0x0ffff),
-                                   document::BucketId(17, 0x0ffff),
-                                   document::BucketId(18, 0x0ffff),
-                                   document::BucketId(19, 0x0ffff)),
-                          document::BucketId(20, 0x0ffff)));
+            doFindParents({BucketId(16, 0x0ffff),
+                           BucketId(17, 0x0ffff),
+                           BucketId(18, 0x0ffff),
+                           BucketId(19, 0x0ffff)},
+                          BucketId(20, 0x0ffff)));
 
     EXPECT_EQ(
             std::string("0,2,3"),
-            doFindParents(toVector(document::BucketId(16, 0x0ffff),
-                                   document::BucketId(17, 0x0ffff),
-                                   document::BucketId(17, 0x1ffff),
-                                   document::BucketId(18, 0x1ffff)),
-                          document::BucketId(22, 0x1ffff)));
+            doFindParents({BucketId(16, 0x0ffff),
+                           BucketId(17, 0x0ffff),
+                           BucketId(17, 0x1ffff),
+                           BucketId(18, 0x1ffff)},
+                          BucketId(22, 0x1ffff)));
 
     EXPECT_EQ(
             std::string("0"),
-            doFindParents(toVector(document::BucketId(16, 0x0ffff),
-                                   document::BucketId(17, 0x0ffff)),
-                          document::BucketId(22, 0x1ffff)));
+            doFindParents({BucketId(16, 0x0ffff),
+                           BucketId(17, 0x0ffff)},
+                          BucketId(22, 0x1ffff)));
 
     EXPECT_EQ( // ticket 3121525
             std::string("0"),
-            doFindParents(toVector(document::BucketId(16, 0x0ffff),
-                                   document::BucketId(17, 0x0ffff),
-                                   document::BucketId(19, 0x1ffff)),
-                          document::BucketId(18, 0x1ffff)));
+            doFindParents({BucketId(16, 0x0ffff),
+                           BucketId(17, 0x0ffff),
+                           BucketId(19, 0x1ffff)},
+                          BucketId(18, 0x1ffff)));
 
     EXPECT_EQ( // ticket 3121525
             std::string("0"),
-            doFindParents(toVector(document::BucketId(16, 0x0ffff),
-                                   document::BucketId(17, 0x0ffff),
-                                   document::BucketId(19, 0x5ffff)),
-                          document::BucketId(18, 0x1ffff)));
+            doFindParents({BucketId(16, 0x0ffff),
+                           BucketId(17, 0x0ffff),
+                           BucketId(19, 0x5ffff)},
+                          BucketId(18, 0x1ffff)));
+
+    // Queried bucket is itself a parent of buckets in the DB, not a child.
+    EXPECT_EQ(std::string(""),
+              doFindParents({BucketId(16, 0x0ffff),
+                             BucketId(17, 0x0ffff),
+                             BucketId(19, 0x5ffff)},
+                            BucketId(15, 0x0ffff)));
+
+    // Queried bucket has lower used bits than any buckets in the DB, and there
+    // are buckets in an unrelated leftmost subtree.
+    EXPECT_EQ(std::string(""),
+              doFindParents({BucketId(16, 0x0000)},
+                            BucketId(8, 0xffff)));
+
+    // Similar as above test, but with subtree ordering reversed.
+    EXPECT_EQ(std::string(""),
+              doFindParents({BucketId(16, 0xffff)},
+                            BucketId(8, 0x0000)));
 }
 
 std::string
@@ -380,6 +398,13 @@ TEST_P(BucketDatabaseTest, find_all) {
                       document::BucketId(18, 0x1ffff)));
 }
 
+TEST_P(BucketDatabaseTest, bucket_resolving_does_not_consider_unused_bits_in_id) {
+    EXPECT_EQ("0,1",
+              doFindAll({BucketId(0x840000003a7455d7),
+                         BucketId(0x840000013a7455d7)},
+                        BucketId(0x8247fe133a7455d7))); // Raw bucket ID from group hash
+}
+
 document::BucketId
 BucketDatabaseTest::doCreate(const std::vector<document::BucketId>& ids,
                              uint32_t minBits,
@@ -707,4 +732,37 @@ TEST_P(BucketDatabaseTest, DISABLED_benchmark_const_iteration) {
             db().toString(false).c_str(), elapsed);
 }
 
+TEST_P(BucketDatabaseTest, DISABLED_benchmark_find_parents) {
+    constexpr uint32_t superbuckets = 1u << 16u;
+    constexpr uint32_t sub_buckets = 14;
+    constexpr uint32_t n_buckets = superbuckets * sub_buckets;
+
+    std::vector<uint64_t> bucket_keys;
+    bucket_keys.reserve(n_buckets);
+
+    for (uint32_t sb = 0; sb < superbuckets; ++sb) {
+        for (uint64_t i = 0; i < sub_buckets; ++i) {
+            document::BucketId bucket(48, (i << 32ULL) | sb); // TODO eval with different bit counts
+            bucket_keys.emplace_back(bucket.toKey());
+        }
+    }
+    fprintf(stderr, "Inserting %zu buckets into DB\n", bucket_keys.size());
+    std::sort(bucket_keys.begin(), bucket_keys.end());
+    for (uint64_t k : bucket_keys) {
+        db().update(BucketDatabase::Entry(BucketId(BucketId::keyToBucketId(k)), BI3(0, 1, 2)));
+    }
+
+    fprintf(stderr, "Invoking getParents() %zu times\n", bucket_keys.size());
+    auto elapsed = vespalib::BenchmarkTimer::benchmark([&] {
+        std::vector<BucketDatabase::Entry> entries;
+        for (uint64_t k : bucket_keys) {
+            db().getParents(BucketId(BucketId::keyToBucketId(k)), entries);
+            assert(entries.size() == 1);
+            entries.clear();
+        }
+    }, 30);
+    fprintf(stderr, "Looking up all buckets in %s takes %g seconds\n",
+            db().toString(false).c_str(), elapsed);
+}
+
 }
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
index 85f5c8c5be9..fa2c915acc1 100644
--- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
@@ -194,13 +194,27 @@ void BTreeBucketDatabase::remove(const BucketId& bucket) {
  * This allows us to reuse a single iterator and to continue seeking forwards
  * from its current position.
  *
+ * To speed up the process of converging on the target bucket without needing
+ * to check many unrelated subtrees, we let the underlying B-tree automatically
+ * aggregate the min/max range of the used-bits of all contained bucket keys.
+ * If we e.g. know that the minimum number of used bits in the DB is 16, we can
+ * immediately seek to this level in the tree instead of working our way down
+ * one bit at a time. By definition, no parents can exist above this level.
+ * This is a very important optimization, as bucket trees are usually very well
+ * balanced due to randomized distribution of data (combined with a cluster-wide
+ * minimum tree level imposed by distribution bits). It is common that the minimum
+ * number of used bits == max number of used bits, i.e. a totally even split.
+ * This means that for a system without inconsistently split buckets (i.e. no
+ * parents) we're highly likely to converge on the target bucket in a single seek.
+ *
  * Algorithm:
  *
  *   Core invariant: every subsequent iterator seek performed in this algorithm
  *   is for a key that is strictly higher than the one the iterator is currently at.
  *
  *   1. Lbound seek to the lowest key that is known to exclude all already visited
- *      parents. On the first iteration this is zero, i.e. the first in-order node.
+ *      parents. On the first iteration we use a bit count equal to the minimum number
+ *      of key used-bits in the entire DB, allowing us to potentially skip most subtrees.
  *   2. If the current node's key is greater than that of the requested bucket's key,
  *      we've either descended to--or beyond--it in its own subtree or we've entered
  *      a disjoint subtree. Since we know that all parents must sort before any given
@@ -229,14 +243,13 @@ void BTreeBucketDatabase::remove(const BucketId& bucket) {
  *   6. Iff iterator is still valid, go to step 2
  *
  * This algorithm is able to skip through large parts of the tree in a sparsely populated
- * tree, but the number of seeks will trend towards O(b) as with the legacy implementation
- * when a tree is densely populated. This because all logical inner nodes in the tree will
- * have subtrees under them. Even in the worst case we should be more efficient than the
- * legacy implementation since we've cut any dense search space in half for each invocation
- * of seek() on the iterator
- *
- * TODO use min-aggregation to infer used bit range in the tree. This should allow us to
- * skip scanning through many disjoint subtrees.
+ * tree, but the number of seeks will trend towards O(b - min_bits) as with the legacy
+ * implementation when a tree is densely populated (where `b` is the used-bits count of the
+ * most specific node in the tree for the target bucket, and min_bits is the minimum number
+ * of used-bits for any key in the database). This because all logical inner nodes in the tree
+ * will have subtrees under them. Even in the worst case we should be more efficient than the
+ * legacy Judy-based implementation since we've cut any dense search space in half for each
+ * invocation of seek() on the iterator.
  */
 BTreeBucketDatabase::BTree::ConstIterator
 BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view,
@@ -244,10 +257,21 @@ BTreeBucketDatabase::find_parents_internal(const BTree::FrozenView& frozen_view,
                                            std::vector<Entry>& entries) const
 {
     const uint64_t bucket_key = bucket.toKey();
-    auto iter = frozen_view.begin();
-    // Start at the root level, descending towards the bucket itself.
+    if (frozen_view.empty()) {
+        return frozen_view.begin(); // Will be invalid.
+    }
+    const auto min_db_bits = frozen_view.getAggregated().getMin();
+    assert(min_db_bits >= static_cast<int32_t>(BucketId::minNumBits));
+    assert(min_db_bits <= static_cast<int32_t>(BucketId::maxNumBits));
+    // Start at the lowest possible tree level no parents can exist above,
+    // descending towards the bucket itself.
+    // Note: important to use getId() rather than getRawId(), as min_db_bits may be
+    // greater than the used bits of the queried bucket. If we used the raw ID, we'd
+    // end up looking at undefined bits.
+    const auto first_key = BucketId(min_db_bits, bucket.getId()).toKey();
+    auto iter = frozen_view.lowerBound(first_key);
     // Try skipping as many levels of the tree as possible as we go.
-    uint32_t bits = 1;
+    uint32_t bits = min_db_bits;
     while (iter.valid() && (iter.getKey() < bucket_key)) {
         auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
         if (candidate.contains(bucket)) {
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
index 8898b0c395a..75736e59092 100644
--- a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
@@ -4,6 +4,8 @@
 
 #include "bucketdatabase.h"
 #include <vespa/vespalib/btree/btree.h>
+#include <vespa/vespalib/btree/minmaxaggregated.h>
+#include <vespa/vespalib/btree/minmaxaggrcalc.h>
 #include <vespa/vespalib/datastore/array_store.h>
 
 namespace storage {
@@ -23,8 +25,21 @@ namespace storage {
  */
 // TODO create and use a new DB interface with better bulk loading, snapshot and iteration support
 class BTreeBucketDatabase : public BucketDatabase {
+
+    struct KeyUsedBitsMinMaxAggrCalc : search::btree::MinMaxAggrCalc {
+        constexpr static bool aggregate_over_values() { return false; }
+        constexpr static int32_t getVal(uint64_t key) noexcept {
+            static_assert(document::BucketId::CountBits == 6u);
+            return static_cast<int32_t>(key & 0b11'1111U); // 6 LSB of key contains used-bits
+        }
+    };
+
     // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref>
-    using BTree        = search::btree::BTree<uint64_t, uint64_t>;
+    using BTree = search::btree::BTree<uint64_t, uint64_t,
+                                       search::btree::MinMaxAggregated,
+                                       std::less<>,
+                                       search::btree::BTreeDefaultTraits,
+                                       KeyUsedBitsMinMaxAggrCalc>;
     using ReplicaStore = search::datastore::ArrayStore<BucketCopy>;
     using GenerationHandler = vespalib::GenerationHandler;