Add initial B+tree distributor bucket database

Still uses legacy `BucketDatabase` API, which is not optimized for bulk loading or updating. Focus for this iteration is functional correctness rather than API redesign. Legacy DB is still the one wired in for all production logic. Unit tests have been expanded to cover discovered edge cases that were not properly tested for. Also move distributor bucket DB tests to GTest. Use value- parameterized test fixture instead of ad-hoc CppUnit approach.
author: Tor Brede Vekterli <vekterli@verizonmedia.com> 2019-04-29 09:04:35 +0000
committer: Tor Brede Vekterli <vekterli@verizonmedia.com> 2019-05-09 11:21:32 +0000
commit: aa9fa2f49f3a61d2978b9748712f3cadd902fd7b (patch)
tree: c81b737f428206ad33a114853606a0d9207acea6 /storage
parent: c3667718a63a8703bf62833dcb92b7ad5422d0cc (diff)
19 files changed, 670 insertions, 180 deletions
diff --git a/storage/CMakeLists.txt b/storage/CMakeLists.txt
index f77c11eb350..6b697716e1d 100644
--- a/storage/CMakeLists.txt
+++ b/storage/CMakeLists.txt
@@ -16,6 +16,7 @@ vespa_define_module(
     vdslib
     persistence
     storageframework
+    searchlib
 
     EXTERNAL_DEPENDS
     Judy
diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt
index f7a65000f6c..fdccf9b1394 100644
--- a/storage/src/tests/distributor/CMakeLists.txt
+++ b/storage/src/tests/distributor/CMakeLists.txt
@@ -2,7 +2,6 @@
 vespa_add_library(storage_testdistributor TEST
     SOURCES
     blockingoperationstartertest.cpp
-    bucketdatabasetest.cpp
     bucketdbmetricupdatertest.cpp
     bucketgctimecalculatortest.cpp
     bucketstateoperationtest.cpp
@@ -15,7 +14,6 @@ vespa_add_library(storage_testdistributor TEST
     idealstatemanagertest.cpp
     joinbuckettest.cpp
     maintenanceschedulertest.cpp
-    mapbucketdatabasetest.cpp
     mergelimitertest.cpp
     mergeoperationtest.cpp
     messagesenderstub.cpp
@@ -48,7 +46,10 @@ vespa_add_library(storage_testdistributor TEST
 
 vespa_add_library(storage_gtestdistributor TEST
     SOURCES
+    btree_bucket_database_test.cpp
+    bucketdatabasetest.cpp
     bucketdbupdatertest.cpp
+    mapbucketdatabasetest.cpp
     # Fixture etc. dupes with non-gtest runner :
     distributortestutil.cpp
     bucket_db_prune_elision_test.cpp
diff --git a/storage/src/tests/distributor/btree_bucket_database_test.cpp b/storage/src/tests/distributor/btree_bucket_database_test.cpp
new file mode 100644
index 00000000000..c253a758f98
--- /dev/null
+++ b/storage/src/tests/distributor/btree_bucket_database_test.cpp
@@ -0,0 +1,12 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/bucketdb/btree_bucket_database.h>
+#include <tests/distributor/bucketdatabasetest.h>
+
+namespace storage::distributor {
+
+INSTANTIATE_TEST_CASE_P(BTreeDatabase, BucketDatabaseTest,
+                        ::testing::Values(std::make_shared<BTreeBucketDatabase>()));
+
+}
+
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp
index 55106ce5c28..92e0c534e31 100644
--- a/storage/src/tests/distributor/bucketdatabasetest.cpp
+++ b/storage/src/tests/distributor/bucketdatabasetest.cpp
@@ -1,6 +1,5 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "bucketdatabasetest.h"
-#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
 #include <iomanip>
 #include <algorithm>
 
@@ -8,9 +7,7 @@ namespace storage::distributor {
 
 using document::BucketId;
 
-void
-BucketDatabaseTest::setUp()
-{
+void BucketDatabaseTest::SetUp() {
     db().clear();
 }
 
@@ -26,44 +23,42 @@ namespace {
     }
 }
 
-void
-BucketDatabaseTest::testClear() {
+TEST_P(BucketDatabaseTest, testClear) {
     db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
     db().clear();
-    CPPUNIT_ASSERT_EQUAL(uint64_t(0), db().size());
+    EXPECT_EQ(uint64_t(0), db().size());
 }
 
-void
-BucketDatabaseTest::testUpdateGetAndRemove() {
+TEST_P(BucketDatabaseTest, testUpdateGetAndRemove) {
     // Do some insertions
-    CPPUNIT_ASSERT_EQUAL(0, (int)db().size());
+    EXPECT_EQ(0, db().size());
     db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3)));
-    CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+    EXPECT_EQ(3, db().size());
 
     db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(4)));
-    CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+    EXPECT_EQ(3, db().size());
 
     // Access some elements
-    CPPUNIT_ASSERT_EQUAL(BI(4), db().get(document::BucketId(16, 11)).getBucketInfo());
-    CPPUNIT_ASSERT_EQUAL(BI(1), db().get(document::BucketId(16, 16)).getBucketInfo());
-    CPPUNIT_ASSERT_EQUAL(BI(3), db().get(document::BucketId(16, 42)).getBucketInfo());
+    EXPECT_EQ(BI(4), db().get(document::BucketId(16, 11)).getBucketInfo());
+    EXPECT_EQ(BI(1), db().get(document::BucketId(16, 16)).getBucketInfo());
+    EXPECT_EQ(BI(3), db().get(document::BucketId(16, 42)).getBucketInfo());
 
     // Do removes
     db().remove(document::BucketId(16, 12));
 
-    CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+    EXPECT_EQ(3, db().size());
 
     db().remove(document::BucketId(16, 11));
 
-    CPPUNIT_ASSERT_EQUAL(2, (int)db().size());
+    EXPECT_EQ(2, db().size());
 
     db().remove(document::BucketId(16, 16));
     db().remove(document::BucketId(16, 42));
 
-    CPPUNIT_ASSERT_EQUAL(0, (int)db().size());
+    EXPECT_EQ(0, db().size());
 }
 
 namespace {
@@ -83,8 +78,7 @@ struct ModifyProcessor : public BucketDatabase::MutableEntryProcessor
     }
 };
 
-struct ListAllProcessor : public BucketDatabase::EntryProcessor
-{
+struct ListAllProcessor : public BucketDatabase::EntryProcessor {
     std::ostringstream ost;
 
     bool process(const BucketDatabase::Entry& e) override {
@@ -93,8 +87,7 @@ struct ListAllProcessor : public BucketDatabase::EntryProcessor
     }
 };
 
-struct DummyProcessor : public BucketDatabase::EntryProcessor
-{
+struct DummyProcessor : public BucketDatabase::EntryProcessor {
     std::ostringstream ost;
 
     bool process(const BucketDatabase::Entry&) override {
@@ -103,8 +96,7 @@ struct DummyProcessor : public BucketDatabase::EntryProcessor
 };
 
 
-struct StoppingProcessor : public BucketDatabase::EntryProcessor
-{
+struct StoppingProcessor : public BucketDatabase::EntryProcessor {
     std::ostringstream ost;
 
     bool process(const BucketDatabase::Entry& e) override {
@@ -120,8 +112,7 @@ struct StoppingProcessor : public BucketDatabase::EntryProcessor
 
 }
 
-void
-BucketDatabaseTest::testIterating() {
+TEST_P(BucketDatabaseTest, testIterating) {
     // Do some insertions
     db().update(BucketDatabase::Entry(document::BucketId(16, 0x10), BI(1)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 0x0b), BI(2)));
@@ -131,7 +122,7 @@ BucketDatabaseTest::testIterating() {
         ListAllProcessor proc;
         db().forEach(proc, document::BucketId());
 
-        CPPUNIT_ASSERT_EQUAL(
+        EXPECT_EQ(
                 std::string(
                         "BucketId(0x4000000000000010) : "
                         "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"
@@ -146,7 +137,7 @@ BucketDatabaseTest::testIterating() {
         ListAllProcessor proc;
         db().forEach(proc, document::BucketId(16, 0x2a));
 
-        CPPUNIT_ASSERT_EQUAL(
+        EXPECT_EQ(
                 std::string(
                         "BucketId(0x400000000000000b) : "
                         "node(idx=2,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"),
@@ -157,7 +148,7 @@ BucketDatabaseTest::testIterating() {
         StoppingProcessor proc;
         db().forEach(proc, document::BucketId());
 
-        CPPUNIT_ASSERT_EQUAL(
+        EXPECT_EQ(
                 std::string(
                         "BucketId(0x4000000000000010) : "
                         "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"
@@ -173,7 +164,7 @@ BucketDatabaseTest::testIterating() {
         ListAllProcessor proc;
         db().forEach(proc);
 
-        CPPUNIT_ASSERT_EQUAL(
+        EXPECT_EQ(
                 std::string(
                         "BucketId(0x4000000000000010) : "
                         "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"
@@ -213,18 +204,33 @@ BucketDatabaseTest::doFindParents(const std::vector<document::BucketId>& ids,
     return ost.str();
 }
 
-void
-BucketDatabaseTest::testFindParents() {
+TEST_P(BucketDatabaseTest, testFindParents) {
     // test what parents in the DB (specified in vector) are parents of the
     // specified bucket. Result is a list of indexes into the vector.
-    CPPUNIT_ASSERT_EQUAL(
+
+    // The way the legacy API works is that a bucket is considered as being in
+    // the set of its parents... This is rather weird, but at least explicitly
+    // test that it is so for now to avoid breaking the world.
+    EXPECT_EQ(
+            std::string("0"),
+            doFindParents(toVector(document::BucketId(17, 0xcafe)),
+                          document::BucketId(17, 0xcafe)));
+
+    EXPECT_EQ(
+            std::string("1,2"),
+            doFindParents(toVector(document::BucketId(1, 0x0),
+                                   document::BucketId(1, 0x1),
+                                   document::BucketId(2, 0x1)),
+                          document::BucketId(16, 0x1)));
+
+    EXPECT_EQ(
             std::string("2"),
             doFindParents(toVector(document::BucketId(17, 0x0ffff),
                                    document::BucketId(18, 0x1ffff),
                                    document::BucketId(18, 0x3ffff)),
                           document::BucketId(22, 0xfffff)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("0,2,3"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff),
@@ -232,7 +238,15 @@ BucketDatabaseTest::testFindParents() {
                                    document::BucketId(19, 0xfffff)),
                           document::BucketId(22, 0xfffff)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
+            std::string("0,1,2,3"),
+            doFindParents(toVector(document::BucketId(16, 0x0ffff),
+                                   document::BucketId(17, 0x0ffff),
+                                   document::BucketId(18, 0x0ffff),
+                                   document::BucketId(19, 0x0ffff)),
+                          document::BucketId(20, 0x0ffff)));
+
+    EXPECT_EQ(
             std::string("0,2,3"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff),
@@ -240,20 +254,20 @@ BucketDatabaseTest::testFindParents() {
                                    document::BucketId(18, 0x1ffff)),
                           document::BucketId(22, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("0"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff)),
                           document::BucketId(22, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL( // ticket 3121525
+    EXPECT_EQ( // ticket 3121525
             std::string("0"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff),
                                    document::BucketId(19, 0x1ffff)),
                           document::BucketId(18, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL( // ticket 3121525
+    EXPECT_EQ( // ticket 3121525
             std::string("0"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff),
@@ -276,8 +290,11 @@ BucketDatabaseTest::doFindAll(const std::vector<document::BucketId>& ids,
 
     std::ostringstream ost;
     for (uint32_t i = 0; i < ids.size(); ++i) {
-        if (std::find(entries.begin(), entries.end(),
-                      BucketDatabase::Entry(ids[i], BI(i))) != entries.end()) {
+        auto wanted = BucketDatabase::Entry(ids[i], BI(i));
+        for (const auto& e : entries) {
+            if (!(e == wanted)) {
+                continue;
+            }
             if (!ost.str().empty()) {
                 ost << ",";
             }
@@ -288,11 +305,9 @@ BucketDatabaseTest::doFindAll(const std::vector<document::BucketId>& ids,
     return ost.str();
 }
 
-void
-BucketDatabaseTest::testFindAll()
-{
+TEST_P(BucketDatabaseTest, testFindAll) {
     std::vector<document::BucketId> buckets;
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string(""),
             doFindAll(buckets, document::BucketId(18, 0x1ffff)));
 
@@ -306,15 +321,20 @@ BucketDatabaseTest::testFindAll()
     buckets.push_back(document::BucketId(20, 0xceaaa));
     buckets.push_back(document::BucketId(17, 0x1ffff));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
+            std::string("0"),
+            doFindAll(toVector(document::BucketId(16, 1234)),
+                      document::BucketId(16, 1234)));
+
+    EXPECT_EQ(
             std::string("0,4,5,6"),
             doFindAll(buckets, document::BucketId(17, 0x1aaaa)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("8"),
             doFindAll(buckets, document::BucketId(16, 0xffff)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("0,1"),
             doFindAll(toVector(document::BucketId(17, 0x00001),
                                document::BucketId(17, 0x10001)),
@@ -322,7 +342,7 @@ BucketDatabaseTest::testFindAll()
 
     document::BucketId id(33, 0x1053c7089); // Bit 32 is set, but unused.
     id.setUsedBits(32);
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("1,2"),
             doFindAll(toVector(document::BucketId(24, 0x000dc7089),
                                document::BucketId(33, 0x0053c7089),
@@ -330,7 +350,7 @@ BucketDatabaseTest::testFindAll()
                                document::BucketId(24, 0x000bc7089)),
                       id));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+    EXPECT_EQ( // Inconsistent split
             std::string("0,1,2"),
             doFindAll(toVector(
                               document::BucketId(16, 0x00001), // contains 2-3
@@ -338,7 +358,7 @@ BucketDatabaseTest::testFindAll()
                               document::BucketId(17, 0x10001)),
                       document::BucketId(16, 0x00001)));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+    EXPECT_EQ( // Inconsistent split
             std::string("1,2"),
             doFindAll(toVector(
                               document::BucketId(17, 0x10000),
@@ -347,14 +367,14 @@ BucketDatabaseTest::testFindAll()
                               document::BucketId(17, 0x1ffff)),
                       document::BucketId(32, 0x027228034)));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+    EXPECT_EQ( // Inconsistent split
             std::string("0"),
             doFindAll(toVector(
                               document::BucketId(16, 0x0ffff),
                               document::BucketId(17, 0x0ffff)),
                       document::BucketId(22, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+    EXPECT_EQ( // Inconsistent split
             std::string("0,2"),
             doFindAll(toVector(
                               document::BucketId(16, 0x0ffff),
@@ -362,7 +382,7 @@ BucketDatabaseTest::testFindAll()
                               document::BucketId(19, 0x1ffff)),
                       document::BucketId(18, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split, ticket 3121525
+    EXPECT_EQ( // Inconsistent split, ticket 3121525
             std::string("0,2"),
             doFindAll(toVector(
                               document::BucketId(16, 0x0ffff),
@@ -386,36 +406,36 @@ BucketDatabaseTest::doCreate(const std::vector<document::BucketId>& ids,
     return entry.getBucketId();
 }
 
-void
-BucketDatabaseTest::testCreateAppropriateBucket() {
+// TODO rewrite in terms of bucket getter, not creator
+TEST_P(BucketDatabaseTest, testCreateAppropriateBucket) {
         // Use min split bits when no relevant bucket exist.
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             document::BucketId(36,0x0000004d2),
             doCreate(toVector(document::BucketId(58, 0x43d6c878000004d2ull)), 36,
                      document::BucketId(58, 0x423bf1e0000004d2ull)));
-        // New bucket has bits in common with existing bucket.
-        // Create bucket with min amount of bits while not being overlapping
-    CPPUNIT_ASSERT_EQUAL(
+    // New bucket has bits in common with existing bucket.
+    // Create bucket with min amount of bits while not being overlapping
+    EXPECT_EQ(
             document::BucketId(34,0x0000004d2),
             doCreate(toVector(document::BucketId(58, 0xeaf77782000004d2)),
                      16,
                      document::BucketId(58, 0x00000000000004d2)));
-        // Create sibling of existing bucket with most LSB bits in common.
-    CPPUNIT_ASSERT_EQUAL(
+    // Create sibling of existing bucket with most LSB bits in common.
+    EXPECT_EQ(
             document::BucketId(40, 0x0000004d2),
             doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2),
                               document::BucketId(58, 0xeaf77782000004d2)),
                      16,
                      document::BucketId(58, 0x00000000000004d2)));
-        // Create sibling of existing bucket with most LSB bits in common.
-    CPPUNIT_ASSERT_EQUAL(
+    // Create sibling of existing bucket with most LSB bits in common.
+    EXPECT_EQ(
             document::BucketId(25, 0x0010004d2),
             doCreate(toVector(document::BucketId(16, 0x00000000000004d1),
                               document::BucketId(40, 0x00000000000004d2)),
                      16,
                      document::BucketId(58, 0x00000000010004d2)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             document::BucketId(36, 0x10000004000004d2),
             doCreate(toVector(document::BucketId(0x8c000000000004d2),
                               document::BucketId(0xeb54b3ac000004d2),
@@ -423,42 +443,38 @@ BucketDatabaseTest::testCreateAppropriateBucket() {
                               document::BucketId(0x84000001000004d2)),
                      16,
                      document::BucketId(58, 0x1944a44000004d2)));
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             document::BucketId(25, 0x0010004d2),
             doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2),
                               document::BucketId(40, 0x00000000000004d1)),
                      16,
                      document::BucketId(58,0x00000000010004d2)));
-        // Test empty bucket database case. (Use min split bits)
+    // Test empty bucket database case. (Use min split bits)
     std::vector<document::BucketId> buckets;
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             document::BucketId(16, 0x0000004d2ull),
             doCreate(buckets, 16,
                      document::BucketId(58, 0x00000000010004d2)));
 }
 
-void
-BucketDatabaseTest::testGetNext()
-{
-    db().clear();
+TEST_P(BucketDatabaseTest, testGetNext) {
     db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3)));
 
-    CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 16),
-                         db().getNext(document::BucketId()).getBucketId());
+    EXPECT_EQ(document::BucketId(16, 16),
+              db().getNext(document::BucketId()).getBucketId());
 
-    CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 42),
-                         db().getNext(document::BucketId(16, 16)).getBucketId());
+    EXPECT_EQ(document::BucketId(16, 42),
+              db().getNext(document::BucketId(16, 16)).getBucketId());
 
-    CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 11),
-                         db().getNext(document::BucketId(16, 42)).getBucketId());
+    EXPECT_EQ(document::BucketId(16, 11),
+              db().getNext(document::BucketId(16, 42)).getBucketId());
 }
 
 void
 BucketDatabaseTest::doTestUpperBound(const UBoundFunc& f)
 {
-    db().clear();
     // Tree is rooted at the LSB bit, so the following buckets are in iteration
     // order based on the reverse of their "normal" bitstring:
     // 0010:3
@@ -473,30 +489,28 @@ BucketDatabaseTest::doTestUpperBound(const UBoundFunc& f)
     db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(3)));
 
     // 0000:0 (default constructed) has ubound of 0010:3
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId()));
+    EXPECT_EQ(BucketId(3, 4), f(db(), BucketId()));
     // 0011:4 has ubound of 1000:3
-    CPPUNIT_ASSERT_EQUAL(document::BucketId(3, 1), f(db(), BucketId(4, 12)));
+    EXPECT_EQ(document::BucketId(3, 1), f(db(), BucketId(4, 12)));
     // 1000:1 has ubound of 1000:3
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId(1, 0)));
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 1), f(db(), BucketId(3, 4)));
-    CPPUNIT_ASSERT_EQUAL(BucketId(4, 9), f(db(), BucketId(3, 1)));
-    CPPUNIT_ASSERT_EQUAL(BucketId(5, 9), f(db(), BucketId(4, 9)));
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(5, 9)));
+    EXPECT_EQ(BucketId(3, 4), f(db(), BucketId(1, 0)));
+    EXPECT_EQ(BucketId(3, 1), f(db(), BucketId(3, 4)));
+    EXPECT_EQ(BucketId(4, 9), f(db(), BucketId(3, 1)));
+    EXPECT_EQ(BucketId(5, 9), f(db(), BucketId(4, 9)));
+    EXPECT_EQ(BucketId(3, 3), f(db(), BucketId(5, 9)));
     // 100101:6 does not exist, should also return 1100:3
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(6, 41)));
+    EXPECT_EQ(BucketId(3, 3), f(db(), BucketId(6, 41)));
 
     // Test extremes.
     db().clear();
     db().update(BucketDatabase::Entry(document::BucketId(8, 0), BI(2)));
     db().update(BucketDatabase::Entry(document::BucketId(8, 0xff), BI(2)));
 
-    CPPUNIT_ASSERT_EQUAL(BucketId(8, 0), f(db(), BucketId()));
-    CPPUNIT_ASSERT_EQUAL(BucketId(8, 0xff), f(db(), BucketId(8, 0)));
+    EXPECT_EQ(BucketId(8, 0), f(db(), BucketId()));
+    EXPECT_EQ(BucketId(8, 0xff), f(db(), BucketId(8, 0)));
 }
 
-void
-BucketDatabaseTest::testUpperBoundReturnsNextInOrderGreaterBucket()
-{
+TEST_P(BucketDatabaseTest, testUpperBoundReturnsNextInOrderGreaterBucket) {
     doTestUpperBound([](const BucketDatabase& bucketDb,
                         const document::BucketId& id)
     {
@@ -504,9 +518,7 @@ BucketDatabaseTest::testUpperBoundReturnsNextInOrderGreaterBucket()
     });
 }
 
-void
-BucketDatabaseTest::testGetNextReturnsUpperBoundBucket()
-{
+TEST_P(BucketDatabaseTest, testGetNextReturnsUpperBoundBucket) {
     // getNext() would generally be implemented in terms of upperBound(), but
     // make sure it conforms to the same contract in case this changes.
     doTestUpperBound([](const BucketDatabase& bucketDb,
@@ -516,31 +528,28 @@ BucketDatabaseTest::testGetNextReturnsUpperBoundBucket()
     });
 }
 
-void
-BucketDatabaseTest::testChildCount()
-{
-    db().clear();
+TEST_P(BucketDatabaseTest, testChildCount) {
     // Empty tree; inserts cannot create inconsistencies.
-    CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
+    EXPECT_EQ(0u, db().childCount(BucketId(3, 1)));
 
     // Same bucket; cannot be inconsistent with itself.
     db().update(BucketDatabase::Entry(document::BucketId(3, 1), BI(1)));
-    CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
+    EXPECT_EQ(0u, db().childCount(BucketId(3, 1)));
 
     // (2, 1) has one subtree.
-    CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 1)));
+    EXPECT_EQ(1u, db().childCount(BucketId(2, 1)));
 
     // Bucket exists in another subtree from (1, 1); inconsistency would
     // result if we tried inserting it.
     db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(2)));
-    CPPUNIT_ASSERT_EQUAL(2u, db().childCount(BucketId(1, 1)));
+    EXPECT_EQ(2u, db().childCount(BucketId(1, 1)));
 
     // Inner node with 1 subtree.
-    CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 3)));
+    EXPECT_EQ(1u, db().childCount(BucketId(2, 3)));
 
     // Leaves have no subtrees.
-    CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
-    CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 5)));
+    EXPECT_EQ(0u, db().childCount(BucketId(3, 1)));
+    EXPECT_EQ(0u, db().childCount(BucketId(3, 5)));
 }
 
 }
diff --git a/storage/src/tests/distributor/bucketdatabasetest.h b/storage/src/tests/distributor/bucketdatabasetest.h
index b7b32118fee..9ebc23bdb16 100644
--- a/storage/src/tests/distributor/bucketdatabasetest.h
+++ b/storage/src/tests/distributor/bucketdatabasetest.h
@@ -3,40 +3,13 @@
 
 #include <vespa/storage/bucketdb/bucketdatabase.h>
 #include <vespa/storage/storageutil/utils.h>
-#include <vespa/vdstestlib/cppunit/macros.h>
-#include <vespa/vespalib/util/document_runnable.h>
-#include <cppunit/extensions/HelperMacros.h>
-
-#define SETUP_DATABASE_TESTS() \
-    CPPUNIT_TEST(testUpdateGetAndRemove); \
-    CPPUNIT_TEST(testClear); \
-    CPPUNIT_TEST(testIterating); \
-    CPPUNIT_TEST(testFindParents); \
-    CPPUNIT_TEST(testFindAll); \
-    CPPUNIT_TEST(testCreateAppropriateBucket); \
-    CPPUNIT_TEST(testGetNext); \
-    CPPUNIT_TEST(testGetNextReturnsUpperBoundBucket); \
-    CPPUNIT_TEST(testUpperBoundReturnsNextInOrderGreaterBucket); \
-    CPPUNIT_TEST(testChildCount);
-
-namespace storage {
-namespace distributor {
-
-struct BucketDatabaseTest : public CppUnit::TestFixture {
-    void setUp() override ;
-
-    void testUpdateGetAndRemove();
-    void testClear();
-    void testIterating();
-    void testFindParents();
-    void testFindAll();
-    void testCreateAppropriateBucket();
-    void testGetNext();
-    void testGetNextReturnsUpperBoundBucket();
-    void testUpperBoundReturnsNextInOrderGreaterBucket();
-    void testChildCount();
-
-    void testBenchmark();
+#include <vespa/vespalib/gtest/gtest.h>
+#include <functional>
+
+namespace storage::distributor {
+
+struct BucketDatabaseTest : public ::testing::TestWithParam<std::shared_ptr<BucketDatabase>> {
+    void SetUp() override ;
 
     std::string doFindParents(const std::vector<document::BucketId>& ids,
                               const document::BucketId& searchId);
@@ -46,9 +19,8 @@ struct BucketDatabaseTest : public CppUnit::TestFixture {
                                 uint32_t minBits,
                                 const document::BucketId& wantedId);
 
-    virtual BucketDatabase& db() = 0;
+    BucketDatabase& db() noexcept { return *GetParam(); }
 
-private:
     using UBoundFunc = std::function<
             document::BucketId(const BucketDatabase&,
                                const document::BucketId&)>;
@@ -57,6 +29,3 @@ private:
 };
 
 }
-
-}
-
diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp
index 51797e90b3e..11c9f18b3ef 100644
--- a/storage/src/tests/distributor/bucketdbupdatertest.cpp
+++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp
@@ -15,12 +15,11 @@
 #include <vespa/storage/distributor/simpleclusterinformation.h>
 #include <vespa/storage/distributor/distributor.h>
 #include <vespa/storage/distributor/distributor_bucket_space.h>
+#include <vespa/vespalib/gtest/gtest.h>
 #include <vespa/vespalib/text/stringtokenizer.h>
 #include <sstream>
 #include <iomanip>
 
-#include <gtest/gtest.h>
-
 using namespace storage::api;
 using namespace storage::lib;
 using document::test::makeDocumentBucket;
diff --git a/storage/src/tests/distributor/mapbucketdatabasetest.cpp b/storage/src/tests/distributor/mapbucketdatabasetest.cpp
index f2e521f1bf8..0ae4a49530e 100644
--- a/storage/src/tests/distributor/mapbucketdatabasetest.cpp
+++ b/storage/src/tests/distributor/mapbucketdatabasetest.cpp
@@ -1,21 +1,11 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vdstestlib/cppunit/macros.h>
+
 #include <vespa/storage/bucketdb/mapbucketdatabase.h>
 #include <tests/distributor/bucketdatabasetest.h>
 
-namespace storage {
-namespace distributor {
-
-struct MapBucketDatabaseTest : public BucketDatabaseTest {
-    MapBucketDatabase _db;
-    BucketDatabase& db() override { return _db; };
+namespace storage::distributor {
 
-    CPPUNIT_TEST_SUITE(MapBucketDatabaseTest);
-    SETUP_DATABASE_TESTS();
-    CPPUNIT_TEST_SUITE_END();
-};
+INSTANTIATE_TEST_CASE_P(MapDatabase, BucketDatabaseTest,
+                        ::testing::Values(std::make_shared<MapBucketDatabase>()));
 
-CPPUNIT_TEST_SUITE_REGISTRATION(MapBucketDatabaseTest);
-
-}
 }
diff --git a/storage/src/vespa/storage/bucketdb/CMakeLists.txt b/storage/src/vespa/storage/bucketdb/CMakeLists.txt
index 8200884de17..a99d16d9f0f 100644
--- a/storage/src/vespa/storage/bucketdb/CMakeLists.txt
+++ b/storage/src/vespa/storage/bucketdb/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 vespa_add_library(storage_bucketdb OBJECT
     SOURCES
+    btree_bucket_database.cpp
     bucketcopy.cpp
     bucketdatabase.cpp
     bucketinfo.cpp
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
new file mode 100644
index 00000000000..27c17b207ea
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
@@ -0,0 +1,405 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "btree_bucket_database.h"
+#include <vespa/searchlib/btree/btreebuilder.h>
+#include <vespa/searchlib/btree/btreenodeallocator.hpp>
+#include <vespa/searchlib/btree/btreenode.hpp>
+#include <vespa/searchlib/btree/btreenodestore.hpp>
+#include <vespa/searchlib/btree/btreeiterator.hpp>
+#include <vespa/searchlib/btree/btreeroot.hpp>
+#include <vespa/searchlib/btree/btreebuilder.hpp>
+#include <vespa/searchlib/btree/btree.hpp>
+#include <vespa/searchlib/btree/btreestore.hpp>
+#include <vespa/searchlib/datastore/array_store.hpp>
+#include <iostream>
+
+/*
+ * Buckets in our tree are represented by their 64-bit numeric key, in what's known as
+ * "reversed bit order with appended used-bits" form. I.e. a bucket ID (16, 0xcafe), which
+ * in its canonical representation has 16 (the used-bits) in its 6 MSBs and 0xcafe in its
+ * LSBs is transformed into 0x7f53000000000010. This key is logically comprised of two parts:
+ *   - the reversed bucket ID itself (0xcafe - 0x7f53) with all trailing zeroes for unset bits
+ *   - the _non-reversed_ used-bits appended as the LSBs
+ *
+ * This particular transformation gives us keys with the following invariants:
+ *   - all distinct bucket IDs map to exactly 1 key
+ *   - buckets with the same ID but different used-bits are ordered in such a way that buckets
+ *     with higher used-bits sort after buckets with lower used-bits
+ *   - the key ordering represents an implicit in-order traversal of the binary bucket tree
+ *     - consequently, all parent buckets are ordered before their child buckets
+ *
+ * The in-order traversal invariant is fundamental to many of the algorithms that operate
+ * on the bucket tree.
+ */
+
+namespace storage {
+
+using Entry = BucketDatabase::Entry;
+using search::datastore::EntryRef;
+using vespalib::ConstArrayRef;
+using document::BucketId;
+
+BTreeBucketDatabase::BTreeBucketDatabase()
+    : _tree(),
+      _store(ReplicaStore::optimizedConfigForHugePage(1023, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE,
+                                                      4 * 1024, 8 * 1024, 0.2)),
+      _generation_handler()
+{
+}
+
+BTreeBucketDatabase::~BTreeBucketDatabase() = default;
+
+namespace {
+
+Entry entry_from_replica_array_ref(const BucketId& id, uint32_t gc_timestamp, ConstArrayRef<BucketCopy> replicas) {
+    return Entry(id, BucketInfo(gc_timestamp, std::vector<BucketCopy>(replicas.begin(), replicas.end())));
+}
+
+EntryRef entry_ref_from_value(uint64_t value) {
+    return EntryRef(value & 0xffffffffULL);
+}
+
+uint32_t gc_timestamp_from_value(uint64_t value) {
+    return (value >> 32u);
+}
+
+uint64_t value_from(uint32_t gc_timestamp, EntryRef ref) {
+    return ((uint64_t(gc_timestamp) << 32u) | ref.ref());
+}
+
+// TODO dedupe and unify common code
+uint8_t
+getMinDiffBits(uint16_t minBits, const document::BucketId& a, const document::BucketId& b) {
+    for (uint32_t i = minBits; i <= std::min(a.getUsedBits(), b.getUsedBits()); i++) {
+        document::BucketId a1(i, a.getRawId());
+        document::BucketId b1(i, b.getRawId());
+        if (b1.getId() != a1.getId()) {
+            return i;
+        }
+    }
+    return minBits;
+}
+
+uint8_t next_parent_bit_seek_level(uint8_t minBits, const document::BucketId& a, const document::BucketId& b) {
+    const uint8_t min_used = std::min(a.getUsedBits(), b.getUsedBits());
+    assert(min_used >= minBits); // Always monotonically descending towards leaves
+    for (uint32_t i = minBits; i <= min_used; i++) {
+        document::BucketId a1(i, a.getRawId());
+        document::BucketId b1(i, b.getRawId());
+        if (b1.getId() != a1.getId()) {
+            return i;
+        }
+    }
+    // The bit prefix is equal, which means that one node is a parent of the other. In this
+    // case we have to force the seek to continue from the next level in the tree.
+    return std::max(min_used, minBits) + 1;
+}
+
+// TODO getMinDiffBits is hoisted from lockablemap.cpp, could probably be rewritten in terms of xor and MSB bit scan instr
+/*
+ *       63 -------- ... -> 0
+ *     a: 1101111111 ... 0010
+ *     b: 1101110010 ... 0011
+ * a ^ b: 0000001101 ... 0001
+ *              ^- diff bit = 57
+ *
+ * 63 - vespalib::Optimized::msbIdx(a ^ b) ==> 6
+ *
+ * what if a == b? special case? not a problem if we can prove this never happens.
+ */
+
+}
+
+void BTreeBucketDatabase::commit_tree_changes() {
+    // TODO break up and refactor
+    // TODO verify semantics and usage
+    // TODO make BTree wrapping API which abstracts away all this stuff via reader/writer interfaces
+    _tree.getAllocator().freeze();
+
+    auto current_gen = _generation_handler.getCurrentGeneration();
+    _store.transferHoldLists(current_gen);
+    _tree.getAllocator().transferHoldLists(current_gen);
+
+    _generation_handler.incGeneration();
+
+    auto used_gen = _generation_handler.getFirstUsedGeneration();
+    _store.trimHoldLists(used_gen);
+    _tree.getAllocator().trimHoldLists(used_gen);
+}
+
+Entry BTreeBucketDatabase::entry_from_iterator(const BTree::ConstIterator& iter) const {
+    if (!iter.valid()) {
+        return Entry::createInvalid();
+    }
+    const auto value = iter.getData();
+    const auto replicas_ref = _store.get(entry_ref_from_value(value));
+    const auto bucket = BucketId(BucketId::keyToBucketId(iter.getKey()));
+    return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref);
+}
+
+BucketId BTreeBucketDatabase::bucket_from_valid_iterator(const BTree::ConstIterator& iter) const {
+    return BucketId(BucketId::keyToBucketId(iter.getKey()));
+}
+
+Entry BTreeBucketDatabase::get(const BucketId& bucket) const {
+    return entry_from_iterator(_tree.find(bucket.toKey()));
+}
+
+void BTreeBucketDatabase::remove(const BucketId& bucket) {
+    auto iter = _tree.find(bucket.toKey());
+    if (!iter.valid()) {
+        return;
+    }
+    const auto value = iter.getData();
+    _store.remove(entry_ref_from_value(value));
+    _tree.remove(iter);
+    commit_tree_changes();
+}
+
+/*
+ * Finding the complete set of parents of a given bucket is not obvious how to
+ * do efficiently, as we only know that the parents are ordered before their
+ * children, but we do not a-priori know if any exist at all. The Judy DB impl
+ * does O(b) explicit point lookups (where b is the number of used bits in the
+ * bucket), starting at the leaf bit and working towards the root. To avoid
+ * having to re-create iterators and perform a full tree search every time, we
+ * turn this on its head and start from the root, progressing towards the leaf.
+ * This allows us to reuse a single iterator and to continue seeking forwards
+ * from its current position.
+ *
+ * Algorithm:
+ *
+ *   Core invariant: every subsequent iterator seek performed in this algorithm
+ *   is for a key that is strictly higher than the one the iterator is currently at.
+ *
+ *   1. Lbound seek to the lowest key that is known to exclude all already visited
+ *      parents. On the first iteration this is zero, i.e. the first in-order node.
+ *   2. If the current node's key is greater than that of the requested bucket's key,
+ *      we've either descended to--or beyond--it in its own subtree or we've entered
+ *      a disjoint subtree. Since we know that all parents must sort before any given
+ *      child bucket, no more parents may be found at this point. Algorithm terminates.
+ *   3. As the main body of the loop is entered, we know one of following must hold:
+ *      3.1 The current node is an explicitly present parent of our bucket.
+ *      3.2 The current node is contained in a left subtree branch of a parent that
+ *          does not have a bucket explicitly present in the tree. It cannot be in
+ *          a right subtree of any parent, as that would imply the node is ordered
+ *          _after_ our own bucket in an in-order traversal, which would contradict
+ *          the check in step 2 above.
+ *   4. If the current node contains the requested bucket, we're at a parent
+ *      node of the bucket; add it to the result set.
+ *      If this is _not_ the case, we're in a different subtree. Example: the
+ *      requested bucket has a key whose MSB is 1 but the first bucket in the
+ *      tree has a key with an MSB of 0. Either way we need to update our search
+ *      key to home in on the target subtree where more parents may be found;
+ *   5. Update the seek key to find the next possible parent. To ensure this key is
+ *      strictly greater than the iterator's current key we find the largest shared
+ *      prefix of bits in common between the current node's key and the requested
+ *      bucket's key. The prefix length + 1 is then the depth in the tree at which the
+ *      two subtrees branch off and diverge.
+ *      The new key is then the MSB prefix length + 1 requested bucket's key with a
+ *      matching number of used-bits set. Forward lbound-seek the iterator to this key.
+ *      `--> TODO elaborate on prefix semantics when they are equal wrt. min used bits
+ *   6. Iff iterator is still valid, go to step 2
+ *
+ * This algorithm is able to skip through large parts of the tree in a sparsely populated
+ * tree, but the number of seeks will trend towards O(b) as with the legacy implementation
+ * when a tree is densely populated. This because all logical inner nodes in the tree will
+ * have subtrees under them. Even in the worst case we should be more efficient than the
+ * legacy implementation since we've cut any dense search space in half for each invocation
+ * of seek() on the iterator
+ *
+ * TODO use min-aggregation to infer used bit range in the tree. This should allow us to
+ * skip scanning through many disjoint subtrees.
+ */
+BTreeBucketDatabase::BTree::ConstIterator
+BTreeBucketDatabase::find_parents_internal(const document::BucketId& bucket,
+                                           std::vector<Entry>& entries) const
+{
+    const uint64_t bucket_key = bucket.toKey();
+    uint64_t parent_key = 0;
+    auto iter = _tree.begin();
+    // Start at the root level, descending towards the bucket itself.
+    // Try skipping as many levels of the tree as possible as we go.
+    uint32_t bits = 1;
+    while (iter.valid() && (iter.getKey() < bucket_key)) {
+        auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
+        if (candidate.contains(bucket)) {
+            assert(candidate.getUsedBits() >= bits);
+            entries.emplace_back(entry_from_iterator(iter));
+        }
+        bits = next_parent_bit_seek_level(bits, candidate, bucket);
+        parent_key = BucketId(bits, bucket.getRawId()).toKey();
+        assert(parent_key > iter.getKey());
+        iter.seek(parent_key);
+    }
+    return iter; // FIXME clang warns here due to copying...
+}
+
+/*
+ * Note: due to legacy API reasons, iff the requested bucket itself exists in the
+ * tree, it will be returned in the result set. I.e. it returns all the nodes on
+ * the path from _and including_ itself towards the root.
+ */
+void BTreeBucketDatabase::getParents(const BucketId& bucket,
+                                     std::vector<Entry>& entries) const
+{
+    auto iter = find_parents_internal(bucket, entries);
+    if (iter.valid() && iter.getKey() == bucket.toKey()) {
+        entries.emplace_back(entry_from_iterator(iter));
+    }
+}
+
+void BTreeBucketDatabase::getAll(const BucketId& bucket,
+                                 std::vector<Entry>& entries) const
+{
+    auto iter = find_parents_internal(bucket, entries);
+    // `iter` is already pointing at, or beyond, one of the bucket's subtrees.
+    for (; iter.valid(); ++iter) {
+        auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
+        if (bucket.contains(candidate)) {
+            entries.emplace_back(entry_from_iterator(iter));
+        } else {
+            break;
+        }
+    }
+}
+
+void BTreeBucketDatabase::update(const Entry& newEntry) {
+    assert(newEntry.valid());
+    auto replicas_ref = _store.add(newEntry.getBucketInfo().getRawNodes());
+    const auto new_value = value_from(newEntry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref);
+    const auto bucket_key = newEntry.getBucketId().toKey();
+    auto iter = _tree.lowerBound(bucket_key);
+    if (iter.valid() && (iter.getKey() == bucket_key)) {
+        _store.remove(entry_ref_from_value(iter.getData()));
+        // In-place update of value; does not require tree structure modification
+        std::atomic_thread_fence(std::memory_order_release); // Must ensure visibility when new array ref is observed
+        iter.writeData(new_value);
+    } else {
+        _tree.insert(iter, bucket_key, new_value);
+    }
+    commit_tree_changes(); // TODO does publishing a new root imply an implicit memory fence?
+}
+
+// TODO need snapshot read with guarding
+// FIXME semantics of for-each in judy and bit tree DBs differ, former expects lbound, latter ubound..!
+// FIXME but bit-tree code says "lowerBound" in impl and "after" in declaration???
+void BTreeBucketDatabase::forEach(EntryProcessor& proc, const BucketId& after) const {
+    for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) {
+        if (!proc.process(entry_from_iterator(iter))) {
+            break;
+        }
+    }
+}
+
+void BTreeBucketDatabase::forEach(MutableEntryProcessor& proc, const BucketId& after) {
+    for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) {
+        // FIXME this is a horrible API which currently has to update every time since we don't
+        // know from the function call itself whether something was in fact updated behind the scenes..!
+        auto entry = entry_from_iterator(iter);
+        bool should_continue = proc.process(entry);
+        // TODO optimize this joyful mess :D
+        update(entry);
+        if (!should_continue) {
+            break;
+        }
+    }
+    //commit_tree_changes(); TODO should be done as bulk op!
+}
+
+Entry BTreeBucketDatabase::upperBound(const BucketId& bucket) const {
+    return entry_from_iterator(_tree.upperBound(bucket.toKey()));
+
+}
+
+uint64_t BTreeBucketDatabase::size() const {
+    return _tree.size();
+}
+
+void BTreeBucketDatabase::clear() {
+    _tree.clear();
+    commit_tree_changes();
+}
+
+/*
+ * Returns the bucket ID which, based on the buckets already existing in the DB,
+ * is the most specific location in the tree in which it should reside. This may
+ * or may not be a bucket that already exists.
+ *
+ * Example: if there is a single bucket (1, 1) in the tree, a query for (1, 1) or
+ * (1, 3) will return (1, 1) as that is the most specific leaf in that subtree.
+ * A query for (1, 0) will return (1, 0) even though this doesn't currently exist,
+ * as there is no existing bucket that can contain the queried bucket. It is up to
+ * the caller to create this bucket according to its needs.
+ *
+ * Usually this function will be called with an ID whose used-bits is at max (58), in
+ * order to find a leaf bucket to route an incoming document operation to.
+ *
+ * TODO rename this function, it's very much _not_ obvious what an "appropriate" bucket is..!
+ * TODO this should be possible to do concurrently
+ */
+BucketId BTreeBucketDatabase::getAppropriateBucket(uint16_t minBits, const BucketId& bid) {
+    // The bucket tree is ordered in such a way that it represents a
+    // natural in-order traversal of all buckets, with inner nodes being
+    // visited before leaf nodes. This means that a lower bound seek will
+    // never return a parent of a seeked bucket. The iterator will be pointing
+    // to a bucket that is either the actual bucket given as the argument to
+    // lowerBound() or the next in-order bucket (or end() if none exists).
+    auto iter = _tree.lowerBound(bid.toKey());
+    if (iter.valid()) {
+        // Find the first level in the tree where the paths through the bucket tree
+        // diverge for the target bucket and the current bucket.
+        minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid);
+    }
+    // TODO is it better to copy original iterator and do begin() on the copy?
+    auto first_iter = _tree.begin();
+    // Original iterator might be in a different subtree than that of our
+    // target bucket. If possible, rewind one node to discover any parent or
+    // leftmost sibling of our node. If there's no such node, we'll still
+    // discover the greatest equal bit prefix.
+    if (iter != first_iter) {
+        --iter;
+        minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid);
+    }
+    return BucketId(minBits, bid.getRawId());
+}
+
+/*
+ * Enumerate the number of child subtrees under `bucket`. The value returned is in the
+ * range [0, 2] regardless of how many subtrees are present further down in the tree.
+ *
+ * Finding this number is reasonably straight forward; we construct two buckets that
+ * represent the key ranges for the left and right subtrees under `bucket` and check
+ * if there are any ranges in the tree's keyspace that are contained in these.
+ */
+// TODO rename/clarify to indicate this is child _subtrees_, not explicit child _buckets_!
+uint32_t BTreeBucketDatabase::childCount(const BucketId& bucket) const {
+    assert(bucket.getUsedBits() < BucketId::maxNumBits);
+    BucketId lhs_bucket(bucket.getUsedBits() + 1, bucket.getId());
+    BucketId rhs_bucket(bucket.getUsedBits() + 1, (1ULL << bucket.getUsedBits()) | bucket.getId());
+
+    auto iter = _tree.lowerBound(lhs_bucket.toKey());
+    if (!iter.valid()) {
+        return 0;
+    }
+    if (lhs_bucket.contains(bucket_from_valid_iterator(iter))) {
+        iter.seek(rhs_bucket.toKey());
+        if (!iter.valid()) {
+            return 1; // lhs subtree only
+        }
+        return (rhs_bucket.contains(bucket_from_valid_iterator(iter)) ? 2 : 1);
+    } else if (rhs_bucket.contains(bucket_from_valid_iterator(iter))) {
+        return 1; // rhs subtree only
+    }
+    return 0;
+}
+
+void BTreeBucketDatabase::print(std::ostream& out, bool verbose,
+                                const std::string& indent) const
+{
+    out << "BTreeBucketDatabase(" << size() << " buckets)";
+    (void)verbose;
+    (void)indent;
+}
+
+}
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
new file mode 100644
index 00000000000..91d3a1cde5b
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
@@ -0,0 +1,70 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "bucketdatabase.h"
+#include <vespa/searchlib/btree/btree.h>
+#include <vespa/searchlib/datastore/array_store.h>
+
+namespace storage {
+
+/*
+ * Bucket database implementation built around lock-free single-writer/multiple-readers B+tree.
+ *
+ * Since a distributor must be able to handle multiple replicas for a given bucket, these
+ * are handled via an ArrayStore indirection. A distributor bucket DB also includes state
+ * for the _entire bucket_ itself, not just the replicas; last timestamp of bucket GC. Since
+ * this is an uint32 we cheekily mangle it into the value, i.e. each bucket key maps to a
+ * composite key of (gc_timestamp_u32 << 32) | array_ref_u32.
+ *
+ * This is _not_ yet production ready, for several reasons:
+ *   - Underlying ArrayStore does not have freelists enabled for replica entry reuse
+ *   - Current API design for mutable forEach requires O(n) tree structure mutations instead
+ *     of changing the tree in bulk and reusing ArrayStore refs et al. Needs a redesign.
+ *
+ * Also note that the DB is currently _not_ thread safe, as read snapshotting is not yet defined
+ * or exposed.
+ */
+// TODO create and use a new DB interface with better bulk loading, snapshot and iteration support
+class BTreeBucketDatabase : public BucketDatabase {
+    // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref>
+    using BTree        = search::btree::BTree<uint64_t, uint64_t>;
+    using ReplicaStore = search::datastore::ArrayStore<BucketCopy>;
+    using GenerationHandler = vespalib::GenerationHandler;
+
+    BTree _tree;
+    ReplicaStore _store;
+    GenerationHandler _generation_handler;
+public:
+    BTreeBucketDatabase();
+    ~BTreeBucketDatabase() override;
+
+    // Ye olde bucket DB API:
+    Entry get(const document::BucketId& bucket) const override;
+    void remove(const document::BucketId& bucket) override;
+    void getParents(const document::BucketId& childBucket,
+                    std::vector<Entry>& entries) const override;
+    void getAll(const document::BucketId& bucket,
+                std::vector<Entry>& entries) const override;
+    void update(const Entry& newEntry) override;
+    void forEach(EntryProcessor&, const document::BucketId& after) const override;
+    void forEach(MutableEntryProcessor&, const document::BucketId& after) override;
+    Entry upperBound(const document::BucketId& value) const override;
+    uint64_t size() const override;
+    void clear() override;
+    document::BucketId getAppropriateBucket(
+            uint16_t minBits,
+            const document::BucketId& bid) override;
+    uint32_t childCount(const document::BucketId&) const override;
+    void print(std::ostream& out, bool verbose,
+               const std::string& indent) const override;
+
+private:
+    Entry entry_from_iterator(const BTree::ConstIterator& iter) const;
+    document::BucketId bucket_from_valid_iterator(const BTree::ConstIterator& iter) const;
+    void commit_tree_changes();
+    BTree::ConstIterator find_parents_internal(const document::BucketId& bucket,
+                                               std::vector<Entry>& entries) const;
+};
+
+}
diff --git a/storage/src/vespa/storage/bucketdb/bucketdatabase.h b/storage/src/vespa/storage/bucketdb/bucketdatabase.h
index d939ac48a03..a5a70c488ca 100644
--- a/storage/src/vespa/storage/bucketdb/bucketdatabase.h
+++ b/storage/src/vespa/storage/bucketdb/bucketdatabase.h
@@ -19,8 +19,8 @@ public:
 
     public:
         Entry() : _bucketId(0) {} // Invalid entry
-        Entry(const document::BucketId& bId, const BucketInfo& bucketInfo)
-            : _bucketId(bId), _info(bucketInfo) {}
+        Entry(const document::BucketId& bId, BucketInfo bucketInfo)
+            : _bucketId(bId), _info(std::move(bucketInfo)) {}
         explicit Entry(const document::BucketId& bId) : _bucketId(bId) {}
 
         bool operator==(const Entry& other) const {
@@ -54,7 +54,7 @@ public:
     virtual void remove(const document::BucketId& bucket) = 0;
 
     /**
-     * Puts all entries that are can contain the given bucket id
+     * Puts all entries that may contain the given bucket id
      * into the given entry vector, including itself if found.
      */
     virtual void getParents(const document::BucketId& childBucket,
diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.cpp b/storage/src/vespa/storage/bucketdb/bucketinfo.cpp
index bbc59de410a..82fc6686c5f 100644
--- a/storage/src/vespa/storage/bucketdb/bucketinfo.cpp
+++ b/storage/src/vespa/storage/bucketdb/bucketinfo.cpp
@@ -6,10 +6,21 @@
 namespace storage {
 
 BucketInfo::BucketInfo()
-    : _lastGarbageCollection(0)
-{ }
+    : _lastGarbageCollection(0),
+      _nodes()
+{}
 
-BucketInfo::~BucketInfo() { }
+BucketInfo::BucketInfo(uint32_t lastGarbageCollection, std::vector<BucketCopy> nodes)
+    : _lastGarbageCollection(lastGarbageCollection),
+      _nodes(std::move(nodes))
+{}
+
+BucketInfo::~BucketInfo() = default;
+
+BucketInfo::BucketInfo(const BucketInfo&) = default;
+BucketInfo& BucketInfo::operator=(const BucketInfo&) = default;
+BucketInfo::BucketInfo(BucketInfo&&) noexcept = default;
+BucketInfo& BucketInfo::operator=(BucketInfo&&) noexcept = default;
 
 std::string
 BucketInfo::toString() const {
diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.h b/storage/src/vespa/storage/bucketdb/bucketinfo.h
index ace82561b2a..8949c0f10da 100644
--- a/storage/src/vespa/storage/bucketdb/bucketinfo.h
+++ b/storage/src/vespa/storage/bucketdb/bucketinfo.h
@@ -22,8 +22,14 @@ private:
 
 public:
     BucketInfo();
+    BucketInfo(uint32_t lastGarbageCollection, std::vector<BucketCopy> nodes);
     ~BucketInfo();
 
+    BucketInfo(const BucketInfo&);
+    BucketInfo& operator=(const BucketInfo&);
+    BucketInfo(BucketInfo&&) noexcept;
+    BucketInfo& operator=(BucketInfo&&) noexcept;
+
     /**
      * @return Returns the last time when this bucket was "garbage collected".
      */
@@ -136,6 +142,10 @@ public:
         return _nodes[idx];
     }
 
+    const std::vector<BucketCopy>& getRawNodes() const noexcept {
+        return _nodes;
+    }
+
     void clearTrusted(uint16_t nodeIdx) {
         getNodeInternal(nodeIdx)->clearTrusted();
     }
diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.h b/storage/src/vespa/storage/bucketdb/lockablemap.h
index 89be92d5921..2b2c2cbe7a8 100644
--- a/storage/src/vespa/storage/bucketdb/lockablemap.h
+++ b/storage/src/vespa/storage/bucketdb/lockablemap.h
@@ -305,7 +305,7 @@ private:
      * Find the given list of keys in the map and add them to the map of
      * results, locking them in the process.
      */
-    void addAndLockResults(const std::vector<BucketId::Type> keys,
+    void addAndLockResults(const std::vector<BucketId::Type>& keys,
                            const char* clientId,
                            std::map<BucketId, WrappedEntry>& results,
                            std::unique_lock<std::mutex> &guard);
diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.hpp b/storage/src/vespa/storage/bucketdb/lockablemap.hpp
index 6d700cd6049..440a76b92fd 100644
--- a/storage/src/vespa/storage/bucketdb/lockablemap.hpp
+++ b/storage/src/vespa/storage/bucketdb/lockablemap.hpp
@@ -549,7 +549,7 @@ LockableMap<Map>::getAllContaining(const BucketId& bucket,
 template<typename Map>
 void
 LockableMap<Map>::addAndLockResults(
-        const std::vector<BucketId::Type> keys,
+        const std::vector<BucketId::Type>& keys,
         const char* clientId,
         std::map<BucketId, WrappedEntry>& results,
         std::unique_lock<std::mutex> &guard)
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
index db2842b70f2..d5c16bc7f5b 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
@@ -822,7 +822,7 @@ BucketDBUpdater::NodeRemover::setCopiesInEntry(
 
     e->addNodes(copies, order);
 
-    LOG(debug, "Changed %s", e->toString().c_str());
+    LOG(spam, "Changed %s", e->toString().c_str());
     LOG_BUCKET_OPERATION_NO_LOCK(
             e.getBucketId(),
             vespalib::make_string("updated bucketdb entry to %s",
@@ -834,7 +834,7 @@ BucketDBUpdater::NodeRemover::removeEmptyBucket(const document::BucketId& bucket
 {
     _removedBuckets.push_back(bucketId);
 
-    LOG(debug,
+    LOG(spam,
         "After system state change %s, bucket %s now has no copies.",
         _oldState.getTextualDifference(_state).c_str(),
         bucketId.toString().c_str());
diff --git a/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp b/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp
index dcf7792860e..f013ce43048 100644
--- a/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp
+++ b/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp
@@ -1,13 +1,24 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "distributor_bucket_space.h"
+#include <vespa/storage/bucketdb/mapbucketdatabase.h>
+#include <vespa/storage/bucketdb/btree_bucket_database.h>
 #include <vespa/vdslib/state/clusterstate.h>
 #include <vespa/vdslib/distribution/distribution.h>
 
 namespace storage::distributor {
 
+namespace {
+
+std::unique_ptr<BucketDatabase> make_default_db_impl() {
+    //return std::make_unique<BTreeBucketDatabase>();
+    return std::make_unique<MapBucketDatabase>();
+}
+
+}
+
 DistributorBucketSpace::DistributorBucketSpace()
-    : _bucketDatabase(),
+    : _bucketDatabase(make_default_db_impl()),
       _clusterState(),
       _distribution()
 {
diff --git a/storage/src/vespa/storage/distributor/distributor_bucket_space.h b/storage/src/vespa/storage/distributor/distributor_bucket_space.h
index 23d00c6ae43..effb0dc3e17 100644
--- a/storage/src/vespa/storage/distributor/distributor_bucket_space.h
+++ b/storage/src/vespa/storage/distributor/distributor_bucket_space.h
@@ -1,9 +1,12 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include <vespa/storage/bucketdb/mapbucketdatabase.h>
 #include <memory>
 
+namespace storage {
+class BucketDatabase;
+}
+
 namespace storage::lib {
     class ClusterState;
     class Distribution;
@@ -23,7 +26,7 @@ namespace storage::distributor {
  *   bucket spaces.
  */
 class DistributorBucketSpace {
-    MapBucketDatabase _bucketDatabase;
+    std::unique_ptr<BucketDatabase>  _bucketDatabase;
     std::shared_ptr<const lib::ClusterState> _clusterState;
     std::shared_ptr<const lib::Distribution> _distribution;
 public:
@@ -36,10 +39,10 @@ public:
     DistributorBucketSpace& operator=(DistributorBucketSpace&&) = delete;
 
     BucketDatabase& getBucketDatabase() noexcept {
-        return _bucketDatabase;
+        return *_bucketDatabase;
     }
     const BucketDatabase& getBucketDatabase() const noexcept {
-        return _bucketDatabase;
+        return *_bucketDatabase;
     }
 
     void setClusterState(std::shared_ptr<const lib::ClusterState> clusterState);
diff --git a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
index c295be19a0b..4208e8b33f9 100644
--- a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
+++ b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
@@ -49,9 +49,7 @@ PendingBucketSpaceDbTransition::PendingBucketSpaceDbTransition(const PendingClus
     }
 }
 
-PendingBucketSpaceDbTransition::~PendingBucketSpaceDbTransition()
-{
-}
+PendingBucketSpaceDbTransition::~PendingBucketSpaceDbTransition() = default;
 
 PendingBucketSpaceDbTransition::Range
 PendingBucketSpaceDbTransition::skipAllForSameBucket()
author	Tor Brede Vekterli <vekterli@verizonmedia.com>	2019-04-29 09:04:35 +0000
committer	Tor Brede Vekterli <vekterli@verizonmedia.com>	2019-05-09 11:21:32 +0000
commit	aa9fa2f49f3a61d2978b9748712f3cadd902fd7b (patch)
tree	c81b737f428206ad33a114853606a0d9207acea6 /storage
parent	c3667718a63a8703bf62833dcb92b7ad5422d0cc (diff)