Merge pull request #9341 from vespa-engine/vekterli/add-distributor-btree-bucket-database-foundations

Add initial B+tree distributor bucket database
author: Tor Brede Vekterli <vekterli@verizonmedia.com> 2019-05-13 10:31:30 +0200
committer: GitHub <noreply@github.com> 2019-05-13 10:31:30 +0200
commit: 555008f9bbb282a5f620a3d4e63e757c08ed883d (patch)
tree: 76339729385d16dc854c77acd476850003545b3d /storage
parent: 866ce6e27e52c8e4ddb34d4cb8e9879c5dfd0a11 (diff)
parent: aa9fa2f49f3a61d2978b9748712f3cadd902fd7b (diff)
19 files changed, 670 insertions, 180 deletions
diff --git a/storage/CMakeLists.txt b/storage/CMakeLists.txt
index f77c11eb350..6b697716e1d 100644
--- a/storage/CMakeLists.txt
+++ b/storage/CMakeLists.txt
@@ -16,6 +16,7 @@ vespa_define_module(
     vdslib
     persistence
     storageframework
+    searchlib
 
     EXTERNAL_DEPENDS
     Judy
diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt
index f7a65000f6c..fdccf9b1394 100644
--- a/storage/src/tests/distributor/CMakeLists.txt
+++ b/storage/src/tests/distributor/CMakeLists.txt
@@ -2,7 +2,6 @@
 vespa_add_library(storage_testdistributor TEST
     SOURCES
     blockingoperationstartertest.cpp
-    bucketdatabasetest.cpp
     bucketdbmetricupdatertest.cpp
     bucketgctimecalculatortest.cpp
     bucketstateoperationtest.cpp
@@ -15,7 +14,6 @@ vespa_add_library(storage_testdistributor TEST
     idealstatemanagertest.cpp
     joinbuckettest.cpp
     maintenanceschedulertest.cpp
-    mapbucketdatabasetest.cpp
     mergelimitertest.cpp
     mergeoperationtest.cpp
     messagesenderstub.cpp
@@ -48,7 +46,10 @@ vespa_add_library(storage_testdistributor TEST
 
 vespa_add_library(storage_gtestdistributor TEST
     SOURCES
+    btree_bucket_database_test.cpp
+    bucketdatabasetest.cpp
     bucketdbupdatertest.cpp
+    mapbucketdatabasetest.cpp
     # Fixture etc. dupes with non-gtest runner :
     distributortestutil.cpp
     bucket_db_prune_elision_test.cpp
diff --git a/storage/src/tests/distributor/btree_bucket_database_test.cpp b/storage/src/tests/distributor/btree_bucket_database_test.cpp
new file mode 100644
index 00000000000..c253a758f98
--- /dev/null
+++ b/storage/src/tests/distributor/btree_bucket_database_test.cpp
@@ -0,0 +1,12 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/bucketdb/btree_bucket_database.h>
+#include <tests/distributor/bucketdatabasetest.h>
+
+namespace storage::distributor {
+
+INSTANTIATE_TEST_CASE_P(BTreeDatabase, BucketDatabaseTest,
+                        ::testing::Values(std::make_shared<BTreeBucketDatabase>()));
+
+}
+
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp
index 55106ce5c28..92e0c534e31 100644
--- a/storage/src/tests/distributor/bucketdatabasetest.cpp
+++ b/storage/src/tests/distributor/bucketdatabasetest.cpp
@@ -1,6 +1,5 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include "bucketdatabasetest.h"
-#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
 #include <iomanip>
 #include <algorithm>
 
@@ -8,9 +7,7 @@ namespace storage::distributor {
 
 using document::BucketId;
 
-void
-BucketDatabaseTest::setUp()
-{
+void BucketDatabaseTest::SetUp() {
     db().clear();
 }
 
@@ -26,44 +23,42 @@ namespace {
     }
 }
 
-void
-BucketDatabaseTest::testClear() {
+TEST_P(BucketDatabaseTest, testClear) {
     db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
     db().clear();
-    CPPUNIT_ASSERT_EQUAL(uint64_t(0), db().size());
+    EXPECT_EQ(uint64_t(0), db().size());
 }
 
-void
-BucketDatabaseTest::testUpdateGetAndRemove() {
+TEST_P(BucketDatabaseTest, testUpdateGetAndRemove) {
     // Do some insertions
-    CPPUNIT_ASSERT_EQUAL(0, (int)db().size());
+    EXPECT_EQ(0, db().size());
     db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3)));
-    CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+    EXPECT_EQ(3, db().size());
 
     db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(4)));
-    CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+    EXPECT_EQ(3, db().size());
 
     // Access some elements
-    CPPUNIT_ASSERT_EQUAL(BI(4), db().get(document::BucketId(16, 11)).getBucketInfo());
-    CPPUNIT_ASSERT_EQUAL(BI(1), db().get(document::BucketId(16, 16)).getBucketInfo());
-    CPPUNIT_ASSERT_EQUAL(BI(3), db().get(document::BucketId(16, 42)).getBucketInfo());
+    EXPECT_EQ(BI(4), db().get(document::BucketId(16, 11)).getBucketInfo());
+    EXPECT_EQ(BI(1), db().get(document::BucketId(16, 16)).getBucketInfo());
+    EXPECT_EQ(BI(3), db().get(document::BucketId(16, 42)).getBucketInfo());
 
     // Do removes
     db().remove(document::BucketId(16, 12));
 
-    CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+    EXPECT_EQ(3, db().size());
 
     db().remove(document::BucketId(16, 11));
 
-    CPPUNIT_ASSERT_EQUAL(2, (int)db().size());
+    EXPECT_EQ(2, db().size());
 
     db().remove(document::BucketId(16, 16));
     db().remove(document::BucketId(16, 42));
 
-    CPPUNIT_ASSERT_EQUAL(0, (int)db().size());
+    EXPECT_EQ(0, db().size());
 }
 
 namespace {
@@ -83,8 +78,7 @@ struct ModifyProcessor : public BucketDatabase::MutableEntryProcessor
     }
 };
 
-struct ListAllProcessor : public BucketDatabase::EntryProcessor
-{
+struct ListAllProcessor : public BucketDatabase::EntryProcessor {
     std::ostringstream ost;
 
     bool process(const BucketDatabase::Entry& e) override {
@@ -93,8 +87,7 @@ struct ListAllProcessor : public BucketDatabase::EntryProcessor
     }
 };
 
-struct DummyProcessor : public BucketDatabase::EntryProcessor
-{
+struct DummyProcessor : public BucketDatabase::EntryProcessor {
     std::ostringstream ost;
 
     bool process(const BucketDatabase::Entry&) override {
@@ -103,8 +96,7 @@ struct DummyProcessor : public BucketDatabase::EntryProcessor
 };
 
 
-struct StoppingProcessor : public BucketDatabase::EntryProcessor
-{
+struct StoppingProcessor : public BucketDatabase::EntryProcessor {
     std::ostringstream ost;
 
     bool process(const BucketDatabase::Entry& e) override {
@@ -120,8 +112,7 @@ struct StoppingProcessor : public BucketDatabase::EntryProcessor
 
 }
 
-void
-BucketDatabaseTest::testIterating() {
+TEST_P(BucketDatabaseTest, testIterating) {
     // Do some insertions
     db().update(BucketDatabase::Entry(document::BucketId(16, 0x10), BI(1)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 0x0b), BI(2)));
@@ -131,7 +122,7 @@ BucketDatabaseTest::testIterating() {
         ListAllProcessor proc;
         db().forEach(proc, document::BucketId());
 
-        CPPUNIT_ASSERT_EQUAL(
+        EXPECT_EQ(
                 std::string(
                         "BucketId(0x4000000000000010) : "
                         "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"
@@ -146,7 +137,7 @@ BucketDatabaseTest::testIterating() {
         ListAllProcessor proc;
         db().forEach(proc, document::BucketId(16, 0x2a));
 
-        CPPUNIT_ASSERT_EQUAL(
+        EXPECT_EQ(
                 std::string(
                         "BucketId(0x400000000000000b) : "
                         "node(idx=2,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"),
@@ -157,7 +148,7 @@ BucketDatabaseTest::testIterating() {
         StoppingProcessor proc;
         db().forEach(proc, document::BucketId());
 
-        CPPUNIT_ASSERT_EQUAL(
+        EXPECT_EQ(
                 std::string(
                         "BucketId(0x4000000000000010) : "
                         "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"
@@ -173,7 +164,7 @@ BucketDatabaseTest::testIterating() {
         ListAllProcessor proc;
         db().forEach(proc);
 
-        CPPUNIT_ASSERT_EQUAL(
+        EXPECT_EQ(
                 std::string(
                         "BucketId(0x4000000000000010) : "
                         "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"
@@ -213,18 +204,33 @@ BucketDatabaseTest::doFindParents(const std::vector<document::BucketId>& ids,
     return ost.str();
 }
 
-void
-BucketDatabaseTest::testFindParents() {
+TEST_P(BucketDatabaseTest, testFindParents) {
     // test what parents in the DB (specified in vector) are parents of the
     // specified bucket. Result is a list of indexes into the vector.
-    CPPUNIT_ASSERT_EQUAL(
+
+    // The way the legacy API works is that a bucket is considered as being in
+    // the set of its parents... This is rather weird, but at least explicitly
+    // test that it is so for now to avoid breaking the world.
+    EXPECT_EQ(
+            std::string("0"),
+            doFindParents(toVector(document::BucketId(17, 0xcafe)),
+                          document::BucketId(17, 0xcafe)));
+
+    EXPECT_EQ(
+            std::string("1,2"),
+            doFindParents(toVector(document::BucketId(1, 0x0),
+                                   document::BucketId(1, 0x1),
+                                   document::BucketId(2, 0x1)),
+                          document::BucketId(16, 0x1)));
+
+    EXPECT_EQ(
             std::string("2"),
             doFindParents(toVector(document::BucketId(17, 0x0ffff),
                                    document::BucketId(18, 0x1ffff),
                                    document::BucketId(18, 0x3ffff)),
                           document::BucketId(22, 0xfffff)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("0,2,3"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff),
@@ -232,7 +238,15 @@ BucketDatabaseTest::testFindParents() {
                                    document::BucketId(19, 0xfffff)),
                           document::BucketId(22, 0xfffff)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
+            std::string("0,1,2,3"),
+            doFindParents(toVector(document::BucketId(16, 0x0ffff),
+                                   document::BucketId(17, 0x0ffff),
+                                   document::BucketId(18, 0x0ffff),
+                                   document::BucketId(19, 0x0ffff)),
+                          document::BucketId(20, 0x0ffff)));
+
+    EXPECT_EQ(
             std::string("0,2,3"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff),
@@ -240,20 +254,20 @@ BucketDatabaseTest::testFindParents() {
                                    document::BucketId(18, 0x1ffff)),
                           document::BucketId(22, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("0"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff)),
                           document::BucketId(22, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL( // ticket 3121525
+    EXPECT_EQ( // ticket 3121525
             std::string("0"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff),
                                    document::BucketId(19, 0x1ffff)),
                           document::BucketId(18, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL( // ticket 3121525
+    EXPECT_EQ( // ticket 3121525
             std::string("0"),
             doFindParents(toVector(document::BucketId(16, 0x0ffff),
                                    document::BucketId(17, 0x0ffff),
@@ -276,8 +290,11 @@ BucketDatabaseTest::doFindAll(const std::vector<document::BucketId>& ids,
 
     std::ostringstream ost;
     for (uint32_t i = 0; i < ids.size(); ++i) {
-        if (std::find(entries.begin(), entries.end(),
-                      BucketDatabase::Entry(ids[i], BI(i))) != entries.end()) {
+        auto wanted = BucketDatabase::Entry(ids[i], BI(i));
+        for (const auto& e : entries) {
+            if (!(e == wanted)) {
+                continue;
+            }
             if (!ost.str().empty()) {
                 ost << ",";
             }
@@ -288,11 +305,9 @@ BucketDatabaseTest::doFindAll(const std::vector<document::BucketId>& ids,
     return ost.str();
 }
 
-void
-BucketDatabaseTest::testFindAll()
-{
+TEST_P(BucketDatabaseTest, testFindAll) {
     std::vector<document::BucketId> buckets;
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string(""),
             doFindAll(buckets, document::BucketId(18, 0x1ffff)));
 
@@ -306,15 +321,20 @@ BucketDatabaseTest::testFindAll()
     buckets.push_back(document::BucketId(20, 0xceaaa));
     buckets.push_back(document::BucketId(17, 0x1ffff));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
+            std::string("0"),
+            doFindAll(toVector(document::BucketId(16, 1234)),
+                      document::BucketId(16, 1234)));
+
+    EXPECT_EQ(
             std::string("0,4,5,6"),
             doFindAll(buckets, document::BucketId(17, 0x1aaaa)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("8"),
             doFindAll(buckets, document::BucketId(16, 0xffff)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("0,1"),
             doFindAll(toVector(document::BucketId(17, 0x00001),
                                document::BucketId(17, 0x10001)),
@@ -322,7 +342,7 @@ BucketDatabaseTest::testFindAll()
 
     document::BucketId id(33, 0x1053c7089); // Bit 32 is set, but unused.
     id.setUsedBits(32);
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             std::string("1,2"),
             doFindAll(toVector(document::BucketId(24, 0x000dc7089),
                                document::BucketId(33, 0x0053c7089),
@@ -330,7 +350,7 @@ BucketDatabaseTest::testFindAll()
                                document::BucketId(24, 0x000bc7089)),
                       id));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+    EXPECT_EQ( // Inconsistent split
             std::string("0,1,2"),
             doFindAll(toVector(
                               document::BucketId(16, 0x00001), // contains 2-3
@@ -338,7 +358,7 @@ BucketDatabaseTest::testFindAll()
                               document::BucketId(17, 0x10001)),
                       document::BucketId(16, 0x00001)));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+    EXPECT_EQ( // Inconsistent split
             std::string("1,2"),
             doFindAll(toVector(
                               document::BucketId(17, 0x10000),
@@ -347,14 +367,14 @@ BucketDatabaseTest::testFindAll()
                               document::BucketId(17, 0x1ffff)),
                       document::BucketId(32, 0x027228034)));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+    EXPECT_EQ( // Inconsistent split
             std::string("0"),
             doFindAll(toVector(
                               document::BucketId(16, 0x0ffff),
                               document::BucketId(17, 0x0ffff)),
                       document::BucketId(22, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+    EXPECT_EQ( // Inconsistent split
             std::string("0,2"),
             doFindAll(toVector(
                               document::BucketId(16, 0x0ffff),
@@ -362,7 +382,7 @@ BucketDatabaseTest::testFindAll()
                               document::BucketId(19, 0x1ffff)),
                       document::BucketId(18, 0x1ffff)));
 
-    CPPUNIT_ASSERT_EQUAL( // Inconsistent split, ticket 3121525
+    EXPECT_EQ( // Inconsistent split, ticket 3121525
             std::string("0,2"),
             doFindAll(toVector(
                               document::BucketId(16, 0x0ffff),
@@ -386,36 +406,36 @@ BucketDatabaseTest::doCreate(const std::vector<document::BucketId>& ids,
     return entry.getBucketId();
 }
 
-void
-BucketDatabaseTest::testCreateAppropriateBucket() {
+// TODO rewrite in terms of bucket getter, not creator
+TEST_P(BucketDatabaseTest, testCreateAppropriateBucket) {
         // Use min split bits when no relevant bucket exist.
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             document::BucketId(36,0x0000004d2),
             doCreate(toVector(document::BucketId(58, 0x43d6c878000004d2ull)), 36,
                      document::BucketId(58, 0x423bf1e0000004d2ull)));
-        // New bucket has bits in common with existing bucket.
-        // Create bucket with min amount of bits while not being overlapping
-    CPPUNIT_ASSERT_EQUAL(
+    // New bucket has bits in common with existing bucket.
+    // Create bucket with min amount of bits while not being overlapping
+    EXPECT_EQ(
             document::BucketId(34,0x0000004d2),
             doCreate(toVector(document::BucketId(58, 0xeaf77782000004d2)),
                      16,
                      document::BucketId(58, 0x00000000000004d2)));
-        // Create sibling of existing bucket with most LSB bits in common.
-    CPPUNIT_ASSERT_EQUAL(
+    // Create sibling of existing bucket with most LSB bits in common.
+    EXPECT_EQ(
             document::BucketId(40, 0x0000004d2),
             doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2),
                               document::BucketId(58, 0xeaf77782000004d2)),
                      16,
                      document::BucketId(58, 0x00000000000004d2)));
-        // Create sibling of existing bucket with most LSB bits in common.
-    CPPUNIT_ASSERT_EQUAL(
+    // Create sibling of existing bucket with most LSB bits in common.
+    EXPECT_EQ(
             document::BucketId(25, 0x0010004d2),
             doCreate(toVector(document::BucketId(16, 0x00000000000004d1),
                               document::BucketId(40, 0x00000000000004d2)),
                      16,
                      document::BucketId(58, 0x00000000010004d2)));
 
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             document::BucketId(36, 0x10000004000004d2),
             doCreate(toVector(document::BucketId(0x8c000000000004d2),
                               document::BucketId(0xeb54b3ac000004d2),
@@ -423,42 +443,38 @@ BucketDatabaseTest::testCreateAppropriateBucket() {
                               document::BucketId(0x84000001000004d2)),
                      16,
                      document::BucketId(58, 0x1944a44000004d2)));
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             document::BucketId(25, 0x0010004d2),
             doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2),
                               document::BucketId(40, 0x00000000000004d1)),
                      16,
                      document::BucketId(58,0x00000000010004d2)));
-        // Test empty bucket database case. (Use min split bits)
+    // Test empty bucket database case. (Use min split bits)
     std::vector<document::BucketId> buckets;
-    CPPUNIT_ASSERT_EQUAL(
+    EXPECT_EQ(
             document::BucketId(16, 0x0000004d2ull),
             doCreate(buckets, 16,
                      document::BucketId(58, 0x00000000010004d2)));
 }
 
-void
-BucketDatabaseTest::testGetNext()
-{
-    db().clear();
+TEST_P(BucketDatabaseTest, testGetNext) {
     db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
     db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3)));
 
-    CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 16),
-                         db().getNext(document::BucketId()).getBucketId());
+    EXPECT_EQ(document::BucketId(16, 16),
+              db().getNext(document::BucketId()).getBucketId());
 
-    CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 42),
-                         db().getNext(document::BucketId(16, 16)).getBucketId());
+    EXPECT_EQ(document::BucketId(16, 42),
+              db().getNext(document::BucketId(16, 16)).getBucketId());
 
-    CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 11),
-                         db().getNext(document::BucketId(16, 42)).getBucketId());
+    EXPECT_EQ(document::BucketId(16, 11),
+              db().getNext(document::BucketId(16, 42)).getBucketId());
 }
 
 void
 BucketDatabaseTest::doTestUpperBound(const UBoundFunc& f)
 {
-    db().clear();
     // Tree is rooted at the LSB bit, so the following buckets are in iteration
     // order based on the reverse of their "normal" bitstring:
     // 0010:3
@@ -473,30 +489,28 @@ BucketDatabaseTest::doTestUpperBound(const UBoundFunc& f)
     db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(3)));
 
     // 0000:0 (default constructed) has ubound of 0010:3
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId()));
+    EXPECT_EQ(BucketId(3, 4), f(db(), BucketId()));
     // 0011:4 has ubound of 1000:3
-    CPPUNIT_ASSERT_EQUAL(document::BucketId(3, 1), f(db(), BucketId(4, 12)));
+    EXPECT_EQ(document::BucketId(3, 1), f(db(), BucketId(4, 12)));
     // 1000:1 has ubound of 1000:3
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId(1, 0)));
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 1), f(db(), BucketId(3, 4)));
-    CPPUNIT_ASSERT_EQUAL(BucketId(4, 9), f(db(), BucketId(3, 1)));
-    CPPUNIT_ASSERT_EQUAL(BucketId(5, 9), f(db(), BucketId(4, 9)));
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(5, 9)));
+    EXPECT_EQ(BucketId(3, 4), f(db(), BucketId(1, 0)));
+    EXPECT_EQ(BucketId(3, 1), f(db(), BucketId(3, 4)));
+    EXPECT_EQ(BucketId(4, 9), f(db(), BucketId(3, 1)));
+    EXPECT_EQ(BucketId(5, 9), f(db(), BucketId(4, 9)));
+    EXPECT_EQ(BucketId(3, 3), f(db(), BucketId(5, 9)));
     // 100101:6 does not exist, should also return 1100:3
-    CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(6, 41)));
+    EXPECT_EQ(BucketId(3, 3), f(db(), BucketId(6, 41)));
 
     // Test extremes.
     db().clear();
     db().update(BucketDatabase::Entry(document::BucketId(8, 0), BI(2)));
     db().update(BucketDatabase::Entry(document::BucketId(8, 0xff), BI(2)));
 
-    CPPUNIT_ASSERT_EQUAL(BucketId(8, 0), f(db(), BucketId()));
-    CPPUNIT_ASSERT_EQUAL(BucketId(8, 0xff), f(db(), BucketId(8, 0)));
+    EXPECT_EQ(BucketId(8, 0), f(db(), BucketId()));
+    EXPECT_EQ(BucketId(8, 0xff), f(db(), BucketId(8, 0)));
 }
 
-void
-BucketDatabaseTest::testUpperBoundReturnsNextInOrderGreaterBucket()
-{
+TEST_P(BucketDatabaseTest, testUpperBoundReturnsNextInOrderGreaterBucket) {
     doTestUpperBound([](const BucketDatabase& bucketDb,
                         const document::BucketId& id)
     {
@@ -504,9 +518,7 @@ BucketDatabaseTest::testUpperBoundReturnsNextInOrderGreaterBucket()
     });
 }
 
-void
-BucketDatabaseTest::testGetNextReturnsUpperBoundBucket()
-{
+TEST_P(BucketDatabaseTest, testGetNextReturnsUpperBoundBucket) {
     // getNext() would generally be implemented in terms of upperBound(), but
     // make sure it conforms to the same contract in case this changes.
     doTestUpperBound([](const BucketDatabase& bucketDb,
@@ -516,31 +528,28 @@ BucketDatabaseTest::testGetNextReturnsUpperBoundBucket()
     });
 }
 
-void
-BucketDatabaseTest::testChildCount()
-{
-    db().clear();
+TEST_P(BucketDatabaseTest, testChildCount) {
     // Empty tree; inserts cannot create inconsistencies.
-    CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
+    EXPECT_EQ(0u, db().childCount(BucketId(3, 1)));
 
     // Same bucket; cannot be inconsistent with itself.
     db().update(BucketDatabase::Entry(document::BucketId(3, 1), BI(1)));
-    CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
+    EXPECT_EQ(0u, db().childCount(BucketId(3, 1)));
 
     // (2, 1) has one subtree.
-    CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 1)));
+    EXPECT_EQ(1u, db().childCount(BucketId(2, 1)));
 
     // Bucket exists in another subtree from (1, 1); inconsistency would
     // result if we tried inserting it.
     db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(2)));
-    CPPUNIT_ASSERT_EQUAL(2u, db().childCount(BucketId(1, 1)));
+    EXPECT_EQ(2u, db().childCount(BucketId(1, 1)));
 
     // Inner node with 1 subtree.
-    CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 3)));
+    EXPECT_EQ(1u, db().childCount(BucketId(2, 3)));
 
     // Leaves have no subtrees.
-    CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
-    CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 5)));
+    EXPECT_EQ(0u, db().childCount(BucketId(3, 1)));
+    EXPECT_EQ(0u, db().childCount(BucketId(3, 5)));
 }
 
 }
diff --git a/storage/src/tests/distributor/bucketdatabasetest.h b/storage/src/tests/distributor/bucketdatabasetest.h
index b7b32118fee..9ebc23bdb16 100644
--- a/storage/src/tests/distributor/bucketdatabasetest.h
+++ b/storage/src/tests/distributor/bucketdatabasetest.h
@@ -3,40 +3,13 @@
 
 #include <vespa/storage/bucketdb/bucketdatabase.h>
 #include <vespa/storage/storageutil/utils.h>
-#include <vespa/vdstestlib/cppunit/macros.h>
-#include <vespa/vespalib/util/document_runnable.h>
-#include <cppunit/extensions/HelperMacros.h>
-
-#define SETUP_DATABASE_TESTS() \
-    CPPUNIT_TEST(testUpdateGetAndRemove); \
-    CPPUNIT_TEST(testClear); \
-    CPPUNIT_TEST(testIterating); \
-    CPPUNIT_TEST(testFindParents); \
-    CPPUNIT_TEST(testFindAll); \
-    CPPUNIT_TEST(testCreateAppropriateBucket); \
-    CPPUNIT_TEST(testGetNext); \
-    CPPUNIT_TEST(testGetNextReturnsUpperBoundBucket); \
-    CPPUNIT_TEST(testUpperBoundReturnsNextInOrderGreaterBucket); \
-    CPPUNIT_TEST(testChildCount);
-
-namespace storage {
-namespace distributor {
-
-struct BucketDatabaseTest : public CppUnit::TestFixture {
-    void setUp() override ;
-
-    void testUpdateGetAndRemove();
-    void testClear();
-    void testIterating();
-    void testFindParents();
-    void testFindAll();
-    void testCreateAppropriateBucket();
-    void testGetNext();
-    void testGetNextReturnsUpperBoundBucket();
-    void testUpperBoundReturnsNextInOrderGreaterBucket();
-    void testChildCount();
-
-    void testBenchmark();
+#include <vespa/vespalib/gtest/gtest.h>
+#include <functional>
+
+namespace storage::distributor {
+
+struct BucketDatabaseTest : public ::testing::TestWithParam<std::shared_ptr<BucketDatabase>> {
+    void SetUp() override ;
 
     std::string doFindParents(const std::vector<document::BucketId>& ids,
                               const document::BucketId& searchId);
@@ -46,9 +19,8 @@ struct BucketDatabaseTest : public CppUnit::TestFixture {
                                 uint32_t minBits,
                                 const document::BucketId& wantedId);
 
-    virtual BucketDatabase& db() = 0;
+    BucketDatabase& db() noexcept { return *GetParam(); }
 
-private:
     using UBoundFunc = std::function<
             document::BucketId(const BucketDatabase&,
                                const document::BucketId&)>;
@@ -57,6 +29,3 @@ private:
 };
 
 }
-
-}
-
diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp
index 51797e90b3e..11c9f18b3ef 100644
--- a/storage/src/tests/distributor/bucketdbupdatertest.cpp
+++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp
@@ -15,12 +15,11 @@
 #include <vespa/storage/distributor/simpleclusterinformation.h>
 #include <vespa/storage/distributor/distributor.h>
 #include <vespa/storage/distributor/distributor_bucket_space.h>
+#include <vespa/vespalib/gtest/gtest.h>
 #include <vespa/vespalib/text/stringtokenizer.h>
 #include <sstream>
 #include <iomanip>
 
-#include <gtest/gtest.h>
-
 using namespace storage::api;
 using namespace storage::lib;
 using document::test::makeDocumentBucket;
diff --git a/storage/src/tests/distributor/mapbucketdatabasetest.cpp b/storage/src/tests/distributor/mapbucketdatabasetest.cpp
index f2e521f1bf8..0ae4a49530e 100644
--- a/storage/src/tests/distributor/mapbucketdatabasetest.cpp
+++ b/storage/src/tests/distributor/mapbucketdatabasetest.cpp
@@ -1,21 +1,11 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vdstestlib/cppunit/macros.h>
+
 #include <vespa/storage/bucketdb/mapbucketdatabase.h>
 #include <tests/distributor/bucketdatabasetest.h>
 
-namespace storage {
-namespace distributor {
-
-struct MapBucketDatabaseTest : public BucketDatabaseTest {
-    MapBucketDatabase _db;
-    BucketDatabase& db() override { return _db; };
+namespace storage::distributor {
 
-    CPPUNIT_TEST_SUITE(MapBucketDatabaseTest);
-    SETUP_DATABASE_TESTS();
-    CPPUNIT_TEST_SUITE_END();
-};
+INSTANTIATE_TEST_CASE_P(MapDatabase, BucketDatabaseTest,
+                        ::testing::Values(std::make_shared<MapBucketDatabase>()));
 
-CPPUNIT_TEST_SUITE_REGISTRATION(MapBucketDatabaseTest);
-
-}
 }
diff --git a/storage/src/vespa/storage/bucketdb/CMakeLists.txt b/storage/src/vespa/storage/bucketdb/CMakeLists.txt
index 8200884de17..a99d16d9f0f 100644
--- a/storage/src/vespa/storage/bucketdb/CMakeLists.txt
+++ b/storage/src/vespa/storage/bucketdb/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 vespa_add_library(storage_bucketdb OBJECT
     SOURCES
+    btree_bucket_database.cpp
     bucketcopy.cpp
     bucketdatabase.cpp
     bucketinfo.cpp
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
new file mode 100644
index 00000000000..27c17b207ea
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
@@ -0,0 +1,405 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "btree_bucket_database.h"
+#include <vespa/searchlib/btree/btreebuilder.h>
+#include <vespa/searchlib/btree/btreenodeallocator.hpp>
+#include <vespa/searchlib/btree/btreenode.hpp>
+#include <vespa/searchlib/btree/btreenodestore.hpp>
+#include <vespa/searchlib/btree/btreeiterator.hpp>
+#include <vespa/searchlib/btree/btreeroot.hpp>
+#include <vespa/searchlib/btree/btreebuilder.hpp>
+#include <vespa/searchlib/btree/btree.hpp>
+#include <vespa/searchlib/btree/btreestore.hpp>
+#include <vespa/searchlib/datastore/array_store.hpp>
+#include <iostream>
+
+/*
+ * Buckets in our tree are represented by their 64-bit numeric key, in what's known as
+ * "reversed bit order with appended used-bits" form. I.e. a bucket ID (16, 0xcafe), which
+ * in its canonical representation has 16 (the used-bits) in its 6 MSBs and 0xcafe in its
+ * LSBs is transformed into 0x7f53000000000010. This key is logically comprised of two parts:
+ *   - the reversed bucket ID itself (0xcafe - 0x7f53) with all trailing zeroes for unset bits
+ *   - the _non-reversed_ used-bits appended as the LSBs
+ *
+ * This particular transformation gives us keys with the following invariants:
+ *   - all distinct bucket IDs map to exactly 1 key
+ *   - buckets with the same ID but different used-bits are ordered in such a way that buckets
+ *     with higher used-bits sort after buckets with lower used-bits
+ *   - the key ordering represents an implicit in-order traversal of the binary bucket tree
+ *     - consequently, all parent buckets are ordered before their child buckets
+ *
+ * The in-order traversal invariant is fundamental to many of the algorithms that operate
+ * on the bucket tree.
+ */
+
+namespace storage {
+
+using Entry = BucketDatabase::Entry;
+using search::datastore::EntryRef;
+using vespalib::ConstArrayRef;
+using document::BucketId;
+
+BTreeBucketDatabase::BTreeBucketDatabase()
+    : _tree(),
+      _store(ReplicaStore::optimizedConfigForHugePage(1023, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE,
+                                                      4 * 1024, 8 * 1024, 0.2)),
+      _generation_handler()
+{
+}
+
+BTreeBucketDatabase::~BTreeBucketDatabase() = default;
+
+namespace {
+
+Entry entry_from_replica_array_ref(const BucketId& id, uint32_t gc_timestamp, ConstArrayRef<BucketCopy> replicas) {
+    return Entry(id, BucketInfo(gc_timestamp, std::vector<BucketCopy>(replicas.begin(), replicas.end())));
+}
+
+EntryRef entry_ref_from_value(uint64_t value) {
+    return EntryRef(value & 0xffffffffULL);
+}
+
+uint32_t gc_timestamp_from_value(uint64_t value) {
+    return (value >> 32u);
+}
+
+uint64_t value_from(uint32_t gc_timestamp, EntryRef ref) {
+    return ((uint64_t(gc_timestamp) << 32u) | ref.ref());
+}
+
+// TODO dedupe and unify common code
+uint8_t
+getMinDiffBits(uint16_t minBits, const document::BucketId& a, const document::BucketId& b) {
+    for (uint32_t i = minBits; i <= std::min(a.getUsedBits(), b.getUsedBits()); i++) {
+        document::BucketId a1(i, a.getRawId());
+        document::BucketId b1(i, b.getRawId());
+        if (b1.getId() != a1.getId()) {
+            return i;
+        }
+    }
+    return minBits;
+}
+
+uint8_t next_parent_bit_seek_level(uint8_t minBits, const document::BucketId& a, const document::BucketId& b) {
+    const uint8_t min_used = std::min(a.getUsedBits(), b.getUsedBits());
+    assert(min_used >= minBits); // Always monotonically descending towards leaves
+    for (uint32_t i = minBits; i <= min_used; i++) {
+        document::BucketId a1(i, a.getRawId());
+        document::BucketId b1(i, b.getRawId());
+        if (b1.getId() != a1.getId()) {
+            return i;
+        }
+    }
+    // The bit prefix is equal, which means that one node is a parent of the other. In this
+    // case we have to force the seek to continue from the next level in the tree.
+    return std::max(min_used, minBits) + 1;
+}
+
+// TODO getMinDiffBits is hoisted from lockablemap.cpp, could probably be rewritten in terms of xor and MSB bit scan instr
+/*
+ *       63 -------- ... -> 0
+ *     a: 1101111111 ... 0010
+ *     b: 1101110010 ... 0011
+ * a ^ b: 0000001101 ... 0001
+ *              ^- diff bit = 57
+ *
+ * 63 - vespalib::Optimized::msbIdx(a ^ b) ==> 6
+ *
+ * what if a == b? special case? not a problem if we can prove this never happens.
+ */
+
+}
+
+void BTreeBucketDatabase::commit_tree_changes() {
+    // TODO break up and refactor
+    // TODO verify semantics and usage
+    // TODO make BTree wrapping API which abstracts away all this stuff via reader/writer interfaces
+    _tree.getAllocator().freeze();
+
+    auto current_gen = _generation_handler.getCurrentGeneration();
+    _store.transferHoldLists(current_gen);
+    _tree.getAllocator().transferHoldLists(current_gen);
+
+    _generation_handler.incGeneration();
+
+    auto used_gen = _generation_handler.getFirstUsedGeneration();
+    _store.trimHoldLists(used_gen);
+    _tree.getAllocator().trimHoldLists(used_gen);
+}
+
+Entry BTreeBucketDatabase::entry_from_iterator(const BTree::ConstIterator& iter) const {
+    if (!iter.valid()) {
+        return Entry::createInvalid();
+    }
+    const auto value = iter.getData();
+    const auto replicas_ref = _store.get(entry_ref_from_value(value));
+    const auto bucket = BucketId(BucketId::keyToBucketId(iter.getKey()));
+    return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref);
+}
+
+BucketId BTreeBucketDatabase::bucket_from_valid_iterator(const BTree::ConstIterator& iter) const {
+    return BucketId(BucketId::keyToBucketId(iter.getKey()));
+}
+
+Entry BTreeBucketDatabase::get(const BucketId& bucket) const {
+    return entry_from_iterator(_tree.find(bucket.toKey()));
+}
+
+void BTreeBucketDatabase::remove(const BucketId& bucket) {
+    auto iter = _tree.find(bucket.toKey());
+    if (!iter.valid()) {
+        return;
+    }
+    const auto value = iter.getData();
+    _store.remove(entry_ref_from_value(value));
+    _tree.remove(iter);
+    commit_tree_changes();
+}
+
+/*
+ * Finding the complete set of parents of a given bucket is not obvious how to
+ * do efficiently, as we only know that the parents are ordered before their
+ * children, but we do not a-priori know if any exist at all. The Judy DB impl
+ * does O(b) explicit point lookups (where b is the number of used bits in the
+ * bucket), starting at the leaf bit and working towards the root. To avoid
+ * having to re-create iterators and perform a full tree search every time, we
+ * turn this on its head and start from the root, progressing towards the leaf.
+ * This allows us to reuse a single iterator and to continue seeking forwards
+ * from its current position.
+ *
+ * Algorithm:
+ *
+ *   Core invariant: every subsequent iterator seek performed in this algorithm
+ *   is for a key that is strictly higher than the one the iterator is currently at.
+ *
+ *   1. Lbound seek to the lowest key that is known to exclude all already visited
+ *      parents. On the first iteration this is zero, i.e. the first in-order node.
+ *   2. If the current node's key is greater than that of the requested bucket's key,
+ *      we've either descended to--or beyond--it in its own subtree or we've entered
+ *      a disjoint subtree. Since we know that all parents must sort before any given
+ *      child bucket, no more parents may be found at this point. Algorithm terminates.
+ *   3. As the main body of the loop is entered, we know one of following must hold:
+ *      3.1 The current node is an explicitly present parent of our bucket.
+ *      3.2 The current node is contained in a left subtree branch of a parent that
+ *          does not have a bucket explicitly present in the tree. It cannot be in
+ *          a right subtree of any parent, as that would imply the node is ordered
+ *          _after_ our own bucket in an in-order traversal, which would contradict
+ *          the check in step 2 above.
+ *   4. If the current node contains the requested bucket, we're at a parent
+ *      node of the bucket; add it to the result set.
+ *      If this is _not_ the case, we're in a different subtree. Example: the
+ *      requested bucket has a key whose MSB is 1 but the first bucket in the
+ *      tree has a key with an MSB of 0. Either way we need to update our search
+ *      key to home in on the target subtree where more parents may be found;
+ *   5. Update the seek key to find the next possible parent. To ensure this key is
+ *      strictly greater than the iterator's current key we find the largest shared
+ *      prefix of bits in common between the current node's key and the requested
+ *      bucket's key. The prefix length + 1 is then the depth in the tree at which the
+ *      two subtrees branch off and diverge.
+ *      The new key is then the MSB prefix length + 1 requested bucket's key with a
+ *      matching number of used-bits set. Forward lbound-seek the iterator to this key.
+ *      `--> TODO elaborate on prefix semantics when they are equal wrt. min used bits
+ *   6. Iff iterator is still valid, go to step 2
+ *
+ * This algorithm is able to skip through large parts of the tree in a sparsely populated
+ * tree, but the number of seeks will trend towards O(b) as with the legacy implementation
+ * when a tree is densely populated. This because all logical inner nodes in the tree will
+ * have subtrees under them. Even in the worst case we should be more efficient than the
+ * legacy implementation since we've cut any dense search space in half for each invocation
+ * of seek() on the iterator
+ *
+ * TODO use min-aggregation to infer used bit range in the tree. This should allow us to
+ * skip scanning through many disjoint subtrees.
+ */
+BTreeBucketDatabase::BTree::ConstIterator
+BTreeBucketDatabase::find_parents_internal(const document::BucketId& bucket,
+                                           std::vector<Entry>& entries) const
+{
+    const uint64_t bucket_key = bucket.toKey();
+    uint64_t parent_key = 0;
+    auto iter = _tree.begin();
+    // Start at the root level, descending towards the bucket itself.
+    // Try skipping as many levels of the tree as possible as we go.
+    uint32_t bits = 1;
+    while (iter.valid() && (iter.getKey() < bucket_key)) {
+        auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
+        if (candidate.contains(bucket)) {
+            assert(candidate.getUsedBits() >= bits);
+            entries.emplace_back(entry_from_iterator(iter));
+        }
+        bits = next_parent_bit_seek_level(bits, candidate, bucket);
+        parent_key = BucketId(bits, bucket.getRawId()).toKey();
+        assert(parent_key > iter.getKey());
+        iter.seek(parent_key);
+    }
+    return iter; // FIXME clang warns here due to copying...
+}
+
+/*
+ * Note: due to legacy API reasons, iff the requested bucket itself exists in the
+ * tree, it will be returned in the result set. I.e. it returns all the nodes on
+ * the path from _and including_ itself towards the root.
+ */
+void BTreeBucketDatabase::getParents(const BucketId& bucket,
+                                     std::vector<Entry>& entries) const
+{
+    auto iter = find_parents_internal(bucket, entries);
+    if (iter.valid() && iter.getKey() == bucket.toKey()) {
+        entries.emplace_back(entry_from_iterator(iter));
+    }
+}
+
+void BTreeBucketDatabase::getAll(const BucketId& bucket,
+                                 std::vector<Entry>& entries) const
+{
+    auto iter = find_parents_internal(bucket, entries);
+    // `iter` is already pointing at, or beyond, one of the bucket's subtrees.
+    for (; iter.valid(); ++iter) {
+        auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
+        if (bucket.contains(candidate)) {
+            entries.emplace_back(entry_from_iterator(iter));
+        } else {
+            break;
+        }
+    }
+}
+
+void BTreeBucketDatabase::update(const Entry& newEntry) {
+    assert(newEntry.valid());
+    auto replicas_ref = _store.add(newEntry.getBucketInfo().getRawNodes());
+    const auto new_value = value_from(newEntry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref);
+    const auto bucket_key = newEntry.getBucketId().toKey();
+    auto iter = _tree.lowerBound(bucket_key);
+    if (iter.valid() && (iter.getKey() == bucket_key)) {
+        _store.remove(entry_ref_from_value(iter.getData()));
+        // In-place update of value; does not require tree structure modification
+        std::atomic_thread_fence(std::memory_order_release); // Must ensure visibility when new array ref is observed
+        iter.writeData(new_value);
+    } else {
+        _tree.insert(iter, bucket_key, new_value);
+    }
+    commit_tree_changes(); // TODO does publishing a new root imply an implicit memory fence?
+}
+
+// TODO need snapshot read with guarding
+// FIXME semantics of for-each in judy and bit tree DBs differ, former expects lbound, latter ubound..!
+// FIXME but bit-tree code says "lowerBound" in impl and "after" in declaration???
+void BTreeBucketDatabase::forEach(EntryProcessor& proc, const BucketId& after) const {
+    for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) {
+        if (!proc.process(entry_from_iterator(iter))) {
+            break;
+        }
+    }
+}
+
+void BTreeBucketDatabase::forEach(MutableEntryProcessor& proc, const BucketId& after) {
+    for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) {
+        // FIXME this is a horrible API which currently has to update every time since we don't
+        // know from the function call itself whether something was in fact updated behind the scenes..!
+        auto entry = entry_from_iterator(iter);
+        bool should_continue = proc.process(entry);
+        // TODO optimize this joyful mess :D
+        update(entry);
+        if (!should_continue) {
+            break;
+        }
+    }
+    //commit_tree_changes(); TODO should be done as bulk op!
+}
+
+Entry BTreeBucketDatabase::upperBound(const BucketId& bucket) const {
+    return entry_from_iterator(_tree.upperBound(bucket.toKey()));
+
+}
+
+uint64_t BTreeBucketDatabase::size() const {
+    return _tree.size();
+}
+
+void BTreeBucketDatabase::clear() {
+    _tree.clear();
+    commit_tree_changes();
+}
+
+/*
+ * Returns the bucket ID which, based on the buckets already existing in the DB,
+ * is the most specific location in the tree in which it should reside. This may
+ * or may not be a bucket that already exists.
+ *
+ * Example: if there is a single bucket (1, 1) in the tree, a query for (1, 1) or
+ * (1, 3) will return (1, 1) as that is the most specific leaf in that subtree.
+ * A query for (1, 0) will return (1, 0) even though this doesn't currently exist,
+ * as there is no existing bucket that can contain the queried bucket. It is up to
+ * the caller to create this bucket according to its needs.
+ *
+ * Usually this function will be called with an ID whose used-bits is at max (58), in
+ * order to find a leaf bucket to route an incoming document operation to.
+ *
+ * TODO rename this function, it's very much _not_ obvious what an "appropriate" bucket is..!
+ * TODO this should be possible to do concurrently
+ */
+BucketId BTreeBucketDatabase::getAppropriateBucket(uint16_t minBits, const BucketId& bid) {
+    // The bucket tree is ordered in such a way that it represents a
+    // natural in-order traversal of all buckets, with inner nodes being
+    // visited before leaf nodes. This means that a lower bound seek will
+    // never return a parent of a seeked bucket. The iterator will be pointing
+    // to a bucket that is either the actual bucket given as the argument to
+    // lowerBound() or the next in-order bucket (or end() if none exists).
+    auto iter = _tree.lowerBound(bid.toKey());
+    if (iter.valid()) {
+        // Find the first level in the tree where the paths through the bucket tree
+        // diverge for the target bucket and the current bucket.
+        minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid);
+    }
+    // TODO is it better to copy original iterator and do begin() on the copy?
+    auto first_iter = _tree.begin();
+    // Original iterator might be in a different subtree than that of our
+    // target bucket. If possible, rewind one node to discover any parent or
+    // leftmost sibling of our node. If there's no such node, we'll still
+    // discover the greatest equal bit prefix.
+    if (iter != first_iter) {
+        --iter;
+        minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid);
+    }
+    return BucketId(minBits, bid.getRawId());
+}
+
+/*
+ * Enumerate the number of child subtrees under `bucket`. The value returned is in the
+ * range [0, 2] regardless of how many subtrees are present further down in the tree.
+ *
+ * Finding this number is reasonably straight forward; we construct two buckets that
+ * represent the key ranges for the left and right subtrees under `bucket` and check
+ * if there are any ranges in the tree's keyspace that are contained in these.
+ */
+// TODO rename/clarify to indicate this is child _subtrees_, not explicit child _buckets_!
+uint32_t BTreeBucketDatabase::childCount(const BucketId& bucket) const {
+    assert(bucket.getUsedBits() < BucketId::maxNumBits);
+    BucketId lhs_bucket(bucket.getUsedBits() + 1, bucket.getId());
+    BucketId rhs_bucket(bucket.getUsedBits() + 1, (1ULL << bucket.getUsedBits()) | bucket.getId());
+
+    auto iter = _tree.lowerBound(lhs_bucket.toKey());
+    if (!iter.valid()) {
+        return 0;
+    }
+    if (lhs_bucket.contains(bucket_from_valid_iterator(iter))) {
+        iter.seek(rhs_bucket.toKey());
+        if (!iter.valid()) {
+            return 1; // lhs subtree only
+        }
+        return (rhs_bucket.contains(bucket_from_valid_iterator(iter)) ? 2 : 1);
+    } else if (rhs_bucket.contains(bucket_from_valid_iterator(iter))) {
+        return 1; // rhs subtree only
+    }
+    return 0;
+}
+
+void BTreeBucketDatabase::print(std::ostream& out, bool verbose,
+                                const std::string& indent) const
+{
+    out << "BTreeBucketDatabase(" << size() << " buckets)";
+    (void)verbose;
+    (void)indent;
+}
+
+}
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
new file mode 100644
index 00000000000..91d3a1cde5b
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
@@ -0,0 +1,70 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "bucketdatabase.h"
+#include <vespa/searchlib/btree/btree.h>
+#include <vespa/searchlib/datastore/array_store.h>
+
+namespace storage {
+
+/*
+ * Bucket database implementation built around lock-free single-writer/multiple-readers B+tree.
+ *
+ * Since a distributor must be able to handle multiple replicas for a given bucket, these
+ * are handled via an ArrayStore indirection. A distributor bucket DB also includes state
+ * for the _entire bucket_ itself, not just the replicas; last timestamp of bucket GC. Since
+ * this is an uint32 we cheekily mangle it into the value, i.e. each bucket key maps to a
+ * composite key of (gc_timestamp_u32 << 32) | array_ref_u32.
+ *
+ * This is _not_ yet production ready, for several reasons:
+ *   - Underlying ArrayStore does not have freelists enabled for replica entry reuse
+ *   - Current API design for mutable forEach requires O(n) tree structure mutations instead
+ *     of changing the tree in bulk and reusing ArrayStore refs et al. Needs a redesign.
+ *
+ * Also note that the DB is currently _not_ thread safe, as read snapshotting is not yet defined
+ * or exposed.
+ */
+// TODO create and use a new DB interface with better bulk loading, snapshot and iteration support
+class BTreeBucketDatabase : public BucketDatabase {
+    // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref>
+    using BTree        = search::btree::BTree<uint64_t, uint64_t>;
+    using ReplicaStore = search::datastore::ArrayStore<BucketCopy>;
+    using GenerationHandler = vespalib::GenerationHandler;
+
+    BTree _tree;
+    ReplicaStore _store;
+    GenerationHandler _generation_handler;
+public:
+    BTreeBucketDatabase();
+    ~BTreeBucketDatabase() override;
+
+    // Ye olde bucket DB API:
+    Entry get(const document::BucketId& bucket) const override;
+    void remove(const document::BucketId& bucket) override;
+    void getParents(const document::BucketId& childBucket,
+                    std::vector<Entry>& entries) const override;
+    void getAll(const document::BucketId& bucket,
+                std::vector<Entry>& entries) const override;
+    void update(const Entry& newEntry) override;
+    void forEach(EntryProcessor&, const document::BucketId& after) const override;
+    void forEach(MutableEntryProcessor&, const document::BucketId& after) override;
+    Entry upperBound(const document::BucketId& value) const override;
+    uint64_t size() const override;
+    void clear() override;
+    document::BucketId getAppropriateBucket(
+            uint16_t minBits,
+            const document::BucketId& bid) override;
+    uint32_t childCount(const document::BucketId&) const override;
+    void print(std::ostream& out, bool verbose,
+               const std::string& indent) const override;
+
+private:
+    Entry entry_from_iterator(const BTree::ConstIterator& iter) const;
+    document::BucketId bucket_from_valid_iterator(const BTree::ConstIterator& iter) const;
+    void commit_tree_changes();
+    BTree::ConstIterator find_parents_internal(const document::BucketId& bucket,
+                                               std::vector<Entry>& entries) const;
+};
+
+}
diff --git a/storage/src/vespa/storage/bucketdb/bucketdatabase.h b/storage/src/vespa/storage/bucketdb/bucketdatabase.h
index d939ac48a03..a5a70c488ca 100644
--- a/storage/src/vespa/storage/bucketdb/bucketdatabase.h
+++ b/storage/src/vespa/storage/bucketdb/bucketdatabase.h
@@ -19,8 +19,8 @@ public:
 
     public:
         Entry() : _bucketId(0) {} // Invalid entry
-        Entry(const document::BucketId& bId, const BucketInfo& bucketInfo)
-            : _bucketId(bId), _info(bucketInfo) {}
+        Entry(const document::BucketId& bId, BucketInfo bucketInfo)
+            : _bucketId(bId), _info(std::move(bucketInfo)) {}
         explicit Entry(const document::BucketId& bId) : _bucketId(bId) {}
 
         bool operator==(const Entry& other) const {
@@ -54,7 +54,7 @@ public:
     virtual void remove(const document::BucketId& bucket) = 0;
 
     /**
-     * Puts all entries that are can contain the given bucket id
+     * Puts all entries that may contain the given bucket id
      * into the given entry vector, including itself if found.
      */
     virtual void getParents(const document::BucketId& childBucket,
diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.cpp b/storage/src/vespa/storage/bucketdb/bucketinfo.cpp
index bbc59de410a..82fc6686c5f 100644
--- a/storage/src/vespa/storage/bucketdb/bucketinfo.cpp
+++ b/storage/src/vespa/storage/bucketdb/bucketinfo.cpp
@@ -6,10 +6,21 @@
 namespace storage {
 
 BucketInfo::BucketInfo()
-    : _lastGarbageCollection(0)
-{ }
+    : _lastGarbageCollection(0),
+      _nodes()
+{}
 
-BucketInfo::~BucketInfo() { }
+BucketInfo::BucketInfo(uint32_t lastGarbageCollection, std::vector<BucketCopy> nodes)
+    : _lastGarbageCollection(lastGarbageCollection),
+      _nodes(std::move(nodes))
+{}
+
+BucketInfo::~BucketInfo() = default;
+
+BucketInfo::BucketInfo(const BucketInfo&) = default;
+BucketInfo& BucketInfo::operator=(const BucketInfo&) = default;
+BucketInfo::BucketInfo(BucketInfo&&) noexcept = default;
+BucketInfo& BucketInfo::operator=(BucketInfo&&) noexcept = default;
 
 std::string
 BucketInfo::toString() const {
diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.h b/storage/src/vespa/storage/bucketdb/bucketinfo.h
index ace82561b2a..8949c0f10da 100644
--- a/storage/src/vespa/storage/bucketdb/bucketinfo.h
+++ b/storage/src/vespa/storage/bucketdb/bucketinfo.h
@@ -22,8 +22,14 @@ private:
 
 public:
     BucketInfo();
+    BucketInfo(uint32_t lastGarbageCollection, std::vector<BucketCopy> nodes);
     ~BucketInfo();
 
+    BucketInfo(const BucketInfo&);
+    BucketInfo& operator=(const BucketInfo&);
+    BucketInfo(BucketInfo&&) noexcept;
+    BucketInfo& operator=(BucketInfo&&) noexcept;
+
     /**
      * @return Returns the last time when this bucket was "garbage collected".
      */
@@ -136,6 +142,10 @@ public:
         return _nodes[idx];
     }
 
+    const std::vector<BucketCopy>& getRawNodes() const noexcept {
+        return _nodes;
+    }
+
     void clearTrusted(uint16_t nodeIdx) {
         getNodeInternal(nodeIdx)->clearTrusted();
     }
diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.h b/storage/src/vespa/storage/bucketdb/lockablemap.h
index 89be92d5921..2b2c2cbe7a8 100644
--- a/storage/src/vespa/storage/bucketdb/lockablemap.h
+++ b/storage/src/vespa/storage/bucketdb/lockablemap.h
@@ -305,7 +305,7 @@ private:
      * Find the given list of keys in the map and add them to the map of
      * results, locking them in the process.
      */
-    void addAndLockResults(const std::vector<BucketId::Type> keys,
+    void addAndLockResults(const std::vector<BucketId::Type>& keys,
                            const char* clientId,
                            std::map<BucketId, WrappedEntry>& results,
                            std::unique_lock<std::mutex> &guard);
diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.hpp b/storage/src/vespa/storage/bucketdb/lockablemap.hpp
index 6d700cd6049..440a76b92fd 100644
--- a/storage/src/vespa/storage/bucketdb/lockablemap.hpp
+++ b/storage/src/vespa/storage/bucketdb/lockablemap.hpp
@@ -549,7 +549,7 @@ LockableMap<Map>::getAllContaining(const BucketId& bucket,
 template<typename Map>
 void
 LockableMap<Map>::addAndLockResults(
-        const std::vector<BucketId::Type> keys,
+        const std::vector<BucketId::Type>& keys,
         const char* clientId,
         std::map<BucketId, WrappedEntry>& results,
         std::unique_lock<std::mutex> &guard)
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
index db2842b70f2..d5c16bc7f5b 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
@@ -822,7 +822,7 @@ BucketDBUpdater::NodeRemover::setCopiesInEntry(
 
     e->addNodes(copies, order);
 
-    LOG(debug, "Changed %s", e->toString().c_str());
+    LOG(spam, "Changed %s", e->toString().c_str());
     LOG_BUCKET_OPERATION_NO_LOCK(
             e.getBucketId(),
             vespalib::make_string("updated bucketdb entry to %s",
@@ -834,7 +834,7 @@ BucketDBUpdater::NodeRemover::removeEmptyBucket(const document::BucketId& bucket
 {
     _removedBuckets.push_back(bucketId);
 
-    LOG(debug,
+    LOG(spam,
         "After system state change %s, bucket %s now has no copies.",
         _oldState.getTextualDifference(_state).c_str(),
         bucketId.toString().c_str());
diff --git a/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp b/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp
index dcf7792860e..f013ce43048 100644
--- a/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp
+++ b/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp
@@ -1,13 +1,24 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "distributor_bucket_space.h"
+#include <vespa/storage/bucketdb/mapbucketdatabase.h>
+#include <vespa/storage/bucketdb/btree_bucket_database.h>
 #include <vespa/vdslib/state/clusterstate.h>
 #include <vespa/vdslib/distribution/distribution.h>
 
 namespace storage::distributor {
 
+namespace {
+
+std::unique_ptr<BucketDatabase> make_default_db_impl() {
+    //return std::make_unique<BTreeBucketDatabase>();
+    return std::make_unique<MapBucketDatabase>();
+}
+
+}
+
 DistributorBucketSpace::DistributorBucketSpace()
-    : _bucketDatabase(),
+    : _bucketDatabase(make_default_db_impl()),
       _clusterState(),
       _distribution()
 {
diff --git a/storage/src/vespa/storage/distributor/distributor_bucket_space.h b/storage/src/vespa/storage/distributor/distributor_bucket_space.h
index 23d00c6ae43..effb0dc3e17 100644
--- a/storage/src/vespa/storage/distributor/distributor_bucket_space.h
+++ b/storage/src/vespa/storage/distributor/distributor_bucket_space.h
@@ -1,9 +1,12 @@
 // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #pragma once
 
-#include <vespa/storage/bucketdb/mapbucketdatabase.h>
 #include <memory>
 
+namespace storage {
+class BucketDatabase;
+}
+
 namespace storage::lib {
     class ClusterState;
     class Distribution;
@@ -23,7 +26,7 @@ namespace storage::distributor {
  *   bucket spaces.
  */
 class DistributorBucketSpace {
-    MapBucketDatabase _bucketDatabase;
+    std::unique_ptr<BucketDatabase>  _bucketDatabase;
     std::shared_ptr<const lib::ClusterState> _clusterState;
     std::shared_ptr<const lib::Distribution> _distribution;
 public:
@@ -36,10 +39,10 @@ public:
     DistributorBucketSpace& operator=(DistributorBucketSpace&&) = delete;
 
     BucketDatabase& getBucketDatabase() noexcept {
-        return _bucketDatabase;
+        return *_bucketDatabase;
     }
     const BucketDatabase& getBucketDatabase() const noexcept {
-        return _bucketDatabase;
+        return *_bucketDatabase;
     }
 
     void setClusterState(std::shared_ptr<const lib::ClusterState> clusterState);
diff --git a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
index c295be19a0b..4208e8b33f9 100644
--- a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
+++ b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
@@ -49,9 +49,7 @@ PendingBucketSpaceDbTransition::PendingBucketSpaceDbTransition(const PendingClus
     }
 }
 
-PendingBucketSpaceDbTransition::~PendingBucketSpaceDbTransition()
-{
-}
+PendingBucketSpaceDbTransition::~PendingBucketSpaceDbTransition() = default;
 
 PendingBucketSpaceDbTransition::Range
 PendingBucketSpaceDbTransition::skipAllForSameBucket()
author	Tor Brede Vekterli <vekterli@verizonmedia.com>	2019-05-13 10:31:30 +0200
committer	GitHub <noreply@github.com>	2019-05-13 10:31:30 +0200
commit	555008f9bbb282a5f620a3d4e63e757c08ed883d (patch)
tree	76339729385d16dc854c77acd476850003545b3d /storage
parent	866ce6e27e52c8e4ddb34d4cb8e9879c5dfd0a11 (diff)
parent	aa9fa2f49f3a61d2978b9748712f3cadd902fd7b (diff)