summaryrefslogtreecommitdiffstats
path: root/storage
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2019-05-13 10:31:30 +0200
committerGitHub <noreply@github.com>2019-05-13 10:31:30 +0200
commit555008f9bbb282a5f620a3d4e63e757c08ed883d (patch)
tree76339729385d16dc854c77acd476850003545b3d /storage
parent866ce6e27e52c8e4ddb34d4cb8e9879c5dfd0a11 (diff)
parentaa9fa2f49f3a61d2978b9748712f3cadd902fd7b (diff)
Merge pull request #9341 from vespa-engine/vekterli/add-distributor-btree-bucket-database-foundations
Add initial B+tree distributor bucket database
Diffstat (limited to 'storage')
-rw-r--r--storage/CMakeLists.txt1
-rw-r--r--storage/src/tests/distributor/CMakeLists.txt5
-rw-r--r--storage/src/tests/distributor/btree_bucket_database_test.cpp12
-rw-r--r--storage/src/tests/distributor/bucketdatabasetest.cpp219
-rw-r--r--storage/src/tests/distributor/bucketdatabasetest.h47
-rw-r--r--storage/src/tests/distributor/bucketdbupdatertest.cpp3
-rw-r--r--storage/src/tests/distributor/mapbucketdatabasetest.cpp18
-rw-r--r--storage/src/vespa/storage/bucketdb/CMakeLists.txt1
-rw-r--r--storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp405
-rw-r--r--storage/src/vespa/storage/bucketdb/btree_bucket_database.h70
-rw-r--r--storage/src/vespa/storage/bucketdb/bucketdatabase.h6
-rw-r--r--storage/src/vespa/storage/bucketdb/bucketinfo.cpp17
-rw-r--r--storage/src/vespa/storage/bucketdb/bucketinfo.h10
-rw-r--r--storage/src/vespa/storage/bucketdb/lockablemap.h2
-rw-r--r--storage/src/vespa/storage/bucketdb/lockablemap.hpp2
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.cpp4
-rw-r--r--storage/src/vespa/storage/distributor/distributor_bucket_space.cpp13
-rw-r--r--storage/src/vespa/storage/distributor/distributor_bucket_space.h11
-rw-r--r--storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp4
19 files changed, 670 insertions, 180 deletions
diff --git a/storage/CMakeLists.txt b/storage/CMakeLists.txt
index f77c11eb350..6b697716e1d 100644
--- a/storage/CMakeLists.txt
+++ b/storage/CMakeLists.txt
@@ -16,6 +16,7 @@ vespa_define_module(
vdslib
persistence
storageframework
+ searchlib
EXTERNAL_DEPENDS
Judy
diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt
index f7a65000f6c..fdccf9b1394 100644
--- a/storage/src/tests/distributor/CMakeLists.txt
+++ b/storage/src/tests/distributor/CMakeLists.txt
@@ -2,7 +2,6 @@
vespa_add_library(storage_testdistributor TEST
SOURCES
blockingoperationstartertest.cpp
- bucketdatabasetest.cpp
bucketdbmetricupdatertest.cpp
bucketgctimecalculatortest.cpp
bucketstateoperationtest.cpp
@@ -15,7 +14,6 @@ vespa_add_library(storage_testdistributor TEST
idealstatemanagertest.cpp
joinbuckettest.cpp
maintenanceschedulertest.cpp
- mapbucketdatabasetest.cpp
mergelimitertest.cpp
mergeoperationtest.cpp
messagesenderstub.cpp
@@ -48,7 +46,10 @@ vespa_add_library(storage_testdistributor TEST
vespa_add_library(storage_gtestdistributor TEST
SOURCES
+ btree_bucket_database_test.cpp
+ bucketdatabasetest.cpp
bucketdbupdatertest.cpp
+ mapbucketdatabasetest.cpp
# Fixture etc. dupes with non-gtest runner :
distributortestutil.cpp
bucket_db_prune_elision_test.cpp
diff --git a/storage/src/tests/distributor/btree_bucket_database_test.cpp b/storage/src/tests/distributor/btree_bucket_database_test.cpp
new file mode 100644
index 00000000000..c253a758f98
--- /dev/null
+++ b/storage/src/tests/distributor/btree_bucket_database_test.cpp
@@ -0,0 +1,12 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/bucketdb/btree_bucket_database.h>
+#include <tests/distributor/bucketdatabasetest.h>
+
+namespace storage::distributor {
+
+INSTANTIATE_TEST_CASE_P(BTreeDatabase, BucketDatabaseTest,
+ ::testing::Values(std::make_shared<BTreeBucketDatabase>()));
+
+}
+
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp
index 55106ce5c28..92e0c534e31 100644
--- a/storage/src/tests/distributor/bucketdatabasetest.cpp
+++ b/storage/src/tests/distributor/bucketdatabasetest.cpp
@@ -1,6 +1,5 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "bucketdatabasetest.h"
-#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
#include <iomanip>
#include <algorithm>
@@ -8,9 +7,7 @@ namespace storage::distributor {
using document::BucketId;
-void
-BucketDatabaseTest::setUp()
-{
+void BucketDatabaseTest::SetUp() {
db().clear();
}
@@ -26,44 +23,42 @@ namespace {
}
}
-void
-BucketDatabaseTest::testClear() {
+TEST_P(BucketDatabaseTest, testClear) {
db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
db().clear();
- CPPUNIT_ASSERT_EQUAL(uint64_t(0), db().size());
+ EXPECT_EQ(uint64_t(0), db().size());
}
-void
-BucketDatabaseTest::testUpdateGetAndRemove() {
+TEST_P(BucketDatabaseTest, testUpdateGetAndRemove) {
// Do some insertions
- CPPUNIT_ASSERT_EQUAL(0, (int)db().size());
+ EXPECT_EQ(0, db().size());
db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3)));
- CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+ EXPECT_EQ(3, db().size());
db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(4)));
- CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+ EXPECT_EQ(3, db().size());
// Access some elements
- CPPUNIT_ASSERT_EQUAL(BI(4), db().get(document::BucketId(16, 11)).getBucketInfo());
- CPPUNIT_ASSERT_EQUAL(BI(1), db().get(document::BucketId(16, 16)).getBucketInfo());
- CPPUNIT_ASSERT_EQUAL(BI(3), db().get(document::BucketId(16, 42)).getBucketInfo());
+ EXPECT_EQ(BI(4), db().get(document::BucketId(16, 11)).getBucketInfo());
+ EXPECT_EQ(BI(1), db().get(document::BucketId(16, 16)).getBucketInfo());
+ EXPECT_EQ(BI(3), db().get(document::BucketId(16, 42)).getBucketInfo());
// Do removes
db().remove(document::BucketId(16, 12));
- CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+ EXPECT_EQ(3, db().size());
db().remove(document::BucketId(16, 11));
- CPPUNIT_ASSERT_EQUAL(2, (int)db().size());
+ EXPECT_EQ(2, db().size());
db().remove(document::BucketId(16, 16));
db().remove(document::BucketId(16, 42));
- CPPUNIT_ASSERT_EQUAL(0, (int)db().size());
+ EXPECT_EQ(0, db().size());
}
namespace {
@@ -83,8 +78,7 @@ struct ModifyProcessor : public BucketDatabase::MutableEntryProcessor
}
};
-struct ListAllProcessor : public BucketDatabase::EntryProcessor
-{
+struct ListAllProcessor : public BucketDatabase::EntryProcessor {
std::ostringstream ost;
bool process(const BucketDatabase::Entry& e) override {
@@ -93,8 +87,7 @@ struct ListAllProcessor : public BucketDatabase::EntryProcessor
}
};
-struct DummyProcessor : public BucketDatabase::EntryProcessor
-{
+struct DummyProcessor : public BucketDatabase::EntryProcessor {
std::ostringstream ost;
bool process(const BucketDatabase::Entry&) override {
@@ -103,8 +96,7 @@ struct DummyProcessor : public BucketDatabase::EntryProcessor
};
-struct StoppingProcessor : public BucketDatabase::EntryProcessor
-{
+struct StoppingProcessor : public BucketDatabase::EntryProcessor {
std::ostringstream ost;
bool process(const BucketDatabase::Entry& e) override {
@@ -120,8 +112,7 @@ struct StoppingProcessor : public BucketDatabase::EntryProcessor
}
-void
-BucketDatabaseTest::testIterating() {
+TEST_P(BucketDatabaseTest, testIterating) {
// Do some insertions
db().update(BucketDatabase::Entry(document::BucketId(16, 0x10), BI(1)));
db().update(BucketDatabase::Entry(document::BucketId(16, 0x0b), BI(2)));
@@ -131,7 +122,7 @@ BucketDatabaseTest::testIterating() {
ListAllProcessor proc;
db().forEach(proc, document::BucketId());
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string(
"BucketId(0x4000000000000010) : "
"node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"
@@ -146,7 +137,7 @@ BucketDatabaseTest::testIterating() {
ListAllProcessor proc;
db().forEach(proc, document::BucketId(16, 0x2a));
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string(
"BucketId(0x400000000000000b) : "
"node(idx=2,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"),
@@ -157,7 +148,7 @@ BucketDatabaseTest::testIterating() {
StoppingProcessor proc;
db().forEach(proc, document::BucketId());
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string(
"BucketId(0x4000000000000010) : "
"node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"
@@ -173,7 +164,7 @@ BucketDatabaseTest::testIterating() {
ListAllProcessor proc;
db().forEach(proc);
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string(
"BucketId(0x4000000000000010) : "
"node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"
@@ -213,18 +204,33 @@ BucketDatabaseTest::doFindParents(const std::vector<document::BucketId>& ids,
return ost.str();
}
-void
-BucketDatabaseTest::testFindParents() {
+TEST_P(BucketDatabaseTest, testFindParents) {
// test what parents in the DB (specified in vector) are parents of the
// specified bucket. Result is a list of indexes into the vector.
- CPPUNIT_ASSERT_EQUAL(
+
+ // The way the legacy API works is that a bucket is considered as being in
+ // the set of its parents... This is rather weird, but at least explicitly
+ // test that it is so for now to avoid breaking the world.
+ EXPECT_EQ(
+ std::string("0"),
+ doFindParents(toVector(document::BucketId(17, 0xcafe)),
+ document::BucketId(17, 0xcafe)));
+
+ EXPECT_EQ(
+ std::string("1,2"),
+ doFindParents(toVector(document::BucketId(1, 0x0),
+ document::BucketId(1, 0x1),
+ document::BucketId(2, 0x1)),
+ document::BucketId(16, 0x1)));
+
+ EXPECT_EQ(
std::string("2"),
doFindParents(toVector(document::BucketId(17, 0x0ffff),
document::BucketId(18, 0x1ffff),
document::BucketId(18, 0x3ffff)),
document::BucketId(22, 0xfffff)));
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string("0,2,3"),
doFindParents(toVector(document::BucketId(16, 0x0ffff),
document::BucketId(17, 0x0ffff),
@@ -232,7 +238,15 @@ BucketDatabaseTest::testFindParents() {
document::BucketId(19, 0xfffff)),
document::BucketId(22, 0xfffff)));
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
+ std::string("0,1,2,3"),
+ doFindParents(toVector(document::BucketId(16, 0x0ffff),
+ document::BucketId(17, 0x0ffff),
+ document::BucketId(18, 0x0ffff),
+ document::BucketId(19, 0x0ffff)),
+ document::BucketId(20, 0x0ffff)));
+
+ EXPECT_EQ(
std::string("0,2,3"),
doFindParents(toVector(document::BucketId(16, 0x0ffff),
document::BucketId(17, 0x0ffff),
@@ -240,20 +254,20 @@ BucketDatabaseTest::testFindParents() {
document::BucketId(18, 0x1ffff)),
document::BucketId(22, 0x1ffff)));
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string("0"),
doFindParents(toVector(document::BucketId(16, 0x0ffff),
document::BucketId(17, 0x0ffff)),
document::BucketId(22, 0x1ffff)));
- CPPUNIT_ASSERT_EQUAL( // ticket 3121525
+ EXPECT_EQ( // ticket 3121525
std::string("0"),
doFindParents(toVector(document::BucketId(16, 0x0ffff),
document::BucketId(17, 0x0ffff),
document::BucketId(19, 0x1ffff)),
document::BucketId(18, 0x1ffff)));
- CPPUNIT_ASSERT_EQUAL( // ticket 3121525
+ EXPECT_EQ( // ticket 3121525
std::string("0"),
doFindParents(toVector(document::BucketId(16, 0x0ffff),
document::BucketId(17, 0x0ffff),
@@ -276,8 +290,11 @@ BucketDatabaseTest::doFindAll(const std::vector<document::BucketId>& ids,
std::ostringstream ost;
for (uint32_t i = 0; i < ids.size(); ++i) {
- if (std::find(entries.begin(), entries.end(),
- BucketDatabase::Entry(ids[i], BI(i))) != entries.end()) {
+ auto wanted = BucketDatabase::Entry(ids[i], BI(i));
+ for (const auto& e : entries) {
+ if (!(e == wanted)) {
+ continue;
+ }
if (!ost.str().empty()) {
ost << ",";
}
@@ -288,11 +305,9 @@ BucketDatabaseTest::doFindAll(const std::vector<document::BucketId>& ids,
return ost.str();
}
-void
-BucketDatabaseTest::testFindAll()
-{
+TEST_P(BucketDatabaseTest, testFindAll) {
std::vector<document::BucketId> buckets;
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string(""),
doFindAll(buckets, document::BucketId(18, 0x1ffff)));
@@ -306,15 +321,20 @@ BucketDatabaseTest::testFindAll()
buckets.push_back(document::BucketId(20, 0xceaaa));
buckets.push_back(document::BucketId(17, 0x1ffff));
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
+ std::string("0"),
+ doFindAll(toVector(document::BucketId(16, 1234)),
+ document::BucketId(16, 1234)));
+
+ EXPECT_EQ(
std::string("0,4,5,6"),
doFindAll(buckets, document::BucketId(17, 0x1aaaa)));
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string("8"),
doFindAll(buckets, document::BucketId(16, 0xffff)));
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string("0,1"),
doFindAll(toVector(document::BucketId(17, 0x00001),
document::BucketId(17, 0x10001)),
@@ -322,7 +342,7 @@ BucketDatabaseTest::testFindAll()
document::BucketId id(33, 0x1053c7089); // Bit 32 is set, but unused.
id.setUsedBits(32);
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
std::string("1,2"),
doFindAll(toVector(document::BucketId(24, 0x000dc7089),
document::BucketId(33, 0x0053c7089),
@@ -330,7 +350,7 @@ BucketDatabaseTest::testFindAll()
document::BucketId(24, 0x000bc7089)),
id));
- CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+ EXPECT_EQ( // Inconsistent split
std::string("0,1,2"),
doFindAll(toVector(
document::BucketId(16, 0x00001), // contains 2-3
@@ -338,7 +358,7 @@ BucketDatabaseTest::testFindAll()
document::BucketId(17, 0x10001)),
document::BucketId(16, 0x00001)));
- CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+ EXPECT_EQ( // Inconsistent split
std::string("1,2"),
doFindAll(toVector(
document::BucketId(17, 0x10000),
@@ -347,14 +367,14 @@ BucketDatabaseTest::testFindAll()
document::BucketId(17, 0x1ffff)),
document::BucketId(32, 0x027228034)));
- CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+ EXPECT_EQ( // Inconsistent split
std::string("0"),
doFindAll(toVector(
document::BucketId(16, 0x0ffff),
document::BucketId(17, 0x0ffff)),
document::BucketId(22, 0x1ffff)));
- CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+ EXPECT_EQ( // Inconsistent split
std::string("0,2"),
doFindAll(toVector(
document::BucketId(16, 0x0ffff),
@@ -362,7 +382,7 @@ BucketDatabaseTest::testFindAll()
document::BucketId(19, 0x1ffff)),
document::BucketId(18, 0x1ffff)));
- CPPUNIT_ASSERT_EQUAL( // Inconsistent split, ticket 3121525
+ EXPECT_EQ( // Inconsistent split, ticket 3121525
std::string("0,2"),
doFindAll(toVector(
document::BucketId(16, 0x0ffff),
@@ -386,36 +406,36 @@ BucketDatabaseTest::doCreate(const std::vector<document::BucketId>& ids,
return entry.getBucketId();
}
-void
-BucketDatabaseTest::testCreateAppropriateBucket() {
+// TODO rewrite in terms of bucket getter, not creator
+TEST_P(BucketDatabaseTest, testCreateAppropriateBucket) {
// Use min split bits when no relevant bucket exist.
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
document::BucketId(36,0x0000004d2),
doCreate(toVector(document::BucketId(58, 0x43d6c878000004d2ull)), 36,
document::BucketId(58, 0x423bf1e0000004d2ull)));
- // New bucket has bits in common with existing bucket.
- // Create bucket with min amount of bits while not being overlapping
- CPPUNIT_ASSERT_EQUAL(
+ // New bucket has bits in common with existing bucket.
+ // Create bucket with min amount of bits while not being overlapping
+ EXPECT_EQ(
document::BucketId(34,0x0000004d2),
doCreate(toVector(document::BucketId(58, 0xeaf77782000004d2)),
16,
document::BucketId(58, 0x00000000000004d2)));
- // Create sibling of existing bucket with most LSB bits in common.
- CPPUNIT_ASSERT_EQUAL(
+ // Create sibling of existing bucket with most LSB bits in common.
+ EXPECT_EQ(
document::BucketId(40, 0x0000004d2),
doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2),
document::BucketId(58, 0xeaf77782000004d2)),
16,
document::BucketId(58, 0x00000000000004d2)));
- // Create sibling of existing bucket with most LSB bits in common.
- CPPUNIT_ASSERT_EQUAL(
+ // Create sibling of existing bucket with most LSB bits in common.
+ EXPECT_EQ(
document::BucketId(25, 0x0010004d2),
doCreate(toVector(document::BucketId(16, 0x00000000000004d1),
document::BucketId(40, 0x00000000000004d2)),
16,
document::BucketId(58, 0x00000000010004d2)));
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
document::BucketId(36, 0x10000004000004d2),
doCreate(toVector(document::BucketId(0x8c000000000004d2),
document::BucketId(0xeb54b3ac000004d2),
@@ -423,42 +443,38 @@ BucketDatabaseTest::testCreateAppropriateBucket() {
document::BucketId(0x84000001000004d2)),
16,
document::BucketId(58, 0x1944a44000004d2)));
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
document::BucketId(25, 0x0010004d2),
doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2),
document::BucketId(40, 0x00000000000004d1)),
16,
document::BucketId(58,0x00000000010004d2)));
- // Test empty bucket database case. (Use min split bits)
+ // Test empty bucket database case. (Use min split bits)
std::vector<document::BucketId> buckets;
- CPPUNIT_ASSERT_EQUAL(
+ EXPECT_EQ(
document::BucketId(16, 0x0000004d2ull),
doCreate(buckets, 16,
document::BucketId(58, 0x00000000010004d2)));
}
-void
-BucketDatabaseTest::testGetNext()
-{
- db().clear();
+TEST_P(BucketDatabaseTest, testGetNext) {
db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3)));
- CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 16),
- db().getNext(document::BucketId()).getBucketId());
+ EXPECT_EQ(document::BucketId(16, 16),
+ db().getNext(document::BucketId()).getBucketId());
- CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 42),
- db().getNext(document::BucketId(16, 16)).getBucketId());
+ EXPECT_EQ(document::BucketId(16, 42),
+ db().getNext(document::BucketId(16, 16)).getBucketId());
- CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 11),
- db().getNext(document::BucketId(16, 42)).getBucketId());
+ EXPECT_EQ(document::BucketId(16, 11),
+ db().getNext(document::BucketId(16, 42)).getBucketId());
}
void
BucketDatabaseTest::doTestUpperBound(const UBoundFunc& f)
{
- db().clear();
// Tree is rooted at the LSB bit, so the following buckets are in iteration
// order based on the reverse of their "normal" bitstring:
// 0010:3
@@ -473,30 +489,28 @@ BucketDatabaseTest::doTestUpperBound(const UBoundFunc& f)
db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(3)));
// 0000:0 (default constructed) has ubound of 0010:3
- CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId()));
+ EXPECT_EQ(BucketId(3, 4), f(db(), BucketId()));
// 0011:4 has ubound of 1000:3
- CPPUNIT_ASSERT_EQUAL(document::BucketId(3, 1), f(db(), BucketId(4, 12)));
+ EXPECT_EQ(document::BucketId(3, 1), f(db(), BucketId(4, 12)));
// 1000:1 has ubound of 1000:3
- CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId(1, 0)));
- CPPUNIT_ASSERT_EQUAL(BucketId(3, 1), f(db(), BucketId(3, 4)));
- CPPUNIT_ASSERT_EQUAL(BucketId(4, 9), f(db(), BucketId(3, 1)));
- CPPUNIT_ASSERT_EQUAL(BucketId(5, 9), f(db(), BucketId(4, 9)));
- CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(5, 9)));
+ EXPECT_EQ(BucketId(3, 4), f(db(), BucketId(1, 0)));
+ EXPECT_EQ(BucketId(3, 1), f(db(), BucketId(3, 4)));
+ EXPECT_EQ(BucketId(4, 9), f(db(), BucketId(3, 1)));
+ EXPECT_EQ(BucketId(5, 9), f(db(), BucketId(4, 9)));
+ EXPECT_EQ(BucketId(3, 3), f(db(), BucketId(5, 9)));
// 100101:6 does not exist, should also return 1100:3
- CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(6, 41)));
+ EXPECT_EQ(BucketId(3, 3), f(db(), BucketId(6, 41)));
// Test extremes.
db().clear();
db().update(BucketDatabase::Entry(document::BucketId(8, 0), BI(2)));
db().update(BucketDatabase::Entry(document::BucketId(8, 0xff), BI(2)));
- CPPUNIT_ASSERT_EQUAL(BucketId(8, 0), f(db(), BucketId()));
- CPPUNIT_ASSERT_EQUAL(BucketId(8, 0xff), f(db(), BucketId(8, 0)));
+ EXPECT_EQ(BucketId(8, 0), f(db(), BucketId()));
+ EXPECT_EQ(BucketId(8, 0xff), f(db(), BucketId(8, 0)));
}
-void
-BucketDatabaseTest::testUpperBoundReturnsNextInOrderGreaterBucket()
-{
+TEST_P(BucketDatabaseTest, testUpperBoundReturnsNextInOrderGreaterBucket) {
doTestUpperBound([](const BucketDatabase& bucketDb,
const document::BucketId& id)
{
@@ -504,9 +518,7 @@ BucketDatabaseTest::testUpperBoundReturnsNextInOrderGreaterBucket()
});
}
-void
-BucketDatabaseTest::testGetNextReturnsUpperBoundBucket()
-{
+TEST_P(BucketDatabaseTest, testGetNextReturnsUpperBoundBucket) {
// getNext() would generally be implemented in terms of upperBound(), but
// make sure it conforms to the same contract in case this changes.
doTestUpperBound([](const BucketDatabase& bucketDb,
@@ -516,31 +528,28 @@ BucketDatabaseTest::testGetNextReturnsUpperBoundBucket()
});
}
-void
-BucketDatabaseTest::testChildCount()
-{
- db().clear();
+TEST_P(BucketDatabaseTest, testChildCount) {
// Empty tree; inserts cannot create inconsistencies.
- CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
+ EXPECT_EQ(0u, db().childCount(BucketId(3, 1)));
// Same bucket; cannot be inconsistent with itself.
db().update(BucketDatabase::Entry(document::BucketId(3, 1), BI(1)));
- CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
+ EXPECT_EQ(0u, db().childCount(BucketId(3, 1)));
// (2, 1) has one subtree.
- CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 1)));
+ EXPECT_EQ(1u, db().childCount(BucketId(2, 1)));
// Bucket exists in another subtree from (1, 1); inconsistency would
// result if we tried inserting it.
db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(2)));
- CPPUNIT_ASSERT_EQUAL(2u, db().childCount(BucketId(1, 1)));
+ EXPECT_EQ(2u, db().childCount(BucketId(1, 1)));
// Inner node with 1 subtree.
- CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 3)));
+ EXPECT_EQ(1u, db().childCount(BucketId(2, 3)));
// Leaves have no subtrees.
- CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
- CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 5)));
+ EXPECT_EQ(0u, db().childCount(BucketId(3, 1)));
+ EXPECT_EQ(0u, db().childCount(BucketId(3, 5)));
}
}
diff --git a/storage/src/tests/distributor/bucketdatabasetest.h b/storage/src/tests/distributor/bucketdatabasetest.h
index b7b32118fee..9ebc23bdb16 100644
--- a/storage/src/tests/distributor/bucketdatabasetest.h
+++ b/storage/src/tests/distributor/bucketdatabasetest.h
@@ -3,40 +3,13 @@
#include <vespa/storage/bucketdb/bucketdatabase.h>
#include <vespa/storage/storageutil/utils.h>
-#include <vespa/vdstestlib/cppunit/macros.h>
-#include <vespa/vespalib/util/document_runnable.h>
-#include <cppunit/extensions/HelperMacros.h>
-
-#define SETUP_DATABASE_TESTS() \
- CPPUNIT_TEST(testUpdateGetAndRemove); \
- CPPUNIT_TEST(testClear); \
- CPPUNIT_TEST(testIterating); \
- CPPUNIT_TEST(testFindParents); \
- CPPUNIT_TEST(testFindAll); \
- CPPUNIT_TEST(testCreateAppropriateBucket); \
- CPPUNIT_TEST(testGetNext); \
- CPPUNIT_TEST(testGetNextReturnsUpperBoundBucket); \
- CPPUNIT_TEST(testUpperBoundReturnsNextInOrderGreaterBucket); \
- CPPUNIT_TEST(testChildCount);
-
-namespace storage {
-namespace distributor {
-
-struct BucketDatabaseTest : public CppUnit::TestFixture {
- void setUp() override ;
-
- void testUpdateGetAndRemove();
- void testClear();
- void testIterating();
- void testFindParents();
- void testFindAll();
- void testCreateAppropriateBucket();
- void testGetNext();
- void testGetNextReturnsUpperBoundBucket();
- void testUpperBoundReturnsNextInOrderGreaterBucket();
- void testChildCount();
-
- void testBenchmark();
+#include <vespa/vespalib/gtest/gtest.h>
+#include <functional>
+
+namespace storage::distributor {
+
+struct BucketDatabaseTest : public ::testing::TestWithParam<std::shared_ptr<BucketDatabase>> {
+ void SetUp() override ;
std::string doFindParents(const std::vector<document::BucketId>& ids,
const document::BucketId& searchId);
@@ -46,9 +19,8 @@ struct BucketDatabaseTest : public CppUnit::TestFixture {
uint32_t minBits,
const document::BucketId& wantedId);
- virtual BucketDatabase& db() = 0;
+ BucketDatabase& db() noexcept { return *GetParam(); }
-private:
using UBoundFunc = std::function<
document::BucketId(const BucketDatabase&,
const document::BucketId&)>;
@@ -57,6 +29,3 @@ private:
};
}
-
-}
-
diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp
index 51797e90b3e..11c9f18b3ef 100644
--- a/storage/src/tests/distributor/bucketdbupdatertest.cpp
+++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp
@@ -15,12 +15,11 @@
#include <vespa/storage/distributor/simpleclusterinformation.h>
#include <vespa/storage/distributor/distributor.h>
#include <vespa/storage/distributor/distributor_bucket_space.h>
+#include <vespa/vespalib/gtest/gtest.h>
#include <vespa/vespalib/text/stringtokenizer.h>
#include <sstream>
#include <iomanip>
-#include <gtest/gtest.h>
-
using namespace storage::api;
using namespace storage::lib;
using document::test::makeDocumentBucket;
diff --git a/storage/src/tests/distributor/mapbucketdatabasetest.cpp b/storage/src/tests/distributor/mapbucketdatabasetest.cpp
index f2e521f1bf8..0ae4a49530e 100644
--- a/storage/src/tests/distributor/mapbucketdatabasetest.cpp
+++ b/storage/src/tests/distributor/mapbucketdatabasetest.cpp
@@ -1,21 +1,11 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vdstestlib/cppunit/macros.h>
+
#include <vespa/storage/bucketdb/mapbucketdatabase.h>
#include <tests/distributor/bucketdatabasetest.h>
-namespace storage {
-namespace distributor {
-
-struct MapBucketDatabaseTest : public BucketDatabaseTest {
- MapBucketDatabase _db;
- BucketDatabase& db() override { return _db; };
+namespace storage::distributor {
- CPPUNIT_TEST_SUITE(MapBucketDatabaseTest);
- SETUP_DATABASE_TESTS();
- CPPUNIT_TEST_SUITE_END();
-};
+INSTANTIATE_TEST_CASE_P(MapDatabase, BucketDatabaseTest,
+ ::testing::Values(std::make_shared<MapBucketDatabase>()));
-CPPUNIT_TEST_SUITE_REGISTRATION(MapBucketDatabaseTest);
-
-}
}
diff --git a/storage/src/vespa/storage/bucketdb/CMakeLists.txt b/storage/src/vespa/storage/bucketdb/CMakeLists.txt
index 8200884de17..a99d16d9f0f 100644
--- a/storage/src/vespa/storage/bucketdb/CMakeLists.txt
+++ b/storage/src/vespa/storage/bucketdb/CMakeLists.txt
@@ -1,6 +1,7 @@
# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
vespa_add_library(storage_bucketdb OBJECT
SOURCES
+ btree_bucket_database.cpp
bucketcopy.cpp
bucketdatabase.cpp
bucketinfo.cpp
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
new file mode 100644
index 00000000000..27c17b207ea
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp
@@ -0,0 +1,405 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "btree_bucket_database.h"
+#include <vespa/searchlib/btree/btreebuilder.h>
+#include <vespa/searchlib/btree/btreenodeallocator.hpp>
+#include <vespa/searchlib/btree/btreenode.hpp>
+#include <vespa/searchlib/btree/btreenodestore.hpp>
+#include <vespa/searchlib/btree/btreeiterator.hpp>
+#include <vespa/searchlib/btree/btreeroot.hpp>
+#include <vespa/searchlib/btree/btreebuilder.hpp>
+#include <vespa/searchlib/btree/btree.hpp>
+#include <vespa/searchlib/btree/btreestore.hpp>
+#include <vespa/searchlib/datastore/array_store.hpp>
+#include <iostream>
+
+/*
+ * Buckets in our tree are represented by their 64-bit numeric key, in what's known as
+ * "reversed bit order with appended used-bits" form. I.e. a bucket ID (16, 0xcafe), which
+ * in its canonical representation has 16 (the used-bits) in its 6 MSBs and 0xcafe in its
+ * LSBs is transformed into 0x7f53000000000010. This key is logically comprised of two parts:
+ * - the reversed bucket ID itself (0xcafe - 0x7f53) with all trailing zeroes for unset bits
+ * - the _non-reversed_ used-bits appended as the LSBs
+ *
+ * This particular transformation gives us keys with the following invariants:
+ * - all distinct bucket IDs map to exactly 1 key
+ * - buckets with the same ID but different used-bits are ordered in such a way that buckets
+ * with higher used-bits sort after buckets with lower used-bits
+ * - the key ordering represents an implicit in-order traversal of the binary bucket tree
+ * - consequently, all parent buckets are ordered before their child buckets
+ *
+ * The in-order traversal invariant is fundamental to many of the algorithms that operate
+ * on the bucket tree.
+ */
+
+namespace storage {
+
+using Entry = BucketDatabase::Entry;
+using search::datastore::EntryRef;
+using vespalib::ConstArrayRef;
+using document::BucketId;
+
+BTreeBucketDatabase::BTreeBucketDatabase()
+ : _tree(),
+ _store(ReplicaStore::optimizedConfigForHugePage(1023, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE,
+ 4 * 1024, 8 * 1024, 0.2)),
+ _generation_handler()
+{
+}
+
+BTreeBucketDatabase::~BTreeBucketDatabase() = default;
+
+namespace {
+
+Entry entry_from_replica_array_ref(const BucketId& id, uint32_t gc_timestamp, ConstArrayRef<BucketCopy> replicas) {
+ return Entry(id, BucketInfo(gc_timestamp, std::vector<BucketCopy>(replicas.begin(), replicas.end())));
+}
+
+EntryRef entry_ref_from_value(uint64_t value) {
+ return EntryRef(value & 0xffffffffULL);
+}
+
+uint32_t gc_timestamp_from_value(uint64_t value) {
+ return (value >> 32u);
+}
+
+uint64_t value_from(uint32_t gc_timestamp, EntryRef ref) {
+ return ((uint64_t(gc_timestamp) << 32u) | ref.ref());
+}
+
+// TODO dedupe and unify common code
+uint8_t
+getMinDiffBits(uint16_t minBits, const document::BucketId& a, const document::BucketId& b) {
+ for (uint32_t i = minBits; i <= std::min(a.getUsedBits(), b.getUsedBits()); i++) {
+ document::BucketId a1(i, a.getRawId());
+ document::BucketId b1(i, b.getRawId());
+ if (b1.getId() != a1.getId()) {
+ return i;
+ }
+ }
+ return minBits;
+}
+
+uint8_t next_parent_bit_seek_level(uint8_t minBits, const document::BucketId& a, const document::BucketId& b) {
+ const uint8_t min_used = std::min(a.getUsedBits(), b.getUsedBits());
+ assert(min_used >= minBits); // Always monotonically descending towards leaves
+ for (uint32_t i = minBits; i <= min_used; i++) {
+ document::BucketId a1(i, a.getRawId());
+ document::BucketId b1(i, b.getRawId());
+ if (b1.getId() != a1.getId()) {
+ return i;
+ }
+ }
+ // The bit prefix is equal, which means that one node is a parent of the other. In this
+ // case we have to force the seek to continue from the next level in the tree.
+ return std::max(min_used, minBits) + 1;
+}
+
+// TODO getMinDiffBits is hoisted from lockablemap.cpp, could probably be rewritten in terms of xor and MSB bit scan instr
+/*
+ * 63 -------- ... -> 0
+ * a: 1101111111 ... 0010
+ * b: 1101110010 ... 0011
+ * a ^ b: 0000001101 ... 0001
+ * ^- diff bit = 57
+ *
+ * 63 - vespalib::Optimized::msbIdx(a ^ b) ==> 6
+ *
+ * what if a == b? special case? not a problem if we can prove this never happens.
+ */
+
+}
+
+void BTreeBucketDatabase::commit_tree_changes() {
+ // TODO break up and refactor
+ // TODO verify semantics and usage
+ // TODO make BTree wrapping API which abstracts away all this stuff via reader/writer interfaces
+ _tree.getAllocator().freeze();
+
+ auto current_gen = _generation_handler.getCurrentGeneration();
+ _store.transferHoldLists(current_gen);
+ _tree.getAllocator().transferHoldLists(current_gen);
+
+ _generation_handler.incGeneration();
+
+ auto used_gen = _generation_handler.getFirstUsedGeneration();
+ _store.trimHoldLists(used_gen);
+ _tree.getAllocator().trimHoldLists(used_gen);
+}
+
+Entry BTreeBucketDatabase::entry_from_iterator(const BTree::ConstIterator& iter) const {
+ if (!iter.valid()) {
+ return Entry::createInvalid();
+ }
+ const auto value = iter.getData();
+ const auto replicas_ref = _store.get(entry_ref_from_value(value));
+ const auto bucket = BucketId(BucketId::keyToBucketId(iter.getKey()));
+ return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref);
+}
+
+BucketId BTreeBucketDatabase::bucket_from_valid_iterator(const BTree::ConstIterator& iter) const {
+ return BucketId(BucketId::keyToBucketId(iter.getKey()));
+}
+
+Entry BTreeBucketDatabase::get(const BucketId& bucket) const {
+ return entry_from_iterator(_tree.find(bucket.toKey()));
+}
+
+void BTreeBucketDatabase::remove(const BucketId& bucket) {
+ auto iter = _tree.find(bucket.toKey());
+ if (!iter.valid()) {
+ return;
+ }
+ const auto value = iter.getData();
+ _store.remove(entry_ref_from_value(value));
+ _tree.remove(iter);
+ commit_tree_changes();
+}
+
+/*
+ * Finding the complete set of parents of a given bucket is not obvious how to
+ * do efficiently, as we only know that the parents are ordered before their
+ * children, but we do not a-priori know if any exist at all. The Judy DB impl
+ * does O(b) explicit point lookups (where b is the number of used bits in the
+ * bucket), starting at the leaf bit and working towards the root. To avoid
+ * having to re-create iterators and perform a full tree search every time, we
+ * turn this on its head and start from the root, progressing towards the leaf.
+ * This allows us to reuse a single iterator and to continue seeking forwards
+ * from its current position.
+ *
+ * Algorithm:
+ *
+ * Core invariant: every subsequent iterator seek performed in this algorithm
+ * is for a key that is strictly higher than the one the iterator is currently at.
+ *
+ * 1. Lbound seek to the lowest key that is known to exclude all already visited
+ * parents. On the first iteration this is zero, i.e. the first in-order node.
+ * 2. If the current node's key is greater than that of the requested bucket's key,
+ * we've either descended to--or beyond--it in its own subtree or we've entered
+ * a disjoint subtree. Since we know that all parents must sort before any given
+ * child bucket, no more parents may be found at this point. Algorithm terminates.
+ * 3. As the main body of the loop is entered, we know one of following must hold:
+ * 3.1 The current node is an explicitly present parent of our bucket.
+ * 3.2 The current node is contained in a left subtree branch of a parent that
+ * does not have a bucket explicitly present in the tree. It cannot be in
+ * a right subtree of any parent, as that would imply the node is ordered
+ * _after_ our own bucket in an in-order traversal, which would contradict
+ * the check in step 2 above.
+ * 4. If the current node contains the requested bucket, we're at a parent
+ * node of the bucket; add it to the result set.
+ * If this is _not_ the case, we're in a different subtree. Example: the
+ * requested bucket has a key whose MSB is 1 but the first bucket in the
+ * tree has a key with an MSB of 0. Either way we need to update our search
+ * key to home in on the target subtree where more parents may be found;
+ * 5. Update the seek key to find the next possible parent. To ensure this key is
+ * strictly greater than the iterator's current key we find the largest shared
+ * prefix of bits in common between the current node's key and the requested
+ * bucket's key. The prefix length + 1 is then the depth in the tree at which the
+ * two subtrees branch off and diverge.
+ * The new key is then the MSB prefix length + 1 requested bucket's key with a
+ * matching number of used-bits set. Forward lbound-seek the iterator to this key.
+ * `--> TODO elaborate on prefix semantics when they are equal wrt. min used bits
+ * 6. Iff iterator is still valid, go to step 2
+ *
+ * This algorithm is able to skip through large parts of the tree in a sparsely populated
+ * tree, but the number of seeks will trend towards O(b) as with the legacy implementation
+ * when a tree is densely populated. This because all logical inner nodes in the tree will
+ * have subtrees under them. Even in the worst case we should be more efficient than the
+ * legacy implementation since we've cut any dense search space in half for each invocation
+ * of seek() on the iterator
+ *
+ * TODO use min-aggregation to infer used bit range in the tree. This should allow us to
+ * skip scanning through many disjoint subtrees.
+ */
+BTreeBucketDatabase::BTree::ConstIterator
+BTreeBucketDatabase::find_parents_internal(const document::BucketId& bucket,
+ std::vector<Entry>& entries) const
+{
+ const uint64_t bucket_key = bucket.toKey();
+ uint64_t parent_key = 0;
+ auto iter = _tree.begin();
+ // Start at the root level, descending towards the bucket itself.
+ // Try skipping as many levels of the tree as possible as we go.
+ uint32_t bits = 1;
+ while (iter.valid() && (iter.getKey() < bucket_key)) {
+ auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
+ if (candidate.contains(bucket)) {
+ assert(candidate.getUsedBits() >= bits);
+ entries.emplace_back(entry_from_iterator(iter));
+ }
+ bits = next_parent_bit_seek_level(bits, candidate, bucket);
+ parent_key = BucketId(bits, bucket.getRawId()).toKey();
+ assert(parent_key > iter.getKey());
+ iter.seek(parent_key);
+ }
+ return iter; // FIXME clang warns here due to copying...
+}
+
+/*
+ * Note: due to legacy API reasons, iff the requested bucket itself exists in the
+ * tree, it will be returned in the result set. I.e. it returns all the nodes on
+ * the path from _and including_ itself towards the root.
+ */
+void BTreeBucketDatabase::getParents(const BucketId& bucket,
+ std::vector<Entry>& entries) const
+{
+ auto iter = find_parents_internal(bucket, entries);
+ if (iter.valid() && iter.getKey() == bucket.toKey()) {
+ entries.emplace_back(entry_from_iterator(iter));
+ }
+}
+
+void BTreeBucketDatabase::getAll(const BucketId& bucket,
+ std::vector<Entry>& entries) const
+{
+ auto iter = find_parents_internal(bucket, entries);
+ // `iter` is already pointing at, or beyond, one of the bucket's subtrees.
+ for (; iter.valid(); ++iter) {
+ auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey()));
+ if (bucket.contains(candidate)) {
+ entries.emplace_back(entry_from_iterator(iter));
+ } else {
+ break;
+ }
+ }
+}
+
+void BTreeBucketDatabase::update(const Entry& newEntry) {
+ assert(newEntry.valid());
+ auto replicas_ref = _store.add(newEntry.getBucketInfo().getRawNodes());
+ const auto new_value = value_from(newEntry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref);
+ const auto bucket_key = newEntry.getBucketId().toKey();
+ auto iter = _tree.lowerBound(bucket_key);
+ if (iter.valid() && (iter.getKey() == bucket_key)) {
+ _store.remove(entry_ref_from_value(iter.getData()));
+ // In-place update of value; does not require tree structure modification
+ std::atomic_thread_fence(std::memory_order_release); // Must ensure visibility when new array ref is observed
+ iter.writeData(new_value);
+ } else {
+ _tree.insert(iter, bucket_key, new_value);
+ }
+ commit_tree_changes(); // TODO does publishing a new root imply an implicit memory fence?
+}
+
+// TODO need snapshot read with guarding
+// FIXME semantics of for-each in judy and bit tree DBs differ, former expects lbound, latter ubound..!
+// FIXME but bit-tree code says "lowerBound" in impl and "after" in declaration???
+void BTreeBucketDatabase::forEach(EntryProcessor& proc, const BucketId& after) const {
+ for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) {
+ if (!proc.process(entry_from_iterator(iter))) {
+ break;
+ }
+ }
+}
+
+void BTreeBucketDatabase::forEach(MutableEntryProcessor& proc, const BucketId& after) {
+ for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) {
+ // FIXME this is a horrible API which currently has to update every time since we don't
+ // know from the function call itself whether something was in fact updated behind the scenes..!
+ auto entry = entry_from_iterator(iter);
+ bool should_continue = proc.process(entry);
+ // TODO optimize this joyful mess :D
+ update(entry);
+ if (!should_continue) {
+ break;
+ }
+ }
+ //commit_tree_changes(); TODO should be done as bulk op!
+}
+
+Entry BTreeBucketDatabase::upperBound(const BucketId& bucket) const {
+ return entry_from_iterator(_tree.upperBound(bucket.toKey()));
+
+}
+
+uint64_t BTreeBucketDatabase::size() const {
+ return _tree.size();
+}
+
+void BTreeBucketDatabase::clear() {
+ _tree.clear();
+ commit_tree_changes();
+}
+
+/*
+ * Returns the bucket ID which, based on the buckets already existing in the DB,
+ * is the most specific location in the tree in which it should reside. This may
+ * or may not be a bucket that already exists.
+ *
+ * Example: if there is a single bucket (1, 1) in the tree, a query for (1, 1) or
+ * (1, 3) will return (1, 1) as that is the most specific leaf in that subtree.
+ * A query for (1, 0) will return (1, 0) even though this doesn't currently exist,
+ * as there is no existing bucket that can contain the queried bucket. It is up to
+ * the caller to create this bucket according to its needs.
+ *
+ * Usually this function will be called with an ID whose used-bits is at max (58), in
+ * order to find a leaf bucket to route an incoming document operation to.
+ *
+ * TODO rename this function, it's very much _not_ obvious what an "appropriate" bucket is..!
+ * TODO this should be possible to do concurrently
+ */
+BucketId BTreeBucketDatabase::getAppropriateBucket(uint16_t minBits, const BucketId& bid) {
+ // The bucket tree is ordered in such a way that it represents a
+ // natural in-order traversal of all buckets, with inner nodes being
+ // visited before leaf nodes. This means that a lower bound seek will
+ // never return a parent of a seeked bucket. The iterator will be pointing
+ // to a bucket that is either the actual bucket given as the argument to
+ // lowerBound() or the next in-order bucket (or end() if none exists).
+ auto iter = _tree.lowerBound(bid.toKey());
+ if (iter.valid()) {
+ // Find the first level in the tree where the paths through the bucket tree
+ // diverge for the target bucket and the current bucket.
+ minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid);
+ }
+ // TODO is it better to copy original iterator and do begin() on the copy?
+ auto first_iter = _tree.begin();
+ // Original iterator might be in a different subtree than that of our
+ // target bucket. If possible, rewind one node to discover any parent or
+ // leftmost sibling of our node. If there's no such node, we'll still
+ // discover the greatest equal bit prefix.
+ if (iter != first_iter) {
+ --iter;
+ minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid);
+ }
+ return BucketId(minBits, bid.getRawId());
+}
+
+/*
+ * Enumerate the number of child subtrees under `bucket`. The value returned is in the
+ * range [0, 2] regardless of how many subtrees are present further down in the tree.
+ *
+ * Finding this number is reasonably straight forward; we construct two buckets that
+ * represent the key ranges for the left and right subtrees under `bucket` and check
+ * if there are any ranges in the tree's keyspace that are contained in these.
+ */
+// TODO rename/clarify to indicate this is child _subtrees_, not explicit child _buckets_!
+uint32_t BTreeBucketDatabase::childCount(const BucketId& bucket) const {
+ assert(bucket.getUsedBits() < BucketId::maxNumBits);
+ BucketId lhs_bucket(bucket.getUsedBits() + 1, bucket.getId());
+ BucketId rhs_bucket(bucket.getUsedBits() + 1, (1ULL << bucket.getUsedBits()) | bucket.getId());
+
+ auto iter = _tree.lowerBound(lhs_bucket.toKey());
+ if (!iter.valid()) {
+ return 0;
+ }
+ if (lhs_bucket.contains(bucket_from_valid_iterator(iter))) {
+ iter.seek(rhs_bucket.toKey());
+ if (!iter.valid()) {
+ return 1; // lhs subtree only
+ }
+ return (rhs_bucket.contains(bucket_from_valid_iterator(iter)) ? 2 : 1);
+ } else if (rhs_bucket.contains(bucket_from_valid_iterator(iter))) {
+ return 1; // rhs subtree only
+ }
+ return 0;
+}
+
+void BTreeBucketDatabase::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ out << "BTreeBucketDatabase(" << size() << " buckets)";
+ (void)verbose;
+ (void)indent;
+}
+
+}
diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
new file mode 100644
index 00000000000..91d3a1cde5b
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h
@@ -0,0 +1,70 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "bucketdatabase.h"
+#include <vespa/searchlib/btree/btree.h>
+#include <vespa/searchlib/datastore/array_store.h>
+
+namespace storage {
+
+/*
+ * Bucket database implementation built around lock-free single-writer/multiple-readers B+tree.
+ *
+ * Since a distributor must be able to handle multiple replicas for a given bucket, these
+ * are handled via an ArrayStore indirection. A distributor bucket DB also includes state
+ * for the _entire bucket_ itself, not just the replicas; last timestamp of bucket GC. Since
+ * this is an uint32 we cheekily mangle it into the value, i.e. each bucket key maps to a
+ * composite key of (gc_timestamp_u32 << 32) | array_ref_u32.
+ *
+ * This is _not_ yet production ready, for several reasons:
+ * - Underlying ArrayStore does not have freelists enabled for replica entry reuse
+ * - Current API design for mutable forEach requires O(n) tree structure mutations instead
+ * of changing the tree in bulk and reusing ArrayStore refs et al. Needs a redesign.
+ *
+ * Also note that the DB is currently _not_ thread safe, as read snapshotting is not yet defined
+ * or exposed.
+ */
+// TODO create and use a new DB interface with better bulk loading, snapshot and iteration support
+class BTreeBucketDatabase : public BucketDatabase {
+ // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref>
+ using BTree = search::btree::BTree<uint64_t, uint64_t>;
+ using ReplicaStore = search::datastore::ArrayStore<BucketCopy>;
+ using GenerationHandler = vespalib::GenerationHandler;
+
+ BTree _tree;
+ ReplicaStore _store;
+ GenerationHandler _generation_handler;
+public:
+ BTreeBucketDatabase();
+ ~BTreeBucketDatabase() override;
+
+ // Ye olde bucket DB API:
+ Entry get(const document::BucketId& bucket) const override;
+ void remove(const document::BucketId& bucket) override;
+ void getParents(const document::BucketId& childBucket,
+ std::vector<Entry>& entries) const override;
+ void getAll(const document::BucketId& bucket,
+ std::vector<Entry>& entries) const override;
+ void update(const Entry& newEntry) override;
+ void forEach(EntryProcessor&, const document::BucketId& after) const override;
+ void forEach(MutableEntryProcessor&, const document::BucketId& after) override;
+ Entry upperBound(const document::BucketId& value) const override;
+ uint64_t size() const override;
+ void clear() override;
+ document::BucketId getAppropriateBucket(
+ uint16_t minBits,
+ const document::BucketId& bid) override;
+ uint32_t childCount(const document::BucketId&) const override;
+ void print(std::ostream& out, bool verbose,
+ const std::string& indent) const override;
+
+private:
+ Entry entry_from_iterator(const BTree::ConstIterator& iter) const;
+ document::BucketId bucket_from_valid_iterator(const BTree::ConstIterator& iter) const;
+ void commit_tree_changes();
+ BTree::ConstIterator find_parents_internal(const document::BucketId& bucket,
+ std::vector<Entry>& entries) const;
+};
+
+}
diff --git a/storage/src/vespa/storage/bucketdb/bucketdatabase.h b/storage/src/vespa/storage/bucketdb/bucketdatabase.h
index d939ac48a03..a5a70c488ca 100644
--- a/storage/src/vespa/storage/bucketdb/bucketdatabase.h
+++ b/storage/src/vespa/storage/bucketdb/bucketdatabase.h
@@ -19,8 +19,8 @@ public:
public:
Entry() : _bucketId(0) {} // Invalid entry
- Entry(const document::BucketId& bId, const BucketInfo& bucketInfo)
- : _bucketId(bId), _info(bucketInfo) {}
+ Entry(const document::BucketId& bId, BucketInfo bucketInfo)
+ : _bucketId(bId), _info(std::move(bucketInfo)) {}
explicit Entry(const document::BucketId& bId) : _bucketId(bId) {}
bool operator==(const Entry& other) const {
@@ -54,7 +54,7 @@ public:
virtual void remove(const document::BucketId& bucket) = 0;
/**
- * Puts all entries that are can contain the given bucket id
+ * Puts all entries that may contain the given bucket id
* into the given entry vector, including itself if found.
*/
virtual void getParents(const document::BucketId& childBucket,
diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.cpp b/storage/src/vespa/storage/bucketdb/bucketinfo.cpp
index bbc59de410a..82fc6686c5f 100644
--- a/storage/src/vespa/storage/bucketdb/bucketinfo.cpp
+++ b/storage/src/vespa/storage/bucketdb/bucketinfo.cpp
@@ -6,10 +6,21 @@
namespace storage {
BucketInfo::BucketInfo()
- : _lastGarbageCollection(0)
-{ }
+ : _lastGarbageCollection(0),
+ _nodes()
+{}
-BucketInfo::~BucketInfo() { }
+BucketInfo::BucketInfo(uint32_t lastGarbageCollection, std::vector<BucketCopy> nodes)
+ : _lastGarbageCollection(lastGarbageCollection),
+ _nodes(std::move(nodes))
+{}
+
+BucketInfo::~BucketInfo() = default;
+
+BucketInfo::BucketInfo(const BucketInfo&) = default;
+BucketInfo& BucketInfo::operator=(const BucketInfo&) = default;
+BucketInfo::BucketInfo(BucketInfo&&) noexcept = default;
+BucketInfo& BucketInfo::operator=(BucketInfo&&) noexcept = default;
std::string
BucketInfo::toString() const {
diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.h b/storage/src/vespa/storage/bucketdb/bucketinfo.h
index ace82561b2a..8949c0f10da 100644
--- a/storage/src/vespa/storage/bucketdb/bucketinfo.h
+++ b/storage/src/vespa/storage/bucketdb/bucketinfo.h
@@ -22,8 +22,14 @@ private:
public:
BucketInfo();
+ BucketInfo(uint32_t lastGarbageCollection, std::vector<BucketCopy> nodes);
~BucketInfo();
+ BucketInfo(const BucketInfo&);
+ BucketInfo& operator=(const BucketInfo&);
+ BucketInfo(BucketInfo&&) noexcept;
+ BucketInfo& operator=(BucketInfo&&) noexcept;
+
/**
* @return Returns the last time when this bucket was "garbage collected".
*/
@@ -136,6 +142,10 @@ public:
return _nodes[idx];
}
+ const std::vector<BucketCopy>& getRawNodes() const noexcept {
+ return _nodes;
+ }
+
void clearTrusted(uint16_t nodeIdx) {
getNodeInternal(nodeIdx)->clearTrusted();
}
diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.h b/storage/src/vespa/storage/bucketdb/lockablemap.h
index 89be92d5921..2b2c2cbe7a8 100644
--- a/storage/src/vespa/storage/bucketdb/lockablemap.h
+++ b/storage/src/vespa/storage/bucketdb/lockablemap.h
@@ -305,7 +305,7 @@ private:
* Find the given list of keys in the map and add them to the map of
* results, locking them in the process.
*/
- void addAndLockResults(const std::vector<BucketId::Type> keys,
+ void addAndLockResults(const std::vector<BucketId::Type>& keys,
const char* clientId,
std::map<BucketId, WrappedEntry>& results,
std::unique_lock<std::mutex> &guard);
diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.hpp b/storage/src/vespa/storage/bucketdb/lockablemap.hpp
index 6d700cd6049..440a76b92fd 100644
--- a/storage/src/vespa/storage/bucketdb/lockablemap.hpp
+++ b/storage/src/vespa/storage/bucketdb/lockablemap.hpp
@@ -549,7 +549,7 @@ LockableMap<Map>::getAllContaining(const BucketId& bucket,
template<typename Map>
void
LockableMap<Map>::addAndLockResults(
- const std::vector<BucketId::Type> keys,
+ const std::vector<BucketId::Type>& keys,
const char* clientId,
std::map<BucketId, WrappedEntry>& results,
std::unique_lock<std::mutex> &guard)
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
index db2842b70f2..d5c16bc7f5b 100644
--- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
@@ -822,7 +822,7 @@ BucketDBUpdater::NodeRemover::setCopiesInEntry(
e->addNodes(copies, order);
- LOG(debug, "Changed %s", e->toString().c_str());
+ LOG(spam, "Changed %s", e->toString().c_str());
LOG_BUCKET_OPERATION_NO_LOCK(
e.getBucketId(),
vespalib::make_string("updated bucketdb entry to %s",
@@ -834,7 +834,7 @@ BucketDBUpdater::NodeRemover::removeEmptyBucket(const document::BucketId& bucket
{
_removedBuckets.push_back(bucketId);
- LOG(debug,
+ LOG(spam,
"After system state change %s, bucket %s now has no copies.",
_oldState.getTextualDifference(_state).c_str(),
bucketId.toString().c_str());
diff --git a/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp b/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp
index dcf7792860e..f013ce43048 100644
--- a/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp
+++ b/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp
@@ -1,13 +1,24 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "distributor_bucket_space.h"
+#include <vespa/storage/bucketdb/mapbucketdatabase.h>
+#include <vespa/storage/bucketdb/btree_bucket_database.h>
#include <vespa/vdslib/state/clusterstate.h>
#include <vespa/vdslib/distribution/distribution.h>
namespace storage::distributor {
+namespace {
+
+std::unique_ptr<BucketDatabase> make_default_db_impl() {
+ //return std::make_unique<BTreeBucketDatabase>();
+ return std::make_unique<MapBucketDatabase>();
+}
+
+}
+
DistributorBucketSpace::DistributorBucketSpace()
- : _bucketDatabase(),
+ : _bucketDatabase(make_default_db_impl()),
_clusterState(),
_distribution()
{
diff --git a/storage/src/vespa/storage/distributor/distributor_bucket_space.h b/storage/src/vespa/storage/distributor/distributor_bucket_space.h
index 23d00c6ae43..effb0dc3e17 100644
--- a/storage/src/vespa/storage/distributor/distributor_bucket_space.h
+++ b/storage/src/vespa/storage/distributor/distributor_bucket_space.h
@@ -1,9 +1,12 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
-#include <vespa/storage/bucketdb/mapbucketdatabase.h>
#include <memory>
+namespace storage {
+class BucketDatabase;
+}
+
namespace storage::lib {
class ClusterState;
class Distribution;
@@ -23,7 +26,7 @@ namespace storage::distributor {
* bucket spaces.
*/
class DistributorBucketSpace {
- MapBucketDatabase _bucketDatabase;
+ std::unique_ptr<BucketDatabase> _bucketDatabase;
std::shared_ptr<const lib::ClusterState> _clusterState;
std::shared_ptr<const lib::Distribution> _distribution;
public:
@@ -36,10 +39,10 @@ public:
DistributorBucketSpace& operator=(DistributorBucketSpace&&) = delete;
BucketDatabase& getBucketDatabase() noexcept {
- return _bucketDatabase;
+ return *_bucketDatabase;
}
const BucketDatabase& getBucketDatabase() const noexcept {
- return _bucketDatabase;
+ return *_bucketDatabase;
}
void setClusterState(std::shared_ptr<const lib::ClusterState> clusterState);
diff --git a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
index c295be19a0b..4208e8b33f9 100644
--- a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
+++ b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp
@@ -49,9 +49,7 @@ PendingBucketSpaceDbTransition::PendingBucketSpaceDbTransition(const PendingClus
}
}
-PendingBucketSpaceDbTransition::~PendingBucketSpaceDbTransition()
-{
-}
+PendingBucketSpaceDbTransition::~PendingBucketSpaceDbTransition() = default;
PendingBucketSpaceDbTransition::Range
PendingBucketSpaceDbTransition::skipAllForSameBucket()