diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2019-05-13 10:31:30 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-05-13 10:31:30 +0200 |
commit | 555008f9bbb282a5f620a3d4e63e757c08ed883d (patch) | |
tree | 76339729385d16dc854c77acd476850003545b3d /storage | |
parent | 866ce6e27e52c8e4ddb34d4cb8e9879c5dfd0a11 (diff) | |
parent | aa9fa2f49f3a61d2978b9748712f3cadd902fd7b (diff) |
Merge pull request #9341 from vespa-engine/vekterli/add-distributor-btree-bucket-database-foundations
Add initial B+tree distributor bucket database
Diffstat (limited to 'storage')
19 files changed, 670 insertions, 180 deletions
diff --git a/storage/CMakeLists.txt b/storage/CMakeLists.txt index f77c11eb350..6b697716e1d 100644 --- a/storage/CMakeLists.txt +++ b/storage/CMakeLists.txt @@ -16,6 +16,7 @@ vespa_define_module( vdslib persistence storageframework + searchlib EXTERNAL_DEPENDS Judy diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt index f7a65000f6c..fdccf9b1394 100644 --- a/storage/src/tests/distributor/CMakeLists.txt +++ b/storage/src/tests/distributor/CMakeLists.txt @@ -2,7 +2,6 @@ vespa_add_library(storage_testdistributor TEST SOURCES blockingoperationstartertest.cpp - bucketdatabasetest.cpp bucketdbmetricupdatertest.cpp bucketgctimecalculatortest.cpp bucketstateoperationtest.cpp @@ -15,7 +14,6 @@ vespa_add_library(storage_testdistributor TEST idealstatemanagertest.cpp joinbuckettest.cpp maintenanceschedulertest.cpp - mapbucketdatabasetest.cpp mergelimitertest.cpp mergeoperationtest.cpp messagesenderstub.cpp @@ -48,7 +46,10 @@ vespa_add_library(storage_testdistributor TEST vespa_add_library(storage_gtestdistributor TEST SOURCES + btree_bucket_database_test.cpp + bucketdatabasetest.cpp bucketdbupdatertest.cpp + mapbucketdatabasetest.cpp # Fixture etc. dupes with non-gtest runner : distributortestutil.cpp bucket_db_prune_elision_test.cpp diff --git a/storage/src/tests/distributor/btree_bucket_database_test.cpp b/storage/src/tests/distributor/btree_bucket_database_test.cpp new file mode 100644 index 00000000000..c253a758f98 --- /dev/null +++ b/storage/src/tests/distributor/btree_bucket_database_test.cpp @@ -0,0 +1,12 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/storage/bucketdb/btree_bucket_database.h> +#include <tests/distributor/bucketdatabasetest.h> + +namespace storage::distributor { + +INSTANTIATE_TEST_CASE_P(BTreeDatabase, BucketDatabaseTest, + ::testing::Values(std::make_shared<BTreeBucketDatabase>())); + +} + diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp index 55106ce5c28..92e0c534e31 100644 --- a/storage/src/tests/distributor/bucketdatabasetest.cpp +++ b/storage/src/tests/distributor/bucketdatabasetest.cpp @@ -1,6 +1,5 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "bucketdatabasetest.h" -#include <vespa/storageframework/defaultimplementation/clock/realclock.h> #include <iomanip> #include <algorithm> @@ -8,9 +7,7 @@ namespace storage::distributor { using document::BucketId; -void -BucketDatabaseTest::setUp() -{ +void BucketDatabaseTest::SetUp() { db().clear(); } @@ -26,44 +23,42 @@ namespace { } } -void -BucketDatabaseTest::testClear() { +TEST_P(BucketDatabaseTest, testClear) { db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1))); db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2))); db().clear(); - CPPUNIT_ASSERT_EQUAL(uint64_t(0), db().size()); + EXPECT_EQ(uint64_t(0), db().size()); } -void -BucketDatabaseTest::testUpdateGetAndRemove() { +TEST_P(BucketDatabaseTest, testUpdateGetAndRemove) { // Do some insertions - CPPUNIT_ASSERT_EQUAL(0, (int)db().size()); + EXPECT_EQ(0, db().size()); db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1))); db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2))); db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3))); - CPPUNIT_ASSERT_EQUAL(3, (int)db().size()); + EXPECT_EQ(3, db().size()); db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(4))); - CPPUNIT_ASSERT_EQUAL(3, (int)db().size()); + EXPECT_EQ(3, db().size()); // Access some elements - CPPUNIT_ASSERT_EQUAL(BI(4), db().get(document::BucketId(16, 11)).getBucketInfo()); - CPPUNIT_ASSERT_EQUAL(BI(1), db().get(document::BucketId(16, 16)).getBucketInfo()); - CPPUNIT_ASSERT_EQUAL(BI(3), db().get(document::BucketId(16, 42)).getBucketInfo()); + EXPECT_EQ(BI(4), db().get(document::BucketId(16, 11)).getBucketInfo()); + EXPECT_EQ(BI(1), db().get(document::BucketId(16, 16)).getBucketInfo()); + EXPECT_EQ(BI(3), db().get(document::BucketId(16, 42)).getBucketInfo()); // Do removes db().remove(document::BucketId(16, 12)); - CPPUNIT_ASSERT_EQUAL(3, (int)db().size()); + EXPECT_EQ(3, db().size()); db().remove(document::BucketId(16, 11)); - CPPUNIT_ASSERT_EQUAL(2, (int)db().size()); + EXPECT_EQ(2, db().size()); db().remove(document::BucketId(16, 16)); db().remove(document::BucketId(16, 42)); - CPPUNIT_ASSERT_EQUAL(0, (int)db().size()); + EXPECT_EQ(0, db().size()); } namespace { @@ -83,8 +78,7 @@ struct ModifyProcessor : public BucketDatabase::MutableEntryProcessor } }; -struct ListAllProcessor : public BucketDatabase::EntryProcessor -{ +struct ListAllProcessor : public BucketDatabase::EntryProcessor { std::ostringstream ost; bool process(const BucketDatabase::Entry& e) override { @@ -93,8 +87,7 @@ struct ListAllProcessor : public BucketDatabase::EntryProcessor } }; -struct DummyProcessor : public BucketDatabase::EntryProcessor -{ +struct DummyProcessor : public BucketDatabase::EntryProcessor { std::ostringstream ost; bool process(const BucketDatabase::Entry&) override { @@ -103,8 +96,7 @@ struct DummyProcessor : public BucketDatabase::EntryProcessor }; -struct StoppingProcessor : public BucketDatabase::EntryProcessor -{ +struct StoppingProcessor : public BucketDatabase::EntryProcessor { std::ostringstream ost; bool process(const BucketDatabase::Entry& e) override { @@ -120,8 +112,7 @@ struct StoppingProcessor : public BucketDatabase::EntryProcessor } -void -BucketDatabaseTest::testIterating() { +TEST_P(BucketDatabaseTest, testIterating) { // Do some insertions db().update(BucketDatabase::Entry(document::BucketId(16, 0x10), BI(1))); db().update(BucketDatabase::Entry(document::BucketId(16, 0x0b), BI(2))); @@ -131,7 +122,7 @@ BucketDatabaseTest::testIterating() { ListAllProcessor proc; db().forEach(proc, document::BucketId()); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string( "BucketId(0x4000000000000010) : " "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n" @@ -146,7 +137,7 @@ BucketDatabaseTest::testIterating() { ListAllProcessor proc; db().forEach(proc, document::BucketId(16, 0x2a)); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string( "BucketId(0x400000000000000b) : " "node(idx=2,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n"), @@ -157,7 +148,7 @@ BucketDatabaseTest::testIterating() { StoppingProcessor proc; db().forEach(proc, document::BucketId()); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string( "BucketId(0x4000000000000010) : " "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n" @@ -173,7 +164,7 @@ BucketDatabaseTest::testIterating() { ListAllProcessor proc; db().forEach(proc); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string( "BucketId(0x4000000000000010) : " "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false,ready=false)\n" @@ -213,18 +204,33 @@ BucketDatabaseTest::doFindParents(const std::vector<document::BucketId>& ids, return ost.str(); } -void -BucketDatabaseTest::testFindParents() { +TEST_P(BucketDatabaseTest, testFindParents) { // test what parents in the DB (specified in vector) are parents of the // specified bucket. Result is a list of indexes into the vector. - CPPUNIT_ASSERT_EQUAL( + + // The way the legacy API works is that a bucket is considered as being in + // the set of its parents... This is rather weird, but at least explicitly + // test that it is so for now to avoid breaking the world. + EXPECT_EQ( + std::string("0"), + doFindParents(toVector(document::BucketId(17, 0xcafe)), + document::BucketId(17, 0xcafe))); + + EXPECT_EQ( + std::string("1,2"), + doFindParents(toVector(document::BucketId(1, 0x0), + document::BucketId(1, 0x1), + document::BucketId(2, 0x1)), + document::BucketId(16, 0x1))); + + EXPECT_EQ( std::string("2"), doFindParents(toVector(document::BucketId(17, 0x0ffff), document::BucketId(18, 0x1ffff), document::BucketId(18, 0x3ffff)), document::BucketId(22, 0xfffff))); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string("0,2,3"), doFindParents(toVector(document::BucketId(16, 0x0ffff), document::BucketId(17, 0x0ffff), @@ -232,7 +238,15 @@ BucketDatabaseTest::testFindParents() { document::BucketId(19, 0xfffff)), document::BucketId(22, 0xfffff))); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( + std::string("0,1,2,3"), + doFindParents(toVector(document::BucketId(16, 0x0ffff), + document::BucketId(17, 0x0ffff), + document::BucketId(18, 0x0ffff), + document::BucketId(19, 0x0ffff)), + document::BucketId(20, 0x0ffff))); + + EXPECT_EQ( std::string("0,2,3"), doFindParents(toVector(document::BucketId(16, 0x0ffff), document::BucketId(17, 0x0ffff), @@ -240,20 +254,20 @@ BucketDatabaseTest::testFindParents() { document::BucketId(18, 0x1ffff)), document::BucketId(22, 0x1ffff))); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string("0"), doFindParents(toVector(document::BucketId(16, 0x0ffff), document::BucketId(17, 0x0ffff)), document::BucketId(22, 0x1ffff))); - CPPUNIT_ASSERT_EQUAL( // ticket 3121525 + EXPECT_EQ( // ticket 3121525 std::string("0"), doFindParents(toVector(document::BucketId(16, 0x0ffff), document::BucketId(17, 0x0ffff), document::BucketId(19, 0x1ffff)), document::BucketId(18, 0x1ffff))); - CPPUNIT_ASSERT_EQUAL( // ticket 3121525 + EXPECT_EQ( // ticket 3121525 std::string("0"), doFindParents(toVector(document::BucketId(16, 0x0ffff), document::BucketId(17, 0x0ffff), @@ -276,8 +290,11 @@ BucketDatabaseTest::doFindAll(const std::vector<document::BucketId>& ids, std::ostringstream ost; for (uint32_t i = 0; i < ids.size(); ++i) { - if (std::find(entries.begin(), entries.end(), - BucketDatabase::Entry(ids[i], BI(i))) != entries.end()) { + auto wanted = BucketDatabase::Entry(ids[i], BI(i)); + for (const auto& e : entries) { + if (!(e == wanted)) { + continue; + } if (!ost.str().empty()) { ost << ","; } @@ -288,11 +305,9 @@ BucketDatabaseTest::doFindAll(const std::vector<document::BucketId>& ids, return ost.str(); } -void -BucketDatabaseTest::testFindAll() -{ +TEST_P(BucketDatabaseTest, testFindAll) { std::vector<document::BucketId> buckets; - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string(""), doFindAll(buckets, document::BucketId(18, 0x1ffff))); @@ -306,15 +321,20 @@ BucketDatabaseTest::testFindAll() buckets.push_back(document::BucketId(20, 0xceaaa)); buckets.push_back(document::BucketId(17, 0x1ffff)); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( + std::string("0"), + doFindAll(toVector(document::BucketId(16, 1234)), + document::BucketId(16, 1234))); + + EXPECT_EQ( std::string("0,4,5,6"), doFindAll(buckets, document::BucketId(17, 0x1aaaa))); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string("8"), doFindAll(buckets, document::BucketId(16, 0xffff))); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string("0,1"), doFindAll(toVector(document::BucketId(17, 0x00001), document::BucketId(17, 0x10001)), @@ -322,7 +342,7 @@ BucketDatabaseTest::testFindAll() document::BucketId id(33, 0x1053c7089); // Bit 32 is set, but unused. id.setUsedBits(32); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( std::string("1,2"), doFindAll(toVector(document::BucketId(24, 0x000dc7089), document::BucketId(33, 0x0053c7089), @@ -330,7 +350,7 @@ BucketDatabaseTest::testFindAll() document::BucketId(24, 0x000bc7089)), id)); - CPPUNIT_ASSERT_EQUAL( // Inconsistent split + EXPECT_EQ( // Inconsistent split std::string("0,1,2"), doFindAll(toVector( document::BucketId(16, 0x00001), // contains 2-3 @@ -338,7 +358,7 @@ BucketDatabaseTest::testFindAll() document::BucketId(17, 0x10001)), document::BucketId(16, 0x00001))); - CPPUNIT_ASSERT_EQUAL( // Inconsistent split + EXPECT_EQ( // Inconsistent split std::string("1,2"), doFindAll(toVector( document::BucketId(17, 0x10000), @@ -347,14 +367,14 @@ BucketDatabaseTest::testFindAll() document::BucketId(17, 0x1ffff)), document::BucketId(32, 0x027228034))); - CPPUNIT_ASSERT_EQUAL( // Inconsistent split + EXPECT_EQ( // Inconsistent split std::string("0"), doFindAll(toVector( document::BucketId(16, 0x0ffff), document::BucketId(17, 0x0ffff)), document::BucketId(22, 0x1ffff))); - CPPUNIT_ASSERT_EQUAL( // Inconsistent split + EXPECT_EQ( // Inconsistent split std::string("0,2"), doFindAll(toVector( document::BucketId(16, 0x0ffff), @@ -362,7 +382,7 @@ BucketDatabaseTest::testFindAll() document::BucketId(19, 0x1ffff)), document::BucketId(18, 0x1ffff))); - CPPUNIT_ASSERT_EQUAL( // Inconsistent split, ticket 3121525 + EXPECT_EQ( // Inconsistent split, ticket 3121525 std::string("0,2"), doFindAll(toVector( document::BucketId(16, 0x0ffff), @@ -386,36 +406,36 @@ BucketDatabaseTest::doCreate(const std::vector<document::BucketId>& ids, return entry.getBucketId(); } -void -BucketDatabaseTest::testCreateAppropriateBucket() { +// TODO rewrite in terms of bucket getter, not creator +TEST_P(BucketDatabaseTest, testCreateAppropriateBucket) { // Use min split bits when no relevant bucket exist. - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( document::BucketId(36,0x0000004d2), doCreate(toVector(document::BucketId(58, 0x43d6c878000004d2ull)), 36, document::BucketId(58, 0x423bf1e0000004d2ull))); - // New bucket has bits in common with existing bucket. - // Create bucket with min amount of bits while not being overlapping - CPPUNIT_ASSERT_EQUAL( + // New bucket has bits in common with existing bucket. + // Create bucket with min amount of bits while not being overlapping + EXPECT_EQ( document::BucketId(34,0x0000004d2), doCreate(toVector(document::BucketId(58, 0xeaf77782000004d2)), 16, document::BucketId(58, 0x00000000000004d2))); - // Create sibling of existing bucket with most LSB bits in common. - CPPUNIT_ASSERT_EQUAL( + // Create sibling of existing bucket with most LSB bits in common. + EXPECT_EQ( document::BucketId(40, 0x0000004d2), doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2), document::BucketId(58, 0xeaf77782000004d2)), 16, document::BucketId(58, 0x00000000000004d2))); - // Create sibling of existing bucket with most LSB bits in common. - CPPUNIT_ASSERT_EQUAL( + // Create sibling of existing bucket with most LSB bits in common. + EXPECT_EQ( document::BucketId(25, 0x0010004d2), doCreate(toVector(document::BucketId(16, 0x00000000000004d1), document::BucketId(40, 0x00000000000004d2)), 16, document::BucketId(58, 0x00000000010004d2))); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( document::BucketId(36, 0x10000004000004d2), doCreate(toVector(document::BucketId(0x8c000000000004d2), document::BucketId(0xeb54b3ac000004d2), @@ -423,42 +443,38 @@ BucketDatabaseTest::testCreateAppropriateBucket() { document::BucketId(0x84000001000004d2)), 16, document::BucketId(58, 0x1944a44000004d2))); - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( document::BucketId(25, 0x0010004d2), doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2), document::BucketId(40, 0x00000000000004d1)), 16, document::BucketId(58,0x00000000010004d2))); - // Test empty bucket database case. (Use min split bits) + // Test empty bucket database case. (Use min split bits) std::vector<document::BucketId> buckets; - CPPUNIT_ASSERT_EQUAL( + EXPECT_EQ( document::BucketId(16, 0x0000004d2ull), doCreate(buckets, 16, document::BucketId(58, 0x00000000010004d2))); } -void -BucketDatabaseTest::testGetNext() -{ - db().clear(); +TEST_P(BucketDatabaseTest, testGetNext) { db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1))); db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2))); db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3))); - CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 16), - db().getNext(document::BucketId()).getBucketId()); + EXPECT_EQ(document::BucketId(16, 16), + db().getNext(document::BucketId()).getBucketId()); - CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 42), - db().getNext(document::BucketId(16, 16)).getBucketId()); + EXPECT_EQ(document::BucketId(16, 42), + db().getNext(document::BucketId(16, 16)).getBucketId()); - CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 11), - db().getNext(document::BucketId(16, 42)).getBucketId()); + EXPECT_EQ(document::BucketId(16, 11), + db().getNext(document::BucketId(16, 42)).getBucketId()); } void BucketDatabaseTest::doTestUpperBound(const UBoundFunc& f) { - db().clear(); // Tree is rooted at the LSB bit, so the following buckets are in iteration // order based on the reverse of their "normal" bitstring: // 0010:3 @@ -473,30 +489,28 @@ BucketDatabaseTest::doTestUpperBound(const UBoundFunc& f) db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(3))); // 0000:0 (default constructed) has ubound of 0010:3 - CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId())); + EXPECT_EQ(BucketId(3, 4), f(db(), BucketId())); // 0011:4 has ubound of 1000:3 - CPPUNIT_ASSERT_EQUAL(document::BucketId(3, 1), f(db(), BucketId(4, 12))); + EXPECT_EQ(document::BucketId(3, 1), f(db(), BucketId(4, 12))); // 1000:1 has ubound of 1000:3 - CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId(1, 0))); - CPPUNIT_ASSERT_EQUAL(BucketId(3, 1), f(db(), BucketId(3, 4))); - CPPUNIT_ASSERT_EQUAL(BucketId(4, 9), f(db(), BucketId(3, 1))); - CPPUNIT_ASSERT_EQUAL(BucketId(5, 9), f(db(), BucketId(4, 9))); - CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(5, 9))); + EXPECT_EQ(BucketId(3, 4), f(db(), BucketId(1, 0))); + EXPECT_EQ(BucketId(3, 1), f(db(), BucketId(3, 4))); + EXPECT_EQ(BucketId(4, 9), f(db(), BucketId(3, 1))); + EXPECT_EQ(BucketId(5, 9), f(db(), BucketId(4, 9))); + EXPECT_EQ(BucketId(3, 3), f(db(), BucketId(5, 9))); // 100101:6 does not exist, should also return 1100:3 - CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(6, 41))); + EXPECT_EQ(BucketId(3, 3), f(db(), BucketId(6, 41))); // Test extremes. db().clear(); db().update(BucketDatabase::Entry(document::BucketId(8, 0), BI(2))); db().update(BucketDatabase::Entry(document::BucketId(8, 0xff), BI(2))); - CPPUNIT_ASSERT_EQUAL(BucketId(8, 0), f(db(), BucketId())); - CPPUNIT_ASSERT_EQUAL(BucketId(8, 0xff), f(db(), BucketId(8, 0))); + EXPECT_EQ(BucketId(8, 0), f(db(), BucketId())); + EXPECT_EQ(BucketId(8, 0xff), f(db(), BucketId(8, 0))); } -void -BucketDatabaseTest::testUpperBoundReturnsNextInOrderGreaterBucket() -{ +TEST_P(BucketDatabaseTest, testUpperBoundReturnsNextInOrderGreaterBucket) { doTestUpperBound([](const BucketDatabase& bucketDb, const document::BucketId& id) { @@ -504,9 +518,7 @@ BucketDatabaseTest::testUpperBoundReturnsNextInOrderGreaterBucket() }); } -void -BucketDatabaseTest::testGetNextReturnsUpperBoundBucket() -{ +TEST_P(BucketDatabaseTest, testGetNextReturnsUpperBoundBucket) { // getNext() would generally be implemented in terms of upperBound(), but // make sure it conforms to the same contract in case this changes. doTestUpperBound([](const BucketDatabase& bucketDb, @@ -516,31 +528,28 @@ BucketDatabaseTest::testGetNextReturnsUpperBoundBucket() }); } -void -BucketDatabaseTest::testChildCount() -{ - db().clear(); +TEST_P(BucketDatabaseTest, testChildCount) { // Empty tree; inserts cannot create inconsistencies. - CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1))); + EXPECT_EQ(0u, db().childCount(BucketId(3, 1))); // Same bucket; cannot be inconsistent with itself. db().update(BucketDatabase::Entry(document::BucketId(3, 1), BI(1))); - CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1))); + EXPECT_EQ(0u, db().childCount(BucketId(3, 1))); // (2, 1) has one subtree. - CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 1))); + EXPECT_EQ(1u, db().childCount(BucketId(2, 1))); // Bucket exists in another subtree from (1, 1); inconsistency would // result if we tried inserting it. db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(2))); - CPPUNIT_ASSERT_EQUAL(2u, db().childCount(BucketId(1, 1))); + EXPECT_EQ(2u, db().childCount(BucketId(1, 1))); // Inner node with 1 subtree. - CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 3))); + EXPECT_EQ(1u, db().childCount(BucketId(2, 3))); // Leaves have no subtrees. - CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1))); - CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 5))); + EXPECT_EQ(0u, db().childCount(BucketId(3, 1))); + EXPECT_EQ(0u, db().childCount(BucketId(3, 5))); } } diff --git a/storage/src/tests/distributor/bucketdatabasetest.h b/storage/src/tests/distributor/bucketdatabasetest.h index b7b32118fee..9ebc23bdb16 100644 --- a/storage/src/tests/distributor/bucketdatabasetest.h +++ b/storage/src/tests/distributor/bucketdatabasetest.h @@ -3,40 +3,13 @@ #include <vespa/storage/bucketdb/bucketdatabase.h> #include <vespa/storage/storageutil/utils.h> -#include <vespa/vdstestlib/cppunit/macros.h> -#include <vespa/vespalib/util/document_runnable.h> -#include <cppunit/extensions/HelperMacros.h> - -#define SETUP_DATABASE_TESTS() \ - CPPUNIT_TEST(testUpdateGetAndRemove); \ - CPPUNIT_TEST(testClear); \ - CPPUNIT_TEST(testIterating); \ - CPPUNIT_TEST(testFindParents); \ - CPPUNIT_TEST(testFindAll); \ - CPPUNIT_TEST(testCreateAppropriateBucket); \ - CPPUNIT_TEST(testGetNext); \ - CPPUNIT_TEST(testGetNextReturnsUpperBoundBucket); \ - CPPUNIT_TEST(testUpperBoundReturnsNextInOrderGreaterBucket); \ - CPPUNIT_TEST(testChildCount); - -namespace storage { -namespace distributor { - -struct BucketDatabaseTest : public CppUnit::TestFixture { - void setUp() override ; - - void testUpdateGetAndRemove(); - void testClear(); - void testIterating(); - void testFindParents(); - void testFindAll(); - void testCreateAppropriateBucket(); - void testGetNext(); - void testGetNextReturnsUpperBoundBucket(); - void testUpperBoundReturnsNextInOrderGreaterBucket(); - void testChildCount(); - - void testBenchmark(); +#include <vespa/vespalib/gtest/gtest.h> +#include <functional> + +namespace storage::distributor { + +struct BucketDatabaseTest : public ::testing::TestWithParam<std::shared_ptr<BucketDatabase>> { + void SetUp() override ; std::string doFindParents(const std::vector<document::BucketId>& ids, const document::BucketId& searchId); @@ -46,9 +19,8 @@ struct BucketDatabaseTest : public CppUnit::TestFixture { uint32_t minBits, const document::BucketId& wantedId); - virtual BucketDatabase& db() = 0; + BucketDatabase& db() noexcept { return *GetParam(); } -private: using UBoundFunc = std::function< document::BucketId(const BucketDatabase&, const document::BucketId&)>; @@ -57,6 +29,3 @@ private: }; } - -} - diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp index 51797e90b3e..11c9f18b3ef 100644 --- a/storage/src/tests/distributor/bucketdbupdatertest.cpp +++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp @@ -15,12 +15,11 @@ #include <vespa/storage/distributor/simpleclusterinformation.h> #include <vespa/storage/distributor/distributor.h> #include <vespa/storage/distributor/distributor_bucket_space.h> +#include <vespa/vespalib/gtest/gtest.h> #include <vespa/vespalib/text/stringtokenizer.h> #include <sstream> #include <iomanip> -#include <gtest/gtest.h> - using namespace storage::api; using namespace storage::lib; using document::test::makeDocumentBucket; diff --git a/storage/src/tests/distributor/mapbucketdatabasetest.cpp b/storage/src/tests/distributor/mapbucketdatabasetest.cpp index f2e521f1bf8..0ae4a49530e 100644 --- a/storage/src/tests/distributor/mapbucketdatabasetest.cpp +++ b/storage/src/tests/distributor/mapbucketdatabasetest.cpp @@ -1,21 +1,11 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vdstestlib/cppunit/macros.h> + #include <vespa/storage/bucketdb/mapbucketdatabase.h> #include <tests/distributor/bucketdatabasetest.h> -namespace storage { -namespace distributor { - -struct MapBucketDatabaseTest : public BucketDatabaseTest { - MapBucketDatabase _db; - BucketDatabase& db() override { return _db; }; +namespace storage::distributor { - CPPUNIT_TEST_SUITE(MapBucketDatabaseTest); - SETUP_DATABASE_TESTS(); - CPPUNIT_TEST_SUITE_END(); -}; +INSTANTIATE_TEST_CASE_P(MapDatabase, BucketDatabaseTest, + ::testing::Values(std::make_shared<MapBucketDatabase>())); -CPPUNIT_TEST_SUITE_REGISTRATION(MapBucketDatabaseTest); - -} } diff --git a/storage/src/vespa/storage/bucketdb/CMakeLists.txt b/storage/src/vespa/storage/bucketdb/CMakeLists.txt index 8200884de17..a99d16d9f0f 100644 --- a/storage/src/vespa/storage/bucketdb/CMakeLists.txt +++ b/storage/src/vespa/storage/bucketdb/CMakeLists.txt @@ -1,6 +1,7 @@ # Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(storage_bucketdb OBJECT SOURCES + btree_bucket_database.cpp bucketcopy.cpp bucketdatabase.cpp bucketinfo.cpp diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp new file mode 100644 index 00000000000..27c17b207ea --- /dev/null +++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.cpp @@ -0,0 +1,405 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "btree_bucket_database.h" +#include <vespa/searchlib/btree/btreebuilder.h> +#include <vespa/searchlib/btree/btreenodeallocator.hpp> +#include <vespa/searchlib/btree/btreenode.hpp> +#include <vespa/searchlib/btree/btreenodestore.hpp> +#include <vespa/searchlib/btree/btreeiterator.hpp> +#include <vespa/searchlib/btree/btreeroot.hpp> +#include <vespa/searchlib/btree/btreebuilder.hpp> +#include <vespa/searchlib/btree/btree.hpp> +#include <vespa/searchlib/btree/btreestore.hpp> +#include <vespa/searchlib/datastore/array_store.hpp> +#include <iostream> + +/* + * Buckets in our tree are represented by their 64-bit numeric key, in what's known as + * "reversed bit order with appended used-bits" form. I.e. a bucket ID (16, 0xcafe), which + * in its canonical representation has 16 (the used-bits) in its 6 MSBs and 0xcafe in its + * LSBs is transformed into 0x7f53000000000010. This key is logically comprised of two parts: + * - the reversed bucket ID itself (0xcafe - 0x7f53) with all trailing zeroes for unset bits + * - the _non-reversed_ used-bits appended as the LSBs + * + * This particular transformation gives us keys with the following invariants: + * - all distinct bucket IDs map to exactly 1 key + * - buckets with the same ID but different used-bits are ordered in such a way that buckets + * with higher used-bits sort after buckets with lower used-bits + * - the key ordering represents an implicit in-order traversal of the binary bucket tree + * - consequently, all parent buckets are ordered before their child buckets + * + * The in-order traversal invariant is fundamental to many of the algorithms that operate + * on the bucket tree. + */ + +namespace storage { + +using Entry = BucketDatabase::Entry; +using search::datastore::EntryRef; +using vespalib::ConstArrayRef; +using document::BucketId; + +BTreeBucketDatabase::BTreeBucketDatabase() + : _tree(), + _store(ReplicaStore::optimizedConfigForHugePage(1023, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE, + 4 * 1024, 8 * 1024, 0.2)), + _generation_handler() +{ +} + +BTreeBucketDatabase::~BTreeBucketDatabase() = default; + +namespace { + +Entry entry_from_replica_array_ref(const BucketId& id, uint32_t gc_timestamp, ConstArrayRef<BucketCopy> replicas) { + return Entry(id, BucketInfo(gc_timestamp, std::vector<BucketCopy>(replicas.begin(), replicas.end()))); +} + +EntryRef entry_ref_from_value(uint64_t value) { + return EntryRef(value & 0xffffffffULL); +} + +uint32_t gc_timestamp_from_value(uint64_t value) { + return (value >> 32u); +} + +uint64_t value_from(uint32_t gc_timestamp, EntryRef ref) { + return ((uint64_t(gc_timestamp) << 32u) | ref.ref()); +} + +// TODO dedupe and unify common code +uint8_t +getMinDiffBits(uint16_t minBits, const document::BucketId& a, const document::BucketId& b) { + for (uint32_t i = minBits; i <= std::min(a.getUsedBits(), b.getUsedBits()); i++) { + document::BucketId a1(i, a.getRawId()); + document::BucketId b1(i, b.getRawId()); + if (b1.getId() != a1.getId()) { + return i; + } + } + return minBits; +} + +uint8_t next_parent_bit_seek_level(uint8_t minBits, const document::BucketId& a, const document::BucketId& b) { + const uint8_t min_used = std::min(a.getUsedBits(), b.getUsedBits()); + assert(min_used >= minBits); // Always monotonically descending towards leaves + for (uint32_t i = minBits; i <= min_used; i++) { + document::BucketId a1(i, a.getRawId()); + document::BucketId b1(i, b.getRawId()); + if (b1.getId() != a1.getId()) { + return i; + } + } + // The bit prefix is equal, which means that one node is a parent of the other. In this + // case we have to force the seek to continue from the next level in the tree. + return std::max(min_used, minBits) + 1; +} + +// TODO getMinDiffBits is hoisted from lockablemap.cpp, could probably be rewritten in terms of xor and MSB bit scan instr +/* + * 63 -------- ... -> 0 + * a: 1101111111 ... 0010 + * b: 1101110010 ... 0011 + * a ^ b: 0000001101 ... 0001 + * ^- diff bit = 57 + * + * 63 - vespalib::Optimized::msbIdx(a ^ b) ==> 6 + * + * what if a == b? special case? not a problem if we can prove this never happens. + */ + +} + +void BTreeBucketDatabase::commit_tree_changes() { + // TODO break up and refactor + // TODO verify semantics and usage + // TODO make BTree wrapping API which abstracts away all this stuff via reader/writer interfaces + _tree.getAllocator().freeze(); + + auto current_gen = _generation_handler.getCurrentGeneration(); + _store.transferHoldLists(current_gen); + _tree.getAllocator().transferHoldLists(current_gen); + + _generation_handler.incGeneration(); + + auto used_gen = _generation_handler.getFirstUsedGeneration(); + _store.trimHoldLists(used_gen); + _tree.getAllocator().trimHoldLists(used_gen); +} + +Entry BTreeBucketDatabase::entry_from_iterator(const BTree::ConstIterator& iter) const { + if (!iter.valid()) { + return Entry::createInvalid(); + } + const auto value = iter.getData(); + const auto replicas_ref = _store.get(entry_ref_from_value(value)); + const auto bucket = BucketId(BucketId::keyToBucketId(iter.getKey())); + return entry_from_replica_array_ref(bucket, gc_timestamp_from_value(value), replicas_ref); +} + +BucketId BTreeBucketDatabase::bucket_from_valid_iterator(const BTree::ConstIterator& iter) const { + return BucketId(BucketId::keyToBucketId(iter.getKey())); +} + +Entry BTreeBucketDatabase::get(const BucketId& bucket) const { + return entry_from_iterator(_tree.find(bucket.toKey())); +} + +void BTreeBucketDatabase::remove(const BucketId& bucket) { + auto iter = _tree.find(bucket.toKey()); + if (!iter.valid()) { + return; + } + const auto value = iter.getData(); + _store.remove(entry_ref_from_value(value)); + _tree.remove(iter); + commit_tree_changes(); +} + +/* + * Finding the complete set of parents of a given bucket is not obvious how to + * do efficiently, as we only know that the parents are ordered before their + * children, but we do not a-priori know if any exist at all. The Judy DB impl + * does O(b) explicit point lookups (where b is the number of used bits in the + * bucket), starting at the leaf bit and working towards the root. To avoid + * having to re-create iterators and perform a full tree search every time, we + * turn this on its head and start from the root, progressing towards the leaf. + * This allows us to reuse a single iterator and to continue seeking forwards + * from its current position. + * + * Algorithm: + * + * Core invariant: every subsequent iterator seek performed in this algorithm + * is for a key that is strictly higher than the one the iterator is currently at. + * + * 1. Lbound seek to the lowest key that is known to exclude all already visited + * parents. On the first iteration this is zero, i.e. the first in-order node. + * 2. If the current node's key is greater than that of the requested bucket's key, + * we've either descended to--or beyond--it in its own subtree or we've entered + * a disjoint subtree. Since we know that all parents must sort before any given + * child bucket, no more parents may be found at this point. Algorithm terminates. + * 3. As the main body of the loop is entered, we know one of following must hold: + * 3.1 The current node is an explicitly present parent of our bucket. + * 3.2 The current node is contained in a left subtree branch of a parent that + * does not have a bucket explicitly present in the tree. It cannot be in + * a right subtree of any parent, as that would imply the node is ordered + * _after_ our own bucket in an in-order traversal, which would contradict + * the check in step 2 above. + * 4. If the current node contains the requested bucket, we're at a parent + * node of the bucket; add it to the result set. + * If this is _not_ the case, we're in a different subtree. Example: the + * requested bucket has a key whose MSB is 1 but the first bucket in the + * tree has a key with an MSB of 0. Either way we need to update our search + * key to home in on the target subtree where more parents may be found; + * 5. Update the seek key to find the next possible parent. To ensure this key is + * strictly greater than the iterator's current key we find the largest shared + * prefix of bits in common between the current node's key and the requested + * bucket's key. The prefix length + 1 is then the depth in the tree at which the + * two subtrees branch off and diverge. + * The new key is then the MSB prefix length + 1 requested bucket's key with a + * matching number of used-bits set. Forward lbound-seek the iterator to this key. + * `--> TODO elaborate on prefix semantics when they are equal wrt. min used bits + * 6. Iff iterator is still valid, go to step 2 + * + * This algorithm is able to skip through large parts of the tree in a sparsely populated + * tree, but the number of seeks will trend towards O(b) as with the legacy implementation + * when a tree is densely populated. This because all logical inner nodes in the tree will + * have subtrees under them. Even in the worst case we should be more efficient than the + * legacy implementation since we've cut any dense search space in half for each invocation + * of seek() on the iterator + * + * TODO use min-aggregation to infer used bit range in the tree. This should allow us to + * skip scanning through many disjoint subtrees. + */ +BTreeBucketDatabase::BTree::ConstIterator +BTreeBucketDatabase::find_parents_internal(const document::BucketId& bucket, + std::vector<Entry>& entries) const +{ + const uint64_t bucket_key = bucket.toKey(); + uint64_t parent_key = 0; + auto iter = _tree.begin(); + // Start at the root level, descending towards the bucket itself. + // Try skipping as many levels of the tree as possible as we go. + uint32_t bits = 1; + while (iter.valid() && (iter.getKey() < bucket_key)) { + auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey())); + if (candidate.contains(bucket)) { + assert(candidate.getUsedBits() >= bits); + entries.emplace_back(entry_from_iterator(iter)); + } + bits = next_parent_bit_seek_level(bits, candidate, bucket); + parent_key = BucketId(bits, bucket.getRawId()).toKey(); + assert(parent_key > iter.getKey()); + iter.seek(parent_key); + } + return iter; // FIXME clang warns here due to copying... +} + +/* + * Note: due to legacy API reasons, iff the requested bucket itself exists in the + * tree, it will be returned in the result set. I.e. it returns all the nodes on + * the path from _and including_ itself towards the root. + */ +void BTreeBucketDatabase::getParents(const BucketId& bucket, + std::vector<Entry>& entries) const +{ + auto iter = find_parents_internal(bucket, entries); + if (iter.valid() && iter.getKey() == bucket.toKey()) { + entries.emplace_back(entry_from_iterator(iter)); + } +} + +void BTreeBucketDatabase::getAll(const BucketId& bucket, + std::vector<Entry>& entries) const +{ + auto iter = find_parents_internal(bucket, entries); + // `iter` is already pointing at, or beyond, one of the bucket's subtrees. + for (; iter.valid(); ++iter) { + auto candidate = BucketId(BucketId::keyToBucketId(iter.getKey())); + if (bucket.contains(candidate)) { + entries.emplace_back(entry_from_iterator(iter)); + } else { + break; + } + } +} + +void BTreeBucketDatabase::update(const Entry& newEntry) { + assert(newEntry.valid()); + auto replicas_ref = _store.add(newEntry.getBucketInfo().getRawNodes()); + const auto new_value = value_from(newEntry.getBucketInfo().getLastGarbageCollectionTime(), replicas_ref); + const auto bucket_key = newEntry.getBucketId().toKey(); + auto iter = _tree.lowerBound(bucket_key); + if (iter.valid() && (iter.getKey() == bucket_key)) { + _store.remove(entry_ref_from_value(iter.getData())); + // In-place update of value; does not require tree structure modification + std::atomic_thread_fence(std::memory_order_release); // Must ensure visibility when new array ref is observed + iter.writeData(new_value); + } else { + _tree.insert(iter, bucket_key, new_value); + } + commit_tree_changes(); // TODO does publishing a new root imply an implicit memory fence? +} + +// TODO need snapshot read with guarding +// FIXME semantics of for-each in judy and bit tree DBs differ, former expects lbound, latter ubound..! +// FIXME but bit-tree code says "lowerBound" in impl and "after" in declaration??? +void BTreeBucketDatabase::forEach(EntryProcessor& proc, const BucketId& after) const { + for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) { + if (!proc.process(entry_from_iterator(iter))) { + break; + } + } +} + +void BTreeBucketDatabase::forEach(MutableEntryProcessor& proc, const BucketId& after) { + for (auto iter = _tree.upperBound(after.toKey()); iter.valid(); ++iter) { + // FIXME this is a horrible API which currently has to update every time since we don't + // know from the function call itself whether something was in fact updated behind the scenes..! + auto entry = entry_from_iterator(iter); + bool should_continue = proc.process(entry); + // TODO optimize this joyful mess :D + update(entry); + if (!should_continue) { + break; + } + } + //commit_tree_changes(); TODO should be done as bulk op! +} + +Entry BTreeBucketDatabase::upperBound(const BucketId& bucket) const { + return entry_from_iterator(_tree.upperBound(bucket.toKey())); + +} + +uint64_t BTreeBucketDatabase::size() const { + return _tree.size(); +} + +void BTreeBucketDatabase::clear() { + _tree.clear(); + commit_tree_changes(); +} + +/* + * Returns the bucket ID which, based on the buckets already existing in the DB, + * is the most specific location in the tree in which it should reside. This may + * or may not be a bucket that already exists. + * + * Example: if there is a single bucket (1, 1) in the tree, a query for (1, 1) or + * (1, 3) will return (1, 1) as that is the most specific leaf in that subtree. + * A query for (1, 0) will return (1, 0) even though this doesn't currently exist, + * as there is no existing bucket that can contain the queried bucket. It is up to + * the caller to create this bucket according to its needs. + * + * Usually this function will be called with an ID whose used-bits is at max (58), in + * order to find a leaf bucket to route an incoming document operation to. + * + * TODO rename this function, it's very much _not_ obvious what an "appropriate" bucket is..! + * TODO this should be possible to do concurrently + */ +BucketId BTreeBucketDatabase::getAppropriateBucket(uint16_t minBits, const BucketId& bid) { + // The bucket tree is ordered in such a way that it represents a + // natural in-order traversal of all buckets, with inner nodes being + // visited before leaf nodes. This means that a lower bound seek will + // never return a parent of a seeked bucket. The iterator will be pointing + // to a bucket that is either the actual bucket given as the argument to + // lowerBound() or the next in-order bucket (or end() if none exists). + auto iter = _tree.lowerBound(bid.toKey()); + if (iter.valid()) { + // Find the first level in the tree where the paths through the bucket tree + // diverge for the target bucket and the current bucket. + minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid); + } + // TODO is it better to copy original iterator and do begin() on the copy? + auto first_iter = _tree.begin(); + // Original iterator might be in a different subtree than that of our + // target bucket. If possible, rewind one node to discover any parent or + // leftmost sibling of our node. If there's no such node, we'll still + // discover the greatest equal bit prefix. + if (iter != first_iter) { + --iter; + minBits = getMinDiffBits(minBits, bucket_from_valid_iterator(iter), bid); + } + return BucketId(minBits, bid.getRawId()); +} + +/* + * Enumerate the number of child subtrees under `bucket`. The value returned is in the + * range [0, 2] regardless of how many subtrees are present further down in the tree. + * + * Finding this number is reasonably straight forward; we construct two buckets that + * represent the key ranges for the left and right subtrees under `bucket` and check + * if there are any ranges in the tree's keyspace that are contained in these. + */ +// TODO rename/clarify to indicate this is child _subtrees_, not explicit child _buckets_! +uint32_t BTreeBucketDatabase::childCount(const BucketId& bucket) const { + assert(bucket.getUsedBits() < BucketId::maxNumBits); + BucketId lhs_bucket(bucket.getUsedBits() + 1, bucket.getId()); + BucketId rhs_bucket(bucket.getUsedBits() + 1, (1ULL << bucket.getUsedBits()) | bucket.getId()); + + auto iter = _tree.lowerBound(lhs_bucket.toKey()); + if (!iter.valid()) { + return 0; + } + if (lhs_bucket.contains(bucket_from_valid_iterator(iter))) { + iter.seek(rhs_bucket.toKey()); + if (!iter.valid()) { + return 1; // lhs subtree only + } + return (rhs_bucket.contains(bucket_from_valid_iterator(iter)) ? 2 : 1); + } else if (rhs_bucket.contains(bucket_from_valid_iterator(iter))) { + return 1; // rhs subtree only + } + return 0; +} + +void BTreeBucketDatabase::print(std::ostream& out, bool verbose, + const std::string& indent) const +{ + out << "BTreeBucketDatabase(" << size() << " buckets)"; + (void)verbose; + (void)indent; +} + +} diff --git a/storage/src/vespa/storage/bucketdb/btree_bucket_database.h b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h new file mode 100644 index 00000000000..91d3a1cde5b --- /dev/null +++ b/storage/src/vespa/storage/bucketdb/btree_bucket_database.h @@ -0,0 +1,70 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "bucketdatabase.h" +#include <vespa/searchlib/btree/btree.h> +#include <vespa/searchlib/datastore/array_store.h> + +namespace storage { + +/* + * Bucket database implementation built around lock-free single-writer/multiple-readers B+tree. + * + * Since a distributor must be able to handle multiple replicas for a given bucket, these + * are handled via an ArrayStore indirection. A distributor bucket DB also includes state + * for the _entire bucket_ itself, not just the replicas; last timestamp of bucket GC. Since + * this is an uint32 we cheekily mangle it into the value, i.e. each bucket key maps to a + * composite key of (gc_timestamp_u32 << 32) | array_ref_u32. + * + * This is _not_ yet production ready, for several reasons: + * - Underlying ArrayStore does not have freelists enabled for replica entry reuse + * - Current API design for mutable forEach requires O(n) tree structure mutations instead + * of changing the tree in bulk and reusing ArrayStore refs et al. Needs a redesign. + * + * Also note that the DB is currently _not_ thread safe, as read snapshotting is not yet defined + * or exposed. + */ +// TODO create and use a new DB interface with better bulk loading, snapshot and iteration support +class BTreeBucketDatabase : public BucketDatabase { + // Mapping from u64: bucket key -> <MSB u32: gc timestamp, LSB u32: ArrayStore ref> + using BTree = search::btree::BTree<uint64_t, uint64_t>; + using ReplicaStore = search::datastore::ArrayStore<BucketCopy>; + using GenerationHandler = vespalib::GenerationHandler; + + BTree _tree; + ReplicaStore _store; + GenerationHandler _generation_handler; +public: + BTreeBucketDatabase(); + ~BTreeBucketDatabase() override; + + // Ye olde bucket DB API: + Entry get(const document::BucketId& bucket) const override; + void remove(const document::BucketId& bucket) override; + void getParents(const document::BucketId& childBucket, + std::vector<Entry>& entries) const override; + void getAll(const document::BucketId& bucket, + std::vector<Entry>& entries) const override; + void update(const Entry& newEntry) override; + void forEach(EntryProcessor&, const document::BucketId& after) const override; + void forEach(MutableEntryProcessor&, const document::BucketId& after) override; + Entry upperBound(const document::BucketId& value) const override; + uint64_t size() const override; + void clear() override; + document::BucketId getAppropriateBucket( + uint16_t minBits, + const document::BucketId& bid) override; + uint32_t childCount(const document::BucketId&) const override; + void print(std::ostream& out, bool verbose, + const std::string& indent) const override; + +private: + Entry entry_from_iterator(const BTree::ConstIterator& iter) const; + document::BucketId bucket_from_valid_iterator(const BTree::ConstIterator& iter) const; + void commit_tree_changes(); + BTree::ConstIterator find_parents_internal(const document::BucketId& bucket, + std::vector<Entry>& entries) const; +}; + +} diff --git a/storage/src/vespa/storage/bucketdb/bucketdatabase.h b/storage/src/vespa/storage/bucketdb/bucketdatabase.h index d939ac48a03..a5a70c488ca 100644 --- a/storage/src/vespa/storage/bucketdb/bucketdatabase.h +++ b/storage/src/vespa/storage/bucketdb/bucketdatabase.h @@ -19,8 +19,8 @@ public: public: Entry() : _bucketId(0) {} // Invalid entry - Entry(const document::BucketId& bId, const BucketInfo& bucketInfo) - : _bucketId(bId), _info(bucketInfo) {} + Entry(const document::BucketId& bId, BucketInfo bucketInfo) + : _bucketId(bId), _info(std::move(bucketInfo)) {} explicit Entry(const document::BucketId& bId) : _bucketId(bId) {} bool operator==(const Entry& other) const { @@ -54,7 +54,7 @@ public: virtual void remove(const document::BucketId& bucket) = 0; /** - * Puts all entries that are can contain the given bucket id + * Puts all entries that may contain the given bucket id * into the given entry vector, including itself if found. */ virtual void getParents(const document::BucketId& childBucket, diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.cpp b/storage/src/vespa/storage/bucketdb/bucketinfo.cpp index bbc59de410a..82fc6686c5f 100644 --- a/storage/src/vespa/storage/bucketdb/bucketinfo.cpp +++ b/storage/src/vespa/storage/bucketdb/bucketinfo.cpp @@ -6,10 +6,21 @@ namespace storage { BucketInfo::BucketInfo() - : _lastGarbageCollection(0) -{ } + : _lastGarbageCollection(0), + _nodes() +{} -BucketInfo::~BucketInfo() { } +BucketInfo::BucketInfo(uint32_t lastGarbageCollection, std::vector<BucketCopy> nodes) + : _lastGarbageCollection(lastGarbageCollection), + _nodes(std::move(nodes)) +{} + +BucketInfo::~BucketInfo() = default; + +BucketInfo::BucketInfo(const BucketInfo&) = default; +BucketInfo& BucketInfo::operator=(const BucketInfo&) = default; +BucketInfo::BucketInfo(BucketInfo&&) noexcept = default; +BucketInfo& BucketInfo::operator=(BucketInfo&&) noexcept = default; std::string BucketInfo::toString() const { diff --git a/storage/src/vespa/storage/bucketdb/bucketinfo.h b/storage/src/vespa/storage/bucketdb/bucketinfo.h index ace82561b2a..8949c0f10da 100644 --- a/storage/src/vespa/storage/bucketdb/bucketinfo.h +++ b/storage/src/vespa/storage/bucketdb/bucketinfo.h @@ -22,8 +22,14 @@ private: public: BucketInfo(); + BucketInfo(uint32_t lastGarbageCollection, std::vector<BucketCopy> nodes); ~BucketInfo(); + BucketInfo(const BucketInfo&); + BucketInfo& operator=(const BucketInfo&); + BucketInfo(BucketInfo&&) noexcept; + BucketInfo& operator=(BucketInfo&&) noexcept; + /** * @return Returns the last time when this bucket was "garbage collected". */ @@ -136,6 +142,10 @@ public: return _nodes[idx]; } + const std::vector<BucketCopy>& getRawNodes() const noexcept { + return _nodes; + } + void clearTrusted(uint16_t nodeIdx) { getNodeInternal(nodeIdx)->clearTrusted(); } diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.h b/storage/src/vespa/storage/bucketdb/lockablemap.h index 89be92d5921..2b2c2cbe7a8 100644 --- a/storage/src/vespa/storage/bucketdb/lockablemap.h +++ b/storage/src/vespa/storage/bucketdb/lockablemap.h @@ -305,7 +305,7 @@ private: * Find the given list of keys in the map and add them to the map of * results, locking them in the process. */ - void addAndLockResults(const std::vector<BucketId::Type> keys, + void addAndLockResults(const std::vector<BucketId::Type>& keys, const char* clientId, std::map<BucketId, WrappedEntry>& results, std::unique_lock<std::mutex> &guard); diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.hpp b/storage/src/vespa/storage/bucketdb/lockablemap.hpp index 6d700cd6049..440a76b92fd 100644 --- a/storage/src/vespa/storage/bucketdb/lockablemap.hpp +++ b/storage/src/vespa/storage/bucketdb/lockablemap.hpp @@ -549,7 +549,7 @@ LockableMap<Map>::getAllContaining(const BucketId& bucket, template<typename Map> void LockableMap<Map>::addAndLockResults( - const std::vector<BucketId::Type> keys, + const std::vector<BucketId::Type>& keys, const char* clientId, std::map<BucketId, WrappedEntry>& results, std::unique_lock<std::mutex> &guard) diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp index db2842b70f2..d5c16bc7f5b 100644 --- a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp +++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp @@ -822,7 +822,7 @@ BucketDBUpdater::NodeRemover::setCopiesInEntry( e->addNodes(copies, order); - LOG(debug, "Changed %s", e->toString().c_str()); + LOG(spam, "Changed %s", e->toString().c_str()); LOG_BUCKET_OPERATION_NO_LOCK( e.getBucketId(), vespalib::make_string("updated bucketdb entry to %s", @@ -834,7 +834,7 @@ BucketDBUpdater::NodeRemover::removeEmptyBucket(const document::BucketId& bucket { _removedBuckets.push_back(bucketId); - LOG(debug, + LOG(spam, "After system state change %s, bucket %s now has no copies.", _oldState.getTextualDifference(_state).c_str(), bucketId.toString().c_str()); diff --git a/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp b/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp index dcf7792860e..f013ce43048 100644 --- a/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp +++ b/storage/src/vespa/storage/distributor/distributor_bucket_space.cpp @@ -1,13 +1,24 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "distributor_bucket_space.h" +#include <vespa/storage/bucketdb/mapbucketdatabase.h> +#include <vespa/storage/bucketdb/btree_bucket_database.h> #include <vespa/vdslib/state/clusterstate.h> #include <vespa/vdslib/distribution/distribution.h> namespace storage::distributor { +namespace { + +std::unique_ptr<BucketDatabase> make_default_db_impl() { + //return std::make_unique<BTreeBucketDatabase>(); + return std::make_unique<MapBucketDatabase>(); +} + +} + DistributorBucketSpace::DistributorBucketSpace() - : _bucketDatabase(), + : _bucketDatabase(make_default_db_impl()), _clusterState(), _distribution() { diff --git a/storage/src/vespa/storage/distributor/distributor_bucket_space.h b/storage/src/vespa/storage/distributor/distributor_bucket_space.h index 23d00c6ae43..effb0dc3e17 100644 --- a/storage/src/vespa/storage/distributor/distributor_bucket_space.h +++ b/storage/src/vespa/storage/distributor/distributor_bucket_space.h @@ -1,9 +1,12 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once -#include <vespa/storage/bucketdb/mapbucketdatabase.h> #include <memory> +namespace storage { +class BucketDatabase; +} + namespace storage::lib { class ClusterState; class Distribution; @@ -23,7 +26,7 @@ namespace storage::distributor { * bucket spaces. */ class DistributorBucketSpace { - MapBucketDatabase _bucketDatabase; + std::unique_ptr<BucketDatabase> _bucketDatabase; std::shared_ptr<const lib::ClusterState> _clusterState; std::shared_ptr<const lib::Distribution> _distribution; public: @@ -36,10 +39,10 @@ public: DistributorBucketSpace& operator=(DistributorBucketSpace&&) = delete; BucketDatabase& getBucketDatabase() noexcept { - return _bucketDatabase; + return *_bucketDatabase; } const BucketDatabase& getBucketDatabase() const noexcept { - return _bucketDatabase; + return *_bucketDatabase; } void setClusterState(std::shared_ptr<const lib::ClusterState> clusterState); diff --git a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp index c295be19a0b..4208e8b33f9 100644 --- a/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp +++ b/storage/src/vespa/storage/distributor/pending_bucket_space_db_transition.cpp @@ -49,9 +49,7 @@ PendingBucketSpaceDbTransition::PendingBucketSpaceDbTransition(const PendingClus } } -PendingBucketSpaceDbTransition::~PendingBucketSpaceDbTransition() -{ -} +PendingBucketSpaceDbTransition::~PendingBucketSpaceDbTransition() = default; PendingBucketSpaceDbTransition::Range PendingBucketSpaceDbTransition::skipAllForSameBucket() |