diff options
author | Geir Storli <geirst@yahoo-inc.com> | 2017-05-10 14:30:29 +0000 |
---|---|---|
committer | Geir Storli <geirst@yahoo-inc.com> | 2017-05-12 14:01:24 +0000 |
commit | 6d253c5644128fa500dfb27ceb48a79a58522db8 (patch) | |
tree | 3104173b278121d2b0c4b5934c98604ba623562f /searchlib | |
parent | bc5e77f6f49107a3005ef03fcfb8131c2261ecc9 (diff) |
Support that lid space can be compacted and entries from old files skipped during load.
Diffstat (limited to 'searchlib')
3 files changed, 91 insertions, 20 deletions
diff --git a/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp b/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp index 4f4ebab9687..f2a41e10819 100644 --- a/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp +++ b/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp @@ -789,8 +789,8 @@ struct Fixture { store.initFlush(serialNum); store.flush(serialNum); } - Fixture &write(uint32_t lid) { - vespalib::string data = genData(lid, 256); + Fixture &write(uint32_t lid, size_t numBytes = 1024) { + vespalib::string data = genData(lid, numBytes); store.write(nextSerialNum(), lid, data.c_str(), data.size()); return *this; } @@ -806,6 +806,9 @@ struct Fixture { void assertDocIdLimit(uint32_t expDocIdLimit) { EXPECT_EQUAL(expDocIdLimit, store.getDocIdLimit()); } + void assertNumChunks(size_t numChunks) { + EXPECT_EQUAL(numChunks, store.getFileChunkStats().size()); + } void assertDocIdLimitInFileChunks(const std::vector<uint32_t> expLimits) { std::vector<uint32_t> actLimits; for (const auto &stat : store.getFileChunkStats()) { @@ -813,22 +816,45 @@ struct Fixture { } EXPECT_EQUAL(expLimits, actLimits); } + void assertContent(const std::set<uint32_t> &lids, uint32_t docIdLimit, size_t numBytesPerEntry = 1024) { + for (uint32_t lid = 0; lid < docIdLimit; ++lid) { + vespalib::DataBuffer buffer; + size_t size = store.read(lid, buffer); + if (lids.find(lid) != lids.end()) { + vespalib::string expData = genData(lid, numBytesPerEntry); + EXPECT_EQUAL(expData, vespalib::string(buffer.getData(), buffer.getDataLen())); + EXPECT_GREATER(size, 0u); + } else { + EXPECT_EQUAL("", vespalib::string(buffer.getData(), buffer.getDataLen())); + EXPECT_EQUAL(0u, size); + } + } + } }; -TEST_F("require that docIdLimit is updated when inserting entries", Fixture("tmp")) +TEST("require that docIdLimit is updated when inserting entries") { - f.assertDocIdLimit(0); - f.write(10); - f.assertDocIdLimit(11); - f.write(9); - f.assertDocIdLimit(11); - f.write(11); - f.assertDocIdLimit(12); + { + Fixture f("tmp", false); + f.assertDocIdLimit(0); + f.write(10); + f.assertDocIdLimit(11); + f.write(9); + f.assertDocIdLimit(11); + f.write(11); + f.assertDocIdLimit(12); + f.assertNumChunks(1); + f.flush(); + } + { + Fixture f("tmp"); + f.assertDocIdLimit(12); + } } -TEST("require that docIdLimit at idx file creation time is written to file header") +TEST("require that docIdLimit at idx file creation time is written to idx file header") { - std::vector<uint32_t> expLimits = {std::numeric_limits<uint32_t>::max(),24,114,214}; + std::vector<uint32_t> expLimits = {std::numeric_limits<uint32_t>::max(),14,104,204}; { Fixture f("tmp", false); f.writeUntilNewChunk(10); @@ -843,6 +869,34 @@ TEST("require that docIdLimit at idx file creation time is written to file heade } } +TEST("require that lid space can be compacted and entries from old files skipped during load") +{ + { + Fixture f("tmp", false); + f.write(10); + f.writeUntilNewChunk(100); + f.write(20); + f.writeUntilNewChunk(200); + f.write(30); + TEST_DO(f.assertContent({10,100,101,102,20,200,201,202,30}, 203)); + + f.assertDocIdLimit(203); + f.store.compactLidSpace(100); + f.assertDocIdLimit(100); + TEST_DO(f.assertContent({10,20,30}, 203)); + + f.writeUntilNewChunk(31); + f.write(99); + f.write(300); + TEST_DO(f.assertContent({10,20,30,31,32,33,99,300}, 301)); + f.assertDocIdLimitInFileChunks({std::numeric_limits<uint32_t>::max(),103,203,100}); + f.flush(); + } + { + Fixture f("tmp"); + TEST_DO(f.assertContent({10,20,30,31,32,33,99,300}, 301)); + } +} TEST_MAIN() { DummyFileHeaderContext::setCreator("logdatastore_test"); diff --git a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp index 4daa1417ea7..478606f437c 100644 --- a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp +++ b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp @@ -53,14 +53,15 @@ LogDataStore::LogDataStore(vespalib::ThreadExecutor &executor, _executor(executor), _initFlushSyncToken(0), _tlSyncer(tlSyncer), - _bucketizer(bucketizer) + _bucketizer(bucketizer), + _currentlyCompacting() { // Reserve space for 1TB summary in order to avoid locking. _fileChunks.reserve(LidInfo::getFileIdLimit()); _holdFileChunks.resize(LidInfo::getFileIdLimit()); preload(); - updateLidMap(); + updateLidMap(getLastFileChunkDocIdLimit()); updateSerialNum(); } @@ -86,13 +87,16 @@ LogDataStore::~LogDataStore() } void -LogDataStore::updateLidMap() +LogDataStore::updateLidMap(uint32_t lastFileChunkDocIdLimit) { uint64_t lastSerialNum(0); LockGuard guard(_updateLock); - for (FileChunk::UP & fc : _fileChunks) { - fc->updateLidMap(guard, *this, lastSerialNum, std::numeric_limits<uint32_t>::max()); - lastSerialNum = fc->getLastPersistedSerialNum(); + for (size_t i = 0; i < _fileChunks.size(); ++i) { + FileChunk::UP &chunk = _fileChunks[i]; + bool lastChunk = ((i + 1) == _fileChunks.size()); + uint32_t docIdLimit = lastChunk ? std::numeric_limits<uint32_t>::max() : lastFileChunkDocIdLimit; + chunk->updateLidMap(guard, *this, lastSerialNum, docIdLimit); + lastSerialNum = chunk->getLastPersistedSerialNum(); } } @@ -770,6 +774,15 @@ LogDataStore::preload() _prevActive = _active.prev(); } +uint32_t +LogDataStore::getLastFileChunkDocIdLimit() +{ + if (!_fileChunks.empty()) { + return _fileChunks.back()->getDocIdLimit(); + } + return std::numeric_limits<uint32_t>::max(); +} + LogDataStore::NameIdSet LogDataStore::eraseEmptyIdxFiles(const NameIdSet &partList) { @@ -1105,7 +1118,10 @@ LogDataStore::getFileChunkStats() const void LogDataStore::compactLidSpace(uint32_t wantedDocLidLimit) { - (void) wantedDocLidLimit; + for (size_t i = wantedDocLidLimit; i < _lidInfo.size(); ++i) { + _lidInfo[i] = LidInfo(); + } + setDocIdLimit(wantedDocLidLimit); } bool diff --git a/searchlib/src/vespa/searchlib/docstore/logdatastore.h b/searchlib/src/vespa/searchlib/docstore/logdatastore.h index b9d0bea5eed..426d1c4b8ff 100644 --- a/searchlib/src/vespa/searchlib/docstore/logdatastore.h +++ b/searchlib/src/vespa/searchlib/docstore/logdatastore.h @@ -217,8 +217,9 @@ private: typedef attribute::RcuVector<uint64_t> LidInfoVector; typedef std::vector<FileChunk::UP> FileChunkVector; - void updateLidMap(); + void updateLidMap(uint32_t lastFileChunkDocIdLimit); void preload(); + uint32_t getLastFileChunkDocIdLimit(); void verifyModificationTime(const NameIdSet & partList); void eraseDanglingDatFiles(const NameIdSet &partList, const NameIdSet &datPartList); |