diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2020-05-11 13:22:52 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2020-05-11 17:49:38 +0000 |
commit | 98d7c8f1c1bd36200db961c0b848a820014ab61f (patch) | |
tree | fd58c846e8796025b5eda6a3317fd87df434d70f /searchlib | |
parent | d4b7118d3632acdd9ca6518b3bfcb4dec5caefee (diff) |
Account and limit number of lids per file to reduce spkies during compaction.
This is to avoid ending up with a very long listof lids that are removed but not compacted away due to lidspace compaction being disabled when
removes and delets are too frequent.
Diffstat (limited to 'searchlib')
12 files changed, 97 insertions, 58 deletions
diff --git a/searchlib/src/tests/docstore/file_chunk/file_chunk_test.cpp b/searchlib/src/tests/docstore/file_chunk/file_chunk_test.cpp index 31c21723cd0..70c23dc4191 100644 --- a/searchlib/src/tests/docstore/file_chunk/file_chunk_test.cpp +++ b/searchlib/src/tests/docstore/file_chunk/file_chunk_test.cpp @@ -147,6 +147,7 @@ struct WriteFixture : public FixtureBase { void updateLidMap(uint32_t docIdLimit) { vespalib::LockGuard guard(updateLock); chunk.updateLidMap(guard, lidObserver, serialNum, docIdLimit); + serialNum = chunk.getSerialNum(); } }; @@ -180,6 +181,30 @@ TEST("require that docIdLimit is written to and read from idx file header") } } +TEST("require that numlids are updated") { + { + WriteFixture f("tmp", 1000, false); + f.updateLidMap(1000); + EXPECT_EQUAL(0u, f.chunk.getNumLids()); + f.append(1); + EXPECT_EQUAL(1u, f.chunk.getNumLids()); + f.append(2); + f.append(3); + EXPECT_EQUAL(3u, f.chunk.getNumLids()); + f.append(3); + EXPECT_EQUAL(4u, f.chunk.getNumLids()); + f.flush(); + } + { + WriteFixture f("tmp", 1000, true); + EXPECT_EQUAL(0u, f.chunk.getNumLids()); + f.updateLidMap(1000); + EXPECT_EQUAL(4u, f.chunk.getNumLids()); + f.append(7); + EXPECT_EQUAL(5u, f.chunk.getNumLids()); + } +} + template <typename FixtureType> void assertUpdateLidMap(FixtureType &f) diff --git a/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp b/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp index c383358db9e..70e15611456 100644 --- a/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp +++ b/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp @@ -202,15 +202,24 @@ TEST("test that DirectIOPadding works accordng to spec") { } #endif -TEST("testGrowing") { - FastOS_File::EmptyAndRemoveDirectory("growing"); - EXPECT_TRUE(FastOS_File::MakeDirectory("growing")); - LogDataStore::Config config; //(100000, 0.1, 3.0, 0.2, 8, true, CompressionConfig::LZ4, - // WriteableFileChunk::Config(CompressionConfig(CompressionConfig::LZ4, 9, 60), 1000)); - config.setMaxFileSize(100000).setMaxDiskBloatFactor(0.1).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2) - .compactCompression({CompressionConfig::LZ4}) - .setFileConfig({{CompressionConfig::LZ4, 9, 60}, 1000}); - vespalib::ThreadStackExecutor executor(8, 128*1024); +class TmpDirectory { +public: + TmpDirectory(const vespalib::string & dir) : _dir(dir) + { + FastOS_File::EmptyAndRemoveDirectory(_dir.c_str()); + ASSERT_TRUE(FastOS_File::MakeDirectory(_dir.c_str())); + } + ~TmpDirectory() { + FastOS_File::EmptyAndRemoveDirectory(_dir.c_str()); + } + const vespalib::string & getDir() const { return _dir; } +private: + vespalib::string _dir; +}; + +void verifyGrowing(const LogDataStore::Config & config, uint32_t minFiles, uint32_t maxFiles) { + TmpDirectory tmpDir("growing"); + vespalib::ThreadStackExecutor executor(4, 128*1024); DummyFileHeaderContext fileHeaderContext; MyTlSyncer tlSyncer; { @@ -243,30 +252,32 @@ TEST("testGrowing") { datastore.compact(30000); datastore.remove(31000, 0); checkStats(datastore, 31000, 30000); + EXPECT_LESS_EQUAL(minFiles, datastore.getAllActiveFiles().size()); + EXPECT_GREATER_EQUAL(maxFiles, datastore.getAllActiveFiles().size()); } { LogDataStore datastore(executor, "growing", config, GrowStrategy(), TuneFileSummary(), fileHeaderContext, tlSyncer, nullptr); checkStats(datastore, 30000, 30000); + EXPECT_LESS_EQUAL(minFiles, datastore.getAllActiveFiles().size()); + EXPECT_GREATER_EQUAL(maxFiles, datastore.getAllActiveFiles().size()); } - - FastOS_File::EmptyAndRemoveDirectory("growing"); +} +TEST("testGrowingChunkedBySize") { + LogDataStore::Config config; + config.setMaxFileSize(100000).setMaxDiskBloatFactor(0.1).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2) + .compactCompression({CompressionConfig::LZ4}) + .setFileConfig({{CompressionConfig::LZ4, 9, 60}, 1000}); + verifyGrowing(config,60, 120); } -class TmpDirectory { -public: - TmpDirectory(const vespalib::string & dir) : _dir(dir) - { - FastOS_File::EmptyAndRemoveDirectory(_dir.c_str()); - ASSERT_TRUE(FastOS_File::MakeDirectory(_dir.c_str())); - } - ~TmpDirectory() { - FastOS_File::EmptyAndRemoveDirectory(_dir.c_str()); - } - const vespalib::string & getDir() const { return _dir; } -private: - vespalib::string _dir; -}; +TEST("testGrowingChunkedByNumLids") { + LogDataStore::Config config; + config.setMaxNumLids(1000).setMaxDiskBloatFactor(0.1).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2) + .compactCompression({CompressionConfig::LZ4}) + .setFileConfig({{CompressionConfig::LZ4, 9, 60}, 1000}); + verifyGrowing(config,10, 10); +} void fetchAndTest(IDataStore & datastore, uint32_t lid, const void *a, size_t sz) { @@ -357,8 +368,7 @@ private: LogDataStore _datastore; }; -VisitStore::~VisitStore() { -} +VisitStore::~VisitStore() =default; TEST("test visit cache does not cache empty ones and is able to access some backing store.") { const char * A7 = "aAaAaAa"; diff --git a/searchlib/src/vespa/searchlib/docstore/chunk.cpp b/searchlib/src/vespa/searchlib/docstore/chunk.cpp index 847cd3623c3..f60bf69d908 100644 --- a/searchlib/src/vespa/searchlib/docstore/chunk.cpp +++ b/searchlib/src/vespa/searchlib/docstore/chunk.cpp @@ -57,7 +57,7 @@ Chunk::pack(uint64_t lastSerial, vespalib::DataBuffer & compressed, const Compre Chunk::Chunk(uint32_t id, const Config & config) : _id(id), _lastSerial(static_cast<uint64_t>(-1l)), - _format(new ChunkFormatV2(config.getMaxBytes())) + _format(std::make_unique<ChunkFormatV2>(config.getMaxBytes())) { _lids.reserve(4096/sizeof(Entry)); } diff --git a/searchlib/src/vespa/searchlib/docstore/chunkformat.cpp b/searchlib/src/vespa/searchlib/docstore/chunkformat.cpp index a5bed4c33ce..fbbdcff3c5d 100644 --- a/searchlib/src/vespa/searchlib/docstore/chunkformat.cpp +++ b/searchlib/src/vespa/searchlib/docstore/chunkformat.cpp @@ -81,33 +81,26 @@ ChunkFormat::deserialize(const void * buffer, size_t len, bool skipcrc) uint32_t crc32(0); raw >> crc32; raw.rp(currPos); - ChunkFormat::UP format; if (version == ChunkFormatV1::VERSION) { if (skipcrc) { - format.reset(new ChunkFormatV1(raw)); + return std::make_unique<ChunkFormatV1>(raw); } else { - format.reset(new ChunkFormatV1(raw, crc32)); + return std::make_unique<ChunkFormatV1>(raw, crc32); } } else if (version == ChunkFormatV2::VERSION) { if (skipcrc) { - format.reset(new ChunkFormatV2(raw)); + return std::make_unique<ChunkFormatV2>(raw); } else { - format.reset(new ChunkFormatV2(raw, crc32)); + return std::make_unique<ChunkFormatV2>(raw, crc32); } } else { throw ChunkException(make_string("Unknown version %d", version), VESPA_STRLOC); } - return format; } -ChunkFormat::ChunkFormat() : - _dataBuf() -{ -} +ChunkFormat::ChunkFormat() = default; -ChunkFormat::~ChunkFormat() -{ -} +ChunkFormat::~ChunkFormat() = default; ChunkFormat::ChunkFormat(size_t maxSize) : _dataBuf(maxSize) diff --git a/searchlib/src/vespa/searchlib/docstore/filechunk.cpp b/searchlib/src/vespa/searchlib/docstore/filechunk.cpp index 84d64954e40..b4ef45187ee 100644 --- a/searchlib/src/vespa/searchlib/docstore/filechunk.cpp +++ b/searchlib/src/vespa/searchlib/docstore/filechunk.cpp @@ -78,9 +78,10 @@ FileChunk::FileChunk(FileId fileId, NameId nameId, const vespalib::string & base _dataFileName(createDatFileName(_name)), _idxFileName(createIdxFileName(_name)), _chunkInfo(), + _lastPersistedSerialNum(0), _dataHeaderLen(0u), _idxHeaderLen(0u), - _lastPersistedSerialNum(0), + _numLids(0), _docIdLimit(std::numeric_limits<uint32_t>::max()), _modificationTime() { @@ -228,6 +229,7 @@ FileChunk::updateLidMap(const LockGuard &guard, ISetLid &ds, uint64_t serialNum, globalBucketMap.recordLid(bucketId); } ds.setLid(guard, lidMeta.getLid(), LidInfo(getFileId().getId(), _chunkInfo.size(), lidMeta.size())); + _numLids++; } else { remove(lidMeta.getLid(), lidMeta.size()); } diff --git a/searchlib/src/vespa/searchlib/docstore/filechunk.h b/searchlib/src/vespa/searchlib/docstore/filechunk.h index 0f139f507c6..b68db801d60 100644 --- a/searchlib/src/vespa/searchlib/docstore/filechunk.h +++ b/searchlib/src/vespa/searchlib/docstore/filechunk.h @@ -154,6 +154,7 @@ public: FileId getFileId() const { return _fileId; } NameId getNameId() const { return _nameId; } + uint32_t getNumLids() const { return _numLids; } size_t getBloatCount() const { return _erasedCount; } size_t getAddedBytes() const { return _addedBytes; } size_t getErasedBytes() const { return _erasedBytes; } @@ -245,9 +246,10 @@ protected: vespalib::string _dataFileName; vespalib::string _idxFileName; ChunkInfoVector _chunkInfo; + uint64_t _lastPersistedSerialNum; uint32_t _dataHeaderLen; uint32_t _idxHeaderLen; - uint64_t _lastPersistedSerialNum; + uint32_t _numLids; uint32_t _docIdLimit; // Limit when the file was created. Stored in idx file header. vespalib::system_time _modificationTime; }; diff --git a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp index 91de6ba4276..ae0696d5824 100644 --- a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp +++ b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp @@ -16,6 +16,11 @@ LOG_SETUP(".searchlib.docstore.logdatastore"); namespace search { +namespace { + constexpr size_t DEFAULT_MAX_FILESIZE = 1000000000ul; + constexpr uint32_t DEFAULT_MAX_LIDS_PER_FILE = 32 * 1024 * 1024; +} + using vespalib::LockGuard; using vespalib::getLastErrorString; using vespalib::getErrorString; @@ -30,10 +35,11 @@ using docstore::BucketCompacter; using namespace std::literals; LogDataStore::Config::Config() - : _maxFileSize(1000000000ul), + : _maxFileSize(DEFAULT_MAX_FILESIZE), _maxDiskBloatFactor(0.2), _maxBucketSpread(2.5), _minFileSizeFactor(0.2), + _maxNumLids(DEFAULT_MAX_LIDS_PER_FILE), _skipCrcOnRead(false), _compactCompression(CompressionConfig::LZ4), _fileConfig() @@ -202,9 +208,9 @@ LogDataStore::requireSpace(LockGuard guard, WriteableFileChunk & active) { assert(active.getFileId() == getActiveFileId(guard)); size_t oldSz(active.getDiskFootprint()); - LOG(spam, "Checking file %s size %ld < %ld", - active.getName().c_str(), oldSz, _config.getMaxFileSize()); - if (oldSz > _config.getMaxFileSize()) { + LOG(spam, "Checking file %s size %ld < %ld AND #lids %u < %u", + active.getName().c_str(), oldSz, _config.getMaxFileSize(), active.getNumLids(), _config.getMaxNumLids()); + if ((oldSz > _config.getMaxFileSize()) || (active.getNumLids() >= _config.getMaxNumLids())) { FileId fileId = allocateFileId(guard); setNewFileChunk(guard, createWritableFile(fileId, active.getSerialNum())); setActive(guard, fileId); @@ -220,9 +226,9 @@ LogDataStore::requireSpace(LockGuard guard, WriteableFileChunk & active) active.flushPendingChunks(active.getSerialNum()); active.freeze(); // TODO: Delay create of new file - LOG(debug, "Closed file %s of size %ld due to maxsize of %ld reached. Bloat is %ld", - active.getName().c_str(), active.getDiskFootprint(), - _config.getMaxFileSize(), active.getDiskBloat()); + LOG(debug, "Closed file %s of size %ld and %u lids due to maxsize of %ld or maxlids %u reached. Bloat is %ld", + active.getName().c_str(), active.getDiskFootprint(), active.getNumLids(), + _config.getMaxFileSize(), _config.getMaxNumLids(), active.getDiskBloat()); } } diff --git a/searchlib/src/vespa/searchlib/docstore/logdatastore.h b/searchlib/src/vespa/searchlib/docstore/logdatastore.h index 39b2c2b7100..7bd55599611 100644 --- a/searchlib/src/vespa/searchlib/docstore/logdatastore.h +++ b/searchlib/src/vespa/searchlib/docstore/logdatastore.h @@ -40,6 +40,7 @@ public: Config(); Config & setMaxFileSize(size_t v) { _maxFileSize = v; return *this; } + Config & setMaxNumLids(size_t v) { _maxNumLids = v; return *this; } Config & setMaxDiskBloatFactor(double v) { _maxDiskBloatFactor = v; return *this; } Config & setMaxBucketSpread(double v) { _maxBucketSpread = v; return *this; } Config & setMinFileSizeFactor(double v) { _minFileSizeFactor = v; return *this; } @@ -51,6 +52,7 @@ public: double getMaxDiskBloatFactor() const { return _maxDiskBloatFactor; } double getMaxBucketSpread() const { return _maxBucketSpread; } double getMinFileSizeFactor() const { return _minFileSizeFactor; } + uint32_t getMaxNumLids() const { return _maxNumLids; } bool crcOnReadDisabled() const { return _skipCrcOnRead; } const CompressionConfig & compactCompression() const { return _compactCompression; } @@ -64,6 +66,7 @@ public: double _maxDiskBloatFactor; double _maxBucketSpread; double _minFileSizeFactor; + uint32_t _maxNumLids; bool _skipCrcOnRead; CompressionConfig _compactCompression; WriteableFileChunk::Config _fileConfig; diff --git a/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp b/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp index 13de5687ee0..5e595d0bb14 100644 --- a/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp +++ b/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp @@ -92,7 +92,7 @@ StoreByBucket::drain(IWrite & drainer) chunks.resize(_chunks.size()); for (const auto & it : _chunks) { ConstBufferRef buf(it.second); - chunks[it.first].reset(new Chunk(it.first, buf.data(), buf.size())); + chunks[it.first] = std::make_unique<Chunk>(it.first, buf.data(), buf.size()); } _chunks.clear(); for (auto & it : _where) { diff --git a/searchlib/src/vespa/searchlib/docstore/writeablefilechunk.cpp b/searchlib/src/vespa/searchlib/docstore/writeablefilechunk.cpp index d12a3a54a93..3517595d00a 100644 --- a/searchlib/src/vespa/searchlib/docstore/writeablefilechunk.cpp +++ b/searchlib/src/vespa/searchlib/docstore/writeablefilechunk.cpp @@ -708,6 +708,7 @@ WriteableFileChunk::append(uint64_t serialNum, uint32_t lid, const void * buffer assert(serialNum >= _serialNum); _serialNum = serialNum; _addedBytes += adjustSize(len); + _numLids++; size_t oldSz(_active->size()); LidMeta lm = _active->append(lid, buffer, len); setDiskFootprint(FileChunk::getDiskFootprint() - oldSz + _active->size()); diff --git a/searchlib/src/vespa/searchlib/docstore/writeablefilechunk.h b/searchlib/src/vespa/searchlib/docstore/writeablefilechunk.h index 6d870cae019..65dd822ac1f 100644 --- a/searchlib/src/vespa/searchlib/docstore/writeablefilechunk.h +++ b/searchlib/src/vespa/searchlib/docstore/writeablefilechunk.h @@ -47,7 +47,7 @@ public: uint32_t docIdLimit, const Config & config, const TuneFileSummary &tune, const common::FileHeaderContext &fileHeaderContext, const IBucketizer * bucketizer, bool crcOnReadDisabled); - ~WriteableFileChunk(); + ~WriteableFileChunk() override; ssize_t read(uint32_t lid, SubChunkId chunk, vespalib::DataBuffer & buffer) const override; void read(LidInfoWithLidV::const_iterator begin, size_t count, IBufferVisitor & visitor) const override; @@ -128,8 +128,8 @@ private: bool _writeTaskIsRunning; vespalib::Monitor _writeMonitor; ProcessedChunkQ _writeQ; - vespalib::Executor & _executor; - ProcessedChunkMap _orderedChunks; + vespalib::Executor & _executor; + ProcessedChunkMap _orderedChunks; BucketDensityComputer _bucketMap; }; diff --git a/searchlib/src/vespa/searchlib/test/directory_handler.h b/searchlib/src/vespa/searchlib/test/directory_handler.h index 72f9cf83d85..a0b8c68887e 100644 --- a/searchlib/src/vespa/searchlib/test/directory_handler.h +++ b/searchlib/src/vespa/searchlib/test/directory_handler.h @@ -5,8 +5,7 @@ #include <vespa/vespalib/io/fileutil.h> #include <vespa/vespalib/stllike/string.h> -namespace search { -namespace test { +namespace search::test { class DirectoryHandler { @@ -40,5 +39,3 @@ public: }; } -} - |