diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2021-12-08 15:08:45 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-12-08 15:08:45 +0100 |
commit | c879b4cffbf447f235ec868fde6b44ba3f3e0dc0 (patch) | |
tree | 140e18aec39bfbe0400b073806ee870b8ec32eed /searchlib | |
parent | 115fcf8a11bf3332e8cdd90a81dc6ccca728886f (diff) | |
parent | 65c857688005ac042809d034a27c9eab586eb4d3 (diff) |
Merge pull request #20412 from vespa-engine/balder/split-bloat-and-spread
Separate spread and bloat
Diffstat (limited to 'searchlib')
9 files changed, 66 insertions, 90 deletions
diff --git a/searchlib/src/tests/docstore/document_store/document_store_test.cpp b/searchlib/src/tests/docstore/document_store/document_store_test.cpp index 1a6b0a5a1c6..f2bec30a349 100644 --- a/searchlib/src/tests/docstore/document_store/document_store_test.cpp +++ b/searchlib/src/tests/docstore/document_store/document_store_test.cpp @@ -25,7 +25,7 @@ struct NullDataStore : IDataStore { size_t memoryMeta() const override { return 0; } size_t getDiskFootprint() const override { return 0; } size_t getDiskBloat() const override { return 0; } - size_t getMaxCompactGain() const override { return 0; } + size_t getMaxSpreadAsBloat() const override { return 0; } uint64_t lastSyncToken() const override { return 0; } uint64_t tentativeLastSyncToken() const override { return 0; } vespalib::system_time getLastFlushTime() const override { return vespalib::system_time(); } diff --git a/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp b/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp index 07652dfd336..378babb6ee1 100644 --- a/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp +++ b/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp @@ -236,7 +236,7 @@ void verifyGrowing(const LogDataStore::Config & config, uint32_t minFiles, uint3 datastore.remove(i + 20000, i); } datastore.flush(datastore.initFlush(lastSyncToken)); - datastore.compact(30000); + datastore.compactBloat(30000); datastore.remove(31000, 0); checkStats(datastore, 31000, 30000); EXPECT_LESS_EQUAL(minFiles, datastore.getAllActiveFiles().size()); @@ -252,7 +252,7 @@ void verifyGrowing(const LogDataStore::Config & config, uint32_t minFiles, uint3 } TEST("testGrowingChunkedBySize") { LogDataStore::Config config; - config.setMaxFileSize(100000).setMaxDiskBloatFactor(0.1).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2) + config.setMaxFileSize(100000).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2) .compactCompression({CompressionConfig::LZ4}) .setFileConfig({{CompressionConfig::LZ4, 9, 60}, 1000}); verifyGrowing(config, 40, 120); @@ -260,7 +260,7 @@ TEST("testGrowingChunkedBySize") { TEST("testGrowingChunkedByNumLids") { LogDataStore::Config config; - config.setMaxNumLids(1000).setMaxDiskBloatFactor(0.1).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2) + config.setMaxNumLids(1000).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2) .compactCompression({CompressionConfig::LZ4}) .setFileConfig({{CompressionConfig::LZ4, 9, 60}, 1000}); verifyGrowing(config,10, 10); @@ -679,7 +679,7 @@ TEST("testWriteRead") { EXPECT_LESS(0u, headerFootprint); EXPECT_EQUAL(datastore.getDiskFootprint(), headerFootprint); EXPECT_EQUAL(datastore.getDiskBloat(), 0ul); - EXPECT_EQUAL(datastore.getMaxCompactGain(), 0ul); + EXPECT_EQUAL(datastore.getMaxSpreadAsBloat(), 0ul); datastore.write(1, 0, a[0].c_str(), a[0].size()); fetchAndTest(datastore, 0, a[0].c_str(), a[0].size()); datastore.write(2, 0, a[1].c_str(), a[1].size()); @@ -701,7 +701,7 @@ TEST("testWriteRead") { EXPECT_EQUAL(datastore.getDiskFootprint(), 2711ul + headerFootprint); EXPECT_EQUAL(datastore.getDiskBloat(), 0ul); - EXPECT_EQUAL(datastore.getMaxCompactGain(), 0ul); + EXPECT_EQUAL(datastore.getMaxSpreadAsBloat(), 0ul); datastore.flush(datastore.initFlush(lastSyncToken)); } { @@ -715,7 +715,7 @@ TEST("testWriteRead") { EXPECT_LESS(0u, headerFootprint); EXPECT_EQUAL(4944ul + headerFootprint, datastore.getDiskFootprint()); EXPECT_EQUAL(0ul, datastore.getDiskBloat()); - EXPECT_EQUAL(0ul, datastore.getMaxCompactGain()); + EXPECT_EQUAL(0ul, datastore.getMaxSpreadAsBloat()); for(size_t i=0; i < 100; i++) { fetchAndTest(datastore, i, a[i%2].c_str(), a[i%2].size()); @@ -730,7 +730,7 @@ TEST("testWriteRead") { EXPECT_EQUAL(7594ul + headerFootprint, datastore.getDiskFootprint()); EXPECT_EQUAL(0ul, datastore.getDiskBloat()); - EXPECT_EQUAL(0ul, datastore.getMaxCompactGain()); + EXPECT_EQUAL(0ul, datastore.getMaxSpreadAsBloat()); } FastOS_File::EmptyAndRemoveDirectory("empty"); } @@ -1050,7 +1050,6 @@ TEST("require that config equality operator detects inequality") { using C = LogDataStore::Config; EXPECT_TRUE(C() == C()); EXPECT_FALSE(C() == C().setMaxFileSize(1)); - EXPECT_FALSE(C() == C().setMaxDiskBloatFactor(0.3)); EXPECT_FALSE(C() == C().setMaxBucketSpread(0.3)); EXPECT_FALSE(C() == C().setMinFileSizeFactor(0.3)); EXPECT_FALSE(C() == C().setFileConfig(WriteableFileChunk::Config({}, 70))); diff --git a/searchlib/src/vespa/searchlib/docstore/documentstore.cpp b/searchlib/src/vespa/searchlib/docstore/documentstore.cpp index 7aaee7180df..b4ff050c0f6 100644 --- a/searchlib/src/vespa/searchlib/docstore/documentstore.cpp +++ b/searchlib/src/vespa/searchlib/docstore/documentstore.cpp @@ -112,7 +112,6 @@ public: } -using VisitCache = docstore::VisitCache; using docstore::Value; bool @@ -239,7 +238,14 @@ DocumentStore::remove(uint64_t syncToken, DocumentIdT lid) } void -DocumentStore::compact(uint64_t syncToken) +DocumentStore::compactBloat(uint64_t syncToken) +{ + (void) syncToken; + // Most implementations does not offer compact. +} + +void +DocumentStore::compactSpread(uint64_t syncToken) { (void) syncToken; // Most implementations does not offer compact. diff --git a/searchlib/src/vespa/searchlib/docstore/documentstore.h b/searchlib/src/vespa/searchlib/docstore/documentstore.h index b6021d34bef..6402c16cd5e 100644 --- a/searchlib/src/vespa/searchlib/docstore/documentstore.h +++ b/searchlib/src/vespa/searchlib/docstore/documentstore.h @@ -72,7 +72,8 @@ public: void remove(uint64_t syncToken, DocumentIdT lid) override; void flush(uint64_t syncToken) override; uint64_t initFlush(uint64_t synctoken) override; - void compact(uint64_t syncToken) override; + void compactBloat(uint64_t syncToken) override; + void compactSpread(uint64_t syncToken) override; uint64_t lastSyncToken() const override; uint64_t tentativeLastSyncToken() const override; vespalib::system_time getLastFlushTime() const override; @@ -80,7 +81,7 @@ public: size_t memoryUsed() const override { return _backingStore.memoryUsed(); } size_t getDiskFootprint() const override { return _backingStore.getDiskFootprint(); } size_t getDiskBloat() const override { return _backingStore.getDiskBloat(); } - size_t getMaxCompactGain() const override { return _backingStore.getMaxCompactGain(); } + size_t getMaxSpreadAsBloat() const override { return _backingStore.getMaxSpreadAsBloat(); } CacheStats getCacheStats() const override; size_t memoryMeta() const override { return _backingStore.memoryMeta(); } const vespalib::string & getBaseDir() const override { return _backingStore.getBaseDir(); } diff --git a/searchlib/src/vespa/searchlib/docstore/idatastore.h b/searchlib/src/vespa/searchlib/docstore/idatastore.h index 82656ad7e69..fc0eae1d15e 100644 --- a/searchlib/src/vespa/searchlib/docstore/idatastore.h +++ b/searchlib/src/vespa/searchlib/docstore/idatastore.h @@ -121,7 +121,7 @@ public: * to avoid misuse we let the report a more conservative number here if necessary. * @return diskspace to be gained. */ - virtual size_t getMaxCompactGain() const = 0; + virtual size_t getMaxSpreadAsBloat() const = 0; /** diff --git a/searchlib/src/vespa/searchlib/docstore/idocumentstore.h b/searchlib/src/vespa/searchlib/docstore/idocumentstore.h index 0e73e4d7993..d84a5ad7e7e 100644 --- a/searchlib/src/vespa/searchlib/docstore/idocumentstore.h +++ b/searchlib/src/vespa/searchlib/docstore/idocumentstore.h @@ -100,7 +100,8 @@ public: /** * If possible compact the disk. **/ - virtual void compact(uint64_t syncToken) = 0; + virtual void compactBloat(uint64_t syncToken) = 0; + virtual void compactSpread(uint64_t syncToken) = 0; /** * The sync token used for the last successful flush() operation, @@ -153,12 +154,11 @@ public: virtual size_t getDiskBloat() const = 0; /** - * Calculates how much diskspace can be compacted during a flush. - * default is to return th ebloat limit, but as some targets have some internal limits - * to avoid misuse we let the report a more conservative number here if necessary. - * @return diskspace to be gained. + * Calculates the gain from keeping buckets close. It is converted to diskbloat + * so it can be prioritized accordingly. + * @return spread as disk bloat. */ - virtual size_t getMaxCompactGain() const = 0; + virtual size_t getMaxSpreadAsBloat() const = 0; /** * Returns statistics about the cache. diff --git a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp index fd25dd56235..6a9ae40cc93 100644 --- a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp +++ b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp @@ -36,7 +36,6 @@ using namespace std::literals; LogDataStore::Config::Config() : _maxFileSize(DEFAULT_MAX_FILESIZE), - _maxDiskBloatFactor(0.2), _maxBucketSpread(2.5), _minFileSizeFactor(0.2), _maxNumLids(DEFAULT_MAX_LIDS_PER_FILE), @@ -48,7 +47,6 @@ LogDataStore::Config::Config() bool LogDataStore::Config::operator == (const Config & rhs) const { return (_maxBucketSpread == rhs._maxBucketSpread) && - (_maxDiskBloatFactor == rhs._maxDiskBloatFactor) && (_maxFileSize == rhs._maxFileSize) && (_minFileSizeFactor == rhs._minFileSizeFactor) && (_skipCrcOnRead == rhs._skipCrcOnRead) && @@ -294,46 +292,14 @@ vespalib::string bloatMsg(size_t bloat, size_t usage) { } -void -LogDataStore::compact(uint64_t syncToken) -{ - uint64_t usage = getDiskFootprint(); - uint64_t bloat = getDiskBloat(); - LOG(debug, "%s", bloatMsg(bloat, usage).c_str()); - const bool doCompact = (_fileChunks.size() > 1); - if (doCompact) { - LOG(info, "%s. Will compact", bloatMsg(bloat, usage).c_str()); - compactWorst(_config.getMaxDiskBloatFactor(), _config.getMaxBucketSpread(), isTotalDiskBloatExceeded(usage, bloat)); - } - flushActiveAndWait(syncToken); - if (doCompact) { - usage = getDiskFootprint(); - bloat = getDiskBloat(); - LOG(info, "Done compacting. %s", bloatMsg(bloat, usage).c_str()); - } -} - -bool -LogDataStore::isTotalDiskBloatExceeded(size_t diskFootPrint, size_t bloat) const { - const size_t maxConfiguredDiskBloat = diskFootPrint * _config.getMaxDiskBloatFactor(); - return bloat > maxConfiguredDiskBloat; -} - size_t -LogDataStore::getMaxCompactGain() const +LogDataStore::getMaxSpreadAsBloat() const { - size_t bloat = getDiskBloat(); const size_t diskFootPrint = getDiskFootprint(); - if ( ! isTotalDiskBloatExceeded(diskFootPrint, bloat) ) { - bloat = 0; - } - const double maxSpread = getMaxBucketSpread(); - size_t spreadAsBloat = diskFootPrint * (1.0 - 1.0/maxSpread); - if ( maxSpread < _config.getMaxBucketSpread()) { - spreadAsBloat = 0; - } - return (bloat + spreadAsBloat); + return (maxSpread > _config.getMaxBucketSpread()) + ? diskFootPrint * (1.0 - 1.0/maxSpread) + : 0; } void @@ -380,40 +346,34 @@ LogDataStore::getMaxBucketSpread() const } std::pair<bool, LogDataStore::FileId> -LogDataStore::findNextToCompact(double bloatLimit, double spreadLimit, bool prioritizeDiskBloat) +LogDataStore::findNextToCompact(bool dueToBloat) { typedef std::multimap<double, FileId, std::greater<double>> CostMap; - CostMap worstBloat; - CostMap worstSpread; + CostMap worst; MonitorGuard guard(_updateLock); for (size_t i(0); i < _fileChunks.size(); i++) { const auto & fc(_fileChunks[i]); if (fc && fc->frozen() && (_currentlyCompacting.find(fc->getNameId()) == _currentlyCompacting.end())) { uint64_t usage = fc->getDiskFootprint(); - uint64_t bloat = fc->getDiskBloat(); - if (_bucketizer) { - worstSpread.emplace(fc->getBucketSpread(), FileId(i)); - } - if (usage > 0) { - double tmp(double(bloat)/usage); - worstBloat.emplace(tmp, FileId(i)); + if ( ! dueToBloat && _bucketizer) { + worst.emplace(fc->getBucketSpread(), FileId(i)); + } else if (dueToBloat && usage > 0) { + double tmp(double(fc->getDiskBloat())/usage); + worst.emplace(tmp, FileId(i)); } } } if (LOG_WOULD_LOG(debug)) { - for (const auto & it : worstBloat) { + for (const auto & it : worst) { const FileChunk & fc = *_fileChunks[it.second.getId()]; LOG(debug, "File '%s' has bloat '%2.2f' and bucket-spread '%1.4f numChunks=%d , numBuckets=%ld, numUniqueBuckets=%ld", fc.getName().c_str(), it.first * 100, fc.getBucketSpread(), fc.getNumChunks(), fc.getNumBuckets(), fc.getNumUniqueBuckets()); } } std::pair<bool, FileId> retval(false, FileId(-1)); - if ( ! worstBloat.empty() && (worstBloat.begin()->first > bloatLimit) && prioritizeDiskBloat) { - retval.first = true; - retval.second = worstBloat.begin()->second; - } else if ( ! worstSpread.empty() && (worstSpread.begin()->first > spreadLimit)) { + if ( ! worst.empty()) { retval.first = true; - retval.second = worstSpread.begin()->second; + retval.second = worst.begin()->second; } if (retval.first) { _currentlyCompacting.insert(_fileChunks[retval.second.getId()]->getNameId()); @@ -422,10 +382,24 @@ LogDataStore::findNextToCompact(double bloatLimit, double spreadLimit, bool prio } void -LogDataStore::compactWorst(double bloatLimit, double spreadLimit, bool prioritizeDiskBloat) { - auto worst = findNextToCompact(bloatLimit, spreadLimit, prioritizeDiskBloat); - if (worst.first) { - compactFile(worst.second); +LogDataStore::compactWorst(uint64_t syncToken, bool compactDiskBloat) { + uint64_t usage = getDiskFootprint(); + uint64_t bloat = getDiskBloat(); + const char * reason = compactDiskBloat ? "bloat" : "spread"; + LOG(debug, "%s", bloatMsg(bloat, usage).c_str()); + const bool doCompact = (_fileChunks.size() > 1); + if (doCompact) { + LOG(debug, "Will compact due to %s: %s", reason, bloatMsg(bloat, usage).c_str()); + auto worst = findNextToCompact(compactDiskBloat); + if (worst.first) { + compactFile(worst.second); + } + flushActiveAndWait(syncToken); + usage = getDiskFootprint(); + bloat = getDiskBloat(); + LOG(info, "Done compacting due to %s: %s", reason, bloatMsg(bloat, usage).c_str()); + } else { + flushActiveAndWait(syncToken); } } @@ -1001,7 +975,7 @@ LogDataStore::computeNumberOfSignificantBucketIdBits(const IBucketizer & bucketi while ((msb > 0) && (msbHistogram[msb - 1] == 0)) { msb--; } - LOG(info, "computeNumberOfSignificantBucketIdBits(file=%d) = %ld = %ld took %1.3f", fileId.getId(), msb, msbHistogram[msb-1], timer.min_time()); + LOG(debug, "computeNumberOfSignificantBucketIdBits(file=%d) = %ld = %ld took %1.3f", fileId.getId(), msb, msbHistogram[msb-1], timer.min_time()); return msb; } diff --git a/searchlib/src/vespa/searchlib/docstore/logdatastore.h b/searchlib/src/vespa/searchlib/docstore/logdatastore.h index f43dc96fac9..62f87076759 100644 --- a/searchlib/src/vespa/searchlib/docstore/logdatastore.h +++ b/searchlib/src/vespa/searchlib/docstore/logdatastore.h @@ -41,7 +41,6 @@ public: Config & setMaxFileSize(size_t v) { _maxFileSize = v; return *this; } Config & setMaxNumLids(size_t v) { _maxNumLids = v; return *this; } - Config & setMaxDiskBloatFactor(double v) { _maxDiskBloatFactor = v; return *this; } Config & setMaxBucketSpread(double v) { _maxBucketSpread = v; return *this; } Config & setMinFileSizeFactor(double v) { _minFileSizeFactor = v; return *this; } @@ -49,7 +48,6 @@ public: Config & setFileConfig(WriteableFileChunk::Config v) { _fileConfig = v; return *this; } size_t getMaxFileSize() const { return _maxFileSize; } - double getMaxDiskBloatFactor() const { return _maxDiskBloatFactor; } double getMaxBucketSpread() const { return _maxBucketSpread; } double getMinFileSizeFactor() const { return _minFileSizeFactor; } uint32_t getMaxNumLids() const { return _maxNumLids; } @@ -63,7 +61,6 @@ public: bool operator == (const Config &) const; private: size_t _maxFileSize; - double _maxDiskBloatFactor; double _maxBucketSpread; double _minFileSizeFactor; uint32_t _maxNumLids; @@ -109,9 +106,10 @@ public: size_t getDiskFootprint() const override; size_t getDiskHeaderFootprint() const override; size_t getDiskBloat() const override; - size_t getMaxCompactGain() const override; + size_t getMaxSpreadAsBloat() const override; - void compact(uint64_t syncToken); + void compactBloat(uint64_t syncToken) { compactWorst(syncToken, true); } + void compactSpread(uint64_t syncToken) { compactWorst(syncToken, false);} const Config & getConfig() const { return _config; } Config & getConfig() { return _config; } @@ -180,10 +178,9 @@ private: class WrapVisitorProgress; class FileChunkHolder; - // Implements ISetLid API void setLid(const ISetLid::unique_lock & guard, uint32_t lid, const LidInfo & lm) override; - void compactWorst(double bloatLimit, double spreadLimit, bool prioritizeDiskBloat); + void compactWorst(uint64_t syncToken, bool compactDiskBloat); void compactFile(FileId chunkId); typedef vespalib::RcuVector<uint64_t> LidInfoVector; @@ -199,8 +196,6 @@ private: NameIdSet eraseIncompleteCompactedFiles(NameIdSet partList); void internalFlushAll(); - bool isTotalDiskBloatExceeded(size_t diskFootPrint, size_t bloat) const; - NameIdSet scanDir(const vespalib::string &dir, const vespalib::string &suffix); FileId allocateFileId(const MonitorGuard & guard); void setNewFileChunk(const MonitorGuard & guard, FileChunk::UP fileChunk); @@ -245,7 +240,7 @@ private: return (_fileChunks.empty() ? 0 : _fileChunks.back()->getLastPersistedSerialNum()); } bool shouldCompactToActiveFile(size_t compactedSize) const; - std::pair<bool, FileId> findNextToCompact(double bloatLimit, double spreadLimit, bool prioritizeDiskBloat); + std::pair<bool, FileId> findNextToCompact(bool compactDiskBloat); void incGeneration(); bool canShrinkLidSpace(const MonitorGuard &guard) const; diff --git a/searchlib/src/vespa/searchlib/docstore/logdocumentstore.h b/searchlib/src/vespa/searchlib/docstore/logdocumentstore.h index de36155bedb..2931f8bce2d 100644 --- a/searchlib/src/vespa/searchlib/docstore/logdocumentstore.h +++ b/searchlib/src/vespa/searchlib/docstore/logdocumentstore.h @@ -51,7 +51,8 @@ public: ~LogDocumentStore() override; void reconfigure(const Config & config); private: - void compact(uint64_t syncToken) override { _backingStore.compact(syncToken); } + void compactBloat(uint64_t syncToken) override { _backingStore.compactBloat(syncToken); } + void compactSpread(uint64_t syncToken) override { _backingStore.compactSpread(syncToken); } LogDataStore _backingStore; }; |