aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2021-12-08 15:08:45 +0100
committerGitHub <noreply@github.com>2021-12-08 15:08:45 +0100
commitc879b4cffbf447f235ec868fde6b44ba3f3e0dc0 (patch)
tree140e18aec39bfbe0400b073806ee870b8ec32eed /searchlib
parent115fcf8a11bf3332e8cdd90a81dc6ccca728886f (diff)
parent65c857688005ac042809d034a27c9eab586eb4d3 (diff)
Merge pull request #20412 from vespa-engine/balder/split-bloat-and-spread
Separate spread and bloat
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/docstore/document_store/document_store_test.cpp2
-rw-r--r--searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp15
-rw-r--r--searchlib/src/vespa/searchlib/docstore/documentstore.cpp10
-rw-r--r--searchlib/src/vespa/searchlib/docstore/documentstore.h5
-rw-r--r--searchlib/src/vespa/searchlib/docstore/idatastore.h2
-rw-r--r--searchlib/src/vespa/searchlib/docstore/idocumentstore.h12
-rw-r--r--searchlib/src/vespa/searchlib/docstore/logdatastore.cpp92
-rw-r--r--searchlib/src/vespa/searchlib/docstore/logdatastore.h15
-rw-r--r--searchlib/src/vespa/searchlib/docstore/logdocumentstore.h3
9 files changed, 66 insertions, 90 deletions
diff --git a/searchlib/src/tests/docstore/document_store/document_store_test.cpp b/searchlib/src/tests/docstore/document_store/document_store_test.cpp
index 1a6b0a5a1c6..f2bec30a349 100644
--- a/searchlib/src/tests/docstore/document_store/document_store_test.cpp
+++ b/searchlib/src/tests/docstore/document_store/document_store_test.cpp
@@ -25,7 +25,7 @@ struct NullDataStore : IDataStore {
size_t memoryMeta() const override { return 0; }
size_t getDiskFootprint() const override { return 0; }
size_t getDiskBloat() const override { return 0; }
- size_t getMaxCompactGain() const override { return 0; }
+ size_t getMaxSpreadAsBloat() const override { return 0; }
uint64_t lastSyncToken() const override { return 0; }
uint64_t tentativeLastSyncToken() const override { return 0; }
vespalib::system_time getLastFlushTime() const override { return vespalib::system_time(); }
diff --git a/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp b/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp
index 07652dfd336..378babb6ee1 100644
--- a/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp
+++ b/searchlib/src/tests/docstore/logdatastore/logdatastore_test.cpp
@@ -236,7 +236,7 @@ void verifyGrowing(const LogDataStore::Config & config, uint32_t minFiles, uint3
datastore.remove(i + 20000, i);
}
datastore.flush(datastore.initFlush(lastSyncToken));
- datastore.compact(30000);
+ datastore.compactBloat(30000);
datastore.remove(31000, 0);
checkStats(datastore, 31000, 30000);
EXPECT_LESS_EQUAL(minFiles, datastore.getAllActiveFiles().size());
@@ -252,7 +252,7 @@ void verifyGrowing(const LogDataStore::Config & config, uint32_t minFiles, uint3
}
TEST("testGrowingChunkedBySize") {
LogDataStore::Config config;
- config.setMaxFileSize(100000).setMaxDiskBloatFactor(0.1).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2)
+ config.setMaxFileSize(100000).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2)
.compactCompression({CompressionConfig::LZ4})
.setFileConfig({{CompressionConfig::LZ4, 9, 60}, 1000});
verifyGrowing(config, 40, 120);
@@ -260,7 +260,7 @@ TEST("testGrowingChunkedBySize") {
TEST("testGrowingChunkedByNumLids") {
LogDataStore::Config config;
- config.setMaxNumLids(1000).setMaxDiskBloatFactor(0.1).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2)
+ config.setMaxNumLids(1000).setMaxBucketSpread(3.0).setMinFileSizeFactor(0.2)
.compactCompression({CompressionConfig::LZ4})
.setFileConfig({{CompressionConfig::LZ4, 9, 60}, 1000});
verifyGrowing(config,10, 10);
@@ -679,7 +679,7 @@ TEST("testWriteRead") {
EXPECT_LESS(0u, headerFootprint);
EXPECT_EQUAL(datastore.getDiskFootprint(), headerFootprint);
EXPECT_EQUAL(datastore.getDiskBloat(), 0ul);
- EXPECT_EQUAL(datastore.getMaxCompactGain(), 0ul);
+ EXPECT_EQUAL(datastore.getMaxSpreadAsBloat(), 0ul);
datastore.write(1, 0, a[0].c_str(), a[0].size());
fetchAndTest(datastore, 0, a[0].c_str(), a[0].size());
datastore.write(2, 0, a[1].c_str(), a[1].size());
@@ -701,7 +701,7 @@ TEST("testWriteRead") {
EXPECT_EQUAL(datastore.getDiskFootprint(),
2711ul + headerFootprint);
EXPECT_EQUAL(datastore.getDiskBloat(), 0ul);
- EXPECT_EQUAL(datastore.getMaxCompactGain(), 0ul);
+ EXPECT_EQUAL(datastore.getMaxSpreadAsBloat(), 0ul);
datastore.flush(datastore.initFlush(lastSyncToken));
}
{
@@ -715,7 +715,7 @@ TEST("testWriteRead") {
EXPECT_LESS(0u, headerFootprint);
EXPECT_EQUAL(4944ul + headerFootprint, datastore.getDiskFootprint());
EXPECT_EQUAL(0ul, datastore.getDiskBloat());
- EXPECT_EQUAL(0ul, datastore.getMaxCompactGain());
+ EXPECT_EQUAL(0ul, datastore.getMaxSpreadAsBloat());
for(size_t i=0; i < 100; i++) {
fetchAndTest(datastore, i, a[i%2].c_str(), a[i%2].size());
@@ -730,7 +730,7 @@ TEST("testWriteRead") {
EXPECT_EQUAL(7594ul + headerFootprint, datastore.getDiskFootprint());
EXPECT_EQUAL(0ul, datastore.getDiskBloat());
- EXPECT_EQUAL(0ul, datastore.getMaxCompactGain());
+ EXPECT_EQUAL(0ul, datastore.getMaxSpreadAsBloat());
}
FastOS_File::EmptyAndRemoveDirectory("empty");
}
@@ -1050,7 +1050,6 @@ TEST("require that config equality operator detects inequality") {
using C = LogDataStore::Config;
EXPECT_TRUE(C() == C());
EXPECT_FALSE(C() == C().setMaxFileSize(1));
- EXPECT_FALSE(C() == C().setMaxDiskBloatFactor(0.3));
EXPECT_FALSE(C() == C().setMaxBucketSpread(0.3));
EXPECT_FALSE(C() == C().setMinFileSizeFactor(0.3));
EXPECT_FALSE(C() == C().setFileConfig(WriteableFileChunk::Config({}, 70)));
diff --git a/searchlib/src/vespa/searchlib/docstore/documentstore.cpp b/searchlib/src/vespa/searchlib/docstore/documentstore.cpp
index 7aaee7180df..b4ff050c0f6 100644
--- a/searchlib/src/vespa/searchlib/docstore/documentstore.cpp
+++ b/searchlib/src/vespa/searchlib/docstore/documentstore.cpp
@@ -112,7 +112,6 @@ public:
}
-using VisitCache = docstore::VisitCache;
using docstore::Value;
bool
@@ -239,7 +238,14 @@ DocumentStore::remove(uint64_t syncToken, DocumentIdT lid)
}
void
-DocumentStore::compact(uint64_t syncToken)
+DocumentStore::compactBloat(uint64_t syncToken)
+{
+ (void) syncToken;
+ // Most implementations does not offer compact.
+}
+
+void
+DocumentStore::compactSpread(uint64_t syncToken)
{
(void) syncToken;
// Most implementations does not offer compact.
diff --git a/searchlib/src/vespa/searchlib/docstore/documentstore.h b/searchlib/src/vespa/searchlib/docstore/documentstore.h
index b6021d34bef..6402c16cd5e 100644
--- a/searchlib/src/vespa/searchlib/docstore/documentstore.h
+++ b/searchlib/src/vespa/searchlib/docstore/documentstore.h
@@ -72,7 +72,8 @@ public:
void remove(uint64_t syncToken, DocumentIdT lid) override;
void flush(uint64_t syncToken) override;
uint64_t initFlush(uint64_t synctoken) override;
- void compact(uint64_t syncToken) override;
+ void compactBloat(uint64_t syncToken) override;
+ void compactSpread(uint64_t syncToken) override;
uint64_t lastSyncToken() const override;
uint64_t tentativeLastSyncToken() const override;
vespalib::system_time getLastFlushTime() const override;
@@ -80,7 +81,7 @@ public:
size_t memoryUsed() const override { return _backingStore.memoryUsed(); }
size_t getDiskFootprint() const override { return _backingStore.getDiskFootprint(); }
size_t getDiskBloat() const override { return _backingStore.getDiskBloat(); }
- size_t getMaxCompactGain() const override { return _backingStore.getMaxCompactGain(); }
+ size_t getMaxSpreadAsBloat() const override { return _backingStore.getMaxSpreadAsBloat(); }
CacheStats getCacheStats() const override;
size_t memoryMeta() const override { return _backingStore.memoryMeta(); }
const vespalib::string & getBaseDir() const override { return _backingStore.getBaseDir(); }
diff --git a/searchlib/src/vespa/searchlib/docstore/idatastore.h b/searchlib/src/vespa/searchlib/docstore/idatastore.h
index 82656ad7e69..fc0eae1d15e 100644
--- a/searchlib/src/vespa/searchlib/docstore/idatastore.h
+++ b/searchlib/src/vespa/searchlib/docstore/idatastore.h
@@ -121,7 +121,7 @@ public:
* to avoid misuse we let the report a more conservative number here if necessary.
* @return diskspace to be gained.
*/
- virtual size_t getMaxCompactGain() const = 0;
+ virtual size_t getMaxSpreadAsBloat() const = 0;
/**
diff --git a/searchlib/src/vespa/searchlib/docstore/idocumentstore.h b/searchlib/src/vespa/searchlib/docstore/idocumentstore.h
index 0e73e4d7993..d84a5ad7e7e 100644
--- a/searchlib/src/vespa/searchlib/docstore/idocumentstore.h
+++ b/searchlib/src/vespa/searchlib/docstore/idocumentstore.h
@@ -100,7 +100,8 @@ public:
/**
* If possible compact the disk.
**/
- virtual void compact(uint64_t syncToken) = 0;
+ virtual void compactBloat(uint64_t syncToken) = 0;
+ virtual void compactSpread(uint64_t syncToken) = 0;
/**
* The sync token used for the last successful flush() operation,
@@ -153,12 +154,11 @@ public:
virtual size_t getDiskBloat() const = 0;
/**
- * Calculates how much diskspace can be compacted during a flush.
- * default is to return th ebloat limit, but as some targets have some internal limits
- * to avoid misuse we let the report a more conservative number here if necessary.
- * @return diskspace to be gained.
+ * Calculates the gain from keeping buckets close. It is converted to diskbloat
+ * so it can be prioritized accordingly.
+ * @return spread as disk bloat.
*/
- virtual size_t getMaxCompactGain() const = 0;
+ virtual size_t getMaxSpreadAsBloat() const = 0;
/**
* Returns statistics about the cache.
diff --git a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp
index fd25dd56235..6a9ae40cc93 100644
--- a/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp
+++ b/searchlib/src/vespa/searchlib/docstore/logdatastore.cpp
@@ -36,7 +36,6 @@ using namespace std::literals;
LogDataStore::Config::Config()
: _maxFileSize(DEFAULT_MAX_FILESIZE),
- _maxDiskBloatFactor(0.2),
_maxBucketSpread(2.5),
_minFileSizeFactor(0.2),
_maxNumLids(DEFAULT_MAX_LIDS_PER_FILE),
@@ -48,7 +47,6 @@ LogDataStore::Config::Config()
bool
LogDataStore::Config::operator == (const Config & rhs) const {
return (_maxBucketSpread == rhs._maxBucketSpread) &&
- (_maxDiskBloatFactor == rhs._maxDiskBloatFactor) &&
(_maxFileSize == rhs._maxFileSize) &&
(_minFileSizeFactor == rhs._minFileSizeFactor) &&
(_skipCrcOnRead == rhs._skipCrcOnRead) &&
@@ -294,46 +292,14 @@ vespalib::string bloatMsg(size_t bloat, size_t usage) {
}
-void
-LogDataStore::compact(uint64_t syncToken)
-{
- uint64_t usage = getDiskFootprint();
- uint64_t bloat = getDiskBloat();
- LOG(debug, "%s", bloatMsg(bloat, usage).c_str());
- const bool doCompact = (_fileChunks.size() > 1);
- if (doCompact) {
- LOG(info, "%s. Will compact", bloatMsg(bloat, usage).c_str());
- compactWorst(_config.getMaxDiskBloatFactor(), _config.getMaxBucketSpread(), isTotalDiskBloatExceeded(usage, bloat));
- }
- flushActiveAndWait(syncToken);
- if (doCompact) {
- usage = getDiskFootprint();
- bloat = getDiskBloat();
- LOG(info, "Done compacting. %s", bloatMsg(bloat, usage).c_str());
- }
-}
-
-bool
-LogDataStore::isTotalDiskBloatExceeded(size_t diskFootPrint, size_t bloat) const {
- const size_t maxConfiguredDiskBloat = diskFootPrint * _config.getMaxDiskBloatFactor();
- return bloat > maxConfiguredDiskBloat;
-}
-
size_t
-LogDataStore::getMaxCompactGain() const
+LogDataStore::getMaxSpreadAsBloat() const
{
- size_t bloat = getDiskBloat();
const size_t diskFootPrint = getDiskFootprint();
- if ( ! isTotalDiskBloatExceeded(diskFootPrint, bloat) ) {
- bloat = 0;
- }
-
const double maxSpread = getMaxBucketSpread();
- size_t spreadAsBloat = diskFootPrint * (1.0 - 1.0/maxSpread);
- if ( maxSpread < _config.getMaxBucketSpread()) {
- spreadAsBloat = 0;
- }
- return (bloat + spreadAsBloat);
+ return (maxSpread > _config.getMaxBucketSpread())
+ ? diskFootPrint * (1.0 - 1.0/maxSpread)
+ : 0;
}
void
@@ -380,40 +346,34 @@ LogDataStore::getMaxBucketSpread() const
}
std::pair<bool, LogDataStore::FileId>
-LogDataStore::findNextToCompact(double bloatLimit, double spreadLimit, bool prioritizeDiskBloat)
+LogDataStore::findNextToCompact(bool dueToBloat)
{
typedef std::multimap<double, FileId, std::greater<double>> CostMap;
- CostMap worstBloat;
- CostMap worstSpread;
+ CostMap worst;
MonitorGuard guard(_updateLock);
for (size_t i(0); i < _fileChunks.size(); i++) {
const auto & fc(_fileChunks[i]);
if (fc && fc->frozen() && (_currentlyCompacting.find(fc->getNameId()) == _currentlyCompacting.end())) {
uint64_t usage = fc->getDiskFootprint();
- uint64_t bloat = fc->getDiskBloat();
- if (_bucketizer) {
- worstSpread.emplace(fc->getBucketSpread(), FileId(i));
- }
- if (usage > 0) {
- double tmp(double(bloat)/usage);
- worstBloat.emplace(tmp, FileId(i));
+ if ( ! dueToBloat && _bucketizer) {
+ worst.emplace(fc->getBucketSpread(), FileId(i));
+ } else if (dueToBloat && usage > 0) {
+ double tmp(double(fc->getDiskBloat())/usage);
+ worst.emplace(tmp, FileId(i));
}
}
}
if (LOG_WOULD_LOG(debug)) {
- for (const auto & it : worstBloat) {
+ for (const auto & it : worst) {
const FileChunk & fc = *_fileChunks[it.second.getId()];
LOG(debug, "File '%s' has bloat '%2.2f' and bucket-spread '%1.4f numChunks=%d , numBuckets=%ld, numUniqueBuckets=%ld",
fc.getName().c_str(), it.first * 100, fc.getBucketSpread(), fc.getNumChunks(), fc.getNumBuckets(), fc.getNumUniqueBuckets());
}
}
std::pair<bool, FileId> retval(false, FileId(-1));
- if ( ! worstBloat.empty() && (worstBloat.begin()->first > bloatLimit) && prioritizeDiskBloat) {
- retval.first = true;
- retval.second = worstBloat.begin()->second;
- } else if ( ! worstSpread.empty() && (worstSpread.begin()->first > spreadLimit)) {
+ if ( ! worst.empty()) {
retval.first = true;
- retval.second = worstSpread.begin()->second;
+ retval.second = worst.begin()->second;
}
if (retval.first) {
_currentlyCompacting.insert(_fileChunks[retval.second.getId()]->getNameId());
@@ -422,10 +382,24 @@ LogDataStore::findNextToCompact(double bloatLimit, double spreadLimit, bool prio
}
void
-LogDataStore::compactWorst(double bloatLimit, double spreadLimit, bool prioritizeDiskBloat) {
- auto worst = findNextToCompact(bloatLimit, spreadLimit, prioritizeDiskBloat);
- if (worst.first) {
- compactFile(worst.second);
+LogDataStore::compactWorst(uint64_t syncToken, bool compactDiskBloat) {
+ uint64_t usage = getDiskFootprint();
+ uint64_t bloat = getDiskBloat();
+ const char * reason = compactDiskBloat ? "bloat" : "spread";
+ LOG(debug, "%s", bloatMsg(bloat, usage).c_str());
+ const bool doCompact = (_fileChunks.size() > 1);
+ if (doCompact) {
+ LOG(debug, "Will compact due to %s: %s", reason, bloatMsg(bloat, usage).c_str());
+ auto worst = findNextToCompact(compactDiskBloat);
+ if (worst.first) {
+ compactFile(worst.second);
+ }
+ flushActiveAndWait(syncToken);
+ usage = getDiskFootprint();
+ bloat = getDiskBloat();
+ LOG(info, "Done compacting due to %s: %s", reason, bloatMsg(bloat, usage).c_str());
+ } else {
+ flushActiveAndWait(syncToken);
}
}
@@ -1001,7 +975,7 @@ LogDataStore::computeNumberOfSignificantBucketIdBits(const IBucketizer & bucketi
while ((msb > 0) && (msbHistogram[msb - 1] == 0)) {
msb--;
}
- LOG(info, "computeNumberOfSignificantBucketIdBits(file=%d) = %ld = %ld took %1.3f", fileId.getId(), msb, msbHistogram[msb-1], timer.min_time());
+ LOG(debug, "computeNumberOfSignificantBucketIdBits(file=%d) = %ld = %ld took %1.3f", fileId.getId(), msb, msbHistogram[msb-1], timer.min_time());
return msb;
}
diff --git a/searchlib/src/vespa/searchlib/docstore/logdatastore.h b/searchlib/src/vespa/searchlib/docstore/logdatastore.h
index f43dc96fac9..62f87076759 100644
--- a/searchlib/src/vespa/searchlib/docstore/logdatastore.h
+++ b/searchlib/src/vespa/searchlib/docstore/logdatastore.h
@@ -41,7 +41,6 @@ public:
Config & setMaxFileSize(size_t v) { _maxFileSize = v; return *this; }
Config & setMaxNumLids(size_t v) { _maxNumLids = v; return *this; }
- Config & setMaxDiskBloatFactor(double v) { _maxDiskBloatFactor = v; return *this; }
Config & setMaxBucketSpread(double v) { _maxBucketSpread = v; return *this; }
Config & setMinFileSizeFactor(double v) { _minFileSizeFactor = v; return *this; }
@@ -49,7 +48,6 @@ public:
Config & setFileConfig(WriteableFileChunk::Config v) { _fileConfig = v; return *this; }
size_t getMaxFileSize() const { return _maxFileSize; }
- double getMaxDiskBloatFactor() const { return _maxDiskBloatFactor; }
double getMaxBucketSpread() const { return _maxBucketSpread; }
double getMinFileSizeFactor() const { return _minFileSizeFactor; }
uint32_t getMaxNumLids() const { return _maxNumLids; }
@@ -63,7 +61,6 @@ public:
bool operator == (const Config &) const;
private:
size_t _maxFileSize;
- double _maxDiskBloatFactor;
double _maxBucketSpread;
double _minFileSizeFactor;
uint32_t _maxNumLids;
@@ -109,9 +106,10 @@ public:
size_t getDiskFootprint() const override;
size_t getDiskHeaderFootprint() const override;
size_t getDiskBloat() const override;
- size_t getMaxCompactGain() const override;
+ size_t getMaxSpreadAsBloat() const override;
- void compact(uint64_t syncToken);
+ void compactBloat(uint64_t syncToken) { compactWorst(syncToken, true); }
+ void compactSpread(uint64_t syncToken) { compactWorst(syncToken, false);}
const Config & getConfig() const { return _config; }
Config & getConfig() { return _config; }
@@ -180,10 +178,9 @@ private:
class WrapVisitorProgress;
class FileChunkHolder;
- // Implements ISetLid API
void setLid(const ISetLid::unique_lock & guard, uint32_t lid, const LidInfo & lm) override;
- void compactWorst(double bloatLimit, double spreadLimit, bool prioritizeDiskBloat);
+ void compactWorst(uint64_t syncToken, bool compactDiskBloat);
void compactFile(FileId chunkId);
typedef vespalib::RcuVector<uint64_t> LidInfoVector;
@@ -199,8 +196,6 @@ private:
NameIdSet eraseIncompleteCompactedFiles(NameIdSet partList);
void internalFlushAll();
- bool isTotalDiskBloatExceeded(size_t diskFootPrint, size_t bloat) const;
-
NameIdSet scanDir(const vespalib::string &dir, const vespalib::string &suffix);
FileId allocateFileId(const MonitorGuard & guard);
void setNewFileChunk(const MonitorGuard & guard, FileChunk::UP fileChunk);
@@ -245,7 +240,7 @@ private:
return (_fileChunks.empty() ? 0 : _fileChunks.back()->getLastPersistedSerialNum());
}
bool shouldCompactToActiveFile(size_t compactedSize) const;
- std::pair<bool, FileId> findNextToCompact(double bloatLimit, double spreadLimit, bool prioritizeDiskBloat);
+ std::pair<bool, FileId> findNextToCompact(bool compactDiskBloat);
void incGeneration();
bool canShrinkLidSpace(const MonitorGuard &guard) const;
diff --git a/searchlib/src/vespa/searchlib/docstore/logdocumentstore.h b/searchlib/src/vespa/searchlib/docstore/logdocumentstore.h
index de36155bedb..2931f8bce2d 100644
--- a/searchlib/src/vespa/searchlib/docstore/logdocumentstore.h
+++ b/searchlib/src/vespa/searchlib/docstore/logdocumentstore.h
@@ -51,7 +51,8 @@ public:
~LogDocumentStore() override;
void reconfigure(const Config & config);
private:
- void compact(uint64_t syncToken) override { _backingStore.compact(syncToken); }
+ void compactBloat(uint64_t syncToken) override { _backingStore.compactBloat(syncToken); }
+ void compactSpread(uint64_t syncToken) override { _backingStore.compactSpread(syncToken); }
LogDataStore _backingStore;
};