diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2023-10-04 20:07:32 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2023-10-04 20:07:32 +0000 |
commit | 643d4162ee887dfe642d7d2e632ed36c2d36f3d3 (patch) | |
tree | 140eca04a5c8c3d8d7d4776277792f48fcbd50cf /searchlib | |
parent | 584223f47c5a8bb16a4c070a1fb3e3d69cb752d6 (diff) |
- Instead of keeping a map of bucketId => lids, just append everything to a vector and sort when complete.
- This significantly improves memory usage during compaction. Instead of many heap allocations
- You now get fewer mmapped allocations that are dropped when done.
Diffstat (limited to 'searchlib')
4 files changed, 36 insertions, 21 deletions
diff --git a/searchlib/src/tests/docstore/store_by_bucket/store_by_bucket_test.cpp b/searchlib/src/tests/docstore/store_by_bucket/store_by_bucket_test.cpp index 13aa1880e8c..c7e0c59da12 100644 --- a/searchlib/src/tests/docstore/store_by_bucket/store_by_bucket_test.cpp +++ b/searchlib/src/tests/docstore/store_by_bucket/store_by_bucket_test.cpp @@ -7,7 +7,6 @@ #include <vespa/searchlib/docstore/storebybucket.h> #include <vespa/vespalib/stllike/asciistream.h> #include <vespa/vespalib/stllike/hash_set.h> -#include <vespa/vespalib/util/size_literals.h> #include <vespa/vespalib/util/threadstackexecutor.h> #include <vespa/log/log.h> @@ -79,6 +78,7 @@ TEST("require that StoreByBucket gives bucket by bucket and ordered within") for (size_t i(1000); i > 500; i--) { add(sbb, i); } + sbb.close(); EXPECT_EQUAL(32u, sbb.getBucketCount()); EXPECT_EQUAL(1000u, sbb.getLidCount()); VerifyBucketOrder vbo; diff --git a/searchlib/src/vespa/searchlib/docstore/compacter.cpp b/searchlib/src/vespa/searchlib/docstore/compacter.cpp index c886e52659f..1f817be1f25 100644 --- a/searchlib/src/vespa/searchlib/docstore/compacter.cpp +++ b/searchlib/src/vespa/searchlib/docstore/compacter.cpp @@ -79,7 +79,8 @@ BucketCompacter::close() size_t lidCount1(0); size_t bucketCount(0); size_t chunkCount(0); - for (const StoreByBucket & store : _tmpStore) { + for (StoreByBucket & store : _tmpStore) { + store.close(); lidCount1 += store.getLidCount(); bucketCount += store.getBucketCount(); chunkCount += store.getChunkCount(); diff --git a/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp b/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp index 6d3c39a51dc..14beccaac9a 100644 --- a/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp +++ b/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp @@ -42,9 +42,8 @@ StoreByBucket::add(BucketId bucketId, uint32_t chunkId, uint32_t lid, const void }); _executor.execute(CpuUsage::wrap(std::move(task), CpuUsage::Category::COMPACT)); } - Index idx(bucketId, _current->getId(), chunkId, lid); _current->append(lid, buffer, sz); - _where[bucketId.toKey()].push_back(idx); + _where.emplace_back(bucketId, _current->getId(), chunkId, lid); } Chunk::UP @@ -88,14 +87,34 @@ StoreByBucket::waitAllProcessed() { } void -StoreByBucket::drain(IWrite & drainer) -{ +StoreByBucket::close() { incChunksPosted(); auto task = makeLambdaTask([this, chunk=std::move(_current)]() mutable { closeChunk(std::move(chunk)); }); _executor.execute(CpuUsage::wrap(std::move(task), CpuUsage::Category::COMPACT)); waitAllProcessed(); + std::sort(_where.begin(), _where.end()); +} + +size_t +StoreByBucket::getBucketCount() const { + if (_where.empty()) return 0; + + size_t count = 0; + BucketId prev = _where.front()._bucketId; + for (const auto & lid : _where) { + if (lid._bucketId != prev) { + count++; + prev = lid._bucketId; + } + } + return count + 1; +} + +void +StoreByBucket::drain(IWrite & drainer) +{ std::vector<Chunk::UP> chunks; chunks.resize(_chunks.size()); for (const auto & it : _chunks) { @@ -103,12 +122,9 @@ StoreByBucket::drain(IWrite & drainer) chunks[it.first] = std::make_unique<Chunk>(it.first, buf.data(), buf.size()); } _chunks.clear(); - for (auto & it : _where) { - std::sort(it.second.begin(), it.second.end()); - for (Index idx : it.second) { - vespalib::ConstBufferRef data(chunks[idx._id]->getLid(idx._lid)); - drainer.write(idx._bucketId, idx._chunkId, idx._lid, data.c_str(), data.size()); - } + for (auto & idx : _where) { + vespalib::ConstBufferRef data(chunks[idx._id]->getLid(idx._lid)); + drainer.write(idx._bucketId, idx._chunkId, idx._lid, data.c_str(), data.size()); } } diff --git a/searchlib/src/vespa/searchlib/docstore/storebybucket.h b/searchlib/src/vespa/searchlib/docstore/storebybucket.h index dfe6199aa2e..b0930d4be39 100644 --- a/searchlib/src/vespa/searchlib/docstore/storebybucket.h +++ b/searchlib/src/vespa/searchlib/docstore/storebybucket.h @@ -7,7 +7,6 @@ #include <vespa/vespalib/data/memorydatastore.h> #include <vespa/vespalib/util/executor.h> #include <vespa/vespalib/stllike/hash_map.h> -#include <map> #include <condition_variable> namespace search::docstore { @@ -34,19 +33,18 @@ public: class IWrite { public: using BucketId=document::BucketId; - virtual ~IWrite() { } + virtual ~IWrite() = default; virtual void write(BucketId bucketId, uint32_t chunkId, uint32_t lid, const void *buffer, size_t sz) = 0; }; void add(document::BucketId bucketId, uint32_t chunkId, uint32_t lid, const void *buffer, size_t sz); + void close(); + /// close() must have been called prior to calling getBucketCount() or drain() void drain(IWrite & drain); + size_t getBucketCount() const; + size_t getChunkCount() const; - size_t getBucketCount() const { return _where.size(); } size_t getLidCount() const { - size_t lidCount(0); - for (const auto & it : _where) { - lidCount += it.second.size(); - } - return lidCount; + return _where.size(); } private: void incChunksPosted(); @@ -69,7 +67,7 @@ private: using IndexVector = std::vector<Index, vespalib::allocator_large<Index>>; uint64_t _chunkSerial; Chunk::UP _current; - std::map<uint64_t, IndexVector> _where; + IndexVector _where; MemoryDataStore & _backingMemory; Executor & _executor; std::unique_ptr<std::mutex> _lock; |