- Instead of keeping a map of bucketId => lids, just append everything to a vector and sort when complete.

- This significantly improves memory usage during compaction. Instead of many heap allocations - You now get fewer mmapped allocations that are dropped when done.
author: Henning Baldersheim <balder@yahoo-inc.com> 2023-10-04 20:07:32 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2023-10-04 20:07:32 +0000
commit: 643d4162ee887dfe642d7d2e632ed36c2d36f3d3 (patch)
tree: 140eca04a5c8c3d8d7d4776277792f48fcbd50cf /searchlib
parent: 584223f47c5a8bb16a4c070a1fb3e3d69cb752d6 (diff)
4 files changed, 36 insertions, 21 deletions
diff --git a/searchlib/src/tests/docstore/store_by_bucket/store_by_bucket_test.cpp b/searchlib/src/tests/docstore/store_by_bucket/store_by_bucket_test.cpp
index 13aa1880e8c..c7e0c59da12 100644
--- a/searchlib/src/tests/docstore/store_by_bucket/store_by_bucket_test.cpp
+++ b/searchlib/src/tests/docstore/store_by_bucket/store_by_bucket_test.cpp
@@ -7,7 +7,6 @@
 #include <vespa/searchlib/docstore/storebybucket.h>
 #include <vespa/vespalib/stllike/asciistream.h>
 #include <vespa/vespalib/stllike/hash_set.h>
-#include <vespa/vespalib/util/size_literals.h>
 #include <vespa/vespalib/util/threadstackexecutor.h>
 
 #include <vespa/log/log.h>
@@ -79,6 +78,7 @@ TEST("require that StoreByBucket gives bucket by bucket and ordered within")
     for (size_t i(1000); i > 500; i--) {
         add(sbb, i);
     }
+    sbb.close();
     EXPECT_EQUAL(32u, sbb.getBucketCount());
     EXPECT_EQUAL(1000u, sbb.getLidCount());
     VerifyBucketOrder vbo;
diff --git a/searchlib/src/vespa/searchlib/docstore/compacter.cpp b/searchlib/src/vespa/searchlib/docstore/compacter.cpp
index c886e52659f..1f817be1f25 100644
--- a/searchlib/src/vespa/searchlib/docstore/compacter.cpp
+++ b/searchlib/src/vespa/searchlib/docstore/compacter.cpp
@@ -79,7 +79,8 @@ BucketCompacter::close()
     size_t lidCount1(0);
     size_t bucketCount(0);
     size_t chunkCount(0);
-    for (const StoreByBucket & store : _tmpStore) {
+    for (StoreByBucket & store : _tmpStore) {
+        store.close();
         lidCount1 += store.getLidCount();
         bucketCount += store.getBucketCount();
         chunkCount += store.getChunkCount();
diff --git a/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp b/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp
index 6d3c39a51dc..14beccaac9a 100644
--- a/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp
+++ b/searchlib/src/vespa/searchlib/docstore/storebybucket.cpp
@@ -42,9 +42,8 @@ StoreByBucket::add(BucketId bucketId, uint32_t chunkId, uint32_t lid, const void
         });
         _executor.execute(CpuUsage::wrap(std::move(task), CpuUsage::Category::COMPACT));
     }
-    Index idx(bucketId, _current->getId(), chunkId, lid);
     _current->append(lid, buffer, sz);
-    _where[bucketId.toKey()].push_back(idx);
+    _where.emplace_back(bucketId, _current->getId(), chunkId, lid);
 }
 
 Chunk::UP
@@ -88,14 +87,34 @@ StoreByBucket::waitAllProcessed() {
 }
 
 void
-StoreByBucket::drain(IWrite & drainer)
-{
+StoreByBucket::close() {
     incChunksPosted();
     auto task = makeLambdaTask([this, chunk=std::move(_current)]() mutable {
         closeChunk(std::move(chunk));
     });
     _executor.execute(CpuUsage::wrap(std::move(task), CpuUsage::Category::COMPACT));
     waitAllProcessed();
+    std::sort(_where.begin(), _where.end());
+}
+
+size_t
+StoreByBucket::getBucketCount() const {
+    if (_where.empty()) return 0;
+
+    size_t count = 0;
+    BucketId prev = _where.front()._bucketId;
+    for (const auto & lid : _where) {
+        if (lid._bucketId != prev) {
+            count++;
+            prev = lid._bucketId;
+        }
+    }
+    return count + 1;
+}
+
+void
+StoreByBucket::drain(IWrite & drainer)
+{
     std::vector<Chunk::UP> chunks;
     chunks.resize(_chunks.size());
     for (const auto & it : _chunks) {
@@ -103,12 +122,9 @@ StoreByBucket::drain(IWrite & drainer)
         chunks[it.first] = std::make_unique<Chunk>(it.first, buf.data(), buf.size());
     }
     _chunks.clear();
-    for (auto & it : _where) {
-        std::sort(it.second.begin(), it.second.end());
-        for (Index idx : it.second) {
-            vespalib::ConstBufferRef data(chunks[idx._id]->getLid(idx._lid));
-            drainer.write(idx._bucketId, idx._chunkId, idx._lid, data.c_str(), data.size());
-        }
+    for (auto & idx : _where) {
+        vespalib::ConstBufferRef data(chunks[idx._id]->getLid(idx._lid));
+        drainer.write(idx._bucketId, idx._chunkId, idx._lid, data.c_str(), data.size());
     }
 }
 
diff --git a/searchlib/src/vespa/searchlib/docstore/storebybucket.h b/searchlib/src/vespa/searchlib/docstore/storebybucket.h
index dfe6199aa2e..b0930d4be39 100644
--- a/searchlib/src/vespa/searchlib/docstore/storebybucket.h
+++ b/searchlib/src/vespa/searchlib/docstore/storebybucket.h
@@ -7,7 +7,6 @@
 #include <vespa/vespalib/data/memorydatastore.h>
 #include <vespa/vespalib/util/executor.h>
 #include <vespa/vespalib/stllike/hash_map.h>
-#include <map>
 #include <condition_variable>
 
 namespace search::docstore {
@@ -34,19 +33,18 @@ public:
     class IWrite {
     public:
         using BucketId=document::BucketId;
-        virtual ~IWrite() { }
+        virtual ~IWrite() = default;
         virtual void write(BucketId bucketId, uint32_t chunkId, uint32_t lid, const void *buffer, size_t sz) = 0;
     };
     void add(document::BucketId bucketId, uint32_t chunkId, uint32_t lid, const void *buffer, size_t sz);
+    void close();
+    /// close() must have been called prior to calling getBucketCount() or drain()
     void drain(IWrite & drain);
+    size_t getBucketCount() const;
+
     size_t getChunkCount() const;
-    size_t getBucketCount() const { return _where.size(); }
     size_t getLidCount() const {
-        size_t lidCount(0);
-        for (const auto & it : _where) {
-            lidCount += it.second.size();
-        }
-        return lidCount;
+        return _where.size();
     }
 private:
     void incChunksPosted();
@@ -69,7 +67,7 @@ private:
     using IndexVector = std::vector<Index, vespalib::allocator_large<Index>>;
     uint64_t                                     _chunkSerial;
     Chunk::UP                                    _current;
-    std::map<uint64_t, IndexVector>              _where;
+    IndexVector                                  _where;
     MemoryDataStore                            & _backingMemory;
     Executor                                   & _executor;
     std::unique_ptr<std::mutex>                  _lock;
author	Henning Baldersheim <balder@yahoo-inc.com>	2023-10-04 20:07:32 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2023-10-04 20:07:32 +0000
commit	643d4162ee887dfe642d7d2e632ed36c2d36f3d3 (patch)
tree	140eca04a5c8c3d8d7d4776277792f48fcbd50cf /searchlib
parent	584223f47c5a8bb16a4c070a1fb3e3d69cb752d6 (diff)