Add class to track mapping from docid to [nodeid] used in hnsw index.

This class is to be used when supporting multiple vectors per document.
author: Geir Storli <geirst@yahooinc.com> 2022-11-14 14:05:27 +0000
committer: Geir Storli <geirst@yahooinc.com> 2022-11-14 14:07:02 +0000
commit: f89b667e4e496cd45b368d69eaae9f55524638e3 (patch)
tree: a98e4e01dfa72aae93ccf32c91875991bdf794d6
parent: 07ba692005ba4ab27d943c66f4d4ff1b36b86832 (diff)
10 files changed, 314 insertions, 16 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt
index 20df8cd3683..ae6469ae07a 100644
--- a/searchlib/CMakeLists.txt
+++ b/searchlib/CMakeLists.txt
@@ -221,6 +221,7 @@ vespa_define_module(
     src/tests/tensor/direct_tensor_store
     src/tests/tensor/distance_functions
     src/tests/tensor/hnsw_index
+    src/tests/tensor/hnsw_nodeid_mapping
     src/tests/tensor/hnsw_saver
     src/tests/tensor/tensor_buffer_operations
     src/tests/tensor/tensor_buffer_store
diff --git a/searchlib/src/tests/tensor/hnsw_nodeid_mapping/CMakeLists.txt b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/CMakeLists.txt
new file mode 100644
index 00000000000..7964bcd45a1
--- /dev/null
+++ b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_hnsw_nodeid_mapping_test_app TEST
+    SOURCES
+    hnsw_nodeid_mapping_test.cpp
+    DEPENDS
+    searchlib
+    GTest::GTest
+)
+vespa_add_test(NAME searchlib_hnsw_nodeid_mapping_test_app COMMAND searchlib_hnsw_nodeid_mapping_test_app)
+
diff --git a/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp
new file mode 100644
index 00000000000..a3e3112eaf4
--- /dev/null
+++ b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp
@@ -0,0 +1,91 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchlib/tensor/hnsw_nodeid_mapping.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace search::tensor;
+
+class HnswNodeidMappingTest : public ::testing::Test {
+public:
+    using NodeidVector = std::vector<uint32_t>;
+    HnswNodeidMapping mapping;
+
+    HnswNodeidMappingTest()
+        : mapping()
+    {
+        mapping.assign_generation(10);
+    }
+    void expect_allocate_get(const NodeidVector& exp_ids, uint32_t docid) {
+        auto ids = mapping.allocate_ids(docid, exp_ids.size());
+        EXPECT_EQ(exp_ids, NodeidVector(ids.begin(), ids.end()));
+        expect_get(exp_ids, docid);
+    }
+
+    void expect_get(const NodeidVector& exp_ids, uint32_t docid) {
+        auto ids = mapping.get_ids(docid);
+        EXPECT_EQ(exp_ids, NodeidVector(ids.begin(), ids.end()));
+    }
+};
+
+
+TEST_F(HnswNodeidMappingTest, allocate_and_get_nodeids)
+{
+    expect_allocate_get({}, 1);
+    expect_allocate_get({1}, 30);
+    expect_allocate_get({2, 3, 4}, 40);
+    expect_allocate_get({5, 6}, 50);
+    // Note that docid=2 has implicit no nodeids:
+    expect_get({}, 2);
+}
+
+TEST_F(HnswNodeidMappingTest, free_ids_clears_docid_entry_so_it_can_be_reused)
+{
+    expect_allocate_get({1, 2, 3}, 1);
+    mapping.free_ids(1);
+    expect_get({}, 1);
+
+    expect_allocate_get({4, 5}, 1);
+    mapping.free_ids(1);
+    expect_get({}, 1);
+}
+
+TEST_F(HnswNodeidMappingTest, free_ids_puts_nodeids_on_hold_list_and_then_free_list_for_reuse)
+{
+    expect_allocate_get({1, 2, 3}, 1);
+    expect_allocate_get({4, 5, 6}, 2);
+
+    mapping.free_ids(1); // {1, 2, 3} are inserted into hold list
+    mapping.assign_generation(11);
+
+    expect_allocate_get({7, 8}, 3); // Free list is NOT used
+    mapping.reclaim_memory(12); // {1, 2, 3} are moved to free list
+    expect_allocate_get({3, 2}, 4); // Free list is used
+
+    mapping.free_ids(2); // {4, 5, 6} are inserted into hold list
+    mapping.assign_generation(12);
+    mapping.free_ids(3); // {7, 8} are inserted into hold list
+    mapping.assign_generation(13);
+
+    mapping.reclaim_memory(13); // {4, 5, 6} are moved to free list
+    expect_allocate_get({6, 5}, 5); // Free list is used
+    expect_allocate_get({4, 1, 9}, 6); // Free list is first used, then new nodeid is allocated
+
+    mapping.reclaim_memory(14); // {7, 8} are moved to free list
+    expect_allocate_get({8, 7, 10}, 7); // Free list is first used, then new nodeid is allocated
+}
+
+TEST_F(HnswNodeidMappingTest, memory_usage_increases_when_allocating_nodeids)
+{
+    expect_allocate_get({1, 2}, 1);
+    auto a = mapping.memory_usage();
+    EXPECT_GT(a.allocatedBytes(), 0);
+    EXPECT_GT(a.usedBytes(), 0);
+    EXPECT_GE(a.allocatedBytes(), a.usedBytes());
+
+    expect_allocate_get({3, 4}, 2);
+    auto b = mapping.memory_usage();
+    EXPECT_GT(b.usedBytes(), a.usedBytes());
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
+
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index bb2df40c368..cd405f54b2e 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -19,6 +19,7 @@ vespa_add_library(searchlib_tensor OBJECT
     hnsw_graph.cpp
     hnsw_index.cpp
     hnsw_index_saver.cpp
+    hnsw_nodeid_mapping.cpp
     imported_tensor_attribute_vector.cpp
     imported_tensor_attribute_vector_read_guard.cpp
     inner_product_distance.cpp
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp
new file mode 100644
index 00000000000..c16024443ca
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp
@@ -0,0 +1,142 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "hnsw_nodeid_mapping.h"
+#include <vespa/vespalib/datastore/array_store.hpp>
+#include <vespa/vespalib/util/generation_hold_list.hpp>
+#include <vespa/vespalib/util/size_literals.h>
+#include <cassert>
+
+using vespalib::datastore::EntryRef;
+
+namespace {
+
+constexpr uint32_t max_small_array_type_id = 64;
+constexpr size_t small_page_size = 4_Ki;
+constexpr size_t min_num_arrays_for_new_buffer = 512_Ki;
+constexpr float alloc_grow_factor = 0.3;
+
+}
+
+namespace search::tensor {
+
+void
+HnswNodeidMapping::ensure_refs_size(uint32_t docid)
+{
+    if (docid >= _refs.size()) {
+        if (docid >= _refs.capacity()) {
+            _refs.reserve(_grow_strategy.calc_new_size(docid + 1));
+        }
+        _refs.resize(docid + 1);
+    }
+}
+
+uint32_t
+HnswNodeidMapping::allocate_id()
+{
+    if (_free_list.empty()) {
+        return _nodeid_limit++;
+    }
+    uint32_t id = _free_list.back();
+    _free_list.pop_back();
+    return id;
+}
+
+HnswNodeidMapping::HnswNodeidMapping()
+    : _refs(),
+      _grow_strategy(16, 1.0, 0, 0), // These are the same parameters as the default in rcuvector.h
+      _nodeid_limit(1), // Starting with nodeid=1 matches that we also start with docid=1.
+      _nodeids(NodeidStore::optimizedConfigForHugePage(max_small_array_type_id,
+                                                       vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE,
+                                                       small_page_size,
+                                                       min_num_arrays_for_new_buffer,
+                                                       alloc_grow_factor).enable_free_lists(true), {}),
+      _hold_list(),
+      _free_list()
+{
+    _refs.reserve(_grow_strategy.getInitialCapacity());
+}
+
+HnswNodeidMapping::~HnswNodeidMapping()
+{
+    _hold_list.reclaim_all();
+}
+
+vespalib::ConstArrayRef<uint32_t>
+HnswNodeidMapping::allocate_ids(uint32_t docid, uint32_t subspaces)
+{
+    ensure_refs_size(docid);
+    assert(!_refs[docid].valid());
+    if (subspaces == 0) {
+        return {};
+    }
+    EntryRef ref = _nodeids.allocate(subspaces);
+    auto nodeids = _nodeids.get_writable(ref);
+    for (auto& nodeid : nodeids) {
+        nodeid = allocate_id();
+    }
+    _refs[docid] = ref;
+    return nodeids;
+}
+
+vespalib::ConstArrayRef<uint32_t>
+HnswNodeidMapping::get_ids(uint32_t docid) const
+{
+    assert(docid < _refs.size());
+    return _nodeids.get(_refs[docid]);
+}
+
+void
+HnswNodeidMapping::free_ids(uint32_t docid)
+{
+    assert(docid < _refs.size());
+    EntryRef ref = _refs[docid];
+    assert(ref.valid());
+    auto nodeids = _nodeids.get(ref);
+    for (auto nodeid : nodeids) {
+        _hold_list.insert(nodeid);
+    }
+    _nodeids.remove(ref);
+    _refs[docid] = EntryRef();
+}
+
+void
+HnswNodeidMapping::assign_generation(generation_t current_gen)
+{
+    _nodeids.assign_generation(current_gen);
+    _hold_list.assign_generation(current_gen);
+}
+
+void
+HnswNodeidMapping::reclaim_memory(generation_t oldest_used_gen)
+{
+    _nodeids.reclaim_memory(oldest_used_gen);
+    _hold_list.reclaim(oldest_used_gen, [this](uint32_t nodeid) {
+        _free_list.push_back(nodeid);
+    });
+}
+
+namespace {
+
+vespalib::MemoryUsage
+get_refs_usage(const std::vector<EntryRef>& refs)
+{
+    vespalib::MemoryUsage result;
+    result.incAllocatedBytes(sizeof(EntryRef) * refs.capacity());
+    result.incUsedBytes(sizeof(EntryRef) * refs.size());
+    return result;
+}
+
+}
+
+vespalib::MemoryUsage
+HnswNodeidMapping::memory_usage() const
+{
+    vespalib::MemoryUsage result;
+    result.merge(get_refs_usage(_refs));
+    result.merge(_nodeids.getMemoryUsage());
+    // Note that the memory usage of the hold list and free list is not explicitly tracked
+    // as their content are covered by the memory usage reported from the NodeidStore (array store).
+    return result;
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h
new file mode 100644
index 00000000000..6ccc62aa0bb
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h
@@ -0,0 +1,60 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/datastore/array_store.h>
+#include <vespa/vespalib/datastore/entryref.h>
+#include <vespa/vespalib/util/arrayref.h>
+#include <vespa/vespalib/util/generation_hold_list.h>
+#include <vespa/vespalib/util/growstrategy.h>
+#include <vespa/vespalib/util/memoryusage.h>
+#include <cstdint>
+#include <vector>
+
+namespace search::tensor {
+
+/**
+ * Class used to keep track of the mapping from docid to array of nodeids.
+ * A nodeid is an identifier for a node in the HNSW graph that represents a single vector.
+ *
+ * The nodeids are allocated by this class.
+ * Nodeids that are freed are reused when no reader threads are accessing them (after a hold cycle).
+ *
+ * Note: Only the writer thread should use this class.
+ */
+class HnswNodeidMapping {
+private:
+    using generation_t = vespalib::GenerationHandler::generation_t;
+    using NodeidStore = vespalib::datastore::ArrayStore<uint32_t>;
+    using NodeidHoldList = vespalib::GenerationHoldList<uint32_t, false, true>;
+    using NodeidFreeList = std::vector<uint32_t>;
+
+    // Maps from docid to EntryRef used to get the array of nodeids from the NodeidStore.
+    std::vector<vespalib::datastore::EntryRef> _refs;
+    vespalib::GrowStrategy _grow_strategy;
+    uint32_t _nodeid_limit;
+    NodeidStore _nodeids;
+    NodeidHoldList _hold_list;
+    NodeidFreeList _free_list;
+
+    void ensure_refs_size(uint32_t docid);
+    uint32_t allocate_id();
+
+public:
+    HnswNodeidMapping();
+    ~HnswNodeidMapping();
+    vespalib::ConstArrayRef<uint32_t> allocate_ids(uint32_t docid, uint32_t subspaces);
+    vespalib::ConstArrayRef<uint32_t> get_ids(uint32_t docid) const;
+    void free_ids(uint32_t docid);
+
+    void assign_generation(generation_t current_gen);
+    void reclaim_memory(generation_t oldest_used_gen);
+    // TODO: Add support for compaction
+    vespalib::MemoryUsage memory_usage() const;
+};
+
+}
+
+namespace vespalib {
+    extern template class GenerationHoldList<uint32_t, false, true>;
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
index b66dc29be6e..acbf1c0d6b2 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
@@ -210,18 +210,6 @@ TensorAttribute::update_stat()
     return result;
 }
 
-vespalib::MemoryUsage
-TensorAttribute::memory_usage() const
-{
-    vespalib::MemoryUsage result = _refVector.getMemoryUsage();
-    result.merge(_tensorStore.getMemoryUsage());
-    result.mergeGenerationHeldBytes(getGenerationHolder().get_held_bytes());
-    if (_index) {
-        result.merge(_index->memory_usage());
-    }
-    return result;
-}
-
 void
 TensorAttribute::populate_state(vespalib::slime::Cursor& object) const
 {
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
index c25ac7833ef..f139a608706 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
@@ -39,7 +39,6 @@ protected:
     void internal_set_tensor(DocId docid, const vespalib::eval::Value& tensor);
     void consider_remove_from_index(DocId docid);
     virtual vespalib::MemoryUsage update_stat();
-    virtual vespalib::MemoryUsage memory_usage() const;
     void populate_state(vespalib::slime::Cursor& object) const;
     void populate_address_space_usage(AddressSpaceUsage& usage) const override;
     EntryRef acquire_entry_ref(DocId doc_id) const noexcept { return _refVector.acquire_elem_ref(doc_id).load_acquire(); }
diff --git a/vespalib/src/vespa/vespalib/util/growstrategy.h b/vespalib/src/vespa/vespalib/util/growstrategy.h
index c46c0b45667..02e18e44925 100644
--- a/vespalib/src/vespa/vespalib/util/growstrategy.h
+++ b/vespalib/src/vespa/vespalib/util/growstrategy.h
@@ -2,6 +2,7 @@
 
 #pragma once
 
+#include <algorithm>
 #include <cstddef>
 
 namespace vespalib {
@@ -32,6 +33,12 @@ public:
     void         setInitialCapacity(size_t v) noexcept { _initialCapacity = v; }
     void         setGrowDelta(size_t v) noexcept { _growDelta = v; }
 
+    size_t calc_new_size(size_t base_size) const {
+        size_t delta = (base_size * getGrowFactor()) + getGrowDelta();
+        size_t new_size = base_size + std::max(delta, static_cast<size_t>(1));
+        return std::max(new_size, getMinimumCapacity());
+    }
+
     bool operator==(const GrowStrategy & rhs) const noexcept {
         return (_initialCapacity == rhs._initialCapacity &&
                 _minimumCapacity == rhs._minimumCapacity &&
diff --git a/vespalib/src/vespa/vespalib/util/rcuvector.hpp b/vespalib/src/vespa/vespalib/util/rcuvector.hpp
index eadda8ac1e9..291738fd0c5 100644
--- a/vespalib/src/vespa/vespalib/util/rcuvector.hpp
+++ b/vespalib/src/vespa/vespalib/util/rcuvector.hpp
@@ -19,10 +19,9 @@ RcuVectorHeld<T>::~RcuVectorHeld() = default;
 
 template <typename T>
 size_t RcuVectorBase<T>::calcNewSize(size_t baseSize) const {
-    size_t delta = (baseSize * _growStrategy.getGrowFactor()) + _growStrategy.getGrowDelta();
-    size_t newSize = baseSize + std::max(delta, static_cast<size_t>(1));
-    return std::max(newSize, _growStrategy.getMinimumCapacity());
+    return _growStrategy.calc_new_size(baseSize);
 }
+
 template <typename T>
 size_t RcuVectorBase<T>::calcNewSize() const {
     return calcNewSize(_data.capacity());
author	Geir Storli <geirst@yahooinc.com>	2022-11-14 14:05:27 +0000
committer	Geir Storli <geirst@yahooinc.com>	2022-11-14 14:07:02 +0000
commit	f89b667e4e496cd45b368d69eaae9f55524638e3 (patch)
tree	a98e4e01dfa72aae93ccf32c91875991bdf794d6
parent	07ba692005ba4ab27d943c66f4d4ff1b36b86832 (diff)