Update mapping from docid to nodeids when loading hnsw index.

author: Tor Egge <Tor.Egge@online.no> 2022-11-21 16:24:26 +0100
committer: Tor Egge <Tor.Egge@online.no> 2022-11-21 16:24:26 +0100
commit: 67689d16d23ecc4b1a2de76ca08cc172ccea7a0f (patch)
tree: 6aa0a28a78127f96b970884b7030facfcaecebae /searchlib/src
parent: 88a4c159d2fa483e6b1cbcfc7bc56667e3427828 (diff)
7 files changed, 185 insertions, 1 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
index 5be4ae9d28f..b86913caa16 100644
--- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
@@ -5,10 +5,13 @@
 #include <vespa/searchlib/tensor/distance_functions.h>
 #include <vespa/searchlib/tensor/doc_vector_access.h>
 #include <vespa/searchlib/tensor/hnsw_index.h>
+#include <vespa/searchlib/tensor/hnsw_index_loader.hpp>
+#include <vespa/searchlib/tensor/hnsw_index_saver.h>
 #include <vespa/searchlib/tensor/random_level_generator.h>
 #include <vespa/searchlib/tensor/inv_log_level_generator.h>
 #include <vespa/searchlib/tensor/subspace_type.h>
 #include <vespa/searchlib/tensor/vector_bundle.h>
+#include <vespa/searchlib/util/bufferwriter.h>
 #include <vespa/searchlib/queryeval/global_filter.h>
 #include <vespa/vespalib/datastore/compaction_spec.h>
 #include <vespa/vespalib/datastore/compaction_strategy.h>
@@ -27,12 +30,46 @@ using namespace search::tensor;
 using namespace vespalib::slime;
 using vespalib::Slime;
 using search::BitVector;
+using search::BufferWriter;
 using vespalib::eval::get_cell_type;
 using vespalib::eval::ValueType;
 using vespalib::datastore::CompactionSpec;
 using vespalib::datastore::CompactionStrategy;
 using search::queryeval::GlobalFilter;
 
+class VectorBufferWriter : public BufferWriter {
+private:
+    char tmp[1024];
+public:
+    std::vector<char> output;
+    VectorBufferWriter() {
+        setup(tmp, 1024);
+    }
+    ~VectorBufferWriter() {}
+    void flush() override {
+        for (size_t i = 0; i < usedLen(); ++i) {
+            output.push_back(tmp[i]);
+        }
+        rewind();
+    }
+};
+
+class VectorBufferReader {
+private:
+    const std::vector<char>& _data;
+    size_t _pos;
+
+public:
+    VectorBufferReader(const std::vector<char>& data) : _data(data), _pos(0) {}
+    uint32_t readHostOrder() {
+        uint32_t result = 0;
+        assert(_pos + sizeof(uint32_t) <= _data.size());
+        std::memcpy(&result, _data.data() + _pos, sizeof(uint32_t));
+        _pos += sizeof(uint32_t);
+        return result;
+    }
+};
+
 template <typename FloatType>
 class MyDocVectorAccess : public DocVectorAccess {
 private:
@@ -195,6 +232,44 @@ public:
 
     FloatVectors& get_vectors() { return vectors; }
 
+    uint32_t get_single_nodeid(uint32_t docid) {
+        auto& id_mapping = index->get_id_mapping();
+        auto nodeids = id_mapping.get_ids(docid);
+        EXPECT_EQ(1, nodeids.size());
+        return nodeids[0];
+    }
+
+    void make_savetest_index()
+    {
+        this->add_document(7);
+        this->add_document(4);
+    }
+
+    void check_savetest_index(const vespalib::string& label) {
+        SCOPED_TRACE(label);
+        auto nodeid_for_doc_7 = get_single_nodeid(7);
+        auto nodeid_for_doc_4 = get_single_nodeid(4);
+        EXPECT_EQ(is_single ? 7 : 1, nodeid_for_doc_7);
+        EXPECT_EQ(is_single ? 4 : 2, nodeid_for_doc_4);
+        this->expect_level_0(nodeid_for_doc_7, { nodeid_for_doc_4 });
+        this->expect_level_0(nodeid_for_doc_4, { nodeid_for_doc_7 });
+    }
+
+    std::vector<char> save_index() const {
+        HnswIndexSaver saver(index->get_graph());
+        VectorBufferWriter vector_writer;
+        saver.save(vector_writer);
+        return vector_writer.output;
+    }
+
+    void load_index(std::vector<char> data) {
+        auto& graph = index->get_graph();
+        HnswIndexLoader<VectorBufferReader, IndexType::index_type> loader(graph, std::make_unique<VectorBufferReader>(data));
+        while (loader.load_next()) {}
+        auto& id_mapping = index->get_id_mapping();
+        id_mapping.on_load(graph.node_refs.make_read_view(graph.node_refs.size()));
+    }
+
     static constexpr bool is_single = std::is_same_v<IndexType, HnswIndex<HnswIndexType::SINGLE>>;
 };
 
@@ -687,6 +762,17 @@ TYPED_TEST(HnswIndexTest, hnsw_graph_is_compacted)
     EXPECT_LT(mem_3.usedBytes(), mem_2.usedBytes());
 }
 
+TYPED_TEST(HnswIndexTest, hnsw_graph_can_be_saved_and_loaded)
+{
+    this->init(false);
+    this->make_savetest_index();
+    this->check_savetest_index("before save");
+    auto data = this->save_index();
+    this->init(false);
+    this->load_index(data);
+    this->check_savetest_index("after load");
+ }
+
 TEST(LevelGeneratorTest, gives_various_levels)
 {
     InvLogLevelGenerator generator(4);
diff --git a/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp
index a3e3112eaf4..ac8b21d6136 100644
--- a/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp
@@ -1,9 +1,11 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include <vespa/searchlib/tensor/hnsw_nodeid_mapping.h>
+#include <vespa/searchlib/tensor/hnsw_node.h>
 #include <vespa/vespalib/gtest/gtest.h>
 
 using namespace search::tensor;
+using vespalib::datastore::EntryRef;
 
 class HnswNodeidMappingTest : public ::testing::Test {
 public:
@@ -74,6 +76,24 @@ TEST_F(HnswNodeidMappingTest, free_ids_puts_nodeids_on_hold_list_and_then_free_l
     expect_allocate_get({8, 7, 10}, 7); // Free list is first used, then new nodeid is allocated
 }
 
+TEST_F(HnswNodeidMappingTest, on_load_populates_mapping)
+{
+    std::vector<HnswNode> nodes(10);
+    nodes[1].ref().store_relaxed(EntryRef(1));
+    nodes[1].store_docid(7);
+    nodes[1].store_subspace(0);
+    nodes[2].ref().store_relaxed(EntryRef(2));
+    nodes[2].store_docid(4);
+    nodes[2].store_subspace(0);
+    nodes[7].ref().store_relaxed(EntryRef(3));
+    nodes[7].store_docid(4);
+    nodes[7].store_subspace(1);
+    mapping.on_load(vespalib::ConstArrayRef(nodes.data(), nodes.size()));
+    expect_get({1}, 7);
+    expect_get({2, 7}, 4);
+    expect_allocate_get({3, 4, 5, 6, 8, 9}, 1);
+}
+
 TEST_F(HnswNodeidMappingTest, memory_usage_increases_when_allocating_nodeids)
 {
     expect_allocate_get({1, 2}, 1);
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h b/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h
index 0ec15a54374..f4f68ddac1e 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h
@@ -10,6 +10,8 @@
 
 namespace search::tensor {
 
+class HnswSimpleNode;
+
 /*
  * Class used to maintain mapping from docid to nodeid for dense tensors
  * (one node per document).
@@ -34,6 +36,7 @@ public:
     void free_ids(uint32_t docid) { (void) docid; }
     void assign_generation(generation_t current_gen) { (void) current_gen; };
     void reclaim_memory(generation_t oldest_used_gen) { (void) oldest_used_gen; };
+    void on_load(vespalib::ConstArrayRef<HnswSimpleNode> nodes) { (void) nodes; }
     vespalib::MemoryUsage memory_usage() const { return vespalib::MemoryUsage(); }
 };
 
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index a583f6f885c..bf38dc01f37 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -67,6 +67,7 @@ public:
         }
     }
 
+    static constexpr HnswIndexType index_type = type;
     using IdMapping = typename HnswIndexTraits<type>::IdMapping;
 protected:
     using GraphType = HnswGraph<type>;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h
index fa3286420a4..2e14f363bba 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h
@@ -19,7 +19,7 @@ class HnswNode {
     vespalib::datastore::AtomicValueWrapper<uint32_t> _subspace;
 
 public:
-    HnswNode()
+    HnswNode() noexcept
         : _ref(),
           _docid(),
           _subspace()
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp
index c16024443ca..a801e826a7d 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp
@@ -1,6 +1,7 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "hnsw_nodeid_mapping.h"
+#include "hnsw_node.h"
 #include <vespa/vespalib/datastore/array_store.hpp>
 #include <vespa/vespalib/util/generation_hold_list.hpp>
 #include <vespa/vespalib/util/size_literals.h>
@@ -115,6 +116,76 @@ HnswNodeidMapping::reclaim_memory(generation_t oldest_used_gen)
     });
 }
 
+void
+HnswNodeidMapping::on_load(vespalib::ConstArrayRef<HnswNode> nodes)
+{
+    if (nodes.empty()) {
+        return;
+    }
+    // Check that reserved nodeid is not used
+    assert(!nodes[0].ref().load_relaxed().valid());
+    // Detect histogram size
+    uint32_t max_docid = 0;
+    for (auto& node : nodes) {
+        if (node.ref().load_relaxed().valid()) {
+            max_docid = std::max(node.acquire_docid(), max_docid);
+        }
+    }
+    // Make histogram
+    std::vector<uint32_t> histogram(max_docid + 1);
+    for (auto& node : nodes) {
+        if (node.ref().load_relaxed().valid()) {
+            auto docid = node.acquire_docid();
+            auto subspace = node.acquire_subspace();
+            auto &num_subspaces = histogram[docid];
+            num_subspaces = std::max(num_subspaces, subspace + 1);
+        }
+    }
+    assert(histogram[0] == 0);
+    // Allocate mapping from docid to nodeids
+    ensure_refs_size(max_docid);
+    uint32_t docid = 0;
+    for (auto subspaces : histogram) {
+        if (subspaces > 0) {
+            auto ref = _nodeids.allocate(subspaces);
+            _refs[docid] = ref;
+            auto nodeids = _nodeids.get_writable(ref);
+            for (auto& nodeid : nodeids) {
+                nodeid = 0;
+            }
+        }
+        ++docid;
+    }
+    {
+        // Populate mapping from docid to nodeids and free list
+        uint32_t nodeid = 0;
+        for (auto& node : nodes) {
+            if (node.ref().load_relaxed().valid()) {
+                docid = node.acquire_docid();
+                auto subspace = node.acquire_subspace();
+                auto nodeids = _nodeids.get_writable(_refs[docid]);
+                assert(subspace < nodeids.size());
+                assert(nodeids[subspace] == 0);
+                nodeids[subspace] = nodeid;
+            } else if (nodeid > 0) {
+                _free_list.push_back(nodeid);
+            }
+            ++nodeid;
+        }
+    }
+    // All subspaces for a docid needs to have a nodeid
+    for (docid = 0; docid <= max_docid; ++docid) {
+        auto ref = _refs[docid];
+        if (ref.valid()) {
+            auto nodeids = _nodeids.get_writable(ref);
+            for (auto nodeid : nodeids) {
+                assert(nodeid != 0);
+            }
+        }
+    }
+    std::reverse(_free_list.begin(), _free_list.end());
+}
+
 namespace {
 
 vespalib::MemoryUsage
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h
index 6ccc62aa0bb..8abf6321832 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h
@@ -13,6 +13,8 @@
 
 namespace search::tensor {
 
+class HnswNode;
+
 /**
  * Class used to keep track of the mapping from docid to array of nodeids.
  * A nodeid is an identifier for a node in the HNSW graph that represents a single vector.
@@ -49,6 +51,7 @@ public:
 
     void assign_generation(generation_t current_gen);
     void reclaim_memory(generation_t oldest_used_gen);
+    void on_load(vespalib::ConstArrayRef<HnswNode> nodes);
     // TODO: Add support for compaction
     vespalib::MemoryUsage memory_usage() const;
 };
author	Tor Egge <Tor.Egge@online.no>	2022-11-21 16:24:26 +0100
committer	Tor Egge <Tor.Egge@online.no>	2022-11-21 16:24:26 +0100
commit	67689d16d23ecc4b1a2de76ca08cc172ccea7a0f (patch)
tree	6aa0a28a78127f96b970884b7030facfcaecebae /searchlib/src
parent	88a4c159d2fa483e6b1cbcfc7bc56667e3427828 (diff)