diff options
author | Tor Egge <Tor.Egge@online.no> | 2022-11-21 16:24:26 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2022-11-21 16:24:26 +0100 |
commit | 67689d16d23ecc4b1a2de76ca08cc172ccea7a0f (patch) | |
tree | 6aa0a28a78127f96b970884b7030facfcaecebae /searchlib/src | |
parent | 88a4c159d2fa483e6b1cbcfc7bc56667e3427828 (diff) |
Update mapping from docid to nodeids when loading hnsw index.
Diffstat (limited to 'searchlib/src')
7 files changed, 185 insertions, 1 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 5be4ae9d28f..b86913caa16 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -5,10 +5,13 @@ #include <vespa/searchlib/tensor/distance_functions.h> #include <vespa/searchlib/tensor/doc_vector_access.h> #include <vespa/searchlib/tensor/hnsw_index.h> +#include <vespa/searchlib/tensor/hnsw_index_loader.hpp> +#include <vespa/searchlib/tensor/hnsw_index_saver.h> #include <vespa/searchlib/tensor/random_level_generator.h> #include <vespa/searchlib/tensor/inv_log_level_generator.h> #include <vespa/searchlib/tensor/subspace_type.h> #include <vespa/searchlib/tensor/vector_bundle.h> +#include <vespa/searchlib/util/bufferwriter.h> #include <vespa/searchlib/queryeval/global_filter.h> #include <vespa/vespalib/datastore/compaction_spec.h> #include <vespa/vespalib/datastore/compaction_strategy.h> @@ -27,12 +30,46 @@ using namespace search::tensor; using namespace vespalib::slime; using vespalib::Slime; using search::BitVector; +using search::BufferWriter; using vespalib::eval::get_cell_type; using vespalib::eval::ValueType; using vespalib::datastore::CompactionSpec; using vespalib::datastore::CompactionStrategy; using search::queryeval::GlobalFilter; +class VectorBufferWriter : public BufferWriter { +private: + char tmp[1024]; +public: + std::vector<char> output; + VectorBufferWriter() { + setup(tmp, 1024); + } + ~VectorBufferWriter() {} + void flush() override { + for (size_t i = 0; i < usedLen(); ++i) { + output.push_back(tmp[i]); + } + rewind(); + } +}; + +class VectorBufferReader { +private: + const std::vector<char>& _data; + size_t _pos; + +public: + VectorBufferReader(const std::vector<char>& data) : _data(data), _pos(0) {} + uint32_t readHostOrder() { + uint32_t result = 0; + assert(_pos + sizeof(uint32_t) <= _data.size()); + std::memcpy(&result, _data.data() + _pos, sizeof(uint32_t)); + _pos += sizeof(uint32_t); + return result; + } +}; + template <typename FloatType> class MyDocVectorAccess : public DocVectorAccess { private: @@ -195,6 +232,44 @@ public: FloatVectors& get_vectors() { return vectors; } + uint32_t get_single_nodeid(uint32_t docid) { + auto& id_mapping = index->get_id_mapping(); + auto nodeids = id_mapping.get_ids(docid); + EXPECT_EQ(1, nodeids.size()); + return nodeids[0]; + } + + void make_savetest_index() + { + this->add_document(7); + this->add_document(4); + } + + void check_savetest_index(const vespalib::string& label) { + SCOPED_TRACE(label); + auto nodeid_for_doc_7 = get_single_nodeid(7); + auto nodeid_for_doc_4 = get_single_nodeid(4); + EXPECT_EQ(is_single ? 7 : 1, nodeid_for_doc_7); + EXPECT_EQ(is_single ? 4 : 2, nodeid_for_doc_4); + this->expect_level_0(nodeid_for_doc_7, { nodeid_for_doc_4 }); + this->expect_level_0(nodeid_for_doc_4, { nodeid_for_doc_7 }); + } + + std::vector<char> save_index() const { + HnswIndexSaver saver(index->get_graph()); + VectorBufferWriter vector_writer; + saver.save(vector_writer); + return vector_writer.output; + } + + void load_index(std::vector<char> data) { + auto& graph = index->get_graph(); + HnswIndexLoader<VectorBufferReader, IndexType::index_type> loader(graph, std::make_unique<VectorBufferReader>(data)); + while (loader.load_next()) {} + auto& id_mapping = index->get_id_mapping(); + id_mapping.on_load(graph.node_refs.make_read_view(graph.node_refs.size())); + } + static constexpr bool is_single = std::is_same_v<IndexType, HnswIndex<HnswIndexType::SINGLE>>; }; @@ -687,6 +762,17 @@ TYPED_TEST(HnswIndexTest, hnsw_graph_is_compacted) EXPECT_LT(mem_3.usedBytes(), mem_2.usedBytes()); } +TYPED_TEST(HnswIndexTest, hnsw_graph_can_be_saved_and_loaded) +{ + this->init(false); + this->make_savetest_index(); + this->check_savetest_index("before save"); + auto data = this->save_index(); + this->init(false); + this->load_index(data); + this->check_savetest_index("after load"); + } + TEST(LevelGeneratorTest, gives_various_levels) { InvLogLevelGenerator generator(4); diff --git a/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp index a3e3112eaf4..ac8b21d6136 100644 --- a/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp @@ -1,9 +1,11 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include <vespa/searchlib/tensor/hnsw_nodeid_mapping.h> +#include <vespa/searchlib/tensor/hnsw_node.h> #include <vespa/vespalib/gtest/gtest.h> using namespace search::tensor; +using vespalib::datastore::EntryRef; class HnswNodeidMappingTest : public ::testing::Test { public: @@ -74,6 +76,24 @@ TEST_F(HnswNodeidMappingTest, free_ids_puts_nodeids_on_hold_list_and_then_free_l expect_allocate_get({8, 7, 10}, 7); // Free list is first used, then new nodeid is allocated } +TEST_F(HnswNodeidMappingTest, on_load_populates_mapping) +{ + std::vector<HnswNode> nodes(10); + nodes[1].ref().store_relaxed(EntryRef(1)); + nodes[1].store_docid(7); + nodes[1].store_subspace(0); + nodes[2].ref().store_relaxed(EntryRef(2)); + nodes[2].store_docid(4); + nodes[2].store_subspace(0); + nodes[7].ref().store_relaxed(EntryRef(3)); + nodes[7].store_docid(4); + nodes[7].store_subspace(1); + mapping.on_load(vespalib::ConstArrayRef(nodes.data(), nodes.size())); + expect_get({1}, 7); + expect_get({2, 7}, 4); + expect_allocate_get({3, 4, 5, 6, 8, 9}, 1); +} + TEST_F(HnswNodeidMappingTest, memory_usage_increases_when_allocating_nodeids) { expect_allocate_get({1, 2}, 1); diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h b/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h index 0ec15a54374..f4f68ddac1e 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h @@ -10,6 +10,8 @@ namespace search::tensor { +class HnswSimpleNode; + /* * Class used to maintain mapping from docid to nodeid for dense tensors * (one node per document). @@ -34,6 +36,7 @@ public: void free_ids(uint32_t docid) { (void) docid; } void assign_generation(generation_t current_gen) { (void) current_gen; }; void reclaim_memory(generation_t oldest_used_gen) { (void) oldest_used_gen; }; + void on_load(vespalib::ConstArrayRef<HnswSimpleNode> nodes) { (void) nodes; } vespalib::MemoryUsage memory_usage() const { return vespalib::MemoryUsage(); } }; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index a583f6f885c..bf38dc01f37 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -67,6 +67,7 @@ public: } } + static constexpr HnswIndexType index_type = type; using IdMapping = typename HnswIndexTraits<type>::IdMapping; protected: using GraphType = HnswGraph<type>; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h index fa3286420a4..2e14f363bba 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h @@ -19,7 +19,7 @@ class HnswNode { vespalib::datastore::AtomicValueWrapper<uint32_t> _subspace; public: - HnswNode() + HnswNode() noexcept : _ref(), _docid(), _subspace() diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp index c16024443ca..a801e826a7d 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "hnsw_nodeid_mapping.h" +#include "hnsw_node.h" #include <vespa/vespalib/datastore/array_store.hpp> #include <vespa/vespalib/util/generation_hold_list.hpp> #include <vespa/vespalib/util/size_literals.h> @@ -115,6 +116,76 @@ HnswNodeidMapping::reclaim_memory(generation_t oldest_used_gen) }); } +void +HnswNodeidMapping::on_load(vespalib::ConstArrayRef<HnswNode> nodes) +{ + if (nodes.empty()) { + return; + } + // Check that reserved nodeid is not used + assert(!nodes[0].ref().load_relaxed().valid()); + // Detect histogram size + uint32_t max_docid = 0; + for (auto& node : nodes) { + if (node.ref().load_relaxed().valid()) { + max_docid = std::max(node.acquire_docid(), max_docid); + } + } + // Make histogram + std::vector<uint32_t> histogram(max_docid + 1); + for (auto& node : nodes) { + if (node.ref().load_relaxed().valid()) { + auto docid = node.acquire_docid(); + auto subspace = node.acquire_subspace(); + auto &num_subspaces = histogram[docid]; + num_subspaces = std::max(num_subspaces, subspace + 1); + } + } + assert(histogram[0] == 0); + // Allocate mapping from docid to nodeids + ensure_refs_size(max_docid); + uint32_t docid = 0; + for (auto subspaces : histogram) { + if (subspaces > 0) { + auto ref = _nodeids.allocate(subspaces); + _refs[docid] = ref; + auto nodeids = _nodeids.get_writable(ref); + for (auto& nodeid : nodeids) { + nodeid = 0; + } + } + ++docid; + } + { + // Populate mapping from docid to nodeids and free list + uint32_t nodeid = 0; + for (auto& node : nodes) { + if (node.ref().load_relaxed().valid()) { + docid = node.acquire_docid(); + auto subspace = node.acquire_subspace(); + auto nodeids = _nodeids.get_writable(_refs[docid]); + assert(subspace < nodeids.size()); + assert(nodeids[subspace] == 0); + nodeids[subspace] = nodeid; + } else if (nodeid > 0) { + _free_list.push_back(nodeid); + } + ++nodeid; + } + } + // All subspaces for a docid needs to have a nodeid + for (docid = 0; docid <= max_docid; ++docid) { + auto ref = _refs[docid]; + if (ref.valid()) { + auto nodeids = _nodeids.get_writable(ref); + for (auto nodeid : nodeids) { + assert(nodeid != 0); + } + } + } + std::reverse(_free_list.begin(), _free_list.end()); +} + namespace { vespalib::MemoryUsage diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h index 6ccc62aa0bb..8abf6321832 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h @@ -13,6 +13,8 @@ namespace search::tensor { +class HnswNode; + /** * Class used to keep track of the mapping from docid to array of nodeids. * A nodeid is an identifier for a node in the HNSW graph that represents a single vector. @@ -49,6 +51,7 @@ public: void assign_generation(generation_t current_gen); void reclaim_memory(generation_t oldest_used_gen); + void on_load(vespalib::ConstArrayRef<HnswNode> nodes); // TODO: Add support for compaction vespalib::MemoryUsage memory_usage() const; }; |