diff options
author | Tor Egge <Tor.Egge@online.no> | 2022-11-21 13:20:40 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2022-11-21 13:20:40 +0100 |
commit | 59e16c1bd293f9ec71e65f4574bf66aa2fb18544 (patch) | |
tree | e5c2f3712d430316268cfe0569b18d757529c9f1 /searchlib | |
parent | ea596771818b7f7ec5c2e0a7a8d4931c1be02e8d (diff) |
Adjust hnsw index save format for managed nodeid mapping.
Diffstat (limited to 'searchlib')
6 files changed, 170 insertions, 27 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp b/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp index d7666700e77..e2a96ec059c 100644 --- a/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp @@ -51,14 +51,59 @@ public: using V = std::vector<uint32_t>; template <HnswIndexType type> +uint32_t fake_docid(uint32_t nodeid); + +template <> +uint32_t fake_docid<HnswIndexType::SINGLE>(uint32_t nodeid) +{ + return nodeid; +} + +template <> +uint32_t fake_docid<HnswIndexType::MULTI>(uint32_t nodeid) +{ + return nodeid + 100; +} + +template <HnswIndexType type> +uint32_t fake_subspace(uint32_t nodeid); + +template <> +uint32_t fake_subspace<HnswIndexType::SINGLE>(uint32_t) +{ + return 0; +} + +template <> +uint32_t fake_subspace<HnswIndexType::MULTI>(uint32_t nodeid) +{ + return nodeid + 10; +} + +template <typename NodeType> +uint32_t fake_get_docid(const NodeType& node, uint32_t nodeid); + +template <> +uint32_t fake_get_docid<HnswSimpleNode>(const HnswSimpleNode &, uint32_t nodeid) +{ + return fake_docid<HnswIndexType::SINGLE>(nodeid); +} + +template <> +uint32_t fake_get_docid<HnswNode>(const HnswNode& node, uint32_t) +{ + return node.acquire_docid(); +} + +template <HnswIndexType type> void populate(HnswGraph<type> &graph) { // no 0 - graph.make_node(1, 1, 0, 1); - auto er = graph.make_node(2, 2, 0, 2); + graph.make_node(1, fake_docid<type>(1), fake_subspace<type>(1), 1); + auto er = graph.make_node(2, 102, 12, 2); // no 3 - graph.make_node(4, 4, 0, 2); - graph.make_node(5, 5, 0, 0); - graph.make_node(6, 6, 0, 1); + graph.make_node(4, fake_docid<type>(4), fake_subspace<type>(4), 2); + graph.make_node(5, fake_docid<type>(5), fake_subspace<type>(5), 0); + graph.make_node(6, fake_docid<type>(6), fake_subspace<type>(6), 1); graph.set_link_array(1, 0, V{2, 4, 6}); graph.set_link_array(2, 0, V{1, 4, 6}); @@ -73,7 +118,7 @@ template <HnswIndexType type> void modify(HnswGraph<type> &graph) { graph.remove_node(2); graph.remove_node(6); - graph.make_node(7, 7, 0, 2); + graph.make_node(7, fake_docid<type>(7), fake_subspace<type>(7), 2); graph.set_link_array(1, 0, V{7, 4}); graph.set_link_array(4, 0, V{7, 2}); @@ -85,10 +130,11 @@ void modify(HnswGraph<type> &graph) { } +template <typename GraphType> class CopyGraphTest : public ::testing::Test { public: - HnswGraph<HnswIndexType::SINGLE> original; - HnswGraph<HnswIndexType::SINGLE> copy; + GraphType original; + GraphType copy; void expect_empty_d(uint32_t nodeid) const { EXPECT_FALSE(copy.acquire_node_ref(nodeid).valid()); @@ -121,10 +167,16 @@ public: return vector_writer.output; } void load_copy(std::vector<char> data) { - HnswIndexLoader<VectorBufferReader, HnswIndexType::SINGLE> loader(copy, std::make_unique<VectorBufferReader>(data)); + HnswIndexLoader<VectorBufferReader, GraphType::index_type> loader(copy, std::make_unique<VectorBufferReader>(data)); while (loader.load_next()) {} } + void expect_docid_and_subspace(uint32_t nodeid) const { + auto& node = copy.node_refs.get_elem_ref(nodeid); + EXPECT_EQ(fake_docid<GraphType::index_type>(nodeid), fake_get_docid(node, nodeid)); + EXPECT_EQ(fake_subspace<GraphType::index_type>(nodeid), node.acquire_subspace()); + } + void expect_copy_as_populated() const { EXPECT_EQ(copy.size(), 7); auto entry = copy.get_entry_node(); @@ -142,27 +194,35 @@ public: expect_level_1(2, {4}); expect_level_1(4, {2}); + expect_docid_and_subspace(1); + expect_docid_and_subspace(2); + expect_docid_and_subspace(4); + expect_docid_and_subspace(6); } }; -TEST_F(CopyGraphTest, reconstructs_graph) +using GraphTestTypes = ::testing::Types<HnswGraph<HnswIndexType::SINGLE>, HnswGraph<HnswIndexType::MULTI>>; + +TYPED_TEST_SUITE(CopyGraphTest, GraphTestTypes); + +TYPED_TEST(CopyGraphTest, reconstructs_graph) { - populate(original); - auto data = save_original(); - load_copy(data); - expect_copy_as_populated(); + populate(this->original); + auto data = this->save_original(); + this->load_copy(data); + this->expect_copy_as_populated(); } -TEST_F(CopyGraphTest, later_changes_ignored) +TYPED_TEST(CopyGraphTest, later_changes_ignored) { - populate(original); - HnswIndexSaver saver(original); - modify(original); + populate(this->original); + HnswIndexSaver saver(this->original); + modify(this->original); VectorBufferWriter vector_writer; saver.save(vector_writer); auto data = vector_writer.output; - load_copy(data); - expect_copy_as_populated(); + this->load_copy(data); + this->expect_copy_as_populated(); } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h b/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h index 5b448ea27b7..4706f9cf7e8 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h @@ -27,6 +27,7 @@ struct HnswGraph { // This uses 12 bits for buffer id -> 4096 buffers. using LinkArrayEntryRefType = vespalib::datastore::EntryRefT<20>; + static constexpr HnswIndexType index_type = type; using NodeType = typename HnswIndexTraits<type>::NodeType; // Provides mapping from document id -> node reference. diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp index 1bf02edbefa..04e1fcc1792 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp @@ -40,10 +40,13 @@ bool HnswIndexLoader<ReaderType, type>::load_next() { assert(!_complete); + static constexpr bool identity_mapping = (type == HnswIndexType::SINGLE); if (_nodeid < _num_nodes) { uint32_t num_levels = next_int(); if (num_levels > 0) { - _graph.make_node(_nodeid, _nodeid, 0, num_levels); + uint32_t docid = identity_mapping ? _nodeid : next_int(); + uint32_t subspace = identity_mapping ? 0 : next_int(); + _graph.make_node(_nodeid, docid, subspace, num_levels); for (uint32_t level = 0; level < num_levels; ++level) { uint32_t num_links = next_int(); _link_array.clear(); diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.cpp index ef6621d910b..370e2ddf92a 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.cpp @@ -54,8 +54,9 @@ HnswIndexSaver<type>::HnswIndexSaver(const HnswGraph<type> &graph) _meta_data.refs.reserve(link_array_count); _meta_data.nodes.reserve(num_nodes+1); for (size_t i = 0; i < num_nodes; ++i) { - _meta_data.nodes.push_back(_meta_data.refs.size()); - auto node_ref = graph.get_node_ref(i); + auto& node = graph.node_refs.get_elem_ref(i); + _meta_data.nodes.emplace_back(_meta_data.refs.size(), node); + auto node_ref = node.ref().load_relaxed(); if (node_ref.valid()) { auto levels = graph.nodes.get(node_ref); for (const auto& links_ref : levels) { @@ -63,7 +64,7 @@ HnswIndexSaver<type>::HnswIndexSaver(const HnswGraph<type> &graph) } } } - _meta_data.nodes.push_back(_meta_data.refs.size()); + _meta_data.nodes.emplace_back(_meta_data.refs.size()); } template <HnswIndexType type> @@ -75,10 +76,19 @@ HnswIndexSaver<type>::save(BufferWriter& writer) const uint32_t num_nodes = _meta_data.nodes.size() - 1; writer.write(&num_nodes, sizeof(uint32_t)); for (uint32_t i(0); i < num_nodes; i++) { - uint32_t offset = _meta_data.nodes[i]; - uint32_t next_offset = _meta_data.nodes[i+1]; + auto& node = _meta_data.nodes[i]; + uint32_t offset = node.get_refs_offset(); + uint32_t next_offset = _meta_data.nodes[i+1].get_refs_offset(); uint32_t num_levels = next_offset - offset; writer.write(&num_levels, sizeof(uint32_t)); + if (num_levels > 0) { + if constexpr (!HnswIndexSaverMetaDataNode<type>::identity_mapping) { + uint32_t docid = node.get_docid(); + uint32_t subspace = node.get_subspace(); + writer.write(&docid, sizeof(uint32_t)); + writer.write(&subspace, sizeof(uint32_t)); + } + } for (; offset < next_offset; offset++) { auto links_ref = _meta_data.refs[offset]; if (links_ref.valid()) { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.h index fa1ae6a5ed6..2884ee9b494 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.h @@ -4,6 +4,7 @@ #include "nearest_neighbor_index_saver.h" #include "hnsw_graph.h" +#include "hnsw_index_saver_metadata_node.h" #include <vespa/vespalib/datastore/entryref.h> #include <vespa/vespalib/stllike/allocator.h> #include <vector> @@ -29,7 +30,7 @@ private: uint32_t entry_nodeid; int32_t entry_level; std::vector<EntryRef, vespalib::allocator_large<EntryRef>> refs; - std::vector<uint32_t, vespalib::allocator_large<uint32_t>> nodes; + std::vector<HnswIndexSaverMetaDataNode<type>, vespalib::allocator_large<HnswIndexSaverMetaDataNode<type>>> nodes; MetaData(); ~MetaData(); }; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver_metadata_node.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver_metadata_node.h new file mode 100644 index 00000000000..bf7fcbc06ed --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver_metadata_node.h @@ -0,0 +1,68 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "hnsw_index_type.h" +#include "hnsw_node.h" +#include "hnsw_simple_node.h" + +namespace search::tensor { + +/* + * Meta data for node during save of hnsw graph. + */ +template <HnswIndexType type> +class HnswIndexSaverMetaDataNode; + +/* + * Meta data for node during save of hnsw graph with one node per document and + * identity mapping between nodeid and docid. + */ +template <> +class HnswIndexSaverMetaDataNode<HnswIndexType::SINGLE> +{ + uint32_t _refs_offset; + +public: + HnswIndexSaverMetaDataNode(uint32_t refs_offset) noexcept + : _refs_offset(refs_offset) + { + } + HnswIndexSaverMetaDataNode(uint32_t refs_offset, const HnswSimpleNode&) noexcept + : _refs_offset(refs_offset) + { + } + uint32_t get_refs_offset() const noexcept { return _refs_offset; } + static constexpr bool identity_mapping = true; +}; + +/* + * Meta data for node during save of hnsw graph with multiple nodes per document and + * managed mapping between nodeid and docid. + */ +template <> +class HnswIndexSaverMetaDataNode<HnswIndexType::MULTI> +{ + uint32_t _refs_offset; + uint32_t _docid; + uint32_t _subspace; +public: + HnswIndexSaverMetaDataNode(uint32_t refs_offset) noexcept + : _refs_offset(refs_offset), + _docid(0), + _subspace(0) + { + } + HnswIndexSaverMetaDataNode(uint32_t refs_offset, const HnswNode& node) noexcept + : _refs_offset(refs_offset), + _docid(node.acquire_docid()), + _subspace(node.acquire_subspace()) + { + } + uint32_t get_refs_offset() const noexcept { return _refs_offset; } + uint32_t get_docid() const noexcept { return _docid; } + uint32_t get_subspace() const noexcept { return _subspace; } + static constexpr bool identity_mapping = false; +}; + +} |