aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2022-11-21 13:20:40 +0100
committerTor Egge <Tor.Egge@online.no>2022-11-21 13:20:40 +0100
commit59e16c1bd293f9ec71e65f4574bf66aa2fb18544 (patch)
treee5c2f3712d430316268cfe0569b18d757529c9f1 /searchlib
parentea596771818b7f7ec5c2e0a7a8d4931c1be02e8d (diff)
Adjust hnsw index save format for managed nodeid mapping.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp100
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_graph.h1
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp5
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.cpp20
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.h3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_saver_metadata_node.h68
6 files changed, 170 insertions, 27 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp b/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp
index d7666700e77..e2a96ec059c 100644
--- a/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp
@@ -51,14 +51,59 @@ public:
using V = std::vector<uint32_t>;
template <HnswIndexType type>
+uint32_t fake_docid(uint32_t nodeid);
+
+template <>
+uint32_t fake_docid<HnswIndexType::SINGLE>(uint32_t nodeid)
+{
+ return nodeid;
+}
+
+template <>
+uint32_t fake_docid<HnswIndexType::MULTI>(uint32_t nodeid)
+{
+ return nodeid + 100;
+}
+
+template <HnswIndexType type>
+uint32_t fake_subspace(uint32_t nodeid);
+
+template <>
+uint32_t fake_subspace<HnswIndexType::SINGLE>(uint32_t)
+{
+ return 0;
+}
+
+template <>
+uint32_t fake_subspace<HnswIndexType::MULTI>(uint32_t nodeid)
+{
+ return nodeid + 10;
+}
+
+template <typename NodeType>
+uint32_t fake_get_docid(const NodeType& node, uint32_t nodeid);
+
+template <>
+uint32_t fake_get_docid<HnswSimpleNode>(const HnswSimpleNode &, uint32_t nodeid)
+{
+ return fake_docid<HnswIndexType::SINGLE>(nodeid);
+}
+
+template <>
+uint32_t fake_get_docid<HnswNode>(const HnswNode& node, uint32_t)
+{
+ return node.acquire_docid();
+}
+
+template <HnswIndexType type>
void populate(HnswGraph<type> &graph) {
// no 0
- graph.make_node(1, 1, 0, 1);
- auto er = graph.make_node(2, 2, 0, 2);
+ graph.make_node(1, fake_docid<type>(1), fake_subspace<type>(1), 1);
+ auto er = graph.make_node(2, 102, 12, 2);
// no 3
- graph.make_node(4, 4, 0, 2);
- graph.make_node(5, 5, 0, 0);
- graph.make_node(6, 6, 0, 1);
+ graph.make_node(4, fake_docid<type>(4), fake_subspace<type>(4), 2);
+ graph.make_node(5, fake_docid<type>(5), fake_subspace<type>(5), 0);
+ graph.make_node(6, fake_docid<type>(6), fake_subspace<type>(6), 1);
graph.set_link_array(1, 0, V{2, 4, 6});
graph.set_link_array(2, 0, V{1, 4, 6});
@@ -73,7 +118,7 @@ template <HnswIndexType type>
void modify(HnswGraph<type> &graph) {
graph.remove_node(2);
graph.remove_node(6);
- graph.make_node(7, 7, 0, 2);
+ graph.make_node(7, fake_docid<type>(7), fake_subspace<type>(7), 2);
graph.set_link_array(1, 0, V{7, 4});
graph.set_link_array(4, 0, V{7, 2});
@@ -85,10 +130,11 @@ void modify(HnswGraph<type> &graph) {
}
+template <typename GraphType>
class CopyGraphTest : public ::testing::Test {
public:
- HnswGraph<HnswIndexType::SINGLE> original;
- HnswGraph<HnswIndexType::SINGLE> copy;
+ GraphType original;
+ GraphType copy;
void expect_empty_d(uint32_t nodeid) const {
EXPECT_FALSE(copy.acquire_node_ref(nodeid).valid());
@@ -121,10 +167,16 @@ public:
return vector_writer.output;
}
void load_copy(std::vector<char> data) {
- HnswIndexLoader<VectorBufferReader, HnswIndexType::SINGLE> loader(copy, std::make_unique<VectorBufferReader>(data));
+ HnswIndexLoader<VectorBufferReader, GraphType::index_type> loader(copy, std::make_unique<VectorBufferReader>(data));
while (loader.load_next()) {}
}
+ void expect_docid_and_subspace(uint32_t nodeid) const {
+ auto& node = copy.node_refs.get_elem_ref(nodeid);
+ EXPECT_EQ(fake_docid<GraphType::index_type>(nodeid), fake_get_docid(node, nodeid));
+ EXPECT_EQ(fake_subspace<GraphType::index_type>(nodeid), node.acquire_subspace());
+ }
+
void expect_copy_as_populated() const {
EXPECT_EQ(copy.size(), 7);
auto entry = copy.get_entry_node();
@@ -142,27 +194,35 @@ public:
expect_level_1(2, {4});
expect_level_1(4, {2});
+ expect_docid_and_subspace(1);
+ expect_docid_and_subspace(2);
+ expect_docid_and_subspace(4);
+ expect_docid_and_subspace(6);
}
};
-TEST_F(CopyGraphTest, reconstructs_graph)
+using GraphTestTypes = ::testing::Types<HnswGraph<HnswIndexType::SINGLE>, HnswGraph<HnswIndexType::MULTI>>;
+
+TYPED_TEST_SUITE(CopyGraphTest, GraphTestTypes);
+
+TYPED_TEST(CopyGraphTest, reconstructs_graph)
{
- populate(original);
- auto data = save_original();
- load_copy(data);
- expect_copy_as_populated();
+ populate(this->original);
+ auto data = this->save_original();
+ this->load_copy(data);
+ this->expect_copy_as_populated();
}
-TEST_F(CopyGraphTest, later_changes_ignored)
+TYPED_TEST(CopyGraphTest, later_changes_ignored)
{
- populate(original);
- HnswIndexSaver saver(original);
- modify(original);
+ populate(this->original);
+ HnswIndexSaver saver(this->original);
+ modify(this->original);
VectorBufferWriter vector_writer;
saver.save(vector_writer);
auto data = vector_writer.output;
- load_copy(data);
- expect_copy_as_populated();
+ this->load_copy(data);
+ this->expect_copy_as_populated();
}
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h b/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h
index 5b448ea27b7..4706f9cf7e8 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h
@@ -27,6 +27,7 @@ struct HnswGraph {
// This uses 12 bits for buffer id -> 4096 buffers.
using LinkArrayEntryRefType = vespalib::datastore::EntryRefT<20>;
+ static constexpr HnswIndexType index_type = type;
using NodeType = typename HnswIndexTraits<type>::NodeType;
// Provides mapping from document id -> node reference.
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp
index 1bf02edbefa..04e1fcc1792 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp
@@ -40,10 +40,13 @@ bool
HnswIndexLoader<ReaderType, type>::load_next()
{
assert(!_complete);
+ static constexpr bool identity_mapping = (type == HnswIndexType::SINGLE);
if (_nodeid < _num_nodes) {
uint32_t num_levels = next_int();
if (num_levels > 0) {
- _graph.make_node(_nodeid, _nodeid, 0, num_levels);
+ uint32_t docid = identity_mapping ? _nodeid : next_int();
+ uint32_t subspace = identity_mapping ? 0 : next_int();
+ _graph.make_node(_nodeid, docid, subspace, num_levels);
for (uint32_t level = 0; level < num_levels; ++level) {
uint32_t num_links = next_int();
_link_array.clear();
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.cpp
index ef6621d910b..370e2ddf92a 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.cpp
@@ -54,8 +54,9 @@ HnswIndexSaver<type>::HnswIndexSaver(const HnswGraph<type> &graph)
_meta_data.refs.reserve(link_array_count);
_meta_data.nodes.reserve(num_nodes+1);
for (size_t i = 0; i < num_nodes; ++i) {
- _meta_data.nodes.push_back(_meta_data.refs.size());
- auto node_ref = graph.get_node_ref(i);
+ auto& node = graph.node_refs.get_elem_ref(i);
+ _meta_data.nodes.emplace_back(_meta_data.refs.size(), node);
+ auto node_ref = node.ref().load_relaxed();
if (node_ref.valid()) {
auto levels = graph.nodes.get(node_ref);
for (const auto& links_ref : levels) {
@@ -63,7 +64,7 @@ HnswIndexSaver<type>::HnswIndexSaver(const HnswGraph<type> &graph)
}
}
}
- _meta_data.nodes.push_back(_meta_data.refs.size());
+ _meta_data.nodes.emplace_back(_meta_data.refs.size());
}
template <HnswIndexType type>
@@ -75,10 +76,19 @@ HnswIndexSaver<type>::save(BufferWriter& writer) const
uint32_t num_nodes = _meta_data.nodes.size() - 1;
writer.write(&num_nodes, sizeof(uint32_t));
for (uint32_t i(0); i < num_nodes; i++) {
- uint32_t offset = _meta_data.nodes[i];
- uint32_t next_offset = _meta_data.nodes[i+1];
+ auto& node = _meta_data.nodes[i];
+ uint32_t offset = node.get_refs_offset();
+ uint32_t next_offset = _meta_data.nodes[i+1].get_refs_offset();
uint32_t num_levels = next_offset - offset;
writer.write(&num_levels, sizeof(uint32_t));
+ if (num_levels > 0) {
+ if constexpr (!HnswIndexSaverMetaDataNode<type>::identity_mapping) {
+ uint32_t docid = node.get_docid();
+ uint32_t subspace = node.get_subspace();
+ writer.write(&docid, sizeof(uint32_t));
+ writer.write(&subspace, sizeof(uint32_t));
+ }
+ }
for (; offset < next_offset; offset++) {
auto links_ref = _meta_data.refs[offset];
if (links_ref.valid()) {
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.h
index fa1ae6a5ed6..2884ee9b494 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver.h
@@ -4,6 +4,7 @@
#include "nearest_neighbor_index_saver.h"
#include "hnsw_graph.h"
+#include "hnsw_index_saver_metadata_node.h"
#include <vespa/vespalib/datastore/entryref.h>
#include <vespa/vespalib/stllike/allocator.h>
#include <vector>
@@ -29,7 +30,7 @@ private:
uint32_t entry_nodeid;
int32_t entry_level;
std::vector<EntryRef, vespalib::allocator_large<EntryRef>> refs;
- std::vector<uint32_t, vespalib::allocator_large<uint32_t>> nodes;
+ std::vector<HnswIndexSaverMetaDataNode<type>, vespalib::allocator_large<HnswIndexSaverMetaDataNode<type>>> nodes;
MetaData();
~MetaData();
};
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver_metadata_node.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver_metadata_node.h
new file mode 100644
index 00000000000..bf7fcbc06ed
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_saver_metadata_node.h
@@ -0,0 +1,68 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "hnsw_index_type.h"
+#include "hnsw_node.h"
+#include "hnsw_simple_node.h"
+
+namespace search::tensor {
+
+/*
+ * Meta data for node during save of hnsw graph.
+ */
+template <HnswIndexType type>
+class HnswIndexSaverMetaDataNode;
+
+/*
+ * Meta data for node during save of hnsw graph with one node per document and
+ * identity mapping between nodeid and docid.
+ */
+template <>
+class HnswIndexSaverMetaDataNode<HnswIndexType::SINGLE>
+{
+ uint32_t _refs_offset;
+
+public:
+ HnswIndexSaverMetaDataNode(uint32_t refs_offset) noexcept
+ : _refs_offset(refs_offset)
+ {
+ }
+ HnswIndexSaverMetaDataNode(uint32_t refs_offset, const HnswSimpleNode&) noexcept
+ : _refs_offset(refs_offset)
+ {
+ }
+ uint32_t get_refs_offset() const noexcept { return _refs_offset; }
+ static constexpr bool identity_mapping = true;
+};
+
+/*
+ * Meta data for node during save of hnsw graph with multiple nodes per document and
+ * managed mapping between nodeid and docid.
+ */
+template <>
+class HnswIndexSaverMetaDataNode<HnswIndexType::MULTI>
+{
+ uint32_t _refs_offset;
+ uint32_t _docid;
+ uint32_t _subspace;
+public:
+ HnswIndexSaverMetaDataNode(uint32_t refs_offset) noexcept
+ : _refs_offset(refs_offset),
+ _docid(0),
+ _subspace(0)
+ {
+ }
+ HnswIndexSaverMetaDataNode(uint32_t refs_offset, const HnswNode& node) noexcept
+ : _refs_offset(refs_offset),
+ _docid(node.acquire_docid()),
+ _subspace(node.acquire_subspace())
+ {
+ }
+ uint32_t get_refs_offset() const noexcept { return _refs_offset; }
+ uint32_t get_docid() const noexcept { return _docid; }
+ uint32_t get_subspace() const noexcept { return _subspace; }
+ static constexpr bool identity_mapping = false;
+};
+
+}