From 67689d16d23ecc4b1a2de76ca08cc172ccea7a0f Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Mon, 21 Nov 2022 16:24:26 +0100 Subject: Update mapping from docid to nodeids when loading hnsw index. --- .../tests/tensor/hnsw_index/hnsw_index_test.cpp | 86 ++++++++++++++++++++++ .../hnsw_nodeid_mapping_test.cpp | 20 +++++ .../vespa/searchlib/tensor/hnsw_identity_mapping.h | 3 + searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 1 + searchlib/src/vespa/searchlib/tensor/hnsw_node.h | 2 +- .../vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp | 71 ++++++++++++++++++ .../vespa/searchlib/tensor/hnsw_nodeid_mapping.h | 3 + 7 files changed, 185 insertions(+), 1 deletion(-) (limited to 'searchlib') diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 5be4ae9d28f..b86913caa16 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -5,10 +5,13 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include #include @@ -27,12 +30,46 @@ using namespace search::tensor; using namespace vespalib::slime; using vespalib::Slime; using search::BitVector; +using search::BufferWriter; using vespalib::eval::get_cell_type; using vespalib::eval::ValueType; using vespalib::datastore::CompactionSpec; using vespalib::datastore::CompactionStrategy; using search::queryeval::GlobalFilter; +class VectorBufferWriter : public BufferWriter { +private: + char tmp[1024]; +public: + std::vector output; + VectorBufferWriter() { + setup(tmp, 1024); + } + ~VectorBufferWriter() {} + void flush() override { + for (size_t i = 0; i < usedLen(); ++i) { + output.push_back(tmp[i]); + } + rewind(); + } +}; + +class VectorBufferReader { +private: + const std::vector& _data; + size_t _pos; + +public: + VectorBufferReader(const std::vector& data) : _data(data), _pos(0) {} + uint32_t readHostOrder() { + uint32_t result = 0; + assert(_pos + sizeof(uint32_t) <= _data.size()); + std::memcpy(&result, _data.data() + _pos, sizeof(uint32_t)); + _pos += sizeof(uint32_t); + return result; + } +}; + template class MyDocVectorAccess : public DocVectorAccess { private: @@ -195,6 +232,44 @@ public: FloatVectors& get_vectors() { return vectors; } + uint32_t get_single_nodeid(uint32_t docid) { + auto& id_mapping = index->get_id_mapping(); + auto nodeids = id_mapping.get_ids(docid); + EXPECT_EQ(1, nodeids.size()); + return nodeids[0]; + } + + void make_savetest_index() + { + this->add_document(7); + this->add_document(4); + } + + void check_savetest_index(const vespalib::string& label) { + SCOPED_TRACE(label); + auto nodeid_for_doc_7 = get_single_nodeid(7); + auto nodeid_for_doc_4 = get_single_nodeid(4); + EXPECT_EQ(is_single ? 7 : 1, nodeid_for_doc_7); + EXPECT_EQ(is_single ? 4 : 2, nodeid_for_doc_4); + this->expect_level_0(nodeid_for_doc_7, { nodeid_for_doc_4 }); + this->expect_level_0(nodeid_for_doc_4, { nodeid_for_doc_7 }); + } + + std::vector save_index() const { + HnswIndexSaver saver(index->get_graph()); + VectorBufferWriter vector_writer; + saver.save(vector_writer); + return vector_writer.output; + } + + void load_index(std::vector data) { + auto& graph = index->get_graph(); + HnswIndexLoader loader(graph, std::make_unique(data)); + while (loader.load_next()) {} + auto& id_mapping = index->get_id_mapping(); + id_mapping.on_load(graph.node_refs.make_read_view(graph.node_refs.size())); + } + static constexpr bool is_single = std::is_same_v>; }; @@ -687,6 +762,17 @@ TYPED_TEST(HnswIndexTest, hnsw_graph_is_compacted) EXPECT_LT(mem_3.usedBytes(), mem_2.usedBytes()); } +TYPED_TEST(HnswIndexTest, hnsw_graph_can_be_saved_and_loaded) +{ + this->init(false); + this->make_savetest_index(); + this->check_savetest_index("before save"); + auto data = this->save_index(); + this->init(false); + this->load_index(data); + this->check_savetest_index("after load"); + } + TEST(LevelGeneratorTest, gives_various_levels) { InvLogLevelGenerator generator(4); diff --git a/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp index a3e3112eaf4..ac8b21d6136 100644 --- a/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_nodeid_mapping/hnsw_nodeid_mapping_test.cpp @@ -1,9 +1,11 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include +#include #include using namespace search::tensor; +using vespalib::datastore::EntryRef; class HnswNodeidMappingTest : public ::testing::Test { public: @@ -74,6 +76,24 @@ TEST_F(HnswNodeidMappingTest, free_ids_puts_nodeids_on_hold_list_and_then_free_l expect_allocate_get({8, 7, 10}, 7); // Free list is first used, then new nodeid is allocated } +TEST_F(HnswNodeidMappingTest, on_load_populates_mapping) +{ + std::vector nodes(10); + nodes[1].ref().store_relaxed(EntryRef(1)); + nodes[1].store_docid(7); + nodes[1].store_subspace(0); + nodes[2].ref().store_relaxed(EntryRef(2)); + nodes[2].store_docid(4); + nodes[2].store_subspace(0); + nodes[7].ref().store_relaxed(EntryRef(3)); + nodes[7].store_docid(4); + nodes[7].store_subspace(1); + mapping.on_load(vespalib::ConstArrayRef(nodes.data(), nodes.size())); + expect_get({1}, 7); + expect_get({2, 7}, 4); + expect_allocate_get({3, 4, 5, 6, 8, 9}, 1); +} + TEST_F(HnswNodeidMappingTest, memory_usage_increases_when_allocating_nodeids) { expect_allocate_get({1, 2}, 1); diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h b/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h index 0ec15a54374..f4f68ddac1e 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_identity_mapping.h @@ -10,6 +10,8 @@ namespace search::tensor { +class HnswSimpleNode; + /* * Class used to maintain mapping from docid to nodeid for dense tensors * (one node per document). @@ -34,6 +36,7 @@ public: void free_ids(uint32_t docid) { (void) docid; } void assign_generation(generation_t current_gen) { (void) current_gen; }; void reclaim_memory(generation_t oldest_used_gen) { (void) oldest_used_gen; }; + void on_load(vespalib::ConstArrayRef nodes) { (void) nodes; } vespalib::MemoryUsage memory_usage() const { return vespalib::MemoryUsage(); } }; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index a583f6f885c..bf38dc01f37 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -67,6 +67,7 @@ public: } } + static constexpr HnswIndexType index_type = type; using IdMapping = typename HnswIndexTraits::IdMapping; protected: using GraphType = HnswGraph; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h index fa3286420a4..2e14f363bba 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h @@ -19,7 +19,7 @@ class HnswNode { vespalib::datastore::AtomicValueWrapper _subspace; public: - HnswNode() + HnswNode() noexcept : _ref(), _docid(), _subspace() diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp index c16024443ca..a801e826a7d 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "hnsw_nodeid_mapping.h" +#include "hnsw_node.h" #include #include #include @@ -115,6 +116,76 @@ HnswNodeidMapping::reclaim_memory(generation_t oldest_used_gen) }); } +void +HnswNodeidMapping::on_load(vespalib::ConstArrayRef nodes) +{ + if (nodes.empty()) { + return; + } + // Check that reserved nodeid is not used + assert(!nodes[0].ref().load_relaxed().valid()); + // Detect histogram size + uint32_t max_docid = 0; + for (auto& node : nodes) { + if (node.ref().load_relaxed().valid()) { + max_docid = std::max(node.acquire_docid(), max_docid); + } + } + // Make histogram + std::vector histogram(max_docid + 1); + for (auto& node : nodes) { + if (node.ref().load_relaxed().valid()) { + auto docid = node.acquire_docid(); + auto subspace = node.acquire_subspace(); + auto &num_subspaces = histogram[docid]; + num_subspaces = std::max(num_subspaces, subspace + 1); + } + } + assert(histogram[0] == 0); + // Allocate mapping from docid to nodeids + ensure_refs_size(max_docid); + uint32_t docid = 0; + for (auto subspaces : histogram) { + if (subspaces > 0) { + auto ref = _nodeids.allocate(subspaces); + _refs[docid] = ref; + auto nodeids = _nodeids.get_writable(ref); + for (auto& nodeid : nodeids) { + nodeid = 0; + } + } + ++docid; + } + { + // Populate mapping from docid to nodeids and free list + uint32_t nodeid = 0; + for (auto& node : nodes) { + if (node.ref().load_relaxed().valid()) { + docid = node.acquire_docid(); + auto subspace = node.acquire_subspace(); + auto nodeids = _nodeids.get_writable(_refs[docid]); + assert(subspace < nodeids.size()); + assert(nodeids[subspace] == 0); + nodeids[subspace] = nodeid; + } else if (nodeid > 0) { + _free_list.push_back(nodeid); + } + ++nodeid; + } + } + // All subspaces for a docid needs to have a nodeid + for (docid = 0; docid <= max_docid; ++docid) { + auto ref = _refs[docid]; + if (ref.valid()) { + auto nodeids = _nodeids.get_writable(ref); + for (auto nodeid : nodeids) { + assert(nodeid != 0); + } + } + } + std::reverse(_free_list.begin(), _free_list.end()); +} + namespace { vespalib::MemoryUsage diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h index 6ccc62aa0bb..8abf6321832 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h @@ -13,6 +13,8 @@ namespace search::tensor { +class HnswNode; + /** * Class used to keep track of the mapping from docid to array of nodeids. * A nodeid is an identifier for a node in the HNSW graph that represents a single vector. @@ -49,6 +51,7 @@ public: void assign_generation(generation_t current_gen); void reclaim_memory(generation_t oldest_used_gen); + void on_load(vespalib::ConstArrayRef nodes); // TODO: Add support for compaction vespalib::MemoryUsage memory_usage() const; }; -- cgit v1.2.3 From 3e6deec22291d2e4304be1e17547e9e147084340 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Mon, 21 Nov 2022 17:02:27 +0100 Subject: Factor out portions of HnswNodeidMapping::on_load() to private member functions. --- .../vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp | 85 ++++++++++++++-------- .../vespa/searchlib/tensor/hnsw_nodeid_mapping.h | 5 ++ 2 files changed, 61 insertions(+), 29 deletions(-) (limited to 'searchlib') diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp index a801e826a7d..2a8453d35ac 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp @@ -116,23 +116,23 @@ HnswNodeidMapping::reclaim_memory(generation_t oldest_used_gen) }); } -void -HnswNodeidMapping::on_load(vespalib::ConstArrayRef nodes) +uint32_t +HnswNodeidMapping::get_docid_limit(vespalib::ConstArrayRef nodes) { - if (nodes.empty()) { - return; - } - // Check that reserved nodeid is not used - assert(!nodes[0].ref().load_relaxed().valid()); - // Detect histogram size uint32_t max_docid = 0; for (auto& node : nodes) { if (node.ref().load_relaxed().valid()) { max_docid = std::max(node.acquire_docid(), max_docid); } } + return max_docid + 1; +} + +std::vector +HnswNodeidMapping::make_subspaces_histogram(vespalib::ConstArrayRef nodes, uint32_t docid_limit) +{ // Make histogram - std::vector histogram(max_docid + 1); + std::vector histogram(docid_limit); for (auto& node : nodes) { if (node.ref().load_relaxed().valid()) { auto docid = node.acquire_docid(); @@ -142,8 +142,14 @@ HnswNodeidMapping::on_load(vespalib::ConstArrayRef nodes) } } assert(histogram[0] == 0); - // Allocate mapping from docid to nodeids - ensure_refs_size(max_docid); + return histogram; +} + + +void +HnswNodeidMapping::allocate_docid_to_nodeids_mapping(std::vector histogram) +{ + ensure_refs_size(histogram.size() - 1); uint32_t docid = 0; for (auto subspaces : histogram) { if (subspaces > 0) { @@ -156,25 +162,32 @@ HnswNodeidMapping::on_load(vespalib::ConstArrayRef nodes) } ++docid; } - { - // Populate mapping from docid to nodeids and free list - uint32_t nodeid = 0; - for (auto& node : nodes) { - if (node.ref().load_relaxed().valid()) { - docid = node.acquire_docid(); - auto subspace = node.acquire_subspace(); - auto nodeids = _nodeids.get_writable(_refs[docid]); - assert(subspace < nodeids.size()); - assert(nodeids[subspace] == 0); - nodeids[subspace] = nodeid; - } else if (nodeid > 0) { - _free_list.push_back(nodeid); - } - ++nodeid; +} + +void +HnswNodeidMapping::populate_docid_to_nodeids_mapping_and_free_list(vespalib::ConstArrayRef nodes) +{ + uint32_t nodeid = 0; + for (auto& node : nodes) { + if (node.ref().load_relaxed().valid()) { + auto docid = node.acquire_docid(); + auto subspace = node.acquire_subspace(); + auto nodeids = _nodeids.get_writable(_refs[docid]); + assert(subspace < nodeids.size()); + assert(nodeids[subspace] == 0); + nodeids[subspace] = nodeid; + } else if (nodeid > 0) { + _free_list.push_back(nodeid); } + ++nodeid; } - // All subspaces for a docid needs to have a nodeid - for (docid = 0; docid <= max_docid; ++docid) { + std::reverse(_free_list.begin(), _free_list.end()); +} + +void +HnswNodeidMapping::assert_all_subspaces_have_valid_nodeid(uint32_t docid_limit) +{ + for (uint32_t docid = 0; docid < docid_limit; ++docid) { auto ref = _refs[docid]; if (ref.valid()) { auto nodeids = _nodeids.get_writable(ref); @@ -183,7 +196,21 @@ HnswNodeidMapping::on_load(vespalib::ConstArrayRef nodes) } } } - std::reverse(_free_list.begin(), _free_list.end()); +} + +void +HnswNodeidMapping::on_load(vespalib::ConstArrayRef nodes) +{ + if (nodes.empty()) { + return; + } + // Check that reserved nodeid is not used + assert(!nodes[0].ref().load_relaxed().valid()); + auto docid_limit = get_docid_limit(nodes); + auto histogram = make_subspaces_histogram(nodes, docid_limit); // Allocate mapping from docid to nodeids + allocate_docid_to_nodeids_mapping(std::move(histogram)); + populate_docid_to_nodeids_mapping_and_free_list(nodes); + assert_all_subspaces_have_valid_nodeid(docid_limit); } namespace { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h index 8abf6321832..153c0faebd7 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h @@ -41,6 +41,11 @@ private: void ensure_refs_size(uint32_t docid); uint32_t allocate_id(); + uint32_t get_docid_limit(vespalib::ConstArrayRef nodes); + std::vector make_subspaces_histogram(vespalib::ConstArrayRef nodes, uint32_t docid_limit); + void allocate_docid_to_nodeids_mapping(std::vector histogram); + void populate_docid_to_nodeids_mapping_and_free_list(vespalib::ConstArrayRef nodes); + void assert_all_subspaces_have_valid_nodeid(uint32_t docid_limit); public: HnswNodeidMapping(); -- cgit v1.2.3 From 25d902a8d776ff6e67ae15b188cbc96ca57c88e4 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 22 Nov 2022 11:20:36 +0100 Subject: Move shared test code to test library. --- .../src/tests/tensor/hnsw_index/CMakeLists.txt | 1 + .../tests/tensor/hnsw_index/hnsw_index_test.cpp | 38 +++------------------- .../src/tests/tensor/hnsw_saver/CMakeLists.txt | 1 + .../tensor/hnsw_saver/hnsw_save_load_test.cpp | 38 +++------------------- searchlib/src/vespa/searchlib/test/CMakeLists.txt | 1 + .../vespa/searchlib/test/vector_buffer_reader.h | 29 +++++++++++++++++ .../vespa/searchlib/test/vector_buffer_writer.cpp | 24 ++++++++++++++ .../vespa/searchlib/test/vector_buffer_writer.h | 21 ++++++++++++ 8 files changed, 85 insertions(+), 68 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/test/vector_buffer_reader.h create mode 100644 searchlib/src/vespa/searchlib/test/vector_buffer_writer.cpp create mode 100644 searchlib/src/vespa/searchlib/test/vector_buffer_writer.h (limited to 'searchlib') diff --git a/searchlib/src/tests/tensor/hnsw_index/CMakeLists.txt b/searchlib/src/tests/tensor/hnsw_index/CMakeLists.txt index a65d7071b5e..aec1d742700 100644 --- a/searchlib/src/tests/tensor/hnsw_index/CMakeLists.txt +++ b/searchlib/src/tests/tensor/hnsw_index/CMakeLists.txt @@ -3,6 +3,7 @@ vespa_add_executable(searchlib_hnsw_index_test_app TEST SOURCES hnsw_index_test.cpp DEPENDS + searchlib_test searchlib GTest::GTest ) diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index b86913caa16..13caad8b6d6 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -2,6 +2,8 @@ #include #include +#include +#include #include #include #include @@ -11,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -36,39 +37,8 @@ using vespalib::eval::ValueType; using vespalib::datastore::CompactionSpec; using vespalib::datastore::CompactionStrategy; using search::queryeval::GlobalFilter; - -class VectorBufferWriter : public BufferWriter { -private: - char tmp[1024]; -public: - std::vector output; - VectorBufferWriter() { - setup(tmp, 1024); - } - ~VectorBufferWriter() {} - void flush() override { - for (size_t i = 0; i < usedLen(); ++i) { - output.push_back(tmp[i]); - } - rewind(); - } -}; - -class VectorBufferReader { -private: - const std::vector& _data; - size_t _pos; - -public: - VectorBufferReader(const std::vector& data) : _data(data), _pos(0) {} - uint32_t readHostOrder() { - uint32_t result = 0; - assert(_pos + sizeof(uint32_t) <= _data.size()); - std::memcpy(&result, _data.data() + _pos, sizeof(uint32_t)); - _pos += sizeof(uint32_t); - return result; - } -}; +using search::test::VectorBufferReader; +using search::test::VectorBufferWriter; template class MyDocVectorAccess : public DocVectorAccess { diff --git a/searchlib/src/tests/tensor/hnsw_saver/CMakeLists.txt b/searchlib/src/tests/tensor/hnsw_saver/CMakeLists.txt index e603f890ac9..3e2be2dbb58 100644 --- a/searchlib/src/tests/tensor/hnsw_saver/CMakeLists.txt +++ b/searchlib/src/tests/tensor/hnsw_saver/CMakeLists.txt @@ -3,6 +3,7 @@ vespa_add_executable(searchlib_hnsw_save_load_test_app TEST SOURCES hnsw_save_load_test.cpp DEPENDS + searchlib_test searchlib GTest::GTest ) diff --git a/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp b/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp index e2a96ec059c..7495fa18c4d 100644 --- a/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp @@ -3,7 +3,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -14,39 +15,8 @@ LOG_SETUP("hnsw_save_load_test"); using namespace search::tensor; using search::BufferWriter; using search::fileutil::LoadedBuffer; - -class VectorBufferWriter : public BufferWriter { -private: - char tmp[1024]; -public: - std::vector output; - VectorBufferWriter() { - setup(tmp, 1024); - } - ~VectorBufferWriter() {} - void flush() override { - for (size_t i = 0; i < usedLen(); ++i) { - output.push_back(tmp[i]); - } - rewind(); - } -}; - -class VectorBufferReader { -private: - const std::vector& _data; - size_t _pos; - -public: - VectorBufferReader(const std::vector& data) : _data(data), _pos(0) {} - uint32_t readHostOrder() { - uint32_t result = 0; - assert(_pos + sizeof(uint32_t) <= _data.size()); - std::memcpy(&result, _data.data() + _pos, sizeof(uint32_t)); - _pos += sizeof(uint32_t); - return result; - } -}; +using search::test::VectorBufferReader; +using search::test::VectorBufferWriter; using V = std::vector; diff --git a/searchlib/src/vespa/searchlib/test/CMakeLists.txt b/searchlib/src/vespa/searchlib/test/CMakeLists.txt index 7decdb992e6..ac8bcb240e0 100644 --- a/searchlib/src/vespa/searchlib/test/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/test/CMakeLists.txt @@ -12,6 +12,7 @@ vespa_add_library(searchlib_test searchiteratorverifier.cpp schema_builder.cpp string_field_builder.cpp + vector_buffer_writer.cpp $ $ $ diff --git a/searchlib/src/vespa/searchlib/test/vector_buffer_reader.h b/searchlib/src/vespa/searchlib/test/vector_buffer_reader.h new file mode 100644 index 00000000000..b608e0c7259 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/vector_buffer_reader.h @@ -0,0 +1,29 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include +#include +#include + +namespace search::test +{ + +class VectorBufferReader { +private: + const std::vector& _data; + size_t _pos; + +public: + VectorBufferReader(const std::vector& data) : _data(data), _pos(0) {} + uint32_t readHostOrder() { + uint32_t result = 0; + assert(_pos + sizeof(uint32_t) <= _data.size()); + std::memcpy(&result, _data.data() + _pos, sizeof(uint32_t)); + _pos += sizeof(uint32_t); + return result; + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/test/vector_buffer_writer.cpp b/searchlib/src/vespa/searchlib/test/vector_buffer_writer.cpp new file mode 100644 index 00000000000..59c42840c37 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/vector_buffer_writer.cpp @@ -0,0 +1,24 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "vector_buffer_writer.h" + +namespace search::test { + +VectorBufferWriter::VectorBufferWriter() + : BufferWriter() +{ + setup(tmp, 1024); +} + +VectorBufferWriter::~VectorBufferWriter() = default; + +void +VectorBufferWriter::flush() +{ + for (size_t i = 0; i < usedLen(); ++i) { + output.push_back(tmp[i]); + } + rewind(); +} + +} diff --git a/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h b/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h new file mode 100644 index 00000000000..22089e6d510 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h @@ -0,0 +1,21 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include + +namespace search::test +{ + +class VectorBufferWriter : public BufferWriter { +private: + char tmp[1024]; +public: + std::vector output; + VectorBufferWriter(); + ~VectorBufferWriter(); + void flush() override; +}; + +} -- cgit v1.2.3 From 85d1e066e03030de91558746864dfd84ecd2f1bc Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 22 Nov 2022 12:03:59 +0100 Subject: Add override specifier for VectorBufferWriter destructor. --- searchlib/src/vespa/searchlib/test/vector_buffer_writer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'searchlib') diff --git a/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h b/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h index 22089e6d510..e3d88cb507d 100644 --- a/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h +++ b/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h @@ -14,7 +14,7 @@ private: public: std::vector output; VectorBufferWriter(); - ~VectorBufferWriter(); + ~VectorBufferWriter() override; void flush() override; }; -- cgit v1.2.3 From ab79bbac295e6f9d3fa9d6d069c6163f2085e05a Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 22 Nov 2022 13:20:02 +0100 Subject: Pass id mapper to HnswIndexLoader. --- .../tests/tensor/hnsw_index/hnsw_index_test.cpp | 5 ++--- .../tensor/hnsw_saver/hnsw_save_load_test.cpp | 26 ++++++++++++++++++---- .../src/vespa/searchlib/tensor/hnsw_index.cpp | 2 +- .../src/vespa/searchlib/tensor/hnsw_index_loader.h | 6 ++++- .../vespa/searchlib/tensor/hnsw_index_loader.hpp | 6 +++-- 5 files changed, 34 insertions(+), 11 deletions(-) (limited to 'searchlib') diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 13caad8b6d6..bea371c78a8 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -234,10 +234,9 @@ public: void load_index(std::vector data) { auto& graph = index->get_graph(); - HnswIndexLoader loader(graph, std::make_unique(data)); - while (loader.load_next()) {} auto& id_mapping = index->get_id_mapping(); - id_mapping.on_load(graph.node_refs.make_read_view(graph.node_refs.size())); + HnswIndexLoader loader(graph, id_mapping, std::make_unique(data)); + while (loader.load_next()) {} } static constexpr bool is_single = std::is_same_v>; diff --git a/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp b/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp index 7495fa18c4d..bf4abdd7cf8 100644 --- a/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_saver/hnsw_save_load_test.cpp @@ -1,8 +1,11 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include +#include #include #include +#include +#include #include #include #include @@ -32,7 +35,14 @@ uint32_t fake_docid(uint32_t nodeid) template <> uint32_t fake_docid(uint32_t nodeid) { - return nodeid + 100; + switch (nodeid) { + case 5: + return 104; + case 6: + return 104; + default: + return nodeid + 100; + } } template @@ -47,7 +57,14 @@ uint32_t fake_subspace(uint32_t) template <> uint32_t fake_subspace(uint32_t nodeid) { - return nodeid + 10; + switch (nodeid) { + case 5: + return 2; + case 6: + return 1; + default: + return 0; + } } template @@ -69,7 +86,7 @@ template void populate(HnswGraph &graph) { // no 0 graph.make_node(1, fake_docid(1), fake_subspace(1), 1); - auto er = graph.make_node(2, 102, 12, 2); + auto er = graph.make_node(2, fake_docid(2), fake_subspace(2), 2); // no 3 graph.make_node(4, fake_docid(4), fake_subspace(4), 2); graph.make_node(5, fake_docid(5), fake_subspace(5), 0); @@ -137,7 +154,8 @@ public: return vector_writer.output; } void load_copy(std::vector data) { - HnswIndexLoader loader(copy, std::make_unique(data)); + typename HnswIndexTraits::IdMapping id_mapping; + HnswIndexLoader loader(copy, id_mapping, std::make_unique(data)); while (loader.load_next()) {} } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index ce9f1ad9ca7..e9e52301f8e 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -777,7 +777,7 @@ HnswIndex::make_loader(FastOS_FileInterface& file) assert(get_entry_nodeid() == 0); // cannot load after index has data using ReaderType = FileReader; using LoaderType = HnswIndexLoader; - return std::make_unique(_graph, std::make_unique(&file)); + return std::make_unique(_graph, _id_mapping, std::make_unique(&file)); } struct NeighborsByDocId { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.h index efe15011776..721276ef0ab 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.h @@ -3,6 +3,7 @@ #pragma once #include "nearest_neighbor_index_loader.h" +#include "hnsw_index_traits.h" #include #include #include @@ -21,6 +22,8 @@ struct HnswGraph; template class HnswIndexLoader : public NearestNeighborIndexLoader { private: + using IdMapping = typename HnswIndexTraits::IdMapping; + HnswGraph& _graph; std::unique_ptr _reader; uint32_t _entry_nodeid; @@ -29,6 +32,7 @@ private: uint32_t _nodeid; std::vector _link_array; bool _complete; + IdMapping& _id_mapping; void init(); uint32_t next_int() { @@ -36,7 +40,7 @@ private: } public: - HnswIndexLoader(HnswGraph& graph, std::unique_ptr reader); + HnswIndexLoader(HnswGraph& graph, IdMapping& id_mapping, std::unique_ptr reader); virtual ~HnswIndexLoader(); bool load_next() override; }; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp index 04e1fcc1792..de9cc760fec 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_loader.hpp @@ -22,7 +22,7 @@ template HnswIndexLoader::~HnswIndexLoader() = default; template -HnswIndexLoader::HnswIndexLoader(HnswGraph& graph, std::unique_ptr reader) +HnswIndexLoader::HnswIndexLoader(HnswGraph& graph, IdMapping& id_mapping, std::unique_ptr reader) : _graph(graph), _reader(std::move(reader)), _entry_nodeid(0), @@ -30,7 +30,8 @@ HnswIndexLoader::HnswIndexLoader(HnswGraph& graph, std:: _num_nodes(0), _nodeid(0), _link_array(), - _complete(false) + _complete(false), + _id_mapping(id_mapping) { init(); } @@ -65,6 +66,7 @@ HnswIndexLoader::load_next() _graph.trim_node_refs_size(); auto entry_node_ref = _graph.get_node_ref(_entry_nodeid); _graph.set_entry_node({_entry_nodeid, entry_node_ref, _entry_level}); + _id_mapping.on_load(_graph.node_refs.make_read_view(_graph.node_refs.size())); _complete = true; return false; } -- cgit v1.2.3 From 0b78cd0b88bb85b76cbac495b67c8a039bbc3125 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 22 Nov 2022 14:12:35 +0100 Subject: Move get_docid_limit and make_subspaces_histogram member functions to an anonymous namespace. --- searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp | 7 +++++-- searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h | 2 -- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'searchlib') diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp index 2a8453d35ac..494bad79b8a 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.cpp @@ -116,8 +116,10 @@ HnswNodeidMapping::reclaim_memory(generation_t oldest_used_gen) }); } +namespace { + uint32_t -HnswNodeidMapping::get_docid_limit(vespalib::ConstArrayRef nodes) +get_docid_limit(vespalib::ConstArrayRef nodes) { uint32_t max_docid = 0; for (auto& node : nodes) { @@ -129,7 +131,7 @@ HnswNodeidMapping::get_docid_limit(vespalib::ConstArrayRef nodes) } std::vector -HnswNodeidMapping::make_subspaces_histogram(vespalib::ConstArrayRef nodes, uint32_t docid_limit) +make_subspaces_histogram(vespalib::ConstArrayRef nodes, uint32_t docid_limit) { // Make histogram std::vector histogram(docid_limit); @@ -145,6 +147,7 @@ HnswNodeidMapping::make_subspaces_histogram(vespalib::ConstArrayRef no return histogram; } +} void HnswNodeidMapping::allocate_docid_to_nodeids_mapping(std::vector histogram) diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h index 153c0faebd7..67213813c11 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_nodeid_mapping.h @@ -41,8 +41,6 @@ private: void ensure_refs_size(uint32_t docid); uint32_t allocate_id(); - uint32_t get_docid_limit(vespalib::ConstArrayRef nodes); - std::vector make_subspaces_histogram(vespalib::ConstArrayRef nodes, uint32_t docid_limit); void allocate_docid_to_nodeids_mapping(std::vector histogram); void populate_docid_to_nodeids_mapping_and_free_list(vespalib::ConstArrayRef nodes); void assert_all_subspaces_have_valid_nodeid(uint32_t docid_limit); -- cgit v1.2.3 From a96c097160f6a714c1851e4dd6141ab911c4a269 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 22 Nov 2022 14:13:33 +0100 Subject: Add class comments. --- searchlib/src/vespa/searchlib/test/vector_buffer_reader.h | 4 ++++ searchlib/src/vespa/searchlib/test/vector_buffer_writer.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'searchlib') diff --git a/searchlib/src/vespa/searchlib/test/vector_buffer_reader.h b/searchlib/src/vespa/searchlib/test/vector_buffer_reader.h index b608e0c7259..d9b1353270b 100644 --- a/searchlib/src/vespa/searchlib/test/vector_buffer_reader.h +++ b/searchlib/src/vespa/searchlib/test/vector_buffer_reader.h @@ -10,6 +10,10 @@ namespace search::test { +/* + * Class used by hnsw graph/index unit tests to load hnsw index from a + * vector. + */ class VectorBufferReader { private: const std::vector& _data; diff --git a/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h b/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h index e3d88cb507d..244c675c567 100644 --- a/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h +++ b/searchlib/src/vespa/searchlib/test/vector_buffer_writer.h @@ -8,6 +8,10 @@ namespace search::test { +/* + * Class used by hnsw graph/index unit tests to save hnsw index to a + * vector. + */ class VectorBufferWriter : public BufferWriter { private: char tmp[1024]; -- cgit v1.2.3