diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-02-06 10:15:51 +0000 |
---|---|---|
committer | Geir Storli <geirst@verizonmedia.com> | 2020-02-06 10:15:51 +0000 |
commit | 96711356c10d90670bc71ccbfc82dd71d8b5b6c1 (patch) | |
tree | 6943257b2f14616cbcfee26a6a4e346fee243780 | |
parent | 6ccedd9cf7116e4b5b652bb72a9466ff3641d76a (diff) |
Add support for adding a new document in multiple levels in the graph.
6 files changed, 118 insertions, 38 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 4ff1aa95862..76548a025a0 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -34,8 +34,10 @@ public: } }; -class LevelZeroGenerator : public RandomLevelGenerator { - uint32_t max_level() override { return 0; } +struct LevelGenerator : public RandomLevelGenerator { + uint32_t level; + LevelGenerator() : level(0) {} + uint32_t max_level() override { return level; } }; using FloatVectors = MyDocVectorAccess<float>; @@ -45,7 +47,7 @@ using FloatIndexUP = std::unique_ptr<FloatIndex>; class HnswIndexTest : public ::testing::Test { public: FloatVectors vectors; - LevelZeroGenerator level_generator; + LevelGenerator level_generator; FloatIndexUP index; HnswIndexTest() @@ -59,13 +61,26 @@ public: } void init(bool heuristic_select_neighbors) { index = std::make_unique<FloatIndex>(vectors, level_generator, - HnswIndexBase::Config(2, 0, 10, heuristic_select_neighbors)); + HnswIndexBase::Config(2, 1, 10, heuristic_select_neighbors)); + } + void add_document(uint32_t docid, uint32_t max_level = 0) { + level_generator.level = max_level; + index->add_document(docid); + } + void expect_entry_point(uint32_t exp_docid, uint32_t exp_level) { + EXPECT_EQ(exp_docid, index->get_entry_docid()); + EXPECT_EQ(exp_level, index->get_entry_level()); } void expect_level_0(uint32_t docid, const HnswNode::LinkArray& exp_links) { auto node = index->get_node(docid); ASSERT_EQ(1, node.size()); EXPECT_EQ(exp_links, node.level(0)); } + void expect_levels(uint32_t docid, const HnswNode::LevelArray& exp_levels) { + auto act_node = index->get_node(docid); + ASSERT_EQ(exp_levels.size(), act_node.size()); + EXPECT_EQ(exp_levels, act_node.levels()); + } }; @@ -73,32 +88,32 @@ TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_simple_select_ne { init(false); - index->add_document(1); + add_document(1); expect_level_0(1, {}); - index->add_document(2); + add_document(2); expect_level_0(1, {2}); expect_level_0(2, {1}); - index->add_document(3); + add_document(3); expect_level_0(1, {2, 3}); expect_level_0(2, {1, 3}); expect_level_0(3, {1, 2}); - index->add_document(4); + add_document(4); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3}); expect_level_0(3, {1, 2, 4}); expect_level_0(4, {1, 3}); - index->add_document(5); + add_document(5); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3, 5}); expect_level_0(3, {1, 2, 4, 5}); expect_level_0(4, {1, 3}); expect_level_0(5, {2, 3}); - index->add_document(6); + add_document(6); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3, 5, 6}); expect_level_0(3, {1, 2, 4, 5}); @@ -106,7 +121,7 @@ TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_simple_select_ne expect_level_0(5, {2, 3, 6}); expect_level_0(6, {2, 5}); - index->add_document(7); + add_document(7); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3, 5, 6, 7}); expect_level_0(3, {1, 2, 4, 5, 7}); @@ -116,60 +131,69 @@ TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_simple_select_ne expect_level_0(7, {2, 3}); } -TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_heuristic_select_neighbors) +TEST_F(HnswIndexTest, 2d_vectors_inserted_in_hierarchic_graph_with_heuristic_select_neighbors) { init(true); - index->add_document(1); + add_document(1); + expect_entry_point(1, 0); expect_level_0(1, {}); - index->add_document(2); + add_document(2); + expect_entry_point(1, 0); expect_level_0(1, {2}); expect_level_0(2, {1}); - index->add_document(3); + // Doc 3 is also added to level 1 + add_document(3, 1); + expect_entry_point(3, 1); expect_level_0(1, {2, 3}); expect_level_0(2, {1, 3}); - expect_level_0(3, {1, 2}); + expect_levels(3, {{1, 2}, {}}); // Doc 4 is closest to 1 and they are linked. // Doc 4 is NOT linked to 3 as the distance between 4 and 3 is greater than the distance between 3 and 1. // Doc 3 is therefore reachable via 1. Same argument for why doc 4 is not linked to 2. - index->add_document(4); + add_document(4); + expect_entry_point(3, 1); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3}); - expect_level_0(3, {1, 2}); + expect_levels(3, {{1, 2}, {}}); expect_level_0(4, {1}); // Doc 5 is closest to 2 and they are linked. // The other docs are reachable via 2, and no other links are created. Same argument as with doc 4 above. - index->add_document(5); + add_document(5); + expect_entry_point(3, 1); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3, 5}); - expect_level_0(3, {1, 2}); + expect_levels(3, {{1, 2}, {}}); expect_level_0(4, {1}); expect_level_0(5, {2}); // Doc 6 is closest to 5 and they are linked. // Doc 6 is also linked to 2 as the distance between 6 and 2 is less than the distance between 2 and 5. - index->add_document(6); + // Doc 6 is also added to level 1 and 2, and linked to doc 3 in level 1. + add_document(6, 2); + expect_entry_point(6, 2); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3, 5, 6}); - expect_level_0(3, {1, 2}); + expect_levels(3, {{1, 2}, {6}}); expect_level_0(4, {1}); expect_level_0(5, {2, 6}); - expect_level_0(6, {2, 5}); + expect_levels(6, {{2, 5}, {3}, {}}); // Doc 7 is closest to 3 and they are linked. // Doc 7 is also linked to 6 as the distance between 7 and 6 is less than the distance between 6 and 3. // Docs 1, 2, 4 are reachable via 3. - index->add_document(7); + add_document(7); + expect_entry_point(6, 2); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3, 5, 6}); - expect_level_0(3, {1, 2, 7}); + expect_levels(3, {{1, 2, 7}, {6}}); expect_level_0(4, {1}); expect_level_0(5, {2, 6}); - expect_level_0(6, {2, 5, 7}); + expect_levels(6, {{2, 5, 7}, {3}, {}}); expect_level_0(7, {3, 6}); } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 107a0ef459a..19cf88f719e 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -29,6 +29,25 @@ HnswIndex<FloatType>::calc_distance(const Vector& lhs, uint32_t rhs_docid) const } template <typename FloatType> +HnswCandidate +HnswIndex<FloatType>::find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level) +{ + HnswCandidate nearest = entry_point; + bool keep_searching = true; + while (keep_searching) { + keep_searching = false; + for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) { + double dist = calc_distance(input, neighbor_docid); + if (dist < nearest.distance) { + nearest = HnswCandidate(neighbor_docid, dist); + keep_searching = true; + } + } + } + return nearest; +} + +template <typename FloatType> void HnswIndex<FloatType>::search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) { @@ -82,20 +101,38 @@ HnswIndex<FloatType>::add_document(uint32_t docid) _node_refs.ensure_size(docid + 1, EntryRef()); // A document cannot be added twice. assert(!_node_refs[docid].valid()); - make_node_for_document(docid); + int level = make_node_for_document(docid); if (_entry_docid == 0) { _entry_docid = docid; + _entry_level = level; return; } + + int search_level = _entry_level; double entry_dist = calc_distance(input, _entry_docid); + HnswCandidate entry_point(_entry_docid, entry_dist); + while (search_level > level) { + entry_point = find_nearest_in_layer(input, entry_point, search_level); + --search_level; + } + FurthestPriQ best_neighbors; - best_neighbors.emplace(_entry_docid, entry_dist); - // TODO: Add support for multiple levels. - // TODO: Rename to search_level? - search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, 0); - auto neighbors = select_neighbors(best_neighbors.peek(), _cfg.max_links_at_level_0()); - connect_new_node(docid, neighbors, 0); - // TODO: Shrink neighbors if needed + best_neighbors.push(entry_point); + search_level = std::min(level, static_cast<int>(_entry_level)); + + // Insert the added document in each level it should exist in. + while (search_level >= 0) { + // TODO: Rename to search_level? + search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level); + auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level)); + connect_new_node(docid, neighbors, search_level); + // TODO: Shrink neighbors if needed + --search_level; + } + if (static_cast<uint32_t>(level) > _entry_level) { + _entry_docid = docid; + _entry_level = level; + } } template <typename FloatType> diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index b6f063b1451..be6f507dbe1 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -33,6 +33,10 @@ private: double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override; double calc_distance(const Vector& lhs, uint32_t rhs_docid) const; + /** + * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. + */ + HnswCandidate find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level); void search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); public: diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp index 4406428bc2a..518ccaa0ed8 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp @@ -32,16 +32,24 @@ HnswIndexBase::make_default_link_store_config() small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true); } -void +uint32_t +HnswIndexBase::max_links_for_level(uint32_t level) const +{ + return (level == 0) ? _cfg.max_links_at_level_0() : _cfg.max_links_at_hierarchic_levels(); +} + +uint32_t HnswIndexBase::make_node_for_document(uint32_t docid) { + uint32_t max_level = _level_generator.max_level(); // TODO: Add capping on num_levels - uint32_t num_levels = _level_generator.max_level() + 1; + uint32_t num_levels = max_level + 1; // Note: The level array instance lives as long as the document is present in the index. LevelArray levels(num_levels, EntryRef()); auto node_ref = _nodes.add(levels); // TODO: Add memory barrier? _node_refs[docid] = node_ref; + return max_level; } HnswIndexBase::LevelArrayRef @@ -147,7 +155,8 @@ HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerato _node_refs(), _nodes(make_default_node_store_config()), _links(make_default_link_store_config()), - _entry_docid(0) + _entry_docid(0), // Note that docid 0 is reserved and never used + _entry_level(0) { } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h index 2ea47c73529..4f5631426e0 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h @@ -84,11 +84,13 @@ protected: NodeStore _nodes; LinkStore _links; uint32_t _entry_docid; + uint32_t _entry_level; static search::datastore::ArrayStoreConfig make_default_node_store_config(); static search::datastore::ArrayStoreConfig make_default_link_store_config(); - void make_node_for_document(uint32_t docid); + uint32_t max_links_for_level(uint32_t level) const; + uint32_t make_node_for_document(uint32_t docid); LevelArrayRef get_level_array(uint32_t docid) const; LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const; void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links); @@ -114,6 +116,9 @@ public: // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists) + uint32_t get_entry_docid() const { return _entry_docid; } + uint32_t get_entry_level() const { return _entry_level; } + // Should only be used by unit tests. HnswNode get_node(uint32_t docid) const; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h index c7e719e2985..8a6187ffbcc 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h @@ -24,6 +24,7 @@ public: HnswNode(const LevelArray& levels_in) : _levels(levels_in) {} bool empty() const { return _levels.empty(); } size_t size() const { return _levels.size(); } + const LevelArray& levels() const { return _levels; } const LinkArray& level(size_t idx) const { return _levels[idx]; } bool operator==(const HnswNode& rhs) { return _levels == rhs._levels; |