summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2020-02-06 10:15:51 +0000
committerGeir Storli <geirst@verizonmedia.com>2020-02-06 10:15:51 +0000
commit96711356c10d90670bc71ccbfc82dd71d8b5b6c1 (patch)
tree6943257b2f14616cbcfee26a6a4e346fee243780
parent6ccedd9cf7116e4b5b652bb72a9466ff3641d76a (diff)
Add support for adding a new document in multiple levels in the graph.
-rw-r--r--searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp76
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp53
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h4
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp15
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h7
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_node.h1
6 files changed, 118 insertions, 38 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
index 4ff1aa95862..76548a025a0 100644
--- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
@@ -34,8 +34,10 @@ public:
}
};
-class LevelZeroGenerator : public RandomLevelGenerator {
- uint32_t max_level() override { return 0; }
+struct LevelGenerator : public RandomLevelGenerator {
+ uint32_t level;
+ LevelGenerator() : level(0) {}
+ uint32_t max_level() override { return level; }
};
using FloatVectors = MyDocVectorAccess<float>;
@@ -45,7 +47,7 @@ using FloatIndexUP = std::unique_ptr<FloatIndex>;
class HnswIndexTest : public ::testing::Test {
public:
FloatVectors vectors;
- LevelZeroGenerator level_generator;
+ LevelGenerator level_generator;
FloatIndexUP index;
HnswIndexTest()
@@ -59,13 +61,26 @@ public:
}
void init(bool heuristic_select_neighbors) {
index = std::make_unique<FloatIndex>(vectors, level_generator,
- HnswIndexBase::Config(2, 0, 10, heuristic_select_neighbors));
+ HnswIndexBase::Config(2, 1, 10, heuristic_select_neighbors));
+ }
+ void add_document(uint32_t docid, uint32_t max_level = 0) {
+ level_generator.level = max_level;
+ index->add_document(docid);
+ }
+ void expect_entry_point(uint32_t exp_docid, uint32_t exp_level) {
+ EXPECT_EQ(exp_docid, index->get_entry_docid());
+ EXPECT_EQ(exp_level, index->get_entry_level());
}
void expect_level_0(uint32_t docid, const HnswNode::LinkArray& exp_links) {
auto node = index->get_node(docid);
ASSERT_EQ(1, node.size());
EXPECT_EQ(exp_links, node.level(0));
}
+ void expect_levels(uint32_t docid, const HnswNode::LevelArray& exp_levels) {
+ auto act_node = index->get_node(docid);
+ ASSERT_EQ(exp_levels.size(), act_node.size());
+ EXPECT_EQ(exp_levels, act_node.levels());
+ }
};
@@ -73,32 +88,32 @@ TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_simple_select_ne
{
init(false);
- index->add_document(1);
+ add_document(1);
expect_level_0(1, {});
- index->add_document(2);
+ add_document(2);
expect_level_0(1, {2});
expect_level_0(2, {1});
- index->add_document(3);
+ add_document(3);
expect_level_0(1, {2, 3});
expect_level_0(2, {1, 3});
expect_level_0(3, {1, 2});
- index->add_document(4);
+ add_document(4);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3});
expect_level_0(3, {1, 2, 4});
expect_level_0(4, {1, 3});
- index->add_document(5);
+ add_document(5);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3, 5});
expect_level_0(3, {1, 2, 4, 5});
expect_level_0(4, {1, 3});
expect_level_0(5, {2, 3});
- index->add_document(6);
+ add_document(6);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3, 5, 6});
expect_level_0(3, {1, 2, 4, 5});
@@ -106,7 +121,7 @@ TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_simple_select_ne
expect_level_0(5, {2, 3, 6});
expect_level_0(6, {2, 5});
- index->add_document(7);
+ add_document(7);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3, 5, 6, 7});
expect_level_0(3, {1, 2, 4, 5, 7});
@@ -116,60 +131,69 @@ TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_simple_select_ne
expect_level_0(7, {2, 3});
}
-TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_heuristic_select_neighbors)
+TEST_F(HnswIndexTest, 2d_vectors_inserted_in_hierarchic_graph_with_heuristic_select_neighbors)
{
init(true);
- index->add_document(1);
+ add_document(1);
+ expect_entry_point(1, 0);
expect_level_0(1, {});
- index->add_document(2);
+ add_document(2);
+ expect_entry_point(1, 0);
expect_level_0(1, {2});
expect_level_0(2, {1});
- index->add_document(3);
+ // Doc 3 is also added to level 1
+ add_document(3, 1);
+ expect_entry_point(3, 1);
expect_level_0(1, {2, 3});
expect_level_0(2, {1, 3});
- expect_level_0(3, {1, 2});
+ expect_levels(3, {{1, 2}, {}});
// Doc 4 is closest to 1 and they are linked.
// Doc 4 is NOT linked to 3 as the distance between 4 and 3 is greater than the distance between 3 and 1.
// Doc 3 is therefore reachable via 1. Same argument for why doc 4 is not linked to 2.
- index->add_document(4);
+ add_document(4);
+ expect_entry_point(3, 1);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3});
- expect_level_0(3, {1, 2});
+ expect_levels(3, {{1, 2}, {}});
expect_level_0(4, {1});
// Doc 5 is closest to 2 and they are linked.
// The other docs are reachable via 2, and no other links are created. Same argument as with doc 4 above.
- index->add_document(5);
+ add_document(5);
+ expect_entry_point(3, 1);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3, 5});
- expect_level_0(3, {1, 2});
+ expect_levels(3, {{1, 2}, {}});
expect_level_0(4, {1});
expect_level_0(5, {2});
// Doc 6 is closest to 5 and they are linked.
// Doc 6 is also linked to 2 as the distance between 6 and 2 is less than the distance between 2 and 5.
- index->add_document(6);
+ // Doc 6 is also added to level 1 and 2, and linked to doc 3 in level 1.
+ add_document(6, 2);
+ expect_entry_point(6, 2);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3, 5, 6});
- expect_level_0(3, {1, 2});
+ expect_levels(3, {{1, 2}, {6}});
expect_level_0(4, {1});
expect_level_0(5, {2, 6});
- expect_level_0(6, {2, 5});
+ expect_levels(6, {{2, 5}, {3}, {}});
// Doc 7 is closest to 3 and they are linked.
// Doc 7 is also linked to 6 as the distance between 7 and 6 is less than the distance between 6 and 3.
// Docs 1, 2, 4 are reachable via 3.
- index->add_document(7);
+ add_document(7);
+ expect_entry_point(6, 2);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3, 5, 6});
- expect_level_0(3, {1, 2, 7});
+ expect_levels(3, {{1, 2, 7}, {6}});
expect_level_0(4, {1});
expect_level_0(5, {2, 6});
- expect_level_0(6, {2, 5, 7});
+ expect_levels(6, {{2, 5, 7}, {3}, {}});
expect_level_0(7, {3, 6});
}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index 107a0ef459a..19cf88f719e 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -29,6 +29,25 @@ HnswIndex<FloatType>::calc_distance(const Vector& lhs, uint32_t rhs_docid) const
}
template <typename FloatType>
+HnswCandidate
+HnswIndex<FloatType>::find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level)
+{
+ HnswCandidate nearest = entry_point;
+ bool keep_searching = true;
+ while (keep_searching) {
+ keep_searching = false;
+ for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) {
+ double dist = calc_distance(input, neighbor_docid);
+ if (dist < nearest.distance) {
+ nearest = HnswCandidate(neighbor_docid, dist);
+ keep_searching = true;
+ }
+ }
+ }
+ return nearest;
+}
+
+template <typename FloatType>
void
HnswIndex<FloatType>::search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level)
{
@@ -82,20 +101,38 @@ HnswIndex<FloatType>::add_document(uint32_t docid)
_node_refs.ensure_size(docid + 1, EntryRef());
// A document cannot be added twice.
assert(!_node_refs[docid].valid());
- make_node_for_document(docid);
+ int level = make_node_for_document(docid);
if (_entry_docid == 0) {
_entry_docid = docid;
+ _entry_level = level;
return;
}
+
+ int search_level = _entry_level;
double entry_dist = calc_distance(input, _entry_docid);
+ HnswCandidate entry_point(_entry_docid, entry_dist);
+ while (search_level > level) {
+ entry_point = find_nearest_in_layer(input, entry_point, search_level);
+ --search_level;
+ }
+
FurthestPriQ best_neighbors;
- best_neighbors.emplace(_entry_docid, entry_dist);
- // TODO: Add support for multiple levels.
- // TODO: Rename to search_level?
- search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, 0);
- auto neighbors = select_neighbors(best_neighbors.peek(), _cfg.max_links_at_level_0());
- connect_new_node(docid, neighbors, 0);
- // TODO: Shrink neighbors if needed
+ best_neighbors.push(entry_point);
+ search_level = std::min(level, static_cast<int>(_entry_level));
+
+ // Insert the added document in each level it should exist in.
+ while (search_level >= 0) {
+ // TODO: Rename to search_level?
+ search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level);
+ auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level));
+ connect_new_node(docid, neighbors, search_level);
+ // TODO: Shrink neighbors if needed
+ --search_level;
+ }
+ if (static_cast<uint32_t>(level) > _entry_level) {
+ _entry_docid = docid;
+ _entry_level = level;
+ }
}
template <typename FloatType>
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index b6f063b1451..be6f507dbe1 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -33,6 +33,10 @@ private:
double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override;
double calc_distance(const Vector& lhs, uint32_t rhs_docid) const;
+ /**
+ * Performs a greedy search in the given layer to find the candidate that is nearest the input vector.
+ */
+ HnswCandidate find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level);
void search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level);
public:
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
index 4406428bc2a..518ccaa0ed8 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
@@ -32,16 +32,24 @@ HnswIndexBase::make_default_link_store_config()
small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true);
}
-void
+uint32_t
+HnswIndexBase::max_links_for_level(uint32_t level) const
+{
+ return (level == 0) ? _cfg.max_links_at_level_0() : _cfg.max_links_at_hierarchic_levels();
+}
+
+uint32_t
HnswIndexBase::make_node_for_document(uint32_t docid)
{
+ uint32_t max_level = _level_generator.max_level();
// TODO: Add capping on num_levels
- uint32_t num_levels = _level_generator.max_level() + 1;
+ uint32_t num_levels = max_level + 1;
// Note: The level array instance lives as long as the document is present in the index.
LevelArray levels(num_levels, EntryRef());
auto node_ref = _nodes.add(levels);
// TODO: Add memory barrier?
_node_refs[docid] = node_ref;
+ return max_level;
}
HnswIndexBase::LevelArrayRef
@@ -147,7 +155,8 @@ HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerato
_node_refs(),
_nodes(make_default_node_store_config()),
_links(make_default_link_store_config()),
- _entry_docid(0)
+ _entry_docid(0), // Note that docid 0 is reserved and never used
+ _entry_level(0)
{
}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
index 2ea47c73529..4f5631426e0 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
@@ -84,11 +84,13 @@ protected:
NodeStore _nodes;
LinkStore _links;
uint32_t _entry_docid;
+ uint32_t _entry_level;
static search::datastore::ArrayStoreConfig make_default_node_store_config();
static search::datastore::ArrayStoreConfig make_default_link_store_config();
- void make_node_for_document(uint32_t docid);
+ uint32_t max_links_for_level(uint32_t level) const;
+ uint32_t make_node_for_document(uint32_t docid);
LevelArrayRef get_level_array(uint32_t docid) const;
LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const;
void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links);
@@ -114,6 +116,9 @@ public:
// TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists)
+ uint32_t get_entry_docid() const { return _entry_docid; }
+ uint32_t get_entry_level() const { return _entry_level; }
+
// Should only be used by unit tests.
HnswNode get_node(uint32_t docid) const;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h
index c7e719e2985..8a6187ffbcc 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_node.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_node.h
@@ -24,6 +24,7 @@ public:
HnswNode(const LevelArray& levels_in) : _levels(levels_in) {}
bool empty() const { return _levels.empty(); }
size_t size() const { return _levels.size(); }
+ const LevelArray& levels() const { return _levels; }
const LinkArray& level(size_t idx) const { return _levels[idx]; }
bool operator==(const HnswNode& rhs) {
return _levels == rhs._levels;