aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2020-02-05 09:39:57 +0000
committerGeir Storli <geirst@verizonmedia.com>2020-02-05 09:39:57 +0000
commit166b192424bc38e16243adb2f925c6b7ff6d1e64 (patch)
treeddd6dd2515b64cf4dbe5e4d675b68ade5648af0b /searchlib
parent396a4dbfca5a6b81bd4c3608a0372afb9c084354 (diff)
Implement a select neighbor function that uses a heuristic the accounts for distances between candidates.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp99
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp10
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h1
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp40
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h20
5 files changed, 154 insertions, 16 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
index ffa49096ad4..0fc5d7ca94f 100644
--- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
@@ -33,18 +33,28 @@ public:
}
};
+using FloatVectors = MyDocVectorAccess<float>;
+using FloatIndex = HnswIndex<float>;
+using FloatIndexUP = std::unique_ptr<FloatIndex>;
+
class HnswIndexTest : public ::testing::Test {
public:
- MyDocVectorAccess<float> vectors;
- HnswIndex<float> index;
+ FloatVectors vectors;
+ FloatIndexUP index;
HnswIndexTest()
: vectors(),
- index(vectors, HnswIndexBase::Config(2, 0, 4))
+ index()
{
+ vectors.set(1, {2, 2}).set(2, {3, 2}).set(3, {2, 3})
+ .set(4, {1, 2}).set(5, {8, 3}).set(6, {7, 2})
+ .set(7, {3, 5});
+ }
+ void init(bool heuristic_select_neighbors) {
+ index = std::make_unique<FloatIndex>(vectors, HnswIndexBase::Config(2, 0, 10, heuristic_select_neighbors));
}
void expect_level_0(uint32_t docid, const HnswNode::LinkArray& exp_links) {
- auto node = index.get_node(docid);
+ auto node = index->get_node(docid);
ASSERT_EQ(1, node.size());
EXPECT_EQ(exp_links, node.level(0));
}
@@ -53,41 +63,106 @@ public:
TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_simple_select_neighbors)
{
- vectors.set(1, {2, 2}).set(2, {3, 2}).set(3, {2, 3})
- .set(4, {1, 2}).set(5, {5, 3}).set(6, {6, 2});
+ init(false);
- index.add_document(1);
+ index->add_document(1);
expect_level_0(1, {});
- index.add_document(2);
+ index->add_document(2);
expect_level_0(1, {2});
expect_level_0(2, {1});
- index.add_document(3);
+ index->add_document(3);
expect_level_0(1, {2, 3});
expect_level_0(2, {1, 3});
expect_level_0(3, {1, 2});
- index.add_document(4);
+ index->add_document(4);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3});
expect_level_0(3, {1, 2, 4});
expect_level_0(4, {1, 3});
- index.add_document(5);
+ index->add_document(5);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3, 5});
expect_level_0(3, {1, 2, 4, 5});
expect_level_0(4, {1, 3});
expect_level_0(5, {2, 3});
- index.add_document(6);
+ index->add_document(6);
expect_level_0(1, {2, 3, 4});
expect_level_0(2, {1, 3, 5, 6});
expect_level_0(3, {1, 2, 4, 5});
expect_level_0(4, {1, 3});
expect_level_0(5, {2, 3, 6});
expect_level_0(6, {2, 5});
+
+ index->add_document(7);
+ expect_level_0(1, {2, 3, 4});
+ expect_level_0(2, {1, 3, 5, 6, 7});
+ expect_level_0(3, {1, 2, 4, 5, 7});
+ expect_level_0(4, {1, 3});
+ expect_level_0(5, {2, 3, 6});
+ expect_level_0(6, {2, 5});
+ expect_level_0(7, {2, 3});
+}
+
+TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_heuristic_select_neighbors)
+{
+ init(true);
+
+ index->add_document(1);
+ expect_level_0(1, {});
+
+ index->add_document(2);
+ expect_level_0(1, {2});
+ expect_level_0(2, {1});
+
+ index->add_document(3);
+ expect_level_0(1, {2, 3});
+ expect_level_0(2, {1, 3});
+ expect_level_0(3, {1, 2});
+
+ // Doc 4 is closest to 1 and they are linked.
+ // Doc 4 is NOT linked to 3 as the distance between 4 and 3 is greater than the distance between 3 and 1.
+ // Doc 3 is therefore reachable via 1. Same argument for why doc 4 is not linked to 2.
+ index->add_document(4);
+ expect_level_0(1, {2, 3, 4});
+ expect_level_0(2, {1, 3});
+ expect_level_0(3, {1, 2});
+ expect_level_0(4, {1});
+
+ // Doc 5 is closest to 2 and they are linked.
+ // The other docs are reachable via 2, and no other links are created. Same argument as with doc 4 above.
+ index->add_document(5);
+ expect_level_0(1, {2, 3, 4});
+ expect_level_0(2, {1, 3, 5});
+ expect_level_0(3, {1, 2});
+ expect_level_0(4, {1});
+ expect_level_0(5, {2});
+
+ // Doc 6 is closest to 5 and they are linked.
+ // Doc 6 is also linked to 2 as the distance between 6 and 2 is less than the distance between 2 and 5.
+ index->add_document(6);
+ expect_level_0(1, {2, 3, 4});
+ expect_level_0(2, {1, 3, 5, 6});
+ expect_level_0(3, {1, 2});
+ expect_level_0(4, {1});
+ expect_level_0(5, {2, 6});
+ expect_level_0(6, {2, 5});
+
+ // Doc 7 is closest to 3 and they are linked.
+ // Doc 7 is also linked to 6 as the distance between 7 and 6 is less than the distance between 6 and 3.
+ // Docs 1, 2, 4 are reachable via 3.
+ index->add_document(7);
+ expect_level_0(1, {2, 3, 4});
+ expect_level_0(2, {1, 3, 5, 6});
+ expect_level_0(3, {1, 2, 7});
+ expect_level_0(4, {1});
+ expect_level_0(5, {2, 6});
+ expect_level_0(6, {2, 5, 7});
+ expect_level_0(7, {3, 6});
}
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index bd46854cf17..fbbacf66870 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -6,6 +6,14 @@ namespace search::tensor {
template <typename FloatType>
double
+HnswIndex<FloatType>::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const
+{
+ auto lhs = get_vector(lhs_docid);
+ return calc_distance(lhs, rhs_docid);
+}
+
+template <typename FloatType>
+double
HnswIndex<FloatType>::calc_distance(const Vector& lhs, uint32_t rhs_docid) const
{
// TODO: Make it possible to specify the distance function from the outside and make it hardware optimized.
@@ -85,7 +93,7 @@ HnswIndex<FloatType>::add_document(uint32_t docid)
// TODO: Add support for multiple levels.
// TODO: Rename to search_level?
search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, 0);
- auto neighbors = select_neighbors_simple(best_neighbors.peek(), _cfg.max_links_at_level_0());
+ auto neighbors = select_neighbors(best_neighbors.peek(), _cfg.max_links_at_level_0());
connect_new_node(docid, neighbors, 0);
// TODO: Shrink neighbors if needed
}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index b1f35af629c..e3c378ccd68 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -27,6 +27,7 @@ private:
return _vectors.get_vector(docid).template typify<FloatType>();
}
+ double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override;
double calc_distance(const Vector& lhs, uint32_t rhs_docid) const;
void search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level);
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
index a1132bea4e7..bd53013b94d 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
@@ -70,17 +70,45 @@ HnswIndexBase::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef
mutable_levels[level] = links_ref;
}
+bool
+HnswIndexBase::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const
+{
+ for (uint32_t result_docid : result) {
+ double dist = calc_distance(candidate.docid, result_docid);
+ if (dist < candidate.distance) {
+ return true;
+ }
+ }
+ return false;
+}
+
HnswIndexBase::LinkArray
HnswIndexBase::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const
{
+ HnswCandidateVector sorted(neighbors);
+ std::sort(sorted.begin(), sorted.end(), LesserDistance());
+ LinkArray result;
+ for (size_t i = 0, m = std::min(static_cast<size_t>(max_links), sorted.size()); i < m; ++i) {
+ result.push_back(sorted[i].docid);
+ }
+ return result;
+}
+
+HnswIndexBase::LinkArray
+HnswIndexBase::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const
+{
LinkArray result;
+ bool need_filtering = neighbors.size() > max_links;
NearestPriQ nearest;
for (const auto& entry : neighbors) {
nearest.push(entry);
}
while (!nearest.empty()) {
- HnswCandidate candidate = nearest.top();
+ auto candidate = nearest.top();
nearest.pop();
+ if (need_filtering && have_closer_distance(candidate, result)) {
+ continue;
+ }
result.push_back(candidate.docid);
if (result.size() == max_links) {
return result;
@@ -89,6 +117,16 @@ HnswIndexBase::select_neighbors_simple(const HnswCandidateVector& neighbors, uin
return result;
}
+HnswIndexBase::LinkArray
+HnswIndexBase::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const
+{
+ if (_cfg.heuristic_select_neighbors()) {
+ return select_neighbors_heuristic(neighbors, max_links);
+ } else {
+ return select_neighbors_simple(neighbors, max_links);
+ }
+}
+
void
HnswIndexBase::connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level)
{
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
index b40eef5016a..b1635b8eafb 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
@@ -33,18 +33,22 @@ public:
uint32_t _max_links_at_level_0;
uint32_t _max_links_at_hierarchic_levels;
uint32_t _neighbors_to_explore_at_construction;
+ bool _heuristic_select_neighbors;
public:
Config(uint32_t max_links_at_level_0_in,
uint32_t max_links_at_hierarchic_levels_in,
- uint32_t neighbors_to_explore_at_construction_in)
+ uint32_t neighbors_to_explore_at_construction_in,
+ bool heuristic_select_neighbors_in)
: _max_links_at_level_0(max_links_at_level_0_in),
_max_links_at_hierarchic_levels(max_links_at_hierarchic_levels_in),
- _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in)
+ _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in),
+ _heuristic_select_neighbors(heuristic_select_neighbors_in)
{}
uint32_t max_links_at_level_0() const { return _max_links_at_level_0; }
uint32_t max_links_at_hierarchic_levels() const { return _max_links_at_hierarchic_levels; }
uint32_t neighbors_to_explore_at_construction() const { return _neighbors_to_explore_at_construction; }
+ bool heuristic_select_neighbors() const { return _heuristic_select_neighbors; }
};
protected:
@@ -86,7 +90,19 @@ protected:
LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const;
void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links);
+ virtual double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const = 0;
+
+ /**
+ * Returns true if the distance between the candidate and a node in the current result
+ * is less than the distance between the candidate and the node we want to add to the graph.
+ * In this case the candidate should be discarded as we already are connected to the space
+ * where the candidate is located.
+ * Used by select_neighbors_heuristic().
+ */
+ bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const;
+ LinkArray select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const;
LinkArray select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const;
+ LinkArray select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const;
void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level);
public: