diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-02-05 09:39:57 +0000 |
---|---|---|
committer | Geir Storli <geirst@verizonmedia.com> | 2020-02-05 09:39:57 +0000 |
commit | 166b192424bc38e16243adb2f925c6b7ff6d1e64 (patch) | |
tree | ddd6dd2515b64cf4dbe5e4d675b68ade5648af0b /searchlib | |
parent | 396a4dbfca5a6b81bd4c3608a0372afb9c084354 (diff) |
Implement a select neighbor function that uses a heuristic the accounts for distances between candidates.
Diffstat (limited to 'searchlib')
5 files changed, 154 insertions, 16 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index ffa49096ad4..0fc5d7ca94f 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -33,18 +33,28 @@ public: } }; +using FloatVectors = MyDocVectorAccess<float>; +using FloatIndex = HnswIndex<float>; +using FloatIndexUP = std::unique_ptr<FloatIndex>; + class HnswIndexTest : public ::testing::Test { public: - MyDocVectorAccess<float> vectors; - HnswIndex<float> index; + FloatVectors vectors; + FloatIndexUP index; HnswIndexTest() : vectors(), - index(vectors, HnswIndexBase::Config(2, 0, 4)) + index() { + vectors.set(1, {2, 2}).set(2, {3, 2}).set(3, {2, 3}) + .set(4, {1, 2}).set(5, {8, 3}).set(6, {7, 2}) + .set(7, {3, 5}); + } + void init(bool heuristic_select_neighbors) { + index = std::make_unique<FloatIndex>(vectors, HnswIndexBase::Config(2, 0, 10, heuristic_select_neighbors)); } void expect_level_0(uint32_t docid, const HnswNode::LinkArray& exp_links) { - auto node = index.get_node(docid); + auto node = index->get_node(docid); ASSERT_EQ(1, node.size()); EXPECT_EQ(exp_links, node.level(0)); } @@ -53,41 +63,106 @@ public: TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_simple_select_neighbors) { - vectors.set(1, {2, 2}).set(2, {3, 2}).set(3, {2, 3}) - .set(4, {1, 2}).set(5, {5, 3}).set(6, {6, 2}); + init(false); - index.add_document(1); + index->add_document(1); expect_level_0(1, {}); - index.add_document(2); + index->add_document(2); expect_level_0(1, {2}); expect_level_0(2, {1}); - index.add_document(3); + index->add_document(3); expect_level_0(1, {2, 3}); expect_level_0(2, {1, 3}); expect_level_0(3, {1, 2}); - index.add_document(4); + index->add_document(4); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3}); expect_level_0(3, {1, 2, 4}); expect_level_0(4, {1, 3}); - index.add_document(5); + index->add_document(5); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3, 5}); expect_level_0(3, {1, 2, 4, 5}); expect_level_0(4, {1, 3}); expect_level_0(5, {2, 3}); - index.add_document(6); + index->add_document(6); expect_level_0(1, {2, 3, 4}); expect_level_0(2, {1, 3, 5, 6}); expect_level_0(3, {1, 2, 4, 5}); expect_level_0(4, {1, 3}); expect_level_0(5, {2, 3, 6}); expect_level_0(6, {2, 5}); + + index->add_document(7); + expect_level_0(1, {2, 3, 4}); + expect_level_0(2, {1, 3, 5, 6, 7}); + expect_level_0(3, {1, 2, 4, 5, 7}); + expect_level_0(4, {1, 3}); + expect_level_0(5, {2, 3, 6}); + expect_level_0(6, {2, 5}); + expect_level_0(7, {2, 3}); +} + +TEST_F(HnswIndexTest, 2d_vectors_inserted_in_level_0_graph_with_heuristic_select_neighbors) +{ + init(true); + + index->add_document(1); + expect_level_0(1, {}); + + index->add_document(2); + expect_level_0(1, {2}); + expect_level_0(2, {1}); + + index->add_document(3); + expect_level_0(1, {2, 3}); + expect_level_0(2, {1, 3}); + expect_level_0(3, {1, 2}); + + // Doc 4 is closest to 1 and they are linked. + // Doc 4 is NOT linked to 3 as the distance between 4 and 3 is greater than the distance between 3 and 1. + // Doc 3 is therefore reachable via 1. Same argument for why doc 4 is not linked to 2. + index->add_document(4); + expect_level_0(1, {2, 3, 4}); + expect_level_0(2, {1, 3}); + expect_level_0(3, {1, 2}); + expect_level_0(4, {1}); + + // Doc 5 is closest to 2 and they are linked. + // The other docs are reachable via 2, and no other links are created. Same argument as with doc 4 above. + index->add_document(5); + expect_level_0(1, {2, 3, 4}); + expect_level_0(2, {1, 3, 5}); + expect_level_0(3, {1, 2}); + expect_level_0(4, {1}); + expect_level_0(5, {2}); + + // Doc 6 is closest to 5 and they are linked. + // Doc 6 is also linked to 2 as the distance between 6 and 2 is less than the distance between 2 and 5. + index->add_document(6); + expect_level_0(1, {2, 3, 4}); + expect_level_0(2, {1, 3, 5, 6}); + expect_level_0(3, {1, 2}); + expect_level_0(4, {1}); + expect_level_0(5, {2, 6}); + expect_level_0(6, {2, 5}); + + // Doc 7 is closest to 3 and they are linked. + // Doc 7 is also linked to 6 as the distance between 7 and 6 is less than the distance between 6 and 3. + // Docs 1, 2, 4 are reachable via 3. + index->add_document(7); + expect_level_0(1, {2, 3, 4}); + expect_level_0(2, {1, 3, 5, 6}); + expect_level_0(3, {1, 2, 7}); + expect_level_0(4, {1}); + expect_level_0(5, {2, 6}); + expect_level_0(6, {2, 5, 7}); + expect_level_0(7, {3, 6}); } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index bd46854cf17..fbbacf66870 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -6,6 +6,14 @@ namespace search::tensor { template <typename FloatType> double +HnswIndex<FloatType>::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const +{ + auto lhs = get_vector(lhs_docid); + return calc_distance(lhs, rhs_docid); +} + +template <typename FloatType> +double HnswIndex<FloatType>::calc_distance(const Vector& lhs, uint32_t rhs_docid) const { // TODO: Make it possible to specify the distance function from the outside and make it hardware optimized. @@ -85,7 +93,7 @@ HnswIndex<FloatType>::add_document(uint32_t docid) // TODO: Add support for multiple levels. // TODO: Rename to search_level? search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, 0); - auto neighbors = select_neighbors_simple(best_neighbors.peek(), _cfg.max_links_at_level_0()); + auto neighbors = select_neighbors(best_neighbors.peek(), _cfg.max_links_at_level_0()); connect_new_node(docid, neighbors, 0); // TODO: Shrink neighbors if needed } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index b1f35af629c..e3c378ccd68 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -27,6 +27,7 @@ private: return _vectors.get_vector(docid).template typify<FloatType>(); } + double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override; double calc_distance(const Vector& lhs, uint32_t rhs_docid) const; void search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp index a1132bea4e7..bd53013b94d 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp @@ -70,17 +70,45 @@ HnswIndexBase::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef mutable_levels[level] = links_ref; } +bool +HnswIndexBase::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const +{ + for (uint32_t result_docid : result) { + double dist = calc_distance(candidate.docid, result_docid); + if (dist < candidate.distance) { + return true; + } + } + return false; +} + HnswIndexBase::LinkArray HnswIndexBase::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const { + HnswCandidateVector sorted(neighbors); + std::sort(sorted.begin(), sorted.end(), LesserDistance()); + LinkArray result; + for (size_t i = 0, m = std::min(static_cast<size_t>(max_links), sorted.size()); i < m; ++i) { + result.push_back(sorted[i].docid); + } + return result; +} + +HnswIndexBase::LinkArray +HnswIndexBase::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const +{ LinkArray result; + bool need_filtering = neighbors.size() > max_links; NearestPriQ nearest; for (const auto& entry : neighbors) { nearest.push(entry); } while (!nearest.empty()) { - HnswCandidate candidate = nearest.top(); + auto candidate = nearest.top(); nearest.pop(); + if (need_filtering && have_closer_distance(candidate, result)) { + continue; + } result.push_back(candidate.docid); if (result.size() == max_links) { return result; @@ -89,6 +117,16 @@ HnswIndexBase::select_neighbors_simple(const HnswCandidateVector& neighbors, uin return result; } +HnswIndexBase::LinkArray +HnswIndexBase::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const +{ + if (_cfg.heuristic_select_neighbors()) { + return select_neighbors_heuristic(neighbors, max_links); + } else { + return select_neighbors_simple(neighbors, max_links); + } +} + void HnswIndexBase::connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level) { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h index b40eef5016a..b1635b8eafb 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h @@ -33,18 +33,22 @@ public: uint32_t _max_links_at_level_0; uint32_t _max_links_at_hierarchic_levels; uint32_t _neighbors_to_explore_at_construction; + bool _heuristic_select_neighbors; public: Config(uint32_t max_links_at_level_0_in, uint32_t max_links_at_hierarchic_levels_in, - uint32_t neighbors_to_explore_at_construction_in) + uint32_t neighbors_to_explore_at_construction_in, + bool heuristic_select_neighbors_in) : _max_links_at_level_0(max_links_at_level_0_in), _max_links_at_hierarchic_levels(max_links_at_hierarchic_levels_in), - _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in) + _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in), + _heuristic_select_neighbors(heuristic_select_neighbors_in) {} uint32_t max_links_at_level_0() const { return _max_links_at_level_0; } uint32_t max_links_at_hierarchic_levels() const { return _max_links_at_hierarchic_levels; } uint32_t neighbors_to_explore_at_construction() const { return _neighbors_to_explore_at_construction; } + bool heuristic_select_neighbors() const { return _heuristic_select_neighbors; } }; protected: @@ -86,7 +90,19 @@ protected: LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const; void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links); + virtual double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const = 0; + + /** + * Returns true if the distance between the candidate and a node in the current result + * is less than the distance between the candidate and the node we want to add to the graph. + * In this case the candidate should be discarded as we already are connected to the space + * where the candidate is located. + * Used by select_neighbors_heuristic(). + */ + bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const; + LinkArray select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const; LinkArray select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const; + LinkArray select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const; void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level); public: |