diff options
Diffstat (limited to 'searchlib')
14 files changed, 385 insertions, 47 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index ae6469ae07a..30c1bcd7fdd 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -220,6 +220,7 @@ vespa_define_module( src/tests/tensor/dense_tensor_store src/tests/tensor/direct_tensor_store src/tests/tensor/distance_functions + src/tests/tensor/hnsw_best_neighbors src/tests/tensor/hnsw_index src/tests/tensor/hnsw_nodeid_mapping src/tests/tensor/hnsw_saver diff --git a/searchlib/src/tests/tensor/hnsw_best_neighbors/CMakeLists.txt b/searchlib/src/tests/tensor/hnsw_best_neighbors/CMakeLists.txt new file mode 100644 index 00000000000..3cb6a286580 --- /dev/null +++ b/searchlib/src/tests/tensor/hnsw_best_neighbors/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_hnsw_best_neighbors_test_app TEST + SOURCES + hnsw_best_neighbors_test.cpp + DEPENDS + searchlib + GTest::GTest +) +vespa_add_test(NAME searchlib_hnsw_best_neighbors_test_app COMMAND searchlib_hnsw_best_neighbors_test_app) diff --git a/searchlib/src/tests/tensor/hnsw_best_neighbors/hnsw_best_neighbors_test.cpp b/searchlib/src/tests/tensor/hnsw_best_neighbors/hnsw_best_neighbors_test.cpp new file mode 100644 index 00000000000..c05b85d3e59 --- /dev/null +++ b/searchlib/src/tests/tensor/hnsw_best_neighbors/hnsw_best_neighbors_test.cpp @@ -0,0 +1,109 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/tensor/hnsw_multi_best_neighbors.h> +#include <vespa/searchlib/tensor/hnsw_single_best_neighbors.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <ostream> + +using vespalib::datastore::EntryRef; +using search::tensor::HnswMultiBestNeighbors; +using search::tensor::HnswSingleBestNeighbors; +using search::tensor::NearestNeighborIndex; + +using Neighbor = NearestNeighborIndex::Neighbor; + +namespace search::tensor +{ + +std::ostream& operator<<(std::ostream& os, const Neighbor& neighbor) { + os << "{ docid=" << neighbor.docid << ", distance=" << neighbor.distance << "}"; + return os; +} + +} + +struct DocidThenDistanceOrder +{ + bool operator()(const Neighbor& lhs, const Neighbor& rhs) const { + if (lhs.docid != rhs.docid) { + return lhs.docid < rhs.docid; + } + return lhs.distance < rhs.distance; + } +}; + +template <typename BestNeighbors> +class HnswBestNeighborsTest : public testing::Test { +protected: + BestNeighbors _neighbors; + HnswBestNeighborsTest() + : testing::Test(), + _neighbors() + { + populate(); + } + ~HnswBestNeighborsTest() override; + + void add(uint32_t nodeid, uint32_t docid, double distance) + { + _neighbors.emplace(nodeid, docid, EntryRef(), distance); + } + + size_t size() const { return _neighbors.size(); } + + void assert_neighbors(std::vector<NearestNeighborIndex::Neighbor> exp, uint32_t k, double distance_limit) { + auto neighbors_copy = _neighbors; + auto act = neighbors_copy.get_neighbors(k, distance_limit); + std::sort(act.begin(), act.end(), DocidThenDistanceOrder()); + EXPECT_EQ(exp, act); + } + + void populate() + { + add(3, 3, 7.0); + add(2, 2, 10.0); + add(1, 1, 1.0); + add(4, 2, 5.0); + } +}; + +template <typename BestNeighbors> +HnswBestNeighborsTest<BestNeighbors>::~HnswBestNeighborsTest() = default; + +using HnswBestNeighborsTypedTestTypes = testing::Types<HnswSingleBestNeighbors, HnswMultiBestNeighbors>; + +TYPED_TEST_SUITE(HnswBestNeighborsTest, HnswBestNeighborsTypedTestTypes); + +TYPED_TEST(HnswBestNeighborsTest, k_limit_is_enforced) +{ + this->assert_neighbors({}, 0, 40.0); + this->assert_neighbors({{1, 1.0}}, 1, 40.0); + this->assert_neighbors({{1, 1.0}, {2, 5.0}}, 2, 40.0); + this->assert_neighbors({{1, 1.0}, {2, 5.0}, {3, 7.0}}, 3, 40.0); +} + +TYPED_TEST(HnswBestNeighborsTest, distance_limit_is_enforced) +{ + this->assert_neighbors({}, 40, 0.5); + this->assert_neighbors({{1, 1.0}}, 40, 1.0); + this->assert_neighbors({{1, 1.0}, {2, 5.0}}, 40, 5.0); + this->assert_neighbors({{1, 1.0}, {2, 5.0}, {3, 7.0}}, 40, 7.0); +} + +using HnswSingleBestNeighborsTest = HnswBestNeighborsTest<HnswSingleBestNeighbors>; + +TEST_F(HnswSingleBestNeighborsTest, duplicate_docids_are_not_elimiated) +{ + EXPECT_EQ(4, size()); + assert_neighbors({{1, 1.0}, {2, 5.0}, {2, 10.0}, {3, 7.0}}, 40, 40.0); +} + +using HnswMultiBestNeighborsTest = HnswBestNeighborsTest<HnswMultiBestNeighbors>; + +TEST_F(HnswMultiBestNeighborsTest, duplicate_docids_are_eliminated) +{ + EXPECT_EQ(3, size()); + assert_neighbors({{1, 1.0}, {2, 5.0}, {3, 7.0}}, 40, 40.0); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index cd405f54b2e..a00a50f32c8 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -19,7 +19,9 @@ vespa_add_library(searchlib_tensor OBJECT hnsw_graph.cpp hnsw_index.cpp hnsw_index_saver.cpp + hnsw_multi_best_neighbors.cpp hnsw_nodeid_mapping.cpp + hnsw_single_best_neighbors.cpp imported_tensor_attribute_vector.cpp imported_tensor_attribute_vector_read_guard.cpp inner_product_distance.cpp diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h b/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h index 4706f9cf7e8..3fcb1f09eeb 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h @@ -65,6 +65,10 @@ struct HnswGraph { return node_refs.get_elem_ref(nodeid).ref().load_relaxed(); // Called from writer only } + const NodeType& acquire_node_refs_elem_ref(uint32_t nodeid) const { + return node_refs.acquire_elem_ref(nodeid); + } + NodeRef acquire_node_ref(uint32_t nodeid) const { return node_refs.acquire_elem_ref(nodeid).ref().load_acquire(); } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index b293b001bcf..d6ad501391a 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -88,7 +88,7 @@ HnswIndex<type>::max_links_for_level(uint32_t level) const template <HnswIndexType type> bool -HnswIndex<type>::have_closer_distance(HnswCandidate candidate, const HnswCandidateVector& result) const +HnswIndex<type>::have_closer_distance(HnswTraversalCandidate candidate, const HnswTraversalCandidateVector& result) const { for (const auto & neighbor : result) { double dist = calc_distance(candidate.nodeid, neighbor.nodeid); @@ -100,10 +100,11 @@ HnswIndex<type>::have_closer_distance(HnswCandidate candidate, const HnswCandida } template <HnswIndexType type> +template <typename HnswCandidateVectorT> typename HnswIndex<type>::SelectResult -HnswIndex<type>::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const +HnswIndex<type>::select_neighbors_simple(const HnswCandidateVectorT& neighbors, uint32_t max_links) const { - HnswCandidateVector sorted(neighbors); + HnswCandidateVectorT sorted(neighbors); std::sort(sorted.begin(), sorted.end(), LesserDistance()); SelectResult result; for (const auto & candidate : sorted) { @@ -117,8 +118,9 @@ HnswIndex<type>::select_neighbors_simple(const HnswCandidateVector& neighbors, u } template <HnswIndexType type> +template <typename HnswCandidateVectorT> typename HnswIndex<type>::SelectResult -HnswIndex<type>::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const +HnswIndex<type>::select_neighbors_heuristic(const HnswCandidateVectorT& neighbors, uint32_t max_links) const { SelectResult result; NearestPriQ nearest; @@ -145,8 +147,9 @@ HnswIndex<type>::select_neighbors_heuristic(const HnswCandidateVector& neighbors } template <HnswIndexType type> +template <typename HnswCandidateVectorT> typename HnswIndex<type>::SelectResult -HnswIndex<type>::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const +HnswIndex<type>::select_neighbors(const HnswCandidateVectorT& neighbors, uint32_t max_links) const { if (_cfg.heuristic_select_neighbors()) { return select_neighbors_heuristic(neighbors, max_links); @@ -162,7 +165,7 @@ HnswIndex<type>::shrink_if_needed(uint32_t nodeid, uint32_t level) auto old_links = _graph.get_link_array(nodeid, level); uint32_t max_links = max_links_for_level(level); if (old_links.size() > max_links) { - HnswCandidateVector neighbors; + HnswTraversalCandidateVector neighbors; neighbors.reserve(old_links.size()); for (uint32_t neighbor_nodeid : old_links) { double dist = calc_distance(nodeid, neighbor_nodeid); @@ -226,6 +229,14 @@ HnswIndex<type>::calc_distance(const TypedCells& lhs, uint32_t rhs_nodeid) const } template <HnswIndexType type> +double +HnswIndex<type>::calc_distance(const TypedCells& lhs, uint32_t rhs_docid, uint32_t rhs_subspace) const +{ + auto rhs = get_vector(rhs_docid, rhs_subspace); + return _distance_func->calc(lhs, rhs); +} + +template <HnswIndexType type> uint32_t HnswIndex<type>::estimate_visited_nodes(uint32_t level, uint32_t nodeid_limit, uint32_t neighbors_to_find, const GlobalFilter* filter) const { @@ -258,12 +269,15 @@ HnswIndex<type>::find_nearest_in_layer(const TypedCells& input, const HnswCandid while (keep_searching) { keep_searching = false; for (uint32_t neighbor_nodeid : _graph.get_link_array(nearest.node_ref, level)) { - auto neighbor_ref = _graph.acquire_node_ref(neighbor_nodeid); - double dist = calc_distance(input, neighbor_nodeid); + auto& neighbor_node = _graph.acquire_node_refs_elem_ref(neighbor_nodeid); + auto neighbor_ref = neighbor_node.ref().load_acquire(); + uint32_t neighbor_docid = acquire_docid(neighbor_node, neighbor_nodeid); + uint32_t neighbor_subspace = neighbor_node.acquire_subspace(); + double dist = calc_distance(input, neighbor_docid, neighbor_subspace); if (_graph.still_valid(neighbor_nodeid, neighbor_ref) && dist < nearest.distance) { - nearest = HnswCandidate(neighbor_nodeid, neighbor_ref, dist); + nearest = HnswCandidate(neighbor_nodeid, neighbor_docid, neighbor_ref, dist); keep_searching = true; } } @@ -272,10 +286,10 @@ HnswIndex<type>::find_nearest_in_layer(const TypedCells& input, const HnswCandid } template <HnswIndexType type> -template <class VisitedTracker> +template <class VisitedTracker, class BestNeighbors> void HnswIndex<type>::search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find, - FurthestPriQ& best_neighbors, uint32_t level, const GlobalFilter *filter, + BestNeighbors& best_neighbors, uint32_t level, const GlobalFilter *filter, uint32_t nodeid_limit, uint32_t estimated_visited_nodes) const { NearestPriQ candidates; @@ -287,7 +301,7 @@ HnswIndex<type>::search_layer_helper(const TypedCells& input, uint32_t neighbors candidates.push(entry); visited.mark(entry.nodeid); if (filter && !filter->check(entry.nodeid)) { - assert(best_neighbors.size() == 1); + assert(best_neighbors.peek().size() == 1); best_neighbors.pop(); } } @@ -303,18 +317,21 @@ HnswIndex<type>::search_layer_helper(const TypedCells& input, uint32_t neighbors if (neighbor_nodeid >= nodeid_limit) { continue; } - auto neighbor_ref = _graph.acquire_node_ref(neighbor_nodeid); + auto& neighbor_node = _graph.acquire_node_refs_elem_ref(neighbor_nodeid); + auto neighbor_ref = neighbor_node.ref().load_acquire(); if ((! neighbor_ref.valid()) || ! visited.try_mark(neighbor_nodeid)) { continue; } - double dist_to_input = calc_distance(input, neighbor_nodeid); + uint32_t neighbor_docid = acquire_docid(neighbor_node, neighbor_nodeid); + uint32_t neighbor_subspace = neighbor_node.acquire_subspace(); + double dist_to_input = calc_distance(input, neighbor_docid, neighbor_subspace); if (dist_to_input < limit_dist) { candidates.emplace(neighbor_nodeid, neighbor_ref, dist_to_input); if ((!filter) || filter->check(neighbor_nodeid)) { - best_neighbors.emplace(neighbor_nodeid, neighbor_ref, dist_to_input); - if (best_neighbors.size() > neighbors_to_find) { + best_neighbors.emplace(neighbor_nodeid, neighbor_docid, neighbor_ref, dist_to_input); + while (best_neighbors.size() > neighbors_to_find) { best_neighbors.pop(); limit_dist = best_neighbors.top().distance; } @@ -325,9 +342,10 @@ HnswIndex<type>::search_layer_helper(const TypedCells& input, uint32_t neighbors } template <HnswIndexType type> +template <class BestNeighbors> void HnswIndex<type>::search_layer(const TypedCells& input, uint32_t neighbors_to_find, - FurthestPriQ& best_neighbors, uint32_t level, const GlobalFilter *filter) const + BestNeighbors& best_neighbors, uint32_t level, const GlobalFilter *filter) const { uint32_t nodeid_limit = _graph.node_refs_size.load(std::memory_order_acquire); if (filter) { @@ -404,8 +422,9 @@ HnswIndex<type>::internal_prepare_add_node(typename HnswIndex::PreparedAddDoc& o } int search_level = entry.level; double entry_dist = calc_distance(input_vector, entry.nodeid); + uint32_t entry_docid = get_docid(entry.nodeid); // TODO: check if entry nodeid/node_ref is still valid here - HnswCandidate entry_point(entry.nodeid, entry.node_ref, entry_dist); + HnswCandidate entry_point(entry.nodeid, entry_docid, entry.node_ref, entry_dist); while (search_level > node_max_level) { entry_point = find_nearest_in_layer(input_vector, entry_point, search_level); --search_level; @@ -791,16 +810,8 @@ HnswIndex<type>::top_k_by_docid(uint32_t k, TypedCells vector, const GlobalFilter *filter, uint32_t explore_k, double distance_threshold) const { - std::vector<Neighbor> result; - FurthestPriQ candidates = top_k_candidates(vector, std::max(k, explore_k), filter); - while (candidates.size() > k) { - candidates.pop(); - } - result.reserve(candidates.size()); - for (const HnswCandidate & hit : candidates.peek()) { - if (hit.distance > distance_threshold) continue; - result.emplace_back(get_docid(hit.nodeid), hit.distance); - } + SearchBestNeighbors candidates = top_k_candidates(vector, std::max(k, explore_k), filter); + auto result = candidates.get_neighbors(k, distance_threshold); std::sort(result.begin(), result.end(), NeighborsByDocId()); return result; } @@ -823,10 +834,10 @@ HnswIndex<type>::find_top_k_with_filter(uint32_t k, TypedCells vector, } template <HnswIndexType type> -FurthestPriQ +typename HnswIndex<type>::SearchBestNeighbors HnswIndex<type>::top_k_candidates(const TypedCells &vector, uint32_t k, const GlobalFilter *filter) const { - FurthestPriQ best_neighbors; + SearchBestNeighbors best_neighbors; auto entry = _graph.get_entry_node(); if (entry.nodeid == 0) { // graph has no entry point @@ -834,8 +845,9 @@ HnswIndex<type>::top_k_candidates(const TypedCells &vector, uint32_t k, const Gl } int search_level = entry.level; double entry_dist = calc_distance(vector, entry.nodeid); + uint32_t entry_docid = get_docid(entry.nodeid); // TODO: check if entry docid/node_ref is still valid here - HnswCandidate entry_point(entry.nodeid, entry.node_ref, entry_dist); + HnswCandidate entry_point(entry.nodeid, entry_docid, entry.node_ref, entry_dist); while (search_level > 0) { entry_point = find_nearest_in_layer(vector, entry_point, search_level); --search_level; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index bf38dc01f37..32058e9ac44 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -7,7 +7,9 @@ #include "doc_vector_access.h" #include "hnsw_identity_mapping.h" #include "hnsw_index_utils.h" +#include "hnsw_multi_best_neighbors.h" #include "hnsw_nodeid_mapping.h" +#include "hnsw_single_best_neighbors.h" #include "hnsw_test_node.h" #include "nearest_neighbor_index.h" #include "random_level_generator.h" @@ -68,6 +70,7 @@ public: } static constexpr HnswIndexType index_type = type; + using SearchBestNeighbors = typename HnswIndexTraits<type>::SearchBestNeighbors; using IdMapping = typename HnswIndexTraits<type>::IdMapping; protected: using GraphType = HnswGraph<type>; @@ -83,6 +86,14 @@ protected: using TypedCells = vespalib::eval::TypedCells; + static uint32_t acquire_docid(const NodeType& node, uint32_t nodeid) { + if constexpr (NodeType::identity_mapping) { + return nodeid; + } else { + return node.acquire_docid(); + } + } + GraphType _graph; const DocVectorAccess& _vectors; DistanceFunction::UP _distance_func; @@ -105,15 +116,18 @@ protected: * where the candidate is located. * Used by select_neighbors_heuristic(). */ - bool have_closer_distance(HnswCandidate candidate, const HnswCandidateVector& curr_result) const; + bool have_closer_distance(HnswTraversalCandidate candidate, const HnswTraversalCandidateVector& curr_result) const; struct SelectResult { - HnswCandidateVector used; + HnswTraversalCandidateVector used; LinkArray unused; ~SelectResult() {} }; - SelectResult select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const; - SelectResult select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const; - SelectResult select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const; + template <typename HnswCandidateVectorT> + SelectResult select_neighbors_heuristic(const HnswCandidateVectorT& neighbors, uint32_t max_links) const; + template <typename HnswCandidateVectorT> + SelectResult select_neighbors_simple(const HnswCandidateVectorT& neighbors, uint32_t max_links) const; + template <typename HnswCandidateVectorT> + SelectResult select_neighbors(const HnswCandidateVectorT& neighbors, uint32_t max_links) const; void shrink_if_needed(uint32_t nodeid, uint32_t level); void connect_new_node(uint32_t nodeid, const LinkArrayRef &neighbors, uint32_t level); void mutual_reconnect(const LinkArrayRef &cluster, uint32_t level); @@ -129,24 +143,29 @@ protected: return _vectors.get_vector(docid, subspace); } } + inline TypedCells get_vector(uint32_t docid, uint32_t subspace) const { + return _vectors.get_vector(docid, subspace); + } inline VectorBundle get_vector_by_docid(uint32_t docid) const { return _vectors.get_vectors(docid); } double calc_distance(uint32_t lhs_nodeid, uint32_t rhs_nodeid) const; double calc_distance(const TypedCells& lhs, uint32_t rhs_nodeid) const; + double calc_distance(const TypedCells& lhs, uint32_t rhs_docid, uint32_t rhs_subspace) const; uint32_t estimate_visited_nodes(uint32_t level, uint32_t nodeid_limit, uint32_t neighbors_to_find, const GlobalFilter* filter) const; /** * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. */ HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) const; - template <class VisitedTracker> - void search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, + template <class VisitedTracker, class BestNeighbors> + void search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find, BestNeighbors& best_neighbors, uint32_t level, const GlobalFilter *filter, uint32_t nodeid_limit, uint32_t estimated_visited_nodes) const; - void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, + template <class BestNeighbors> + void search_layer(const TypedCells& input, uint32_t neighbors_to_find, BestNeighbors& best_neighbors, uint32_t level, const GlobalFilter *filter = nullptr) const; std::vector<Neighbor> top_k_by_docid(uint32_t k, TypedCells vector, const GlobalFilter *filter, uint32_t explore_k, @@ -227,7 +246,7 @@ public: double distance_threshold) const override; const DistanceFunction *distance_function() const override { return _distance_func.get(); } - FurthestPriQ top_k_candidates(const TypedCells &vector, uint32_t k, const GlobalFilter *filter) const; + SearchBestNeighbors top_k_candidates(const TypedCells &vector, uint32_t k, const GlobalFilter *filter) const; uint32_t get_entry_nodeid() const { return _graph.get_entry_node().nodeid; } int32_t get_entry_level() const { return _graph.get_entry_node().level; } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_traits.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_traits.h index 397c7c98555..ec5672c4c9d 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_traits.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_traits.h @@ -9,7 +9,9 @@ namespace search::tensor { class HnswSimpleNode; class HnswNode; class HnswIdentityMapping; +class HnswMultiBestNeighbors; class HnswNodeidMapping; +class HnswSingleBestNeighbors; /* * Class that selects what node type and id mapping to use based on @@ -30,6 +32,7 @@ class HnswIndexTraits<HnswIndexType::SINGLE> public: using NodeType = HnswSimpleNode; using IdMapping = HnswIdentityMapping; + using SearchBestNeighbors = HnswSingleBestNeighbors; }; /* @@ -44,6 +47,7 @@ class HnswIndexTraits<HnswIndexType::MULTI> public: using NodeType = HnswNode; using IdMapping = HnswNodeidMapping; + using SearchBestNeighbors = HnswMultiBestNeighbors; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_utils.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_utils.h index 670772a5055..a88b805f198 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_utils.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_utils.h @@ -10,36 +10,58 @@ namespace search::tensor { /** - * Represents a candidate node with its distance to another point in space. + * Represents a travesal candidate node with its distance to another + * point in space. */ -struct HnswCandidate { +struct HnswTraversalCandidate { uint32_t nodeid; vespalib::datastore::EntryRef node_ref; double distance; - HnswCandidate(uint32_t nodeid_in, double distance_in) noexcept + HnswTraversalCandidate(uint32_t nodeid_in, double distance_in) noexcept : nodeid(nodeid_in), node_ref(), distance(distance_in) {} - HnswCandidate(uint32_t nodeid_in, vespalib::datastore::EntryRef node_ref_in, double distance_in) noexcept + HnswTraversalCandidate(uint32_t nodeid_in, vespalib::datastore::EntryRef node_ref_in, double distance_in) noexcept : nodeid(nodeid_in), node_ref(node_ref_in), distance(distance_in) {} + HnswTraversalCandidate(uint32_t nodeid_in, uint32_t docid_in, vespalib::datastore::EntryRef node_ref_in, double distance_in) noexcept + : nodeid(nodeid_in), node_ref(node_ref_in), distance(distance_in) + { + (void) docid_in; + } +}; + +/** + * Represents a neighbor candidate node with its distance to another + * point in space. + */ +struct HnswCandidate : public HnswTraversalCandidate { + uint32_t docid; + + HnswCandidate(uint32_t nodeid_in, uint32_t docid_in, vespalib::datastore::EntryRef node_ref_in, double distance_in) noexcept + : HnswTraversalCandidate(nodeid_in, docid_in, node_ref_in, distance_in), + docid(docid_in) + { + } }; struct GreaterDistance { - bool operator() (const HnswCandidate& lhs, const HnswCandidate& rhs) const { + bool operator() (const HnswTraversalCandidate& lhs, const HnswTraversalCandidate& rhs) const { return (rhs.distance < lhs.distance); } }; struct LesserDistance { - bool operator() (const HnswCandidate& lhs, const HnswCandidate& rhs) const { + bool operator() (const HnswTraversalCandidate& lhs, const HnswTraversalCandidate& rhs) const { return (lhs.distance < rhs.distance); } }; +using HnswTraversalCandidateVector = std::vector<HnswTraversalCandidate>; + using HnswCandidateVector = std::vector<HnswCandidate>; /** * Priority queue that keeps the candidate node that is nearest a point in space on top. */ -using NearestPriQ = std::priority_queue<HnswCandidate, HnswCandidateVector, GreaterDistance>; +using NearestPriQ = std::priority_queue<HnswTraversalCandidate, HnswTraversalCandidateVector, GreaterDistance>; /** * Priority queue that keeps the candidate node that is furthest away a point in space on top. diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.cpp new file mode 100644 index 00000000000..86cfc743247 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.cpp @@ -0,0 +1,28 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "hnsw_multi_best_neighbors.h" + +namespace search::tensor { + +HnswMultiBestNeighbors::~HnswMultiBestNeighbors() = default; + +std::vector<NearestNeighborIndex::Neighbor> +HnswMultiBestNeighbors::get_neighbors(uint32_t k, double distance_threshold) +{ + while (_docids.size() > k) { + pop(); + } + std::vector<NearestNeighborIndex::Neighbor> result; + result.reserve(_docids.size()); + while (!_candidates.empty()) { + auto& hit = _candidates.top(); + uint32_t docid = hit.docid; + if (remove_docid(docid) && (!(hit.distance > distance_threshold))) { + result.emplace_back(docid, hit.distance); + } + _candidates.pop(); + } + return result; +} + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.h b/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.h new file mode 100644 index 00000000000..f3f5cf5690a --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.h @@ -0,0 +1,70 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "hnsw_index_utils.h" +#include "nearest_neighbor_index.h" +#include <vespa/vespalib/stllike/hash_map.h> +#include <cassert> + +namespace search::tensor { + +/* + * A priority queue of best neighbors for hnsw index. Used for search + * when hnsw index has multiple nodes per document. + */ +class HnswMultiBestNeighbors { + using EntryRef = vespalib::datastore::EntryRef; + FurthestPriQ _candidates; + vespalib::hash_map<uint32_t, uint32_t> _docids; + + void add_docid(uint32_t docid) + { + auto insres = _docids.insert(std::make_pair(docid, 1)); + if (!insres.second) { + ++insres.first->second; + } + } + + bool remove_docid(uint32_t docid) + { + auto itr = _docids.find(docid); + assert(itr != _docids.end()); + if (itr->second > 1) { + --itr->second; + return false; + } else { + _docids.erase(docid); + return true; + } + } +public: + HnswMultiBestNeighbors() + : _candidates(), + _docids() + { + } + ~HnswMultiBestNeighbors(); + + std::vector<NearestNeighborIndex::Neighbor> get_neighbors(uint32_t k, double distance_threshold); + + void push(const HnswCandidate& candidate) { + add_docid(candidate.docid); + _candidates.push(candidate); + } + void pop() { + assert(!_candidates.empty()); + uint32_t docid = _candidates.top().docid; + remove_docid(docid); + _candidates.pop(); + } + const HnswCandidateVector& peek() const { return _candidates.peek(); } + const HnswCandidate& top() const { return _candidates.top(); } + size_t size() const { return _docids.size(); } + void emplace(uint32_t nodeid, uint32_t docid, EntryRef ref, double distance) { + add_docid(docid); + _candidates.emplace(nodeid, docid, ref, distance); + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.cpp new file mode 100644 index 00000000000..1b2f62a144f --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.cpp @@ -0,0 +1,22 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "hnsw_single_best_neighbors.h" + +namespace search::tensor { + +std::vector<NearestNeighborIndex::Neighbor> +HnswSingleBestNeighbors::get_neighbors(uint32_t k, double distance_threshold) +{ + while (_candidates.size() > k) { + _candidates.pop(); + } + std::vector<NearestNeighborIndex::Neighbor> result; + result.reserve(_candidates.size()); + for (const HnswCandidate & hit : _candidates.peek()) { + if (hit.distance > distance_threshold) continue; + result.emplace_back(hit.docid, hit.distance); + } + return result; +} + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.h b/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.h new file mode 100644 index 00000000000..e7c0a7fded6 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.h @@ -0,0 +1,33 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "hnsw_index_utils.h" +#include "nearest_neighbor_index.h" + +namespace search::tensor { + +/* + * A priority queue of best neighbors for hnsw index. Used for search + * when hnsw index has a single node per document. + */ +class HnswSingleBestNeighbors { + using EntryRef = vespalib::datastore::EntryRef; + FurthestPriQ _candidates; +public: + HnswSingleBestNeighbors() + : _candidates() + { + } + ~HnswSingleBestNeighbors() = default; + + std::vector<NearestNeighborIndex::Neighbor> get_neighbors(uint32_t k, double distance_threshold); + void push(const HnswCandidate& candidate) { _candidates.push(candidate); } + void pop() { _candidates.pop(); } + const HnswCandidateVector& peek() const { return _candidates.peek(); } + const HnswCandidate& top() const { return _candidates.top(); } + size_t size() const { return _candidates.size(); } + void emplace(uint32_t nodeid, uint32_t docid, EntryRef ref, double distance) { _candidates.emplace(nodeid, docid, ref, distance); } +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h index de1ea26d7bf..3f6c9b82a65 100644 --- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h +++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h @@ -45,6 +45,9 @@ public: : docid(id), distance(dist) {} Neighbor() noexcept : docid(0), distance(0.0) {} + bool operator==(const Neighbor& rhs) const { + return docid == rhs.docid && distance == rhs.distance; + } }; virtual ~NearestNeighborIndex() = default; virtual void add_document(uint32_t docid) = 0; |