summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2022-11-24 10:26:10 +0100
committerTor Egge <Tor.Egge@online.no>2022-11-24 10:26:10 +0100
commite2c7b0f200a65dd9c74a9a70d6cc37f1bccb11fb (patch)
tree92b62c1903531d72a5c02ff6c872b75ffc91bf1f /searchlib
parentba11a8a87877dd3a97586255839b02782b70de87 (diff)
Avoid duplicate docid when searching in hnsw index with multiple nodes per
document.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/CMakeLists.txt1
-rw-r--r--searchlib/src/tests/tensor/hnsw_best_neighbors/CMakeLists.txt9
-rw-r--r--searchlib/src/tests/tensor/hnsw_best_neighbors/hnsw_best_neighbors_test.cpp109
-rw-r--r--searchlib/src/vespa/searchlib/tensor/CMakeLists.txt2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_graph.h4
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp74
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h37
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_traits.h4
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_utils.h36
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.cpp28
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.h70
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.cpp22
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.h33
-rw-r--r--searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h3
14 files changed, 385 insertions, 47 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt
index ae6469ae07a..30c1bcd7fdd 100644
--- a/searchlib/CMakeLists.txt
+++ b/searchlib/CMakeLists.txt
@@ -220,6 +220,7 @@ vespa_define_module(
src/tests/tensor/dense_tensor_store
src/tests/tensor/direct_tensor_store
src/tests/tensor/distance_functions
+ src/tests/tensor/hnsw_best_neighbors
src/tests/tensor/hnsw_index
src/tests/tensor/hnsw_nodeid_mapping
src/tests/tensor/hnsw_saver
diff --git a/searchlib/src/tests/tensor/hnsw_best_neighbors/CMakeLists.txt b/searchlib/src/tests/tensor/hnsw_best_neighbors/CMakeLists.txt
new file mode 100644
index 00000000000..3cb6a286580
--- /dev/null
+++ b/searchlib/src/tests/tensor/hnsw_best_neighbors/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(searchlib_hnsw_best_neighbors_test_app TEST
+ SOURCES
+ hnsw_best_neighbors_test.cpp
+ DEPENDS
+ searchlib
+ GTest::GTest
+)
+vespa_add_test(NAME searchlib_hnsw_best_neighbors_test_app COMMAND searchlib_hnsw_best_neighbors_test_app)
diff --git a/searchlib/src/tests/tensor/hnsw_best_neighbors/hnsw_best_neighbors_test.cpp b/searchlib/src/tests/tensor/hnsw_best_neighbors/hnsw_best_neighbors_test.cpp
new file mode 100644
index 00000000000..c05b85d3e59
--- /dev/null
+++ b/searchlib/src/tests/tensor/hnsw_best_neighbors/hnsw_best_neighbors_test.cpp
@@ -0,0 +1,109 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchlib/tensor/hnsw_multi_best_neighbors.h>
+#include <vespa/searchlib/tensor/hnsw_single_best_neighbors.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <ostream>
+
+using vespalib::datastore::EntryRef;
+using search::tensor::HnswMultiBestNeighbors;
+using search::tensor::HnswSingleBestNeighbors;
+using search::tensor::NearestNeighborIndex;
+
+using Neighbor = NearestNeighborIndex::Neighbor;
+
+namespace search::tensor
+{
+
+std::ostream& operator<<(std::ostream& os, const Neighbor& neighbor) {
+ os << "{ docid=" << neighbor.docid << ", distance=" << neighbor.distance << "}";
+ return os;
+}
+
+}
+
+struct DocidThenDistanceOrder
+{
+ bool operator()(const Neighbor& lhs, const Neighbor& rhs) const {
+ if (lhs.docid != rhs.docid) {
+ return lhs.docid < rhs.docid;
+ }
+ return lhs.distance < rhs.distance;
+ }
+};
+
+template <typename BestNeighbors>
+class HnswBestNeighborsTest : public testing::Test {
+protected:
+ BestNeighbors _neighbors;
+ HnswBestNeighborsTest()
+ : testing::Test(),
+ _neighbors()
+ {
+ populate();
+ }
+ ~HnswBestNeighborsTest() override;
+
+ void add(uint32_t nodeid, uint32_t docid, double distance)
+ {
+ _neighbors.emplace(nodeid, docid, EntryRef(), distance);
+ }
+
+ size_t size() const { return _neighbors.size(); }
+
+ void assert_neighbors(std::vector<NearestNeighborIndex::Neighbor> exp, uint32_t k, double distance_limit) {
+ auto neighbors_copy = _neighbors;
+ auto act = neighbors_copy.get_neighbors(k, distance_limit);
+ std::sort(act.begin(), act.end(), DocidThenDistanceOrder());
+ EXPECT_EQ(exp, act);
+ }
+
+ void populate()
+ {
+ add(3, 3, 7.0);
+ add(2, 2, 10.0);
+ add(1, 1, 1.0);
+ add(4, 2, 5.0);
+ }
+};
+
+template <typename BestNeighbors>
+HnswBestNeighborsTest<BestNeighbors>::~HnswBestNeighborsTest() = default;
+
+using HnswBestNeighborsTypedTestTypes = testing::Types<HnswSingleBestNeighbors, HnswMultiBestNeighbors>;
+
+TYPED_TEST_SUITE(HnswBestNeighborsTest, HnswBestNeighborsTypedTestTypes);
+
+TYPED_TEST(HnswBestNeighborsTest, k_limit_is_enforced)
+{
+ this->assert_neighbors({}, 0, 40.0);
+ this->assert_neighbors({{1, 1.0}}, 1, 40.0);
+ this->assert_neighbors({{1, 1.0}, {2, 5.0}}, 2, 40.0);
+ this->assert_neighbors({{1, 1.0}, {2, 5.0}, {3, 7.0}}, 3, 40.0);
+}
+
+TYPED_TEST(HnswBestNeighborsTest, distance_limit_is_enforced)
+{
+ this->assert_neighbors({}, 40, 0.5);
+ this->assert_neighbors({{1, 1.0}}, 40, 1.0);
+ this->assert_neighbors({{1, 1.0}, {2, 5.0}}, 40, 5.0);
+ this->assert_neighbors({{1, 1.0}, {2, 5.0}, {3, 7.0}}, 40, 7.0);
+}
+
+using HnswSingleBestNeighborsTest = HnswBestNeighborsTest<HnswSingleBestNeighbors>;
+
+TEST_F(HnswSingleBestNeighborsTest, duplicate_docids_are_not_elimiated)
+{
+ EXPECT_EQ(4, size());
+ assert_neighbors({{1, 1.0}, {2, 5.0}, {2, 10.0}, {3, 7.0}}, 40, 40.0);
+}
+
+using HnswMultiBestNeighborsTest = HnswBestNeighborsTest<HnswMultiBestNeighbors>;
+
+TEST_F(HnswMultiBestNeighborsTest, duplicate_docids_are_eliminated)
+{
+ EXPECT_EQ(3, size());
+ assert_neighbors({{1, 1.0}, {2, 5.0}, {3, 7.0}}, 40, 40.0);
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index cd405f54b2e..a00a50f32c8 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -19,7 +19,9 @@ vespa_add_library(searchlib_tensor OBJECT
hnsw_graph.cpp
hnsw_index.cpp
hnsw_index_saver.cpp
+ hnsw_multi_best_neighbors.cpp
hnsw_nodeid_mapping.cpp
+ hnsw_single_best_neighbors.cpp
imported_tensor_attribute_vector.cpp
imported_tensor_attribute_vector_read_guard.cpp
inner_product_distance.cpp
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h b/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h
index 4706f9cf7e8..3fcb1f09eeb 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_graph.h
@@ -65,6 +65,10 @@ struct HnswGraph {
return node_refs.get_elem_ref(nodeid).ref().load_relaxed(); // Called from writer only
}
+ const NodeType& acquire_node_refs_elem_ref(uint32_t nodeid) const {
+ return node_refs.acquire_elem_ref(nodeid);
+ }
+
NodeRef acquire_node_ref(uint32_t nodeid) const {
return node_refs.acquire_elem_ref(nodeid).ref().load_acquire();
}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index b293b001bcf..d6ad501391a 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -88,7 +88,7 @@ HnswIndex<type>::max_links_for_level(uint32_t level) const
template <HnswIndexType type>
bool
-HnswIndex<type>::have_closer_distance(HnswCandidate candidate, const HnswCandidateVector& result) const
+HnswIndex<type>::have_closer_distance(HnswTraversalCandidate candidate, const HnswTraversalCandidateVector& result) const
{
for (const auto & neighbor : result) {
double dist = calc_distance(candidate.nodeid, neighbor.nodeid);
@@ -100,10 +100,11 @@ HnswIndex<type>::have_closer_distance(HnswCandidate candidate, const HnswCandida
}
template <HnswIndexType type>
+template <typename HnswCandidateVectorT>
typename HnswIndex<type>::SelectResult
-HnswIndex<type>::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const
+HnswIndex<type>::select_neighbors_simple(const HnswCandidateVectorT& neighbors, uint32_t max_links) const
{
- HnswCandidateVector sorted(neighbors);
+ HnswCandidateVectorT sorted(neighbors);
std::sort(sorted.begin(), sorted.end(), LesserDistance());
SelectResult result;
for (const auto & candidate : sorted) {
@@ -117,8 +118,9 @@ HnswIndex<type>::select_neighbors_simple(const HnswCandidateVector& neighbors, u
}
template <HnswIndexType type>
+template <typename HnswCandidateVectorT>
typename HnswIndex<type>::SelectResult
-HnswIndex<type>::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const
+HnswIndex<type>::select_neighbors_heuristic(const HnswCandidateVectorT& neighbors, uint32_t max_links) const
{
SelectResult result;
NearestPriQ nearest;
@@ -145,8 +147,9 @@ HnswIndex<type>::select_neighbors_heuristic(const HnswCandidateVector& neighbors
}
template <HnswIndexType type>
+template <typename HnswCandidateVectorT>
typename HnswIndex<type>::SelectResult
-HnswIndex<type>::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const
+HnswIndex<type>::select_neighbors(const HnswCandidateVectorT& neighbors, uint32_t max_links) const
{
if (_cfg.heuristic_select_neighbors()) {
return select_neighbors_heuristic(neighbors, max_links);
@@ -162,7 +165,7 @@ HnswIndex<type>::shrink_if_needed(uint32_t nodeid, uint32_t level)
auto old_links = _graph.get_link_array(nodeid, level);
uint32_t max_links = max_links_for_level(level);
if (old_links.size() > max_links) {
- HnswCandidateVector neighbors;
+ HnswTraversalCandidateVector neighbors;
neighbors.reserve(old_links.size());
for (uint32_t neighbor_nodeid : old_links) {
double dist = calc_distance(nodeid, neighbor_nodeid);
@@ -226,6 +229,14 @@ HnswIndex<type>::calc_distance(const TypedCells& lhs, uint32_t rhs_nodeid) const
}
template <HnswIndexType type>
+double
+HnswIndex<type>::calc_distance(const TypedCells& lhs, uint32_t rhs_docid, uint32_t rhs_subspace) const
+{
+ auto rhs = get_vector(rhs_docid, rhs_subspace);
+ return _distance_func->calc(lhs, rhs);
+}
+
+template <HnswIndexType type>
uint32_t
HnswIndex<type>::estimate_visited_nodes(uint32_t level, uint32_t nodeid_limit, uint32_t neighbors_to_find, const GlobalFilter* filter) const
{
@@ -258,12 +269,15 @@ HnswIndex<type>::find_nearest_in_layer(const TypedCells& input, const HnswCandid
while (keep_searching) {
keep_searching = false;
for (uint32_t neighbor_nodeid : _graph.get_link_array(nearest.node_ref, level)) {
- auto neighbor_ref = _graph.acquire_node_ref(neighbor_nodeid);
- double dist = calc_distance(input, neighbor_nodeid);
+ auto& neighbor_node = _graph.acquire_node_refs_elem_ref(neighbor_nodeid);
+ auto neighbor_ref = neighbor_node.ref().load_acquire();
+ uint32_t neighbor_docid = acquire_docid(neighbor_node, neighbor_nodeid);
+ uint32_t neighbor_subspace = neighbor_node.acquire_subspace();
+ double dist = calc_distance(input, neighbor_docid, neighbor_subspace);
if (_graph.still_valid(neighbor_nodeid, neighbor_ref)
&& dist < nearest.distance)
{
- nearest = HnswCandidate(neighbor_nodeid, neighbor_ref, dist);
+ nearest = HnswCandidate(neighbor_nodeid, neighbor_docid, neighbor_ref, dist);
keep_searching = true;
}
}
@@ -272,10 +286,10 @@ HnswIndex<type>::find_nearest_in_layer(const TypedCells& input, const HnswCandid
}
template <HnswIndexType type>
-template <class VisitedTracker>
+template <class VisitedTracker, class BestNeighbors>
void
HnswIndex<type>::search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find,
- FurthestPriQ& best_neighbors, uint32_t level, const GlobalFilter *filter,
+ BestNeighbors& best_neighbors, uint32_t level, const GlobalFilter *filter,
uint32_t nodeid_limit, uint32_t estimated_visited_nodes) const
{
NearestPriQ candidates;
@@ -287,7 +301,7 @@ HnswIndex<type>::search_layer_helper(const TypedCells& input, uint32_t neighbors
candidates.push(entry);
visited.mark(entry.nodeid);
if (filter && !filter->check(entry.nodeid)) {
- assert(best_neighbors.size() == 1);
+ assert(best_neighbors.peek().size() == 1);
best_neighbors.pop();
}
}
@@ -303,18 +317,21 @@ HnswIndex<type>::search_layer_helper(const TypedCells& input, uint32_t neighbors
if (neighbor_nodeid >= nodeid_limit) {
continue;
}
- auto neighbor_ref = _graph.acquire_node_ref(neighbor_nodeid);
+ auto& neighbor_node = _graph.acquire_node_refs_elem_ref(neighbor_nodeid);
+ auto neighbor_ref = neighbor_node.ref().load_acquire();
if ((! neighbor_ref.valid())
|| ! visited.try_mark(neighbor_nodeid))
{
continue;
}
- double dist_to_input = calc_distance(input, neighbor_nodeid);
+ uint32_t neighbor_docid = acquire_docid(neighbor_node, neighbor_nodeid);
+ uint32_t neighbor_subspace = neighbor_node.acquire_subspace();
+ double dist_to_input = calc_distance(input, neighbor_docid, neighbor_subspace);
if (dist_to_input < limit_dist) {
candidates.emplace(neighbor_nodeid, neighbor_ref, dist_to_input);
if ((!filter) || filter->check(neighbor_nodeid)) {
- best_neighbors.emplace(neighbor_nodeid, neighbor_ref, dist_to_input);
- if (best_neighbors.size() > neighbors_to_find) {
+ best_neighbors.emplace(neighbor_nodeid, neighbor_docid, neighbor_ref, dist_to_input);
+ while (best_neighbors.size() > neighbors_to_find) {
best_neighbors.pop();
limit_dist = best_neighbors.top().distance;
}
@@ -325,9 +342,10 @@ HnswIndex<type>::search_layer_helper(const TypedCells& input, uint32_t neighbors
}
template <HnswIndexType type>
+template <class BestNeighbors>
void
HnswIndex<type>::search_layer(const TypedCells& input, uint32_t neighbors_to_find,
- FurthestPriQ& best_neighbors, uint32_t level, const GlobalFilter *filter) const
+ BestNeighbors& best_neighbors, uint32_t level, const GlobalFilter *filter) const
{
uint32_t nodeid_limit = _graph.node_refs_size.load(std::memory_order_acquire);
if (filter) {
@@ -404,8 +422,9 @@ HnswIndex<type>::internal_prepare_add_node(typename HnswIndex::PreparedAddDoc& o
}
int search_level = entry.level;
double entry_dist = calc_distance(input_vector, entry.nodeid);
+ uint32_t entry_docid = get_docid(entry.nodeid);
// TODO: check if entry nodeid/node_ref is still valid here
- HnswCandidate entry_point(entry.nodeid, entry.node_ref, entry_dist);
+ HnswCandidate entry_point(entry.nodeid, entry_docid, entry.node_ref, entry_dist);
while (search_level > node_max_level) {
entry_point = find_nearest_in_layer(input_vector, entry_point, search_level);
--search_level;
@@ -791,16 +810,8 @@ HnswIndex<type>::top_k_by_docid(uint32_t k, TypedCells vector,
const GlobalFilter *filter, uint32_t explore_k,
double distance_threshold) const
{
- std::vector<Neighbor> result;
- FurthestPriQ candidates = top_k_candidates(vector, std::max(k, explore_k), filter);
- while (candidates.size() > k) {
- candidates.pop();
- }
- result.reserve(candidates.size());
- for (const HnswCandidate & hit : candidates.peek()) {
- if (hit.distance > distance_threshold) continue;
- result.emplace_back(get_docid(hit.nodeid), hit.distance);
- }
+ SearchBestNeighbors candidates = top_k_candidates(vector, std::max(k, explore_k), filter);
+ auto result = candidates.get_neighbors(k, distance_threshold);
std::sort(result.begin(), result.end(), NeighborsByDocId());
return result;
}
@@ -823,10 +834,10 @@ HnswIndex<type>::find_top_k_with_filter(uint32_t k, TypedCells vector,
}
template <HnswIndexType type>
-FurthestPriQ
+typename HnswIndex<type>::SearchBestNeighbors
HnswIndex<type>::top_k_candidates(const TypedCells &vector, uint32_t k, const GlobalFilter *filter) const
{
- FurthestPriQ best_neighbors;
+ SearchBestNeighbors best_neighbors;
auto entry = _graph.get_entry_node();
if (entry.nodeid == 0) {
// graph has no entry point
@@ -834,8 +845,9 @@ HnswIndex<type>::top_k_candidates(const TypedCells &vector, uint32_t k, const Gl
}
int search_level = entry.level;
double entry_dist = calc_distance(vector, entry.nodeid);
+ uint32_t entry_docid = get_docid(entry.nodeid);
// TODO: check if entry docid/node_ref is still valid here
- HnswCandidate entry_point(entry.nodeid, entry.node_ref, entry_dist);
+ HnswCandidate entry_point(entry.nodeid, entry_docid, entry.node_ref, entry_dist);
while (search_level > 0) {
entry_point = find_nearest_in_layer(vector, entry_point, search_level);
--search_level;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index bf38dc01f37..32058e9ac44 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -7,7 +7,9 @@
#include "doc_vector_access.h"
#include "hnsw_identity_mapping.h"
#include "hnsw_index_utils.h"
+#include "hnsw_multi_best_neighbors.h"
#include "hnsw_nodeid_mapping.h"
+#include "hnsw_single_best_neighbors.h"
#include "hnsw_test_node.h"
#include "nearest_neighbor_index.h"
#include "random_level_generator.h"
@@ -68,6 +70,7 @@ public:
}
static constexpr HnswIndexType index_type = type;
+ using SearchBestNeighbors = typename HnswIndexTraits<type>::SearchBestNeighbors;
using IdMapping = typename HnswIndexTraits<type>::IdMapping;
protected:
using GraphType = HnswGraph<type>;
@@ -83,6 +86,14 @@ protected:
using TypedCells = vespalib::eval::TypedCells;
+ static uint32_t acquire_docid(const NodeType& node, uint32_t nodeid) {
+ if constexpr (NodeType::identity_mapping) {
+ return nodeid;
+ } else {
+ return node.acquire_docid();
+ }
+ }
+
GraphType _graph;
const DocVectorAccess& _vectors;
DistanceFunction::UP _distance_func;
@@ -105,15 +116,18 @@ protected:
* where the candidate is located.
* Used by select_neighbors_heuristic().
*/
- bool have_closer_distance(HnswCandidate candidate, const HnswCandidateVector& curr_result) const;
+ bool have_closer_distance(HnswTraversalCandidate candidate, const HnswTraversalCandidateVector& curr_result) const;
struct SelectResult {
- HnswCandidateVector used;
+ HnswTraversalCandidateVector used;
LinkArray unused;
~SelectResult() {}
};
- SelectResult select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const;
- SelectResult select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const;
- SelectResult select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const;
+ template <typename HnswCandidateVectorT>
+ SelectResult select_neighbors_heuristic(const HnswCandidateVectorT& neighbors, uint32_t max_links) const;
+ template <typename HnswCandidateVectorT>
+ SelectResult select_neighbors_simple(const HnswCandidateVectorT& neighbors, uint32_t max_links) const;
+ template <typename HnswCandidateVectorT>
+ SelectResult select_neighbors(const HnswCandidateVectorT& neighbors, uint32_t max_links) const;
void shrink_if_needed(uint32_t nodeid, uint32_t level);
void connect_new_node(uint32_t nodeid, const LinkArrayRef &neighbors, uint32_t level);
void mutual_reconnect(const LinkArrayRef &cluster, uint32_t level);
@@ -129,24 +143,29 @@ protected:
return _vectors.get_vector(docid, subspace);
}
}
+ inline TypedCells get_vector(uint32_t docid, uint32_t subspace) const {
+ return _vectors.get_vector(docid, subspace);
+ }
inline VectorBundle get_vector_by_docid(uint32_t docid) const {
return _vectors.get_vectors(docid);
}
double calc_distance(uint32_t lhs_nodeid, uint32_t rhs_nodeid) const;
double calc_distance(const TypedCells& lhs, uint32_t rhs_nodeid) const;
+ double calc_distance(const TypedCells& lhs, uint32_t rhs_docid, uint32_t rhs_subspace) const;
uint32_t estimate_visited_nodes(uint32_t level, uint32_t nodeid_limit, uint32_t neighbors_to_find, const GlobalFilter* filter) const;
/**
* Performs a greedy search in the given layer to find the candidate that is nearest the input vector.
*/
HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) const;
- template <class VisitedTracker>
- void search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors,
+ template <class VisitedTracker, class BestNeighbors>
+ void search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find, BestNeighbors& best_neighbors,
uint32_t level, const GlobalFilter *filter,
uint32_t nodeid_limit,
uint32_t estimated_visited_nodes) const;
- void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors,
+ template <class BestNeighbors>
+ void search_layer(const TypedCells& input, uint32_t neighbors_to_find, BestNeighbors& best_neighbors,
uint32_t level, const GlobalFilter *filter = nullptr) const;
std::vector<Neighbor> top_k_by_docid(uint32_t k, TypedCells vector,
const GlobalFilter *filter, uint32_t explore_k,
@@ -227,7 +246,7 @@ public:
double distance_threshold) const override;
const DistanceFunction *distance_function() const override { return _distance_func.get(); }
- FurthestPriQ top_k_candidates(const TypedCells &vector, uint32_t k, const GlobalFilter *filter) const;
+ SearchBestNeighbors top_k_candidates(const TypedCells &vector, uint32_t k, const GlobalFilter *filter) const;
uint32_t get_entry_nodeid() const { return _graph.get_entry_node().nodeid; }
int32_t get_entry_level() const { return _graph.get_entry_node().level; }
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_traits.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_traits.h
index 397c7c98555..ec5672c4c9d 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_traits.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_traits.h
@@ -9,7 +9,9 @@ namespace search::tensor {
class HnswSimpleNode;
class HnswNode;
class HnswIdentityMapping;
+class HnswMultiBestNeighbors;
class HnswNodeidMapping;
+class HnswSingleBestNeighbors;
/*
* Class that selects what node type and id mapping to use based on
@@ -30,6 +32,7 @@ class HnswIndexTraits<HnswIndexType::SINGLE>
public:
using NodeType = HnswSimpleNode;
using IdMapping = HnswIdentityMapping;
+ using SearchBestNeighbors = HnswSingleBestNeighbors;
};
/*
@@ -44,6 +47,7 @@ class HnswIndexTraits<HnswIndexType::MULTI>
public:
using NodeType = HnswNode;
using IdMapping = HnswNodeidMapping;
+ using SearchBestNeighbors = HnswMultiBestNeighbors;
};
}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_utils.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_utils.h
index 670772a5055..a88b805f198 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_utils.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_utils.h
@@ -10,36 +10,58 @@
namespace search::tensor {
/**
- * Represents a candidate node with its distance to another point in space.
+ * Represents a travesal candidate node with its distance to another
+ * point in space.
*/
-struct HnswCandidate {
+struct HnswTraversalCandidate {
uint32_t nodeid;
vespalib::datastore::EntryRef node_ref;
double distance;
- HnswCandidate(uint32_t nodeid_in, double distance_in) noexcept
+ HnswTraversalCandidate(uint32_t nodeid_in, double distance_in) noexcept
: nodeid(nodeid_in), node_ref(), distance(distance_in) {}
- HnswCandidate(uint32_t nodeid_in, vespalib::datastore::EntryRef node_ref_in, double distance_in) noexcept
+ HnswTraversalCandidate(uint32_t nodeid_in, vespalib::datastore::EntryRef node_ref_in, double distance_in) noexcept
: nodeid(nodeid_in), node_ref(node_ref_in), distance(distance_in) {}
+ HnswTraversalCandidate(uint32_t nodeid_in, uint32_t docid_in, vespalib::datastore::EntryRef node_ref_in, double distance_in) noexcept
+ : nodeid(nodeid_in), node_ref(node_ref_in), distance(distance_in)
+ {
+ (void) docid_in;
+ }
+};
+
+/**
+ * Represents a neighbor candidate node with its distance to another
+ * point in space.
+ */
+struct HnswCandidate : public HnswTraversalCandidate {
+ uint32_t docid;
+
+ HnswCandidate(uint32_t nodeid_in, uint32_t docid_in, vespalib::datastore::EntryRef node_ref_in, double distance_in) noexcept
+ : HnswTraversalCandidate(nodeid_in, docid_in, node_ref_in, distance_in),
+ docid(docid_in)
+ {
+ }
};
struct GreaterDistance {
- bool operator() (const HnswCandidate& lhs, const HnswCandidate& rhs) const {
+ bool operator() (const HnswTraversalCandidate& lhs, const HnswTraversalCandidate& rhs) const {
return (rhs.distance < lhs.distance);
}
};
struct LesserDistance {
- bool operator() (const HnswCandidate& lhs, const HnswCandidate& rhs) const {
+ bool operator() (const HnswTraversalCandidate& lhs, const HnswTraversalCandidate& rhs) const {
return (lhs.distance < rhs.distance);
}
};
+using HnswTraversalCandidateVector = std::vector<HnswTraversalCandidate>;
+
using HnswCandidateVector = std::vector<HnswCandidate>;
/**
* Priority queue that keeps the candidate node that is nearest a point in space on top.
*/
-using NearestPriQ = std::priority_queue<HnswCandidate, HnswCandidateVector, GreaterDistance>;
+using NearestPriQ = std::priority_queue<HnswTraversalCandidate, HnswTraversalCandidateVector, GreaterDistance>;
/**
* Priority queue that keeps the candidate node that is furthest away a point in space on top.
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.cpp
new file mode 100644
index 00000000000..86cfc743247
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.cpp
@@ -0,0 +1,28 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "hnsw_multi_best_neighbors.h"
+
+namespace search::tensor {
+
+HnswMultiBestNeighbors::~HnswMultiBestNeighbors() = default;
+
+std::vector<NearestNeighborIndex::Neighbor>
+HnswMultiBestNeighbors::get_neighbors(uint32_t k, double distance_threshold)
+{
+ while (_docids.size() > k) {
+ pop();
+ }
+ std::vector<NearestNeighborIndex::Neighbor> result;
+ result.reserve(_docids.size());
+ while (!_candidates.empty()) {
+ auto& hit = _candidates.top();
+ uint32_t docid = hit.docid;
+ if (remove_docid(docid) && (!(hit.distance > distance_threshold))) {
+ result.emplace_back(docid, hit.distance);
+ }
+ _candidates.pop();
+ }
+ return result;
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.h b/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.h
new file mode 100644
index 00000000000..f3f5cf5690a
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_multi_best_neighbors.h
@@ -0,0 +1,70 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "hnsw_index_utils.h"
+#include "nearest_neighbor_index.h"
+#include <vespa/vespalib/stllike/hash_map.h>
+#include <cassert>
+
+namespace search::tensor {
+
+/*
+ * A priority queue of best neighbors for hnsw index. Used for search
+ * when hnsw index has multiple nodes per document.
+ */
+class HnswMultiBestNeighbors {
+ using EntryRef = vespalib::datastore::EntryRef;
+ FurthestPriQ _candidates;
+ vespalib::hash_map<uint32_t, uint32_t> _docids;
+
+ void add_docid(uint32_t docid)
+ {
+ auto insres = _docids.insert(std::make_pair(docid, 1));
+ if (!insres.second) {
+ ++insres.first->second;
+ }
+ }
+
+ bool remove_docid(uint32_t docid)
+ {
+ auto itr = _docids.find(docid);
+ assert(itr != _docids.end());
+ if (itr->second > 1) {
+ --itr->second;
+ return false;
+ } else {
+ _docids.erase(docid);
+ return true;
+ }
+ }
+public:
+ HnswMultiBestNeighbors()
+ : _candidates(),
+ _docids()
+ {
+ }
+ ~HnswMultiBestNeighbors();
+
+ std::vector<NearestNeighborIndex::Neighbor> get_neighbors(uint32_t k, double distance_threshold);
+
+ void push(const HnswCandidate& candidate) {
+ add_docid(candidate.docid);
+ _candidates.push(candidate);
+ }
+ void pop() {
+ assert(!_candidates.empty());
+ uint32_t docid = _candidates.top().docid;
+ remove_docid(docid);
+ _candidates.pop();
+ }
+ const HnswCandidateVector& peek() const { return _candidates.peek(); }
+ const HnswCandidate& top() const { return _candidates.top(); }
+ size_t size() const { return _docids.size(); }
+ void emplace(uint32_t nodeid, uint32_t docid, EntryRef ref, double distance) {
+ add_docid(docid);
+ _candidates.emplace(nodeid, docid, ref, distance);
+ }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.cpp
new file mode 100644
index 00000000000..1b2f62a144f
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.cpp
@@ -0,0 +1,22 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "hnsw_single_best_neighbors.h"
+
+namespace search::tensor {
+
+std::vector<NearestNeighborIndex::Neighbor>
+HnswSingleBestNeighbors::get_neighbors(uint32_t k, double distance_threshold)
+{
+ while (_candidates.size() > k) {
+ _candidates.pop();
+ }
+ std::vector<NearestNeighborIndex::Neighbor> result;
+ result.reserve(_candidates.size());
+ for (const HnswCandidate & hit : _candidates.peek()) {
+ if (hit.distance > distance_threshold) continue;
+ result.emplace_back(hit.docid, hit.distance);
+ }
+ return result;
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.h b/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.h
new file mode 100644
index 00000000000..e7c0a7fded6
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_single_best_neighbors.h
@@ -0,0 +1,33 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "hnsw_index_utils.h"
+#include "nearest_neighbor_index.h"
+
+namespace search::tensor {
+
+/*
+ * A priority queue of best neighbors for hnsw index. Used for search
+ * when hnsw index has a single node per document.
+ */
+class HnswSingleBestNeighbors {
+ using EntryRef = vespalib::datastore::EntryRef;
+ FurthestPriQ _candidates;
+public:
+ HnswSingleBestNeighbors()
+ : _candidates()
+ {
+ }
+ ~HnswSingleBestNeighbors() = default;
+
+ std::vector<NearestNeighborIndex::Neighbor> get_neighbors(uint32_t k, double distance_threshold);
+ void push(const HnswCandidate& candidate) { _candidates.push(candidate); }
+ void pop() { _candidates.pop(); }
+ const HnswCandidateVector& peek() const { return _candidates.peek(); }
+ const HnswCandidate& top() const { return _candidates.top(); }
+ size_t size() const { return _candidates.size(); }
+ void emplace(uint32_t nodeid, uint32_t docid, EntryRef ref, double distance) { _candidates.emplace(nodeid, docid, ref, distance); }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
index de1ea26d7bf..3f6c9b82a65 100644
--- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
@@ -45,6 +45,9 @@ public:
: docid(id), distance(dist)
{}
Neighbor() noexcept : docid(0), distance(0.0) {}
+ bool operator==(const Neighbor& rhs) const {
+ return docid == rhs.docid && distance == rhs.distance;
+ }
};
virtual ~NearestNeighborIndex() = default;
virtual void add_document(uint32_t docid) = 0;