summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2021-08-31 14:51:28 +0200
committerGitHub <noreply@github.com>2021-08-31 14:51:28 +0200
commitfc6e87c3d849b2222ab3b2817d050d01fe497e9d (patch)
treec560c8d3dfc2f6c3efdaa5c7732b52dc0f4285c0 /searchlib
parent0d6692e624dc031a5769ac6a832930d06dc68bcb (diff)
parent17d3124f86f298e7c91003d54c9c664fba29afab (diff)
Merge pull request #18904 from vespa-engine/toregge/add-alternate-visited-nodes-trackers-for-hnsw-index
Prepare for alternate visited nodes trackers for HNSW index.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/tensor/CMakeLists.txt3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.h32
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.h27
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp63
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h7
-rw-r--r--searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.cpp15
-rw-r--r--searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.h31
9 files changed, 197 insertions, 9 deletions
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index 46bafa189e1..e10db433a58 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -2,6 +2,7 @@
vespa_add_library(searchlib_tensor OBJECT
SOURCES
angular_distance.cpp
+ bitvector_visited_tracker.cpp
default_nearest_neighbor_index_factory.cpp
dense_tensor_attribute.cpp
dense_tensor_attribute_saver.cpp
@@ -13,6 +14,7 @@ vespa_add_library(searchlib_tensor OBJECT
euclidean_distance.cpp
geo_degrees_distance.cpp
hamming_distance.cpp
+ hash_set_visited_tracker.cpp
hnsw_graph.cpp
hnsw_index.cpp
hnsw_index_loader.cpp
@@ -29,5 +31,6 @@ vespa_add_library(searchlib_tensor OBJECT
tensor_attribute.cpp
tensor_deserialize.cpp
tensor_store.cpp
+ reusable_set_visited_tracker.cpp
DEPENDS
)
diff --git a/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.cpp b/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.cpp
new file mode 100644
index 00000000000..fe5ae0cd6d0
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.cpp
@@ -0,0 +1,14 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "bitvector_visited_tracker.h"
+
+namespace search::tensor {
+
+BitVectorVisitedTracker::BitVectorVisitedTracker(const HnswIndex&, uint32_t doc_id_limit, uint32_t)
+ : _visited(doc_id_limit)
+{
+}
+
+BitVectorVisitedTracker::~BitVectorVisitedTracker() = default;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.h b/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.h
new file mode 100644
index 00000000000..0a588744075
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.h
@@ -0,0 +1,32 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchlib/common/allocatedbitvector.h>
+
+namespace search::tensor {
+
+class HnswIndex;
+
+/*
+ * Tracker for visited nodes based on search::AllocatedBitVector. Best when
+ * many nodes are visited.
+ */
+class BitVectorVisitedTracker
+{
+ search::AllocatedBitVector _visited;
+public:
+ BitVectorVisitedTracker(const HnswIndex&, uint32_t doc_id_limit, uint32_t);
+ ~BitVectorVisitedTracker();
+ void mark(uint32_t doc_id) { _visited.setBit(doc_id); }
+ bool try_mark(uint32_t doc_id) {
+ if (_visited.testBit(doc_id)) {
+ return false;
+ } else {
+ _visited.setBit(doc_id);
+ return true;
+ }
+ }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.cpp b/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.cpp
new file mode 100644
index 00000000000..e74c6088a7c
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.cpp
@@ -0,0 +1,14 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "hash_set_visited_tracker.h"
+
+namespace search::tensor {
+
+HashSetVisitedTracker::HashSetVisitedTracker(const HnswIndex&, uint32_t, uint32_t estimated_visited_nodes)
+ : _visited(estimated_visited_nodes)
+{
+}
+
+HashSetVisitedTracker::~HashSetVisitedTracker() = default;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.h b/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.h
new file mode 100644
index 00000000000..369473b6dfa
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.h
@@ -0,0 +1,27 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/stllike/hash_set.h>
+
+namespace search::tensor {
+
+class HnswIndex;
+
+/*
+ * Tracker for visited nodes based on vespalib::hash_set<uint32_t>. Best when
+ * only a small portion of the nodes are visited.
+ */
+class HashSetVisitedTracker
+{
+ vespalib::hash_set<uint32_t> _visited;
+public:
+ HashSetVisitedTracker(const HnswIndex&, uint32_t, uint32_t estimated_visited_nodes);
+ ~HashSetVisitedTracker();
+ void mark(uint32_t doc_id) { _visited.insert(doc_id); }
+ bool try_mark(uint32_t doc_id) {
+ return _visited.insert(doc_id).second;
+ }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index 51c12d8af6d..a6b0a16b524 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -5,6 +5,9 @@
#include "hnsw_index_loader.h"
#include "hnsw_index_saver.h"
#include "random_level_generator.h"
+#include "bitvector_visited_tracker.h"
+#include "hash_set_visited_tracker.h"
+#include "reusable_set_visited_tracker.h"
#include <vespa/searchcommon/common/compaction_strategy.h>
#include <vespa/searchlib/attribute/address_space_components.h>
#include <vespa/searchlib/attribute/address_space_usage.h>
@@ -20,6 +23,8 @@
LOG_SETUP(".searchlib.tensor.hnsw_index");
+#define USE_OLD_VISITED_TRACKER 1
+
namespace search::tensor {
using search::AddressSpaceComponents;
@@ -206,6 +211,29 @@ HnswIndex::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const
return _distance_func->calc(lhs, rhs);
}
+uint32_t
+HnswIndex::estimate_visited_nodes(uint32_t level, uint32_t doc_id_limit, uint32_t neighbors_to_find, const search::BitVector* filter) const
+{
+ uint32_t m_for_level = max_links_for_level(level);
+ uint64_t base_estimate = uint64_t(m_for_level) * neighbors_to_find + 100;
+ if (base_estimate >= doc_id_limit) {
+ return doc_id_limit;
+ }
+ if (!filter) {
+ return base_estimate;
+ }
+ uint32_t true_bits = filter->countTrueBits();
+ if (true_bits == 0) {
+ return doc_id_limit;
+ }
+ double scaler = double(filter->size()) / true_bits;
+ double scaled_estimate = scaler * base_estimate;
+ if (scaled_estimate >= doc_id_limit) {
+ return doc_id_limit;
+ }
+ return scaled_estimate;
+}
+
HnswCandidate
HnswIndex::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) const
{
@@ -227,16 +255,14 @@ HnswIndex::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& e
return nearest;
}
+template <class VisitedTracker>
void
-HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find,
- FurthestPriQ& best_neighbors, uint32_t level, const search::BitVector *filter) const
+HnswIndex::search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find,
+ FurthestPriQ& best_neighbors, uint32_t level, const search::BitVector *filter,
+ uint32_t doc_id_limit, uint32_t estimated_visited_nodes) const
{
NearestPriQ candidates;
- uint32_t doc_id_limit = _graph.node_refs_size.load(std::memory_order_acquire);
- if (filter) {
- doc_id_limit = std::min(filter->size(), doc_id_limit);
- }
- auto visited = _visited_set_pool.get(doc_id_limit);
+ VisitedTracker visited(*this, doc_id_limit, estimated_visited_nodes);
for (const auto &entry : best_neighbors.peek()) {
if (entry.docid >= doc_id_limit) {
continue;
@@ -262,11 +288,10 @@ HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find,
}
auto neighbor_ref = _graph.get_node_ref(neighbor_docid);
if ((! neighbor_ref.valid())
- || visited.is_marked(neighbor_docid))
+ || ! visited.try_mark(neighbor_docid))
{
continue;
}
- visited.mark(neighbor_docid);
double dist_to_input = calc_distance(input, neighbor_docid);
if (dist_to_input < limit_dist) {
candidates.emplace(neighbor_docid, neighbor_ref, dist_to_input);
@@ -282,6 +307,26 @@ HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find,
}
}
+void
+HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find,
+ FurthestPriQ& best_neighbors, uint32_t level, const search::BitVector *filter) const
+{
+ uint32_t doc_id_limit = _graph.node_refs_size.load(std::memory_order_acquire);
+ if (filter) {
+ doc_id_limit = std::min(filter->size(), doc_id_limit);
+ }
+ uint32_t estimated_visited_nodes = estimate_visited_nodes(level, doc_id_limit, neighbors_to_find, filter);
+#if ! USE_OLD_VISITED_TRACKER
+ if (estimated_visited_nodes >= doc_id_limit / 64) {
+ search_layer_helper<BitVectorVisitedTracker>(input, neighbors_to_find, best_neighbors, level, filter, doc_id_limit, estimated_visited_nodes);
+ } else {
+ search_layer_helper<HashSetVisitedTracker>(input, neighbors_to_find, best_neighbors, level, filter, doc_id_limit, estimated_visited_nodes);
+ }
+#else
+ search_layer_helper<ReusableSetVisitedTracker>(input, neighbors_to_find, best_neighbors, level, filter, doc_id_limit, estimated_visited_nodes);
+#endif
+}
+
HnswIndex::HnswIndex(const DocVectorAccess& vectors, DistanceFunction::UP distance_func,
RandomLevelGenerator::UP level_generator, const Config& cfg)
: _graph(),
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index eb874193ae9..4503459a88a 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -119,11 +119,17 @@ protected:
double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const;
double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const;
+ uint32_t estimate_visited_nodes(uint32_t level, uint32_t doc_id_limit, uint32_t neighbors_to_find, const search::BitVector* filter) const;
/**
* Performs a greedy search in the given layer to find the candidate that is nearest the input vector.
*/
HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) const;
+ template <class VisitedTracker>
+ void search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors,
+ uint32_t level, const search::BitVector *filter,
+ uint32_t doc_id_limit,
+ uint32_t estimated_visited_nodes) const;
void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors,
uint32_t level, const search::BitVector *filter = nullptr) const;
std::vector<Neighbor> top_k_by_docid(uint32_t k, TypedCells vector,
@@ -197,6 +203,7 @@ public:
bool check_link_symmetry() const;
std::pair<uint32_t, bool> count_reachable_nodes() const;
HnswGraph& get_graph() { return _graph; }
+ vespalib::ReusableSetPool& get_visited_set_pool() const noexcept { return _visited_set_pool; }
static vespalib::datastore::ArrayStoreConfig make_default_node_store_config();
static vespalib::datastore::ArrayStoreConfig make_default_link_store_config();
diff --git a/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.cpp b/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.cpp
new file mode 100644
index 00000000000..e64cb485aa5
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.cpp
@@ -0,0 +1,15 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "reusable_set_visited_tracker.h"
+#include "hnsw_index.h"
+
+namespace search::tensor {
+
+ReusableSetVisitedTracker::ReusableSetVisitedTracker(const HnswIndex& index, uint32_t doc_id_limit, uint32_t)
+ : _visited(index.get_visited_set_pool().get(doc_id_limit))
+{
+}
+
+ReusableSetVisitedTracker::~ReusableSetVisitedTracker() = default;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.h b/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.h
new file mode 100644
index 00000000000..819d5dd7a15
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.h
@@ -0,0 +1,31 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/util/reusable_set_handle.h>
+
+namespace search::tensor {
+
+class HnswIndex;
+
+/*
+ * Tracker for visited nodes based on vespalib::ReusableSet.
+ */
+class ReusableSetVisitedTracker
+{
+ vespalib::ReusableSetHandle _visited;
+public:
+ ReusableSetVisitedTracker(const HnswIndex& index, uint32_t doc_id_limit, uint32_t);
+ ~ReusableSetVisitedTracker();
+ void mark(uint32_t doc_id) { _visited.mark(doc_id); }
+ bool try_mark(uint32_t doc_id) {
+ if (_visited.is_marked(doc_id)) {
+ return false;
+ } else {
+ _visited.mark(doc_id);
+ return true;
+ }
+ }
+};
+
+}