diff options
author | Geir Storli <geirst@verizonmedia.com> | 2021-08-31 14:51:28 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-08-31 14:51:28 +0200 |
commit | fc6e87c3d849b2222ab3b2817d050d01fe497e9d (patch) | |
tree | c560c8d3dfc2f6c3efdaa5c7732b52dc0f4285c0 /searchlib | |
parent | 0d6692e624dc031a5769ac6a832930d06dc68bcb (diff) | |
parent | 17d3124f86f298e7c91003d54c9c664fba29afab (diff) |
Merge pull request #18904 from vespa-engine/toregge/add-alternate-visited-nodes-trackers-for-hnsw-index
Prepare for alternate visited nodes trackers for HNSW index.
Diffstat (limited to 'searchlib')
9 files changed, 197 insertions, 9 deletions
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index 46bafa189e1..e10db433a58 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -2,6 +2,7 @@ vespa_add_library(searchlib_tensor OBJECT SOURCES angular_distance.cpp + bitvector_visited_tracker.cpp default_nearest_neighbor_index_factory.cpp dense_tensor_attribute.cpp dense_tensor_attribute_saver.cpp @@ -13,6 +14,7 @@ vespa_add_library(searchlib_tensor OBJECT euclidean_distance.cpp geo_degrees_distance.cpp hamming_distance.cpp + hash_set_visited_tracker.cpp hnsw_graph.cpp hnsw_index.cpp hnsw_index_loader.cpp @@ -29,5 +31,6 @@ vespa_add_library(searchlib_tensor OBJECT tensor_attribute.cpp tensor_deserialize.cpp tensor_store.cpp + reusable_set_visited_tracker.cpp DEPENDS ) diff --git a/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.cpp b/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.cpp new file mode 100644 index 00000000000..fe5ae0cd6d0 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.cpp @@ -0,0 +1,14 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "bitvector_visited_tracker.h" + +namespace search::tensor { + +BitVectorVisitedTracker::BitVectorVisitedTracker(const HnswIndex&, uint32_t doc_id_limit, uint32_t) + : _visited(doc_id_limit) +{ +} + +BitVectorVisitedTracker::~BitVectorVisitedTracker() = default; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.h b/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.h new file mode 100644 index 00000000000..0a588744075 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/bitvector_visited_tracker.h @@ -0,0 +1,32 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/common/allocatedbitvector.h> + +namespace search::tensor { + +class HnswIndex; + +/* + * Tracker for visited nodes based on search::AllocatedBitVector. Best when + * many nodes are visited. + */ +class BitVectorVisitedTracker +{ + search::AllocatedBitVector _visited; +public: + BitVectorVisitedTracker(const HnswIndex&, uint32_t doc_id_limit, uint32_t); + ~BitVectorVisitedTracker(); + void mark(uint32_t doc_id) { _visited.setBit(doc_id); } + bool try_mark(uint32_t doc_id) { + if (_visited.testBit(doc_id)) { + return false; + } else { + _visited.setBit(doc_id); + return true; + } + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.cpp b/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.cpp new file mode 100644 index 00000000000..e74c6088a7c --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.cpp @@ -0,0 +1,14 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "hash_set_visited_tracker.h" + +namespace search::tensor { + +HashSetVisitedTracker::HashSetVisitedTracker(const HnswIndex&, uint32_t, uint32_t estimated_visited_nodes) + : _visited(estimated_visited_nodes) +{ +} + +HashSetVisitedTracker::~HashSetVisitedTracker() = default; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.h b/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.h new file mode 100644 index 00000000000..369473b6dfa --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hash_set_visited_tracker.h @@ -0,0 +1,27 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/hash_set.h> + +namespace search::tensor { + +class HnswIndex; + +/* + * Tracker for visited nodes based on vespalib::hash_set<uint32_t>. Best when + * only a small portion of the nodes are visited. + */ +class HashSetVisitedTracker +{ + vespalib::hash_set<uint32_t> _visited; +public: + HashSetVisitedTracker(const HnswIndex&, uint32_t, uint32_t estimated_visited_nodes); + ~HashSetVisitedTracker(); + void mark(uint32_t doc_id) { _visited.insert(doc_id); } + bool try_mark(uint32_t doc_id) { + return _visited.insert(doc_id).second; + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 51c12d8af6d..a6b0a16b524 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -5,6 +5,9 @@ #include "hnsw_index_loader.h" #include "hnsw_index_saver.h" #include "random_level_generator.h" +#include "bitvector_visited_tracker.h" +#include "hash_set_visited_tracker.h" +#include "reusable_set_visited_tracker.h" #include <vespa/searchcommon/common/compaction_strategy.h> #include <vespa/searchlib/attribute/address_space_components.h> #include <vespa/searchlib/attribute/address_space_usage.h> @@ -20,6 +23,8 @@ LOG_SETUP(".searchlib.tensor.hnsw_index"); +#define USE_OLD_VISITED_TRACKER 1 + namespace search::tensor { using search::AddressSpaceComponents; @@ -206,6 +211,29 @@ HnswIndex::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const return _distance_func->calc(lhs, rhs); } +uint32_t +HnswIndex::estimate_visited_nodes(uint32_t level, uint32_t doc_id_limit, uint32_t neighbors_to_find, const search::BitVector* filter) const +{ + uint32_t m_for_level = max_links_for_level(level); + uint64_t base_estimate = uint64_t(m_for_level) * neighbors_to_find + 100; + if (base_estimate >= doc_id_limit) { + return doc_id_limit; + } + if (!filter) { + return base_estimate; + } + uint32_t true_bits = filter->countTrueBits(); + if (true_bits == 0) { + return doc_id_limit; + } + double scaler = double(filter->size()) / true_bits; + double scaled_estimate = scaler * base_estimate; + if (scaled_estimate >= doc_id_limit) { + return doc_id_limit; + } + return scaled_estimate; +} + HnswCandidate HnswIndex::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) const { @@ -227,16 +255,14 @@ HnswIndex::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& e return nearest; } +template <class VisitedTracker> void -HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, - FurthestPriQ& best_neighbors, uint32_t level, const search::BitVector *filter) const +HnswIndex::search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find, + FurthestPriQ& best_neighbors, uint32_t level, const search::BitVector *filter, + uint32_t doc_id_limit, uint32_t estimated_visited_nodes) const { NearestPriQ candidates; - uint32_t doc_id_limit = _graph.node_refs_size.load(std::memory_order_acquire); - if (filter) { - doc_id_limit = std::min(filter->size(), doc_id_limit); - } - auto visited = _visited_set_pool.get(doc_id_limit); + VisitedTracker visited(*this, doc_id_limit, estimated_visited_nodes); for (const auto &entry : best_neighbors.peek()) { if (entry.docid >= doc_id_limit) { continue; @@ -262,11 +288,10 @@ HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, } auto neighbor_ref = _graph.get_node_ref(neighbor_docid); if ((! neighbor_ref.valid()) - || visited.is_marked(neighbor_docid)) + || ! visited.try_mark(neighbor_docid)) { continue; } - visited.mark(neighbor_docid); double dist_to_input = calc_distance(input, neighbor_docid); if (dist_to_input < limit_dist) { candidates.emplace(neighbor_docid, neighbor_ref, dist_to_input); @@ -282,6 +307,26 @@ HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, } } +void +HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, + FurthestPriQ& best_neighbors, uint32_t level, const search::BitVector *filter) const +{ + uint32_t doc_id_limit = _graph.node_refs_size.load(std::memory_order_acquire); + if (filter) { + doc_id_limit = std::min(filter->size(), doc_id_limit); + } + uint32_t estimated_visited_nodes = estimate_visited_nodes(level, doc_id_limit, neighbors_to_find, filter); +#if ! USE_OLD_VISITED_TRACKER + if (estimated_visited_nodes >= doc_id_limit / 64) { + search_layer_helper<BitVectorVisitedTracker>(input, neighbors_to_find, best_neighbors, level, filter, doc_id_limit, estimated_visited_nodes); + } else { + search_layer_helper<HashSetVisitedTracker>(input, neighbors_to_find, best_neighbors, level, filter, doc_id_limit, estimated_visited_nodes); + } +#else + search_layer_helper<ReusableSetVisitedTracker>(input, neighbors_to_find, best_neighbors, level, filter, doc_id_limit, estimated_visited_nodes); +#endif +} + HnswIndex::HnswIndex(const DocVectorAccess& vectors, DistanceFunction::UP distance_func, RandomLevelGenerator::UP level_generator, const Config& cfg) : _graph(), diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index eb874193ae9..4503459a88a 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -119,11 +119,17 @@ protected: double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const; double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; + uint32_t estimate_visited_nodes(uint32_t level, uint32_t doc_id_limit, uint32_t neighbors_to_find, const search::BitVector* filter) const; /** * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. */ HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) const; + template <class VisitedTracker> + void search_layer_helper(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, + uint32_t level, const search::BitVector *filter, + uint32_t doc_id_limit, + uint32_t estimated_visited_nodes) const; void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level, const search::BitVector *filter = nullptr) const; std::vector<Neighbor> top_k_by_docid(uint32_t k, TypedCells vector, @@ -197,6 +203,7 @@ public: bool check_link_symmetry() const; std::pair<uint32_t, bool> count_reachable_nodes() const; HnswGraph& get_graph() { return _graph; } + vespalib::ReusableSetPool& get_visited_set_pool() const noexcept { return _visited_set_pool; } static vespalib::datastore::ArrayStoreConfig make_default_node_store_config(); static vespalib::datastore::ArrayStoreConfig make_default_link_store_config(); diff --git a/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.cpp b/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.cpp new file mode 100644 index 00000000000..e64cb485aa5 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.cpp @@ -0,0 +1,15 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "reusable_set_visited_tracker.h" +#include "hnsw_index.h" + +namespace search::tensor { + +ReusableSetVisitedTracker::ReusableSetVisitedTracker(const HnswIndex& index, uint32_t doc_id_limit, uint32_t) + : _visited(index.get_visited_set_pool().get(doc_id_limit)) +{ +} + +ReusableSetVisitedTracker::~ReusableSetVisitedTracker() = default; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.h b/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.h new file mode 100644 index 00000000000..819d5dd7a15 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/reusable_set_visited_tracker.h @@ -0,0 +1,31 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/util/reusable_set_handle.h> + +namespace search::tensor { + +class HnswIndex; + +/* + * Tracker for visited nodes based on vespalib::ReusableSet. + */ +class ReusableSetVisitedTracker +{ + vespalib::ReusableSetHandle _visited; +public: + ReusableSetVisitedTracker(const HnswIndex& index, uint32_t doc_id_limit, uint32_t); + ~ReusableSetVisitedTracker(); + void mark(uint32_t doc_id) { _visited.mark(doc_id); } + bool try_mark(uint32_t doc_id) { + if (_visited.is_marked(doc_id)) { + return false; + } else { + _visited.mark(doc_id); + return true; + } + } +}; + +} |