diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-02-11 14:05:17 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-11 14:05:17 +0100 |
commit | ebfcce139ce2d561bd820973f03be923440c91b4 (patch) | |
tree | fb6d48d3a4409998f9f890a6180a61a99d11f32f | |
parent | 13c62d475d45f91541da9c6e6d36dff1a46f00bf (diff) | |
parent | e3e77d310e0ca6363a6b4571ca55f85356db60af (diff) |
Merge pull request #12135 from vespa-engine/geirst/hnsw-simplifications-with-distance-function-interface
Hnsw simplifications with distance function interface
8 files changed, 368 insertions, 373 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 634569cbc8c..081361b2fdc 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -1,6 +1,7 @@ // Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include <vespa/eval/tensor/dense/typed_cells.h> +#include <vespa/searchlib/tensor/distance_functions.h> #include <vespa/searchlib/tensor/doc_vector_access.h> #include <vespa/searchlib/tensor/hnsw_index.h> #include <vespa/searchlib/tensor/random_level_generator.h> @@ -41,14 +42,15 @@ struct LevelGenerator : public RandomLevelGenerator { }; using FloatVectors = MyDocVectorAccess<float>; -using FloatIndex = HnswIndex<float>; -using FloatIndexUP = std::unique_ptr<FloatIndex>; +using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>; +using HnswIndexUP = std::unique_ptr<HnswIndex>; class HnswIndexTest : public ::testing::Test { public: FloatVectors vectors; + FloatSqEuclideanDistance distance_func; LevelGenerator level_generator; - FloatIndexUP index; + HnswIndexUP index; HnswIndexTest() : vectors(), @@ -60,8 +62,8 @@ public: .set(7, {3, 5}); } void init(bool heuristic_select_neighbors) { - index = std::make_unique<FloatIndex>(vectors, level_generator, - HnswIndexBase::Config(2, 1, 10, heuristic_select_neighbors)); + index = std::make_unique<HnswIndex>(vectors, distance_func, level_generator, + HnswIndex::Config(2, 1, 10, heuristic_select_neighbors)); } void add_document(uint32_t docid, uint32_t max_level = 0) { level_generator.level = max_level; diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index 7c05304d451..9175168248c 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -6,7 +6,6 @@ vespa_add_library(searchlib_tensor OBJECT dense_tensor_store.cpp generic_tensor_attribute.cpp generic_tensor_store.cpp - hnsw_index_base.cpp hnsw_index.cpp imported_tensor_attribute_vector.cpp imported_tensor_attribute_vector_read_guard.cpp diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function.h b/searchlib/src/vespa/searchlib/tensor/distance_function.h new file mode 100644 index 00000000000..8dfb77ddccb --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/distance_function.h @@ -0,0 +1,21 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +namespace vespalib::tensor { struct TypedCells; } + +namespace search::tensor { + +/** + * Interface used to calculate the distance between two n-dimensional vectors. + * + * The vectors must be of same size and same type (float or double). + * The actual implementation must know which type the vectors are. + */ +class DistanceFunction { +public: + virtual ~DistanceFunction() {} + virtual double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const = 0; +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h new file mode 100644 index 00000000000..1e8727e92aa --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -0,0 +1,34 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" + +namespace search::tensor { + +// TODO: Make distance functions hardware optimized. + +/** + * Calculates the square of the standard Euclidean distance. + */ +template <typename FloatType> +class SquaredEuclideanDistance : public DistanceFunction { +public: + double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + double result = 0.0; + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + for (size_t i = 0; i < sz; ++i) { + double diff = lhs_vector[i] - rhs_vector[i]; + result += diff * diff; + } + return result; + } +}; + +template class SquaredEuclideanDistance<float>; +template class SquaredEuclideanDistance<double>; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 345f7f551a6..9600f2fd9d4 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -1,37 +1,180 @@ // Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "distance_function.h" #include "hnsw_index.h" +#include "random_level_generator.h" +#include <vespa/eval/tensor/dense/typed_cells.h> +#include <vespa/vespalib/datastore/array_store.hpp> #include <vespa/vespalib/util/rcuvector.hpp> namespace search::tensor { -template <typename FloatType> +namespace { + +// TODO: Move this to MemoryAllocator, with name PAGE_SIZE. +constexpr size_t small_page_size = 4 * 1024; +constexpr size_t min_num_arrays_for_new_buffer = 8 * 1024; +constexpr float alloc_grow_factor = 0.2; +// TODO: Adjust these numbers to what we accept as max in config. +constexpr size_t max_level_array_size = 16; +constexpr size_t max_link_array_size = 64; + +} + +search::datastore::ArrayStoreConfig +HnswIndex::make_default_node_store_config() +{ + return NodeStore::optimizedConfigForHugePage(max_level_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE, + small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true); +} + +search::datastore::ArrayStoreConfig +HnswIndex::make_default_link_store_config() +{ + return LinkStore::optimizedConfigForHugePage(max_link_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE, + small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true); +} + +uint32_t +HnswIndex::max_links_for_level(uint32_t level) const +{ + return (level == 0) ? _cfg.max_links_at_level_0() : _cfg.max_links_at_hierarchic_levels(); +} + +uint32_t +HnswIndex::make_node_for_document(uint32_t docid) +{ + uint32_t max_level = _level_generator.max_level(); + // TODO: Add capping on num_levels + uint32_t num_levels = max_level + 1; + // Note: The level array instance lives as long as the document is present in the index. + LevelArray levels(num_levels, AtomicEntryRef()); + auto node_ref = _nodes.add(levels); + _node_refs[docid].store_release(node_ref); + return max_level; +} + +HnswIndex::LevelArrayRef +HnswIndex::get_level_array(uint32_t docid) const +{ + auto node_ref = _node_refs[docid].load_acquire(); + return _nodes.get(node_ref); +} + +HnswIndex::LinkArrayRef +HnswIndex::get_link_array(uint32_t docid, uint32_t level) const +{ + auto levels = get_level_array(docid); + assert(level < levels.size()); + return _links.get(levels[level].load_acquire()); +} + +void +HnswIndex::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links) +{ + auto links_ref = _links.add(links); + auto node_ref = _node_refs[docid].load_acquire(); + auto levels = _nodes.get_writable(node_ref); + levels[level].store_release(links_ref); +} + +bool +HnswIndex::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const +{ + for (uint32_t result_docid : result) { + double dist = calc_distance(candidate.docid, result_docid); + if (dist < candidate.distance) { + return true; + } + } + return false; +} + +HnswIndex::LinkArray +HnswIndex::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const +{ + HnswCandidateVector sorted(neighbors); + std::sort(sorted.begin(), sorted.end(), LesserDistance()); + LinkArray result; + for (size_t i = 0, m = std::min(static_cast<size_t>(max_links), sorted.size()); i < m; ++i) { + result.push_back(sorted[i].docid); + } + return result; +} + +HnswIndex::LinkArray +HnswIndex::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const +{ + LinkArray result; + bool need_filtering = neighbors.size() > max_links; + NearestPriQ nearest; + for (const auto& entry : neighbors) { + nearest.push(entry); + } + while (!nearest.empty()) { + auto candidate = nearest.top(); + nearest.pop(); + if (need_filtering && have_closer_distance(candidate, result)) { + continue; + } + result.push_back(candidate.docid); + if (result.size() == max_links) { + return result; + } + } + return result; +} + +HnswIndex::LinkArray +HnswIndex::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const +{ + if (_cfg.heuristic_select_neighbors()) { + return select_neighbors_heuristic(neighbors, max_links); + } else { + return select_neighbors_simple(neighbors, max_links); + } +} + +void +HnswIndex::connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level) +{ + set_link_array(docid, level, neighbors); + for (uint32_t neighbor_docid : neighbors) { + auto old_links = get_link_array(neighbor_docid, level); + LinkArray new_links(old_links.begin(), old_links.end()); + new_links.push_back(docid); + set_link_array(neighbor_docid, level, new_links); + } +} + +void +HnswIndex::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level) +{ + LinkArray new_links; + auto old_links = get_link_array(remove_from, level); + for (uint32_t id : old_links) { + if (id != remove_id) new_links.push_back(id); + } + set_link_array(remove_from, level, new_links); +} + + double -HnswIndex<FloatType>::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const +HnswIndex::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const { auto lhs = get_vector(lhs_docid); return calc_distance(lhs, rhs_docid); } -template <typename FloatType> double -HnswIndex<FloatType>::calc_distance(const Vector& lhs, uint32_t rhs_docid) const +HnswIndex::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const { - // TODO: Make it possible to specify the distance function from the outside and make it hardware optimized. auto rhs = get_vector(rhs_docid); - double result = 0.0; - size_t sz = lhs.size(); - assert(sz == rhs.size()); - for (size_t i = 0; i < sz; ++i) { - double diff = lhs[i] - rhs[i]; - result += diff * diff; - } - return result; + return _distance_func.calc(lhs, rhs); } -template <typename FloatType> HnswCandidate -HnswIndex<FloatType>::find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level) +HnswIndex::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) { HnswCandidate nearest = entry_point; bool keep_searching = true; @@ -48,9 +191,8 @@ HnswIndex<FloatType>::find_nearest_in_layer(const Vector& input, const HnswCandi return nearest; } -template <typename FloatType> void -HnswIndex<FloatType>::search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) +HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) { NearestPriQ candidates; // TODO: Add proper handling of visited set. @@ -85,18 +227,24 @@ HnswIndex<FloatType>::search_layer(const Vector& input, uint32_t neighbors_to_fi } } -template <typename FloatType> -HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg) - : HnswIndexBase(vectors, level_generator, cfg) +HnswIndex::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg) + : _vectors(vectors), + _distance_func(distance_func), + _level_generator(level_generator), + _cfg(cfg), + _node_refs(), + _nodes(make_default_node_store_config()), + _links(make_default_link_store_config()), + _entry_docid(0), // Note that docid 0 is reserved and never used + _entry_level(-1) { } -template <typename FloatType> -HnswIndex<FloatType>::~HnswIndex() = default; +HnswIndex::~HnswIndex() = default; -template <typename FloatType> void -HnswIndex<FloatType>::add_document(uint32_t docid) +HnswIndex::add_document(uint32_t docid) { auto input = get_vector(docid); _node_refs.ensure_size(docid + 1, AtomicEntryRef()); @@ -136,9 +284,8 @@ HnswIndex<FloatType>::add_document(uint32_t docid) } } -template <typename FloatType> void -HnswIndex<FloatType>::remove_document(uint32_t docid) +HnswIndex::remove_document(uint32_t docid) { bool need_new_entrypoint = (docid == _entry_docid); LinkArray empty; @@ -163,5 +310,23 @@ HnswIndex<FloatType>::remove_document(uint32_t docid) _node_refs[docid].store_release(invalid); } +HnswNode +HnswIndex::get_node(uint32_t docid) const +{ + auto node_ref = _node_refs[docid].load_acquire(); + if (!node_ref.valid()) { + return HnswNode(); + } + auto levels = _nodes.get(node_ref); + HnswNode::LevelArray result; + for (const auto& links_ref : levels) { + auto links = _links.get(links_ref.load_acquire()); + HnswNode::LinkArray result_links(links.begin(), links.end()); + std::sort(result_links.begin(), result_links.end()); + result.push_back(result_links); + } + return HnswNode(result); +} + } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index be6f507dbe1..15ad4ebf07b 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -3,52 +3,148 @@ #pragma once #include "doc_vector_access.h" -#include "hnsw_index_base.h" +#include "hnsw_index_utils.h" +#include "hnsw_node.h" +#include "nearest_neighbor_index.h" +#include <vespa/eval/tensor/dense/typed_cells.h> #include <vespa/searchlib/common/bitvector.h> #include <vespa/vespalib/datastore/array_store.h> +#include <vespa/vespalib/datastore/atomic_entry_ref.h> #include <vespa/vespalib/datastore/entryref.h> #include <vespa/vespalib/util/rcuvector.h> namespace search::tensor { -class DocVectorAccess; +class DistanceFunction; class RandomLevelGenerator; /** - * Concrete implementation of a hierarchical navigable small world graph (HNSW) + * Implementation of a hierarchical navigable small world graph (HNSW) * that is used for approximate K-nearest neighbor search. * - * See HnswIndexBase for more details. + * The implementation supports 1 write thread and multiple search threads without the use of mutexes. + * This is achieved by using data stores that use generation tracking and associated memory management. * - * The FloatType template argument specifies the data type used in the vectors (4 byte float or 8 byte double). + * The implementation is mainly based on the algorithms described in + * "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs" (Yu. A. Malkov, D. A. Yashunin), + * but some adjustments are made to support proper removes. + * + * TODO: Add details on how to handle removes. */ -template <typename FloatType = float> -class HnswIndex : public HnswIndexBase { -private: - using Vector = vespalib::ConstArrayRef<FloatType>; +class HnswIndex : public NearestNeighborIndex { +public: + class Config { + private: + uint32_t _max_links_at_level_0; + uint32_t _max_links_at_hierarchic_levels; + uint32_t _neighbors_to_explore_at_construction; + bool _heuristic_select_neighbors; + + public: + Config(uint32_t max_links_at_level_0_in, + uint32_t max_links_at_hierarchic_levels_in, + uint32_t neighbors_to_explore_at_construction_in, + bool heuristic_select_neighbors_in) + : _max_links_at_level_0(max_links_at_level_0_in), + _max_links_at_hierarchic_levels(max_links_at_hierarchic_levels_in), + _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in), + _heuristic_select_neighbors(heuristic_select_neighbors_in) + {} + uint32_t max_links_at_level_0() const { return _max_links_at_level_0; } + uint32_t max_links_at_hierarchic_levels() const { return _max_links_at_hierarchic_levels; } + uint32_t neighbors_to_explore_at_construction() const { return _neighbors_to_explore_at_construction; } + bool heuristic_select_neighbors() const { return _heuristic_select_neighbors; } + }; + +protected: + using AtomicEntryRef = search::datastore::AtomicEntryRef; + + // This uses 10 bits for buffer id -> 1024 buffers. + // As we have very short arrays we get less fragmentation with fewer and larger buffers. + using EntryRefType = search::datastore::EntryRefT<22>; + + // Provides mapping from document id -> node reference. + // The reference is used to lookup the node data in NodeStore. + using NodeRefVector = vespalib::RcuVector<AtomicEntryRef>; + + // This stores the level arrays for all nodes. + // Each node consists of an array of levels (from level 0 to n) where each entry is a reference to the link array at that level. + using NodeStore = search::datastore::ArrayStore<AtomicEntryRef, EntryRefType>; + using LevelArrayRef = NodeStore::ConstArrayRef; + using LevelArray = vespalib::Array<AtomicEntryRef>; + + // This stores all link arrays. + // A link array consists of the document ids of the nodes a particular node is linked to. + using LinkStore = search::datastore::ArrayStore<uint32_t, EntryRefType>; + using LinkArrayRef = LinkStore::ConstArrayRef; + using LinkArray = vespalib::Array<uint32_t>; + + using TypedCells = vespalib::tensor::TypedCells; + + const DocVectorAccess& _vectors; + const DistanceFunction& _distance_func; + RandomLevelGenerator& _level_generator; + Config _cfg; + NodeRefVector _node_refs; + NodeStore _nodes; + LinkStore _links; + uint32_t _entry_docid; + int _entry_level; + + static search::datastore::ArrayStoreConfig make_default_node_store_config(); + static search::datastore::ArrayStoreConfig make_default_link_store_config(); - inline Vector get_vector(uint32_t docid) const { - return _vectors.get_vector(docid).template typify<FloatType>(); + uint32_t max_links_for_level(uint32_t level) const; + uint32_t make_node_for_document(uint32_t docid); + LevelArrayRef get_level_array(uint32_t docid) const; + LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const; + void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links); + + /** + * Returns true if the distance between the candidate and a node in the current result + * is less than the distance between the candidate and the node we want to add to the graph. + * In this case the candidate should be discarded as we already are connected to the space + * where the candidate is located. + * Used by select_neighbors_heuristic(). + */ + bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const; + LinkArray select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const; + LinkArray select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const; + LinkArray select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const; + void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level); + void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level); + + inline TypedCells get_vector(uint32_t docid) const { + return _vectors.get_vector(docid); } - double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override; - double calc_distance(const Vector& lhs, uint32_t rhs_docid) const; + double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const; + double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; + /** * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. */ - HnswCandidate find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level); - void search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); + HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level); + void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); public: - HnswIndex(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg); + HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndex() override; void add_document(uint32_t docid) override; void remove_document(uint32_t docid) override; -}; -template class HnswIndex<float>; -template class HnswIndex<double>; + // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists) + + uint32_t get_entry_docid() const { return _entry_docid; } + uint32_t get_entry_level() const { return _entry_level; } + + // Should only be used by unit tests. + HnswNode get_node(uint32_t docid) const; + + // TODO: Implement set_node() as well for use in unit tests. +}; } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp deleted file mode 100644 index a7b9c1dba79..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "hnsw_index_base.h" -#include "random_level_generator.h" -#include <vespa/vespalib/datastore/array_store.hpp> -#include <vespa/vespalib/util/rcuvector.hpp> - -namespace search::tensor { - -namespace { - -// TODO: Move this to MemoryAllocator, with name PAGE_SIZE. -constexpr size_t small_page_size = 4 * 1024; -constexpr size_t min_num_arrays_for_new_buffer = 8 * 1024; -constexpr float alloc_grow_factor = 0.2; -// TODO: Adjust these numbers to what we accept as max in config. -constexpr size_t max_level_array_size = 16; -constexpr size_t max_link_array_size = 64; - -} - -search::datastore::ArrayStoreConfig -HnswIndexBase::make_default_node_store_config() -{ - return NodeStore::optimizedConfigForHugePage(max_level_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE, - small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true); -} - -search::datastore::ArrayStoreConfig -HnswIndexBase::make_default_link_store_config() -{ - return LinkStore::optimizedConfigForHugePage(max_link_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE, - small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true); -} - -uint32_t -HnswIndexBase::max_links_for_level(uint32_t level) const -{ - return (level == 0) ? _cfg.max_links_at_level_0() : _cfg.max_links_at_hierarchic_levels(); -} - -uint32_t -HnswIndexBase::make_node_for_document(uint32_t docid) -{ - uint32_t max_level = _level_generator.max_level(); - // TODO: Add capping on num_levels - uint32_t num_levels = max_level + 1; - // Note: The level array instance lives as long as the document is present in the index. - LevelArray levels(num_levels, AtomicEntryRef()); - auto node_ref = _nodes.add(levels); - _node_refs[docid].store_release(node_ref); - return max_level; -} - -HnswIndexBase::LevelArrayRef -HnswIndexBase::get_level_array(uint32_t docid) const -{ - auto node_ref = _node_refs[docid].load_acquire(); - return _nodes.get(node_ref); -} - -HnswIndexBase::LinkArrayRef -HnswIndexBase::get_link_array(uint32_t docid, uint32_t level) const -{ - auto levels = get_level_array(docid); - assert(level < levels.size()); - return _links.get(levels[level].load_acquire()); -} - -void -HnswIndexBase::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links) -{ - auto links_ref = _links.add(links); - auto node_ref = _node_refs[docid].load_acquire(); - auto levels = _nodes.get_writable(node_ref); - levels[level].store_release(links_ref); -} - -bool -HnswIndexBase::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const -{ - for (uint32_t result_docid : result) { - double dist = calc_distance(candidate.docid, result_docid); - if (dist < candidate.distance) { - return true; - } - } - return false; -} - -HnswIndexBase::LinkArray -HnswIndexBase::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const -{ - HnswCandidateVector sorted(neighbors); - std::sort(sorted.begin(), sorted.end(), LesserDistance()); - LinkArray result; - for (size_t i = 0, m = std::min(static_cast<size_t>(max_links), sorted.size()); i < m; ++i) { - result.push_back(sorted[i].docid); - } - return result; -} - -HnswIndexBase::LinkArray -HnswIndexBase::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const -{ - LinkArray result; - bool need_filtering = neighbors.size() > max_links; - NearestPriQ nearest; - for (const auto& entry : neighbors) { - nearest.push(entry); - } - while (!nearest.empty()) { - auto candidate = nearest.top(); - nearest.pop(); - if (need_filtering && have_closer_distance(candidate, result)) { - continue; - } - result.push_back(candidate.docid); - if (result.size() == max_links) { - return result; - } - } - return result; -} - -HnswIndexBase::LinkArray -HnswIndexBase::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const -{ - if (_cfg.heuristic_select_neighbors()) { - return select_neighbors_heuristic(neighbors, max_links); - } else { - return select_neighbors_simple(neighbors, max_links); - } -} - -void -HnswIndexBase::connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level) -{ - set_link_array(docid, level, neighbors); - for (uint32_t neighbor_docid : neighbors) { - auto old_links = get_link_array(neighbor_docid, level); - LinkArray new_links(old_links.begin(), old_links.end()); - new_links.push_back(docid); - set_link_array(neighbor_docid, level, new_links); - } -} - -void -HnswIndexBase::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level) -{ - LinkArray new_links; - auto old_links = get_link_array(remove_from, level); - for (uint32_t id : old_links) { - if (id != remove_id) new_links.push_back(id); - } - set_link_array(remove_from, level, new_links); -} - -HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg) - : _vectors(vectors), - _level_generator(level_generator), - _cfg(cfg), - _node_refs(), - _nodes(make_default_node_store_config()), - _links(make_default_link_store_config()), - _entry_docid(0), // Note that docid 0 is reserved and never used - _entry_level(-1) -{ -} - -HnswIndexBase::~HnswIndexBase() = default; - -HnswNode -HnswIndexBase::get_node(uint32_t docid) const -{ - auto node_ref = _node_refs[docid].load_acquire(); - if (!node_ref.valid()) { - return HnswNode(); - } - auto levels = _nodes.get(node_ref); - HnswNode::LevelArray result; - for (const auto& links_ref : levels) { - auto links = _links.get(links_ref.load_acquire()); - HnswNode::LinkArray result_links(links.begin(), links.end()); - std::sort(result_links.begin(), result_links.end()); - result.push_back(result_links); - } - return HnswNode(result); -} - -} - diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h deleted file mode 100644 index 9987b61c157..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "hnsw_index_utils.h" -#include "hnsw_node.h" -#include "nearest_neighbor_index.h" -#include <vespa/searchlib/common/bitvector.h> -#include <vespa/vespalib/datastore/array_store.h> -#include <vespa/vespalib/datastore/atomic_entry_ref.h> -#include <vespa/vespalib/datastore/entryref.h> -#include <vespa/vespalib/util/rcuvector.h> - -namespace search::tensor { - -class DocVectorAccess; -class RandomLevelGenerator; - -/** - * Base class for an implementation of a hierarchical navigable small world graph (HNSW) - * that is used for approximate K-nearest neighbor search. - * - * The implementation supports 1 write thread and multiple search threads without the use of mutexes. - * This is achieved by using data stores that use generation tracking and associated memory management. - * - * The implementation is mainly based on the algorithms described in - * "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs" (Yu. A. Malkov, D. A. Yashunin), - * but some adjustments are made to support proper removes. - * - * TODO: Add details on how to handle removes. - */ -class HnswIndexBase : public NearestNeighborIndex { -public: - class Config { - private: - uint32_t _max_links_at_level_0; - uint32_t _max_links_at_hierarchic_levels; - uint32_t _neighbors_to_explore_at_construction; - bool _heuristic_select_neighbors; - - public: - Config(uint32_t max_links_at_level_0_in, - uint32_t max_links_at_hierarchic_levels_in, - uint32_t neighbors_to_explore_at_construction_in, - bool heuristic_select_neighbors_in) - : _max_links_at_level_0(max_links_at_level_0_in), - _max_links_at_hierarchic_levels(max_links_at_hierarchic_levels_in), - _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in), - _heuristic_select_neighbors(heuristic_select_neighbors_in) - {} - uint32_t max_links_at_level_0() const { return _max_links_at_level_0; } - uint32_t max_links_at_hierarchic_levels() const { return _max_links_at_hierarchic_levels; } - uint32_t neighbors_to_explore_at_construction() const { return _neighbors_to_explore_at_construction; } - bool heuristic_select_neighbors() const { return _heuristic_select_neighbors; } - }; - -protected: - using AtomicEntryRef = search::datastore::AtomicEntryRef; - - // This uses 10 bits for buffer id -> 1024 buffers. - // As we have very short arrays we get less fragmentation with fewer and larger buffers. - using EntryRefType = search::datastore::EntryRefT<22>; - - // Provides mapping from document id -> node reference. - // The reference is used to lookup the node data in NodeStore. - using NodeRefVector = vespalib::RcuVector<AtomicEntryRef>; - - // This stores the level arrays for all nodes. - // Each node consists of an array of levels (from level 0 to n) where each entry is a reference to the link array at that level. - using NodeStore = search::datastore::ArrayStore<AtomicEntryRef, EntryRefType>; - using LevelArrayRef = NodeStore::ConstArrayRef; - using LevelArray = vespalib::Array<AtomicEntryRef>; - - // This stores all link arrays. - // A link array consists of the document ids of the nodes a particular node is linked to. - using LinkStore = search::datastore::ArrayStore<uint32_t, EntryRefType>; - using LinkArrayRef = LinkStore::ConstArrayRef; - using LinkArray = vespalib::Array<uint32_t>; - - const DocVectorAccess& _vectors; - RandomLevelGenerator& _level_generator; - Config _cfg; - NodeRefVector _node_refs; - NodeStore _nodes; - LinkStore _links; - uint32_t _entry_docid; - int _entry_level; - - static search::datastore::ArrayStoreConfig make_default_node_store_config(); - static search::datastore::ArrayStoreConfig make_default_link_store_config(); - - uint32_t max_links_for_level(uint32_t level) const; - uint32_t make_node_for_document(uint32_t docid); - LevelArrayRef get_level_array(uint32_t docid) const; - LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const; - void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links); - - virtual double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const = 0; - - /** - * Returns true if the distance between the candidate and a node in the current result - * is less than the distance between the candidate and the node we want to add to the graph. - * In this case the candidate should be discarded as we already are connected to the space - * where the candidate is located. - * Used by select_neighbors_heuristic(). - */ - bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const; - LinkArray select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const; - LinkArray select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const; - LinkArray select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const; - void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level); - void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level); - -public: - HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg); - ~HnswIndexBase() override; - - // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists) - - uint32_t get_entry_docid() const { return _entry_docid; } - uint32_t get_entry_level() const { return _entry_level; } - - // Should only be used by unit tests. - HnswNode get_node(uint32_t docid) const; - - // TODO: Implement set_node() as well for use in unit tests. -}; - -} - |