From 8164e4a5c4910a878e49663730de5c2b310d5940 Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Mon, 10 Feb 2020 11:59:23 +0000 Subject: Add interface used to calculate the distance between two n-dim vectors. --- .../tests/tensor/hnsw_index/hnsw_index_test.cpp | 5 +++- .../src/vespa/searchlib/tensor/distance_function.h | 21 +++++++++++++ .../vespa/searchlib/tensor/distance_functions.h | 34 ++++++++++++++++++++++ .../src/vespa/searchlib/tensor/hnsw_index.cpp | 22 +++++--------- searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 16 +++++----- .../src/vespa/searchlib/tensor/hnsw_index_base.cpp | 4 ++- .../src/vespa/searchlib/tensor/hnsw_index_base.h | 5 +++- 7 files changed, 83 insertions(+), 24 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/tensor/distance_function.h create mode 100644 searchlib/src/vespa/searchlib/tensor/distance_functions.h diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 634569cbc8c..0c5ddb93e92 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -1,6 +1,7 @@ // Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include +#include #include #include #include @@ -41,12 +42,14 @@ struct LevelGenerator : public RandomLevelGenerator { }; using FloatVectors = MyDocVectorAccess; +using FloatSqEuclideanDistance = SquaredEuclideanDistance; using FloatIndex = HnswIndex; using FloatIndexUP = std::unique_ptr; class HnswIndexTest : public ::testing::Test { public: FloatVectors vectors; + FloatSqEuclideanDistance distance_func; LevelGenerator level_generator; FloatIndexUP index; @@ -60,7 +63,7 @@ public: .set(7, {3, 5}); } void init(bool heuristic_select_neighbors) { - index = std::make_unique(vectors, level_generator, + index = std::make_unique(vectors, distance_func, level_generator, HnswIndexBase::Config(2, 1, 10, heuristic_select_neighbors)); } void add_document(uint32_t docid, uint32_t max_level = 0) { diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function.h b/searchlib/src/vespa/searchlib/tensor/distance_function.h new file mode 100644 index 00000000000..8dfb77ddccb --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/distance_function.h @@ -0,0 +1,21 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +namespace vespalib::tensor { struct TypedCells; } + +namespace search::tensor { + +/** + * Interface used to calculate the distance between two n-dimensional vectors. + * + * The vectors must be of same size and same type (float or double). + * The actual implementation must know which type the vectors are. + */ +class DistanceFunction { +public: + virtual ~DistanceFunction() {} + virtual double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const = 0; +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h new file mode 100644 index 00000000000..dabab55dc7a --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -0,0 +1,34 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" + +namespace search::tensor { + +// TODO: Make distance functions hardware optimized. + +/** + * Calculates the square of the standard Euclidean distance. + */ +template +class SquaredEuclideanDistance : public DistanceFunction { +public: + double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override { + auto lhs_vector = lhs.template typify(); + auto rhs_vector = rhs.template typify(); + double result = 0.0; + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + for (size_t i = 0; i < sz; ++i) { + double diff = lhs_vector[i] - rhs_vector[i]; + result += diff * diff; + } + return result; + } +}; + +template class SquaredEuclideanDistance; +template class SquaredEuclideanDistance; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 345f7f551a6..cb617246066 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -1,5 +1,6 @@ // Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "distance_function.h" #include "hnsw_index.h" #include @@ -15,23 +16,15 @@ HnswIndex::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) cons template double -HnswIndex::calc_distance(const Vector& lhs, uint32_t rhs_docid) const +HnswIndex::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const { - // TODO: Make it possible to specify the distance function from the outside and make it hardware optimized. auto rhs = get_vector(rhs_docid); - double result = 0.0; - size_t sz = lhs.size(); - assert(sz == rhs.size()); - for (size_t i = 0; i < sz; ++i) { - double diff = lhs[i] - rhs[i]; - result += diff * diff; - } - return result; + return _distance_func.calc(lhs, rhs); } template HnswCandidate -HnswIndex::find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level) +HnswIndex::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) { HnswCandidate nearest = entry_point; bool keep_searching = true; @@ -50,7 +43,7 @@ HnswIndex::find_nearest_in_layer(const Vector& input, const HnswCandi template void -HnswIndex::search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) +HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) { NearestPriQ candidates; // TODO: Add proper handling of visited set. @@ -86,8 +79,9 @@ HnswIndex::search_layer(const Vector& input, uint32_t neighbors_to_fi } template -HnswIndex::HnswIndex(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg) - : HnswIndexBase(vectors, level_generator, cfg) +HnswIndex::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg) + : HnswIndexBase(vectors, distance_func, level_generator, cfg) { } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index be6f507dbe1..a93062cf200 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -11,6 +11,7 @@ namespace search::tensor { +class DistanceFunction; class DocVectorAccess; class RandomLevelGenerator; @@ -25,22 +26,23 @@ class RandomLevelGenerator; template class HnswIndex : public HnswIndexBase { private: - using Vector = vespalib::ConstArrayRef; + using TypedCells = vespalib::tensor::TypedCells; - inline Vector get_vector(uint32_t docid) const { - return _vectors.get_vector(docid).template typify(); + inline TypedCells get_vector(uint32_t docid) const { + return _vectors.get_vector(docid); } double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override; - double calc_distance(const Vector& lhs, uint32_t rhs_docid) const; + double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; /** * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. */ - HnswCandidate find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level); - void search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); + HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level); + void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); public: - HnswIndex(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg); + HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndex() override; void add_document(uint32_t docid) override; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp index a7b9c1dba79..e0ecdf87399 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp @@ -156,8 +156,10 @@ HnswIndexBase::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t set_link_array(remove_from, level, new_links); } -HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg) +HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg) : _vectors(vectors), + _distance_func(distance_func), _level_generator(level_generator), _cfg(cfg), _node_refs(), diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h index 9987b61c157..ab59ad02f14 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h @@ -13,6 +13,7 @@ namespace search::tensor { +class DistanceFunction; class DocVectorAccess; class RandomLevelGenerator; @@ -78,6 +79,7 @@ protected: using LinkArray = vespalib::Array; const DocVectorAccess& _vectors; + const DistanceFunction& _distance_func; RandomLevelGenerator& _level_generator; Config _cfg; NodeRefVector _node_refs; @@ -112,7 +114,8 @@ protected: void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level); public: - HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg); + HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndexBase() override; // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists) -- cgit v1.2.3 From 1debbb54c26ce0c146a1d0711beea55adef3173b Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Mon, 10 Feb 2020 12:19:40 +0000 Subject: Move code to the hnsw index base class as we no longer depend on the float type. --- .../src/vespa/searchlib/tensor/hnsw_index.cpp | 142 --------------------- searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 18 --- .../src/vespa/searchlib/tensor/hnsw_index_base.cpp | 138 ++++++++++++++++++++ .../src/vespa/searchlib/tensor/hnsw_index_base.h | 23 +++- 4 files changed, 158 insertions(+), 163 deletions(-) diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index cb617246066..9ad578ae983 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -2,82 +2,9 @@ #include "distance_function.h" #include "hnsw_index.h" -#include namespace search::tensor { -template -double -HnswIndex::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const -{ - auto lhs = get_vector(lhs_docid); - return calc_distance(lhs, rhs_docid); -} - -template -double -HnswIndex::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const -{ - auto rhs = get_vector(rhs_docid); - return _distance_func.calc(lhs, rhs); -} - -template -HnswCandidate -HnswIndex::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) -{ - HnswCandidate nearest = entry_point; - bool keep_searching = true; - while (keep_searching) { - keep_searching = false; - for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) { - double dist = calc_distance(input, neighbor_docid); - if (dist < nearest.distance) { - nearest = HnswCandidate(neighbor_docid, dist); - keep_searching = true; - } - } - } - return nearest; -} - -template -void -HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) -{ - NearestPriQ candidates; - // TODO: Add proper handling of visited set. - auto visited = BitVector::create(_node_refs.size()); - for (const auto &entry : best_neighbors.peek()) { - candidates.push(entry); - visited->setBit(entry.docid); - } - double limit_dist = std::numeric_limits::max(); - - while (!candidates.empty()) { - auto cand = candidates.top(); - if (cand.distance > limit_dist) { - break; - } - candidates.pop(); - for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) { - if (visited->testBit(neighbor_docid)) { - continue; - } - visited->setBit(neighbor_docid); - double dist_to_input = calc_distance(input, neighbor_docid); - if (dist_to_input < limit_dist) { - candidates.emplace(neighbor_docid, dist_to_input); - best_neighbors.emplace(neighbor_docid, dist_to_input); - if (best_neighbors.size() > neighbors_to_find) { - best_neighbors.pop(); - limit_dist = best_neighbors.top().distance; - } - } - } - } -} - template HnswIndex::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, RandomLevelGenerator& level_generator, const Config& cfg) @@ -88,74 +15,5 @@ HnswIndex::HnswIndex(const DocVectorAccess& vectors, const DistanceFu template HnswIndex::~HnswIndex() = default; -template -void -HnswIndex::add_document(uint32_t docid) -{ - auto input = get_vector(docid); - _node_refs.ensure_size(docid + 1, AtomicEntryRef()); - // A document cannot be added twice. - assert(!_node_refs[docid].load_acquire().valid()); - int level = make_node_for_document(docid); - if (_entry_docid == 0) { - _entry_docid = docid; - _entry_level = level; - return; - } - - int search_level = _entry_level; - double entry_dist = calc_distance(input, _entry_docid); - HnswCandidate entry_point(_entry_docid, entry_dist); - while (search_level > level) { - entry_point = find_nearest_in_layer(input, entry_point, search_level); - --search_level; - } - - FurthestPriQ best_neighbors; - best_neighbors.push(entry_point); - search_level = std::min(level, _entry_level); - - // Insert the added document in each level it should exist in. - while (search_level >= 0) { - // TODO: Rename to search_level? - search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level); - auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level)); - connect_new_node(docid, neighbors, search_level); - // TODO: Shrink neighbors if needed - --search_level; - } - if (level > _entry_level) { - _entry_docid = docid; - _entry_level = level; - } -} - -template -void -HnswIndex::remove_document(uint32_t docid) -{ - bool need_new_entrypoint = (docid == _entry_docid); - LinkArray empty; - LevelArrayRef node_levels = get_level_array(docid); - for (int level = node_levels.size(); level-- > 0; ) { - LinkArrayRef my_links = get_link_array(docid, level); - for (uint32_t neighbor_id : my_links) { - if (need_new_entrypoint) { - _entry_docid = neighbor_id; - _entry_level = level; - need_new_entrypoint = false; - } - remove_link_to(neighbor_id, docid, level); - } - set_link_array(docid, level, empty); - } - if (need_new_entrypoint) { - _entry_docid = 0; - _entry_level = -1; - } - search::datastore::EntryRef invalid; - _node_refs[docid].store_release(invalid); -} - } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index a93062cf200..2b0761d17ea 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -25,28 +25,10 @@ class RandomLevelGenerator; */ template class HnswIndex : public HnswIndexBase { -private: - using TypedCells = vespalib::tensor::TypedCells; - - inline TypedCells get_vector(uint32_t docid) const { - return _vectors.get_vector(docid); - } - - double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override; - double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; - /** - * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. - */ - HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level); - void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); - public: HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndex() override; - - void add_document(uint32_t docid) override; - void remove_document(uint32_t docid) override; }; template class HnswIndex; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp index e0ecdf87399..a58dba70746 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp @@ -1,7 +1,9 @@ // Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "distance_function.h" #include "hnsw_index_base.h" #include "random_level_generator.h" +#include #include #include @@ -156,6 +158,75 @@ HnswIndexBase::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t set_link_array(remove_from, level, new_links); } + +double +HnswIndexBase::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const +{ + auto lhs = get_vector(lhs_docid); + return calc_distance(lhs, rhs_docid); +} + +double +HnswIndexBase::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const +{ + auto rhs = get_vector(rhs_docid); + return _distance_func.calc(lhs, rhs); +} + +HnswCandidate +HnswIndexBase::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) +{ + HnswCandidate nearest = entry_point; + bool keep_searching = true; + while (keep_searching) { + keep_searching = false; + for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) { + double dist = calc_distance(input, neighbor_docid); + if (dist < nearest.distance) { + nearest = HnswCandidate(neighbor_docid, dist); + keep_searching = true; + } + } + } + return nearest; +} + +void +HnswIndexBase::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) +{ + NearestPriQ candidates; + // TODO: Add proper handling of visited set. + auto visited = BitVector::create(_node_refs.size()); + for (const auto &entry : best_neighbors.peek()) { + candidates.push(entry); + visited->setBit(entry.docid); + } + double limit_dist = std::numeric_limits::max(); + + while (!candidates.empty()) { + auto cand = candidates.top(); + if (cand.distance > limit_dist) { + break; + } + candidates.pop(); + for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) { + if (visited->testBit(neighbor_docid)) { + continue; + } + visited->setBit(neighbor_docid); + double dist_to_input = calc_distance(input, neighbor_docid); + if (dist_to_input < limit_dist) { + candidates.emplace(neighbor_docid, dist_to_input); + best_neighbors.emplace(neighbor_docid, dist_to_input); + if (best_neighbors.size() > neighbors_to_find) { + best_neighbors.pop(); + limit_dist = best_neighbors.top().distance; + } + } + } + } +} + HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, RandomLevelGenerator& level_generator, const Config& cfg) : _vectors(vectors), @@ -172,6 +243,73 @@ HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunct HnswIndexBase::~HnswIndexBase() = default; +void +HnswIndexBase::add_document(uint32_t docid) +{ + auto input = get_vector(docid); + _node_refs.ensure_size(docid + 1, AtomicEntryRef()); + // A document cannot be added twice. + assert(!_node_refs[docid].load_acquire().valid()); + int level = make_node_for_document(docid); + if (_entry_docid == 0) { + _entry_docid = docid; + _entry_level = level; + return; + } + + int search_level = _entry_level; + double entry_dist = calc_distance(input, _entry_docid); + HnswCandidate entry_point(_entry_docid, entry_dist); + while (search_level > level) { + entry_point = find_nearest_in_layer(input, entry_point, search_level); + --search_level; + } + + FurthestPriQ best_neighbors; + best_neighbors.push(entry_point); + search_level = std::min(level, _entry_level); + + // Insert the added document in each level it should exist in. + while (search_level >= 0) { + // TODO: Rename to search_level? + search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level); + auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level)); + connect_new_node(docid, neighbors, search_level); + // TODO: Shrink neighbors if needed + --search_level; + } + if (level > _entry_level) { + _entry_docid = docid; + _entry_level = level; + } +} + +void +HnswIndexBase::remove_document(uint32_t docid) +{ + bool need_new_entrypoint = (docid == _entry_docid); + LinkArray empty; + LevelArrayRef node_levels = get_level_array(docid); + for (int level = node_levels.size(); level-- > 0; ) { + LinkArrayRef my_links = get_link_array(docid, level); + for (uint32_t neighbor_id : my_links) { + if (need_new_entrypoint) { + _entry_docid = neighbor_id; + _entry_level = level; + need_new_entrypoint = false; + } + remove_link_to(neighbor_id, docid, level); + } + set_link_array(docid, level, empty); + } + if (need_new_entrypoint) { + _entry_docid = 0; + _entry_level = -1; + } + search::datastore::EntryRef invalid; + _node_refs[docid].store_release(invalid); +} + HnswNode HnswIndexBase::get_node(uint32_t docid) const { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h index ab59ad02f14..992f3587651 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h @@ -2,9 +2,11 @@ #pragma once +#include "doc_vector_access.h" #include "hnsw_index_utils.h" #include "hnsw_node.h" #include "nearest_neighbor_index.h" +#include #include #include #include @@ -14,7 +16,6 @@ namespace search::tensor { class DistanceFunction; -class DocVectorAccess; class RandomLevelGenerator; /** @@ -78,6 +79,8 @@ protected: using LinkArrayRef = LinkStore::ConstArrayRef; using LinkArray = vespalib::Array; + using TypedCells = vespalib::tensor::TypedCells; + const DocVectorAccess& _vectors; const DistanceFunction& _distance_func; RandomLevelGenerator& _level_generator; @@ -97,8 +100,6 @@ protected: LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const; void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links); - virtual double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const = 0; - /** * Returns true if the distance between the candidate and a node in the current result * is less than the distance between the candidate and the node we want to add to the graph. @@ -113,11 +114,27 @@ protected: void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level); void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level); + inline TypedCells get_vector(uint32_t docid) const { + return _vectors.get_vector(docid); + } + + double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const; + double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; + + /** + * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. + */ + HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level); + void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); + public: HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndexBase() override; + void add_document(uint32_t docid) override; + void remove_document(uint32_t docid) override; + // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists) uint32_t get_entry_docid() const { return _entry_docid; } -- cgit v1.2.3 From b9e867c4a20e9aa340dbdb110066083ebcc2a619 Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Mon, 10 Feb 2020 12:30:47 +0000 Subject: Rename HnswIndexBase -> HnswIndex and remove the templated class. --- .../tests/tensor/hnsw_index/hnsw_index_test.cpp | 9 +- .../src/vespa/searchlib/tensor/CMakeLists.txt | 1 - .../src/vespa/searchlib/tensor/hnsw_index.cpp | 325 +++++++++++++++++++- searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 132 +++++++- .../src/vespa/searchlib/tensor/hnsw_index_base.cpp | 332 --------------------- .../src/vespa/searchlib/tensor/hnsw_index_base.h | 150 ---------- 6 files changed, 445 insertions(+), 504 deletions(-) delete mode 100644 searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp delete mode 100644 searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 0c5ddb93e92..081361b2fdc 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -43,15 +43,14 @@ struct LevelGenerator : public RandomLevelGenerator { using FloatVectors = MyDocVectorAccess; using FloatSqEuclideanDistance = SquaredEuclideanDistance; -using FloatIndex = HnswIndex; -using FloatIndexUP = std::unique_ptr; +using HnswIndexUP = std::unique_ptr; class HnswIndexTest : public ::testing::Test { public: FloatVectors vectors; FloatSqEuclideanDistance distance_func; LevelGenerator level_generator; - FloatIndexUP index; + HnswIndexUP index; HnswIndexTest() : vectors(), @@ -63,8 +62,8 @@ public: .set(7, {3, 5}); } void init(bool heuristic_select_neighbors) { - index = std::make_unique(vectors, distance_func, level_generator, - HnswIndexBase::Config(2, 1, 10, heuristic_select_neighbors)); + index = std::make_unique(vectors, distance_func, level_generator, + HnswIndex::Config(2, 1, 10, heuristic_select_neighbors)); } void add_document(uint32_t docid, uint32_t max_level = 0) { level_generator.level = max_level; diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index 7c05304d451..9175168248c 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -6,7 +6,6 @@ vespa_add_library(searchlib_tensor OBJECT dense_tensor_store.cpp generic_tensor_attribute.cpp generic_tensor_store.cpp - hnsw_index_base.cpp hnsw_index.cpp imported_tensor_attribute_vector.cpp imported_tensor_attribute_vector_read_guard.cpp diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 9ad578ae983..9600f2fd9d4 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -2,18 +2,331 @@ #include "distance_function.h" #include "hnsw_index.h" +#include "random_level_generator.h" +#include +#include +#include namespace search::tensor { -template -HnswIndex::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, - RandomLevelGenerator& level_generator, const Config& cfg) - : HnswIndexBase(vectors, distance_func, level_generator, cfg) +namespace { + +// TODO: Move this to MemoryAllocator, with name PAGE_SIZE. +constexpr size_t small_page_size = 4 * 1024; +constexpr size_t min_num_arrays_for_new_buffer = 8 * 1024; +constexpr float alloc_grow_factor = 0.2; +// TODO: Adjust these numbers to what we accept as max in config. +constexpr size_t max_level_array_size = 16; +constexpr size_t max_link_array_size = 64; + +} + +search::datastore::ArrayStoreConfig +HnswIndex::make_default_node_store_config() +{ + return NodeStore::optimizedConfigForHugePage(max_level_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE, + small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true); +} + +search::datastore::ArrayStoreConfig +HnswIndex::make_default_link_store_config() +{ + return LinkStore::optimizedConfigForHugePage(max_link_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE, + small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true); +} + +uint32_t +HnswIndex::max_links_for_level(uint32_t level) const { + return (level == 0) ? _cfg.max_links_at_level_0() : _cfg.max_links_at_hierarchic_levels(); } -template -HnswIndex::~HnswIndex() = default; +uint32_t +HnswIndex::make_node_for_document(uint32_t docid) +{ + uint32_t max_level = _level_generator.max_level(); + // TODO: Add capping on num_levels + uint32_t num_levels = max_level + 1; + // Note: The level array instance lives as long as the document is present in the index. + LevelArray levels(num_levels, AtomicEntryRef()); + auto node_ref = _nodes.add(levels); + _node_refs[docid].store_release(node_ref); + return max_level; +} + +HnswIndex::LevelArrayRef +HnswIndex::get_level_array(uint32_t docid) const +{ + auto node_ref = _node_refs[docid].load_acquire(); + return _nodes.get(node_ref); +} + +HnswIndex::LinkArrayRef +HnswIndex::get_link_array(uint32_t docid, uint32_t level) const +{ + auto levels = get_level_array(docid); + assert(level < levels.size()); + return _links.get(levels[level].load_acquire()); +} + +void +HnswIndex::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links) +{ + auto links_ref = _links.add(links); + auto node_ref = _node_refs[docid].load_acquire(); + auto levels = _nodes.get_writable(node_ref); + levels[level].store_release(links_ref); +} + +bool +HnswIndex::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const +{ + for (uint32_t result_docid : result) { + double dist = calc_distance(candidate.docid, result_docid); + if (dist < candidate.distance) { + return true; + } + } + return false; +} + +HnswIndex::LinkArray +HnswIndex::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const +{ + HnswCandidateVector sorted(neighbors); + std::sort(sorted.begin(), sorted.end(), LesserDistance()); + LinkArray result; + for (size_t i = 0, m = std::min(static_cast(max_links), sorted.size()); i < m; ++i) { + result.push_back(sorted[i].docid); + } + return result; +} + +HnswIndex::LinkArray +HnswIndex::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const +{ + LinkArray result; + bool need_filtering = neighbors.size() > max_links; + NearestPriQ nearest; + for (const auto& entry : neighbors) { + nearest.push(entry); + } + while (!nearest.empty()) { + auto candidate = nearest.top(); + nearest.pop(); + if (need_filtering && have_closer_distance(candidate, result)) { + continue; + } + result.push_back(candidate.docid); + if (result.size() == max_links) { + return result; + } + } + return result; +} + +HnswIndex::LinkArray +HnswIndex::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const +{ + if (_cfg.heuristic_select_neighbors()) { + return select_neighbors_heuristic(neighbors, max_links); + } else { + return select_neighbors_simple(neighbors, max_links); + } +} + +void +HnswIndex::connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level) +{ + set_link_array(docid, level, neighbors); + for (uint32_t neighbor_docid : neighbors) { + auto old_links = get_link_array(neighbor_docid, level); + LinkArray new_links(old_links.begin(), old_links.end()); + new_links.push_back(docid); + set_link_array(neighbor_docid, level, new_links); + } +} + +void +HnswIndex::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level) +{ + LinkArray new_links; + auto old_links = get_link_array(remove_from, level); + for (uint32_t id : old_links) { + if (id != remove_id) new_links.push_back(id); + } + set_link_array(remove_from, level, new_links); +} + + +double +HnswIndex::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const +{ + auto lhs = get_vector(lhs_docid); + return calc_distance(lhs, rhs_docid); +} + +double +HnswIndex::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const +{ + auto rhs = get_vector(rhs_docid); + return _distance_func.calc(lhs, rhs); +} + +HnswCandidate +HnswIndex::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) +{ + HnswCandidate nearest = entry_point; + bool keep_searching = true; + while (keep_searching) { + keep_searching = false; + for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) { + double dist = calc_distance(input, neighbor_docid); + if (dist < nearest.distance) { + nearest = HnswCandidate(neighbor_docid, dist); + keep_searching = true; + } + } + } + return nearest; +} + +void +HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) +{ + NearestPriQ candidates; + // TODO: Add proper handling of visited set. + auto visited = BitVector::create(_node_refs.size()); + for (const auto &entry : best_neighbors.peek()) { + candidates.push(entry); + visited->setBit(entry.docid); + } + double limit_dist = std::numeric_limits::max(); + + while (!candidates.empty()) { + auto cand = candidates.top(); + if (cand.distance > limit_dist) { + break; + } + candidates.pop(); + for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) { + if (visited->testBit(neighbor_docid)) { + continue; + } + visited->setBit(neighbor_docid); + double dist_to_input = calc_distance(input, neighbor_docid); + if (dist_to_input < limit_dist) { + candidates.emplace(neighbor_docid, dist_to_input); + best_neighbors.emplace(neighbor_docid, dist_to_input); + if (best_neighbors.size() > neighbors_to_find) { + best_neighbors.pop(); + limit_dist = best_neighbors.top().distance; + } + } + } + } +} + +HnswIndex::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg) + : _vectors(vectors), + _distance_func(distance_func), + _level_generator(level_generator), + _cfg(cfg), + _node_refs(), + _nodes(make_default_node_store_config()), + _links(make_default_link_store_config()), + _entry_docid(0), // Note that docid 0 is reserved and never used + _entry_level(-1) +{ +} + +HnswIndex::~HnswIndex() = default; + +void +HnswIndex::add_document(uint32_t docid) +{ + auto input = get_vector(docid); + _node_refs.ensure_size(docid + 1, AtomicEntryRef()); + // A document cannot be added twice. + assert(!_node_refs[docid].load_acquire().valid()); + int level = make_node_for_document(docid); + if (_entry_docid == 0) { + _entry_docid = docid; + _entry_level = level; + return; + } + + int search_level = _entry_level; + double entry_dist = calc_distance(input, _entry_docid); + HnswCandidate entry_point(_entry_docid, entry_dist); + while (search_level > level) { + entry_point = find_nearest_in_layer(input, entry_point, search_level); + --search_level; + } + + FurthestPriQ best_neighbors; + best_neighbors.push(entry_point); + search_level = std::min(level, _entry_level); + + // Insert the added document in each level it should exist in. + while (search_level >= 0) { + // TODO: Rename to search_level? + search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level); + auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level)); + connect_new_node(docid, neighbors, search_level); + // TODO: Shrink neighbors if needed + --search_level; + } + if (level > _entry_level) { + _entry_docid = docid; + _entry_level = level; + } +} + +void +HnswIndex::remove_document(uint32_t docid) +{ + bool need_new_entrypoint = (docid == _entry_docid); + LinkArray empty; + LevelArrayRef node_levels = get_level_array(docid); + for (int level = node_levels.size(); level-- > 0; ) { + LinkArrayRef my_links = get_link_array(docid, level); + for (uint32_t neighbor_id : my_links) { + if (need_new_entrypoint) { + _entry_docid = neighbor_id; + _entry_level = level; + need_new_entrypoint = false; + } + remove_link_to(neighbor_id, docid, level); + } + set_link_array(docid, level, empty); + } + if (need_new_entrypoint) { + _entry_docid = 0; + _entry_level = -1; + } + search::datastore::EntryRef invalid; + _node_refs[docid].store_release(invalid); +} + +HnswNode +HnswIndex::get_node(uint32_t docid) const +{ + auto node_ref = _node_refs[docid].load_acquire(); + if (!node_ref.valid()) { + return HnswNode(); + } + auto levels = _nodes.get(node_ref); + HnswNode::LevelArray result; + for (const auto& links_ref : levels) { + auto links = _links.get(links_ref.load_acquire()); + HnswNode::LinkArray result_links(links.begin(), links.end()); + std::sort(result_links.begin(), result_links.end()); + result.push_back(result_links); + } + return HnswNode(result); +} } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index 2b0761d17ea..15ad4ebf07b 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -3,36 +3,148 @@ #pragma once #include "doc_vector_access.h" -#include "hnsw_index_base.h" +#include "hnsw_index_utils.h" +#include "hnsw_node.h" +#include "nearest_neighbor_index.h" +#include #include #include +#include #include #include namespace search::tensor { class DistanceFunction; -class DocVectorAccess; class RandomLevelGenerator; /** - * Concrete implementation of a hierarchical navigable small world graph (HNSW) + * Implementation of a hierarchical navigable small world graph (HNSW) * that is used for approximate K-nearest neighbor search. * - * See HnswIndexBase for more details. + * The implementation supports 1 write thread and multiple search threads without the use of mutexes. + * This is achieved by using data stores that use generation tracking and associated memory management. * - * The FloatType template argument specifies the data type used in the vectors (4 byte float or 8 byte double). + * The implementation is mainly based on the algorithms described in + * "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs" (Yu. A. Malkov, D. A. Yashunin), + * but some adjustments are made to support proper removes. + * + * TODO: Add details on how to handle removes. */ -template -class HnswIndex : public HnswIndexBase { +class HnswIndex : public NearestNeighborIndex { +public: + class Config { + private: + uint32_t _max_links_at_level_0; + uint32_t _max_links_at_hierarchic_levels; + uint32_t _neighbors_to_explore_at_construction; + bool _heuristic_select_neighbors; + + public: + Config(uint32_t max_links_at_level_0_in, + uint32_t max_links_at_hierarchic_levels_in, + uint32_t neighbors_to_explore_at_construction_in, + bool heuristic_select_neighbors_in) + : _max_links_at_level_0(max_links_at_level_0_in), + _max_links_at_hierarchic_levels(max_links_at_hierarchic_levels_in), + _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in), + _heuristic_select_neighbors(heuristic_select_neighbors_in) + {} + uint32_t max_links_at_level_0() const { return _max_links_at_level_0; } + uint32_t max_links_at_hierarchic_levels() const { return _max_links_at_hierarchic_levels; } + uint32_t neighbors_to_explore_at_construction() const { return _neighbors_to_explore_at_construction; } + bool heuristic_select_neighbors() const { return _heuristic_select_neighbors; } + }; + +protected: + using AtomicEntryRef = search::datastore::AtomicEntryRef; + + // This uses 10 bits for buffer id -> 1024 buffers. + // As we have very short arrays we get less fragmentation with fewer and larger buffers. + using EntryRefType = search::datastore::EntryRefT<22>; + + // Provides mapping from document id -> node reference. + // The reference is used to lookup the node data in NodeStore. + using NodeRefVector = vespalib::RcuVector; + + // This stores the level arrays for all nodes. + // Each node consists of an array of levels (from level 0 to n) where each entry is a reference to the link array at that level. + using NodeStore = search::datastore::ArrayStore; + using LevelArrayRef = NodeStore::ConstArrayRef; + using LevelArray = vespalib::Array; + + // This stores all link arrays. + // A link array consists of the document ids of the nodes a particular node is linked to. + using LinkStore = search::datastore::ArrayStore; + using LinkArrayRef = LinkStore::ConstArrayRef; + using LinkArray = vespalib::Array; + + using TypedCells = vespalib::tensor::TypedCells; + + const DocVectorAccess& _vectors; + const DistanceFunction& _distance_func; + RandomLevelGenerator& _level_generator; + Config _cfg; + NodeRefVector _node_refs; + NodeStore _nodes; + LinkStore _links; + uint32_t _entry_docid; + int _entry_level; + + static search::datastore::ArrayStoreConfig make_default_node_store_config(); + static search::datastore::ArrayStoreConfig make_default_link_store_config(); + + uint32_t max_links_for_level(uint32_t level) const; + uint32_t make_node_for_document(uint32_t docid); + LevelArrayRef get_level_array(uint32_t docid) const; + LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const; + void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links); + + /** + * Returns true if the distance between the candidate and a node in the current result + * is less than the distance between the candidate and the node we want to add to the graph. + * In this case the candidate should be discarded as we already are connected to the space + * where the candidate is located. + * Used by select_neighbors_heuristic(). + */ + bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const; + LinkArray select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const; + LinkArray select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const; + LinkArray select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const; + void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level); + void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level); + + inline TypedCells get_vector(uint32_t docid) const { + return _vectors.get_vector(docid); + } + + double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const; + double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; + + /** + * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. + */ + HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level); + void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); + public: HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndex() override; -}; -template class HnswIndex; -template class HnswIndex; + void add_document(uint32_t docid) override; + void remove_document(uint32_t docid) override; + + // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists) + + uint32_t get_entry_docid() const { return _entry_docid; } + uint32_t get_entry_level() const { return _entry_level; } + + // Should only be used by unit tests. + HnswNode get_node(uint32_t docid) const; + + // TODO: Implement set_node() as well for use in unit tests. +}; } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp deleted file mode 100644 index a58dba70746..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "distance_function.h" -#include "hnsw_index_base.h" -#include "random_level_generator.h" -#include -#include -#include - -namespace search::tensor { - -namespace { - -// TODO: Move this to MemoryAllocator, with name PAGE_SIZE. -constexpr size_t small_page_size = 4 * 1024; -constexpr size_t min_num_arrays_for_new_buffer = 8 * 1024; -constexpr float alloc_grow_factor = 0.2; -// TODO: Adjust these numbers to what we accept as max in config. -constexpr size_t max_level_array_size = 16; -constexpr size_t max_link_array_size = 64; - -} - -search::datastore::ArrayStoreConfig -HnswIndexBase::make_default_node_store_config() -{ - return NodeStore::optimizedConfigForHugePage(max_level_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE, - small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true); -} - -search::datastore::ArrayStoreConfig -HnswIndexBase::make_default_link_store_config() -{ - return LinkStore::optimizedConfigForHugePage(max_link_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE, - small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true); -} - -uint32_t -HnswIndexBase::max_links_for_level(uint32_t level) const -{ - return (level == 0) ? _cfg.max_links_at_level_0() : _cfg.max_links_at_hierarchic_levels(); -} - -uint32_t -HnswIndexBase::make_node_for_document(uint32_t docid) -{ - uint32_t max_level = _level_generator.max_level(); - // TODO: Add capping on num_levels - uint32_t num_levels = max_level + 1; - // Note: The level array instance lives as long as the document is present in the index. - LevelArray levels(num_levels, AtomicEntryRef()); - auto node_ref = _nodes.add(levels); - _node_refs[docid].store_release(node_ref); - return max_level; -} - -HnswIndexBase::LevelArrayRef -HnswIndexBase::get_level_array(uint32_t docid) const -{ - auto node_ref = _node_refs[docid].load_acquire(); - return _nodes.get(node_ref); -} - -HnswIndexBase::LinkArrayRef -HnswIndexBase::get_link_array(uint32_t docid, uint32_t level) const -{ - auto levels = get_level_array(docid); - assert(level < levels.size()); - return _links.get(levels[level].load_acquire()); -} - -void -HnswIndexBase::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links) -{ - auto links_ref = _links.add(links); - auto node_ref = _node_refs[docid].load_acquire(); - auto levels = _nodes.get_writable(node_ref); - levels[level].store_release(links_ref); -} - -bool -HnswIndexBase::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const -{ - for (uint32_t result_docid : result) { - double dist = calc_distance(candidate.docid, result_docid); - if (dist < candidate.distance) { - return true; - } - } - return false; -} - -HnswIndexBase::LinkArray -HnswIndexBase::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const -{ - HnswCandidateVector sorted(neighbors); - std::sort(sorted.begin(), sorted.end(), LesserDistance()); - LinkArray result; - for (size_t i = 0, m = std::min(static_cast(max_links), sorted.size()); i < m; ++i) { - result.push_back(sorted[i].docid); - } - return result; -} - -HnswIndexBase::LinkArray -HnswIndexBase::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const -{ - LinkArray result; - bool need_filtering = neighbors.size() > max_links; - NearestPriQ nearest; - for (const auto& entry : neighbors) { - nearest.push(entry); - } - while (!nearest.empty()) { - auto candidate = nearest.top(); - nearest.pop(); - if (need_filtering && have_closer_distance(candidate, result)) { - continue; - } - result.push_back(candidate.docid); - if (result.size() == max_links) { - return result; - } - } - return result; -} - -HnswIndexBase::LinkArray -HnswIndexBase::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const -{ - if (_cfg.heuristic_select_neighbors()) { - return select_neighbors_heuristic(neighbors, max_links); - } else { - return select_neighbors_simple(neighbors, max_links); - } -} - -void -HnswIndexBase::connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level) -{ - set_link_array(docid, level, neighbors); - for (uint32_t neighbor_docid : neighbors) { - auto old_links = get_link_array(neighbor_docid, level); - LinkArray new_links(old_links.begin(), old_links.end()); - new_links.push_back(docid); - set_link_array(neighbor_docid, level, new_links); - } -} - -void -HnswIndexBase::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level) -{ - LinkArray new_links; - auto old_links = get_link_array(remove_from, level); - for (uint32_t id : old_links) { - if (id != remove_id) new_links.push_back(id); - } - set_link_array(remove_from, level, new_links); -} - - -double -HnswIndexBase::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const -{ - auto lhs = get_vector(lhs_docid); - return calc_distance(lhs, rhs_docid); -} - -double -HnswIndexBase::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const -{ - auto rhs = get_vector(rhs_docid); - return _distance_func.calc(lhs, rhs); -} - -HnswCandidate -HnswIndexBase::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) -{ - HnswCandidate nearest = entry_point; - bool keep_searching = true; - while (keep_searching) { - keep_searching = false; - for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) { - double dist = calc_distance(input, neighbor_docid); - if (dist < nearest.distance) { - nearest = HnswCandidate(neighbor_docid, dist); - keep_searching = true; - } - } - } - return nearest; -} - -void -HnswIndexBase::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) -{ - NearestPriQ candidates; - // TODO: Add proper handling of visited set. - auto visited = BitVector::create(_node_refs.size()); - for (const auto &entry : best_neighbors.peek()) { - candidates.push(entry); - visited->setBit(entry.docid); - } - double limit_dist = std::numeric_limits::max(); - - while (!candidates.empty()) { - auto cand = candidates.top(); - if (cand.distance > limit_dist) { - break; - } - candidates.pop(); - for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) { - if (visited->testBit(neighbor_docid)) { - continue; - } - visited->setBit(neighbor_docid); - double dist_to_input = calc_distance(input, neighbor_docid); - if (dist_to_input < limit_dist) { - candidates.emplace(neighbor_docid, dist_to_input); - best_neighbors.emplace(neighbor_docid, dist_to_input); - if (best_neighbors.size() > neighbors_to_find) { - best_neighbors.pop(); - limit_dist = best_neighbors.top().distance; - } - } - } - } -} - -HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, - RandomLevelGenerator& level_generator, const Config& cfg) - : _vectors(vectors), - _distance_func(distance_func), - _level_generator(level_generator), - _cfg(cfg), - _node_refs(), - _nodes(make_default_node_store_config()), - _links(make_default_link_store_config()), - _entry_docid(0), // Note that docid 0 is reserved and never used - _entry_level(-1) -{ -} - -HnswIndexBase::~HnswIndexBase() = default; - -void -HnswIndexBase::add_document(uint32_t docid) -{ - auto input = get_vector(docid); - _node_refs.ensure_size(docid + 1, AtomicEntryRef()); - // A document cannot be added twice. - assert(!_node_refs[docid].load_acquire().valid()); - int level = make_node_for_document(docid); - if (_entry_docid == 0) { - _entry_docid = docid; - _entry_level = level; - return; - } - - int search_level = _entry_level; - double entry_dist = calc_distance(input, _entry_docid); - HnswCandidate entry_point(_entry_docid, entry_dist); - while (search_level > level) { - entry_point = find_nearest_in_layer(input, entry_point, search_level); - --search_level; - } - - FurthestPriQ best_neighbors; - best_neighbors.push(entry_point); - search_level = std::min(level, _entry_level); - - // Insert the added document in each level it should exist in. - while (search_level >= 0) { - // TODO: Rename to search_level? - search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level); - auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level)); - connect_new_node(docid, neighbors, search_level); - // TODO: Shrink neighbors if needed - --search_level; - } - if (level > _entry_level) { - _entry_docid = docid; - _entry_level = level; - } -} - -void -HnswIndexBase::remove_document(uint32_t docid) -{ - bool need_new_entrypoint = (docid == _entry_docid); - LinkArray empty; - LevelArrayRef node_levels = get_level_array(docid); - for (int level = node_levels.size(); level-- > 0; ) { - LinkArrayRef my_links = get_link_array(docid, level); - for (uint32_t neighbor_id : my_links) { - if (need_new_entrypoint) { - _entry_docid = neighbor_id; - _entry_level = level; - need_new_entrypoint = false; - } - remove_link_to(neighbor_id, docid, level); - } - set_link_array(docid, level, empty); - } - if (need_new_entrypoint) { - _entry_docid = 0; - _entry_level = -1; - } - search::datastore::EntryRef invalid; - _node_refs[docid].store_release(invalid); -} - -HnswNode -HnswIndexBase::get_node(uint32_t docid) const -{ - auto node_ref = _node_refs[docid].load_acquire(); - if (!node_ref.valid()) { - return HnswNode(); - } - auto levels = _nodes.get(node_ref); - HnswNode::LevelArray result; - for (const auto& links_ref : levels) { - auto links = _links.get(links_ref.load_acquire()); - HnswNode::LinkArray result_links(links.begin(), links.end()); - std::sort(result_links.begin(), result_links.end()); - result.push_back(result_links); - } - return HnswNode(result); -} - -} - diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h deleted file mode 100644 index 992f3587651..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "doc_vector_access.h" -#include "hnsw_index_utils.h" -#include "hnsw_node.h" -#include "nearest_neighbor_index.h" -#include -#include -#include -#include -#include -#include - -namespace search::tensor { - -class DistanceFunction; -class RandomLevelGenerator; - -/** - * Base class for an implementation of a hierarchical navigable small world graph (HNSW) - * that is used for approximate K-nearest neighbor search. - * - * The implementation supports 1 write thread and multiple search threads without the use of mutexes. - * This is achieved by using data stores that use generation tracking and associated memory management. - * - * The implementation is mainly based on the algorithms described in - * "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs" (Yu. A. Malkov, D. A. Yashunin), - * but some adjustments are made to support proper removes. - * - * TODO: Add details on how to handle removes. - */ -class HnswIndexBase : public NearestNeighborIndex { -public: - class Config { - private: - uint32_t _max_links_at_level_0; - uint32_t _max_links_at_hierarchic_levels; - uint32_t _neighbors_to_explore_at_construction; - bool _heuristic_select_neighbors; - - public: - Config(uint32_t max_links_at_level_0_in, - uint32_t max_links_at_hierarchic_levels_in, - uint32_t neighbors_to_explore_at_construction_in, - bool heuristic_select_neighbors_in) - : _max_links_at_level_0(max_links_at_level_0_in), - _max_links_at_hierarchic_levels(max_links_at_hierarchic_levels_in), - _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in), - _heuristic_select_neighbors(heuristic_select_neighbors_in) - {} - uint32_t max_links_at_level_0() const { return _max_links_at_level_0; } - uint32_t max_links_at_hierarchic_levels() const { return _max_links_at_hierarchic_levels; } - uint32_t neighbors_to_explore_at_construction() const { return _neighbors_to_explore_at_construction; } - bool heuristic_select_neighbors() const { return _heuristic_select_neighbors; } - }; - -protected: - using AtomicEntryRef = search::datastore::AtomicEntryRef; - - // This uses 10 bits for buffer id -> 1024 buffers. - // As we have very short arrays we get less fragmentation with fewer and larger buffers. - using EntryRefType = search::datastore::EntryRefT<22>; - - // Provides mapping from document id -> node reference. - // The reference is used to lookup the node data in NodeStore. - using NodeRefVector = vespalib::RcuVector; - - // This stores the level arrays for all nodes. - // Each node consists of an array of levels (from level 0 to n) where each entry is a reference to the link array at that level. - using NodeStore = search::datastore::ArrayStore; - using LevelArrayRef = NodeStore::ConstArrayRef; - using LevelArray = vespalib::Array; - - // This stores all link arrays. - // A link array consists of the document ids of the nodes a particular node is linked to. - using LinkStore = search::datastore::ArrayStore; - using LinkArrayRef = LinkStore::ConstArrayRef; - using LinkArray = vespalib::Array; - - using TypedCells = vespalib::tensor::TypedCells; - - const DocVectorAccess& _vectors; - const DistanceFunction& _distance_func; - RandomLevelGenerator& _level_generator; - Config _cfg; - NodeRefVector _node_refs; - NodeStore _nodes; - LinkStore _links; - uint32_t _entry_docid; - int _entry_level; - - static search::datastore::ArrayStoreConfig make_default_node_store_config(); - static search::datastore::ArrayStoreConfig make_default_link_store_config(); - - uint32_t max_links_for_level(uint32_t level) const; - uint32_t make_node_for_document(uint32_t docid); - LevelArrayRef get_level_array(uint32_t docid) const; - LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const; - void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links); - - /** - * Returns true if the distance between the candidate and a node in the current result - * is less than the distance between the candidate and the node we want to add to the graph. - * In this case the candidate should be discarded as we already are connected to the space - * where the candidate is located. - * Used by select_neighbors_heuristic(). - */ - bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const; - LinkArray select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const; - LinkArray select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const; - LinkArray select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const; - void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level); - void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level); - - inline TypedCells get_vector(uint32_t docid) const { - return _vectors.get_vector(docid); - } - - double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const; - double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; - - /** - * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. - */ - HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level); - void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); - -public: - HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, - RandomLevelGenerator& level_generator, const Config& cfg); - ~HnswIndexBase() override; - - void add_document(uint32_t docid) override; - void remove_document(uint32_t docid) override; - - // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists) - - uint32_t get_entry_docid() const { return _entry_docid; } - uint32_t get_entry_level() const { return _entry_level; } - - // Should only be used by unit tests. - HnswNode get_node(uint32_t docid) const; - - // TODO: Implement set_node() as well for use in unit tests. -}; - -} - -- cgit v1.2.3 From e3e77d310e0ca6363a6b4571ca55f85356db60af Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Tue, 11 Feb 2020 11:43:54 +0000 Subject: Remove unneeded "template" keyword. --- searchlib/src/vespa/searchlib/tensor/distance_functions.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h index dabab55dc7a..1e8727e92aa 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -15,8 +15,8 @@ template class SquaredEuclideanDistance : public DistanceFunction { public: double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override { - auto lhs_vector = lhs.template typify(); - auto rhs_vector = rhs.template typify(); + auto lhs_vector = lhs.typify(); + auto rhs_vector = rhs.typify(); double result = 0.0; size_t sz = lhs_vector.size(); assert(sz == rhs_vector.size()); -- cgit v1.2.3