diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-02-10 12:19:40 +0000 |
---|---|---|
committer | Geir Storli <geirst@verizonmedia.com> | 2020-02-10 12:19:40 +0000 |
commit | 1debbb54c26ce0c146a1d0711beea55adef3173b (patch) | |
tree | 34e4c34d713762bb715696136423932463230435 | |
parent | 8164e4a5c4910a878e49663730de5c2b310d5940 (diff) |
Move code to the hnsw index base class as we no longer depend on the float type.
-rw-r--r-- | searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp | 142 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 18 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp | 138 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h | 23 |
4 files changed, 158 insertions, 163 deletions
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index cb617246066..9ad578ae983 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -2,83 +2,10 @@ #include "distance_function.h" #include "hnsw_index.h" -#include <vespa/vespalib/util/rcuvector.hpp> namespace search::tensor { template <typename FloatType> -double -HnswIndex<FloatType>::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const -{ - auto lhs = get_vector(lhs_docid); - return calc_distance(lhs, rhs_docid); -} - -template <typename FloatType> -double -HnswIndex<FloatType>::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const -{ - auto rhs = get_vector(rhs_docid); - return _distance_func.calc(lhs, rhs); -} - -template <typename FloatType> -HnswCandidate -HnswIndex<FloatType>::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) -{ - HnswCandidate nearest = entry_point; - bool keep_searching = true; - while (keep_searching) { - keep_searching = false; - for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) { - double dist = calc_distance(input, neighbor_docid); - if (dist < nearest.distance) { - nearest = HnswCandidate(neighbor_docid, dist); - keep_searching = true; - } - } - } - return nearest; -} - -template <typename FloatType> -void -HnswIndex<FloatType>::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) -{ - NearestPriQ candidates; - // TODO: Add proper handling of visited set. - auto visited = BitVector::create(_node_refs.size()); - for (const auto &entry : best_neighbors.peek()) { - candidates.push(entry); - visited->setBit(entry.docid); - } - double limit_dist = std::numeric_limits<double>::max(); - - while (!candidates.empty()) { - auto cand = candidates.top(); - if (cand.distance > limit_dist) { - break; - } - candidates.pop(); - for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) { - if (visited->testBit(neighbor_docid)) { - continue; - } - visited->setBit(neighbor_docid); - double dist_to_input = calc_distance(input, neighbor_docid); - if (dist_to_input < limit_dist) { - candidates.emplace(neighbor_docid, dist_to_input); - best_neighbors.emplace(neighbor_docid, dist_to_input); - if (best_neighbors.size() > neighbors_to_find) { - best_neighbors.pop(); - limit_dist = best_neighbors.top().distance; - } - } - } - } -} - -template <typename FloatType> HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, RandomLevelGenerator& level_generator, const Config& cfg) : HnswIndexBase(vectors, distance_func, level_generator, cfg) @@ -88,74 +15,5 @@ HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, const DistanceFu template <typename FloatType> HnswIndex<FloatType>::~HnswIndex() = default; -template <typename FloatType> -void -HnswIndex<FloatType>::add_document(uint32_t docid) -{ - auto input = get_vector(docid); - _node_refs.ensure_size(docid + 1, AtomicEntryRef()); - // A document cannot be added twice. - assert(!_node_refs[docid].load_acquire().valid()); - int level = make_node_for_document(docid); - if (_entry_docid == 0) { - _entry_docid = docid; - _entry_level = level; - return; - } - - int search_level = _entry_level; - double entry_dist = calc_distance(input, _entry_docid); - HnswCandidate entry_point(_entry_docid, entry_dist); - while (search_level > level) { - entry_point = find_nearest_in_layer(input, entry_point, search_level); - --search_level; - } - - FurthestPriQ best_neighbors; - best_neighbors.push(entry_point); - search_level = std::min(level, _entry_level); - - // Insert the added document in each level it should exist in. - while (search_level >= 0) { - // TODO: Rename to search_level? - search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level); - auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level)); - connect_new_node(docid, neighbors, search_level); - // TODO: Shrink neighbors if needed - --search_level; - } - if (level > _entry_level) { - _entry_docid = docid; - _entry_level = level; - } -} - -template <typename FloatType> -void -HnswIndex<FloatType>::remove_document(uint32_t docid) -{ - bool need_new_entrypoint = (docid == _entry_docid); - LinkArray empty; - LevelArrayRef node_levels = get_level_array(docid); - for (int level = node_levels.size(); level-- > 0; ) { - LinkArrayRef my_links = get_link_array(docid, level); - for (uint32_t neighbor_id : my_links) { - if (need_new_entrypoint) { - _entry_docid = neighbor_id; - _entry_level = level; - need_new_entrypoint = false; - } - remove_link_to(neighbor_id, docid, level); - } - set_link_array(docid, level, empty); - } - if (need_new_entrypoint) { - _entry_docid = 0; - _entry_level = -1; - } - search::datastore::EntryRef invalid; - _node_refs[docid].store_release(invalid); -} - } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index a93062cf200..2b0761d17ea 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -25,28 +25,10 @@ class RandomLevelGenerator; */ template <typename FloatType = float> class HnswIndex : public HnswIndexBase { -private: - using TypedCells = vespalib::tensor::TypedCells; - - inline TypedCells get_vector(uint32_t docid) const { - return _vectors.get_vector(docid); - } - - double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override; - double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; - /** - * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. - */ - HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level); - void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); - public: HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndex() override; - - void add_document(uint32_t docid) override; - void remove_document(uint32_t docid) override; }; template class HnswIndex<float>; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp index e0ecdf87399..a58dba70746 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp @@ -1,7 +1,9 @@ // Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "distance_function.h" #include "hnsw_index_base.h" #include "random_level_generator.h" +#include <vespa/eval/tensor/dense/typed_cells.h> #include <vespa/vespalib/datastore/array_store.hpp> #include <vespa/vespalib/util/rcuvector.hpp> @@ -156,6 +158,75 @@ HnswIndexBase::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t set_link_array(remove_from, level, new_links); } + +double +HnswIndexBase::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const +{ + auto lhs = get_vector(lhs_docid); + return calc_distance(lhs, rhs_docid); +} + +double +HnswIndexBase::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const +{ + auto rhs = get_vector(rhs_docid); + return _distance_func.calc(lhs, rhs); +} + +HnswCandidate +HnswIndexBase::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) +{ + HnswCandidate nearest = entry_point; + bool keep_searching = true; + while (keep_searching) { + keep_searching = false; + for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) { + double dist = calc_distance(input, neighbor_docid); + if (dist < nearest.distance) { + nearest = HnswCandidate(neighbor_docid, dist); + keep_searching = true; + } + } + } + return nearest; +} + +void +HnswIndexBase::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) +{ + NearestPriQ candidates; + // TODO: Add proper handling of visited set. + auto visited = BitVector::create(_node_refs.size()); + for (const auto &entry : best_neighbors.peek()) { + candidates.push(entry); + visited->setBit(entry.docid); + } + double limit_dist = std::numeric_limits<double>::max(); + + while (!candidates.empty()) { + auto cand = candidates.top(); + if (cand.distance > limit_dist) { + break; + } + candidates.pop(); + for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) { + if (visited->testBit(neighbor_docid)) { + continue; + } + visited->setBit(neighbor_docid); + double dist_to_input = calc_distance(input, neighbor_docid); + if (dist_to_input < limit_dist) { + candidates.emplace(neighbor_docid, dist_to_input); + best_neighbors.emplace(neighbor_docid, dist_to_input); + if (best_neighbors.size() > neighbors_to_find) { + best_neighbors.pop(); + limit_dist = best_neighbors.top().distance; + } + } + } + } +} + HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, RandomLevelGenerator& level_generator, const Config& cfg) : _vectors(vectors), @@ -172,6 +243,73 @@ HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunct HnswIndexBase::~HnswIndexBase() = default; +void +HnswIndexBase::add_document(uint32_t docid) +{ + auto input = get_vector(docid); + _node_refs.ensure_size(docid + 1, AtomicEntryRef()); + // A document cannot be added twice. + assert(!_node_refs[docid].load_acquire().valid()); + int level = make_node_for_document(docid); + if (_entry_docid == 0) { + _entry_docid = docid; + _entry_level = level; + return; + } + + int search_level = _entry_level; + double entry_dist = calc_distance(input, _entry_docid); + HnswCandidate entry_point(_entry_docid, entry_dist); + while (search_level > level) { + entry_point = find_nearest_in_layer(input, entry_point, search_level); + --search_level; + } + + FurthestPriQ best_neighbors; + best_neighbors.push(entry_point); + search_level = std::min(level, _entry_level); + + // Insert the added document in each level it should exist in. + while (search_level >= 0) { + // TODO: Rename to search_level? + search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level); + auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level)); + connect_new_node(docid, neighbors, search_level); + // TODO: Shrink neighbors if needed + --search_level; + } + if (level > _entry_level) { + _entry_docid = docid; + _entry_level = level; + } +} + +void +HnswIndexBase::remove_document(uint32_t docid) +{ + bool need_new_entrypoint = (docid == _entry_docid); + LinkArray empty; + LevelArrayRef node_levels = get_level_array(docid); + for (int level = node_levels.size(); level-- > 0; ) { + LinkArrayRef my_links = get_link_array(docid, level); + for (uint32_t neighbor_id : my_links) { + if (need_new_entrypoint) { + _entry_docid = neighbor_id; + _entry_level = level; + need_new_entrypoint = false; + } + remove_link_to(neighbor_id, docid, level); + } + set_link_array(docid, level, empty); + } + if (need_new_entrypoint) { + _entry_docid = 0; + _entry_level = -1; + } + search::datastore::EntryRef invalid; + _node_refs[docid].store_release(invalid); +} + HnswNode HnswIndexBase::get_node(uint32_t docid) const { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h index ab59ad02f14..992f3587651 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h @@ -2,9 +2,11 @@ #pragma once +#include "doc_vector_access.h" #include "hnsw_index_utils.h" #include "hnsw_node.h" #include "nearest_neighbor_index.h" +#include <vespa/eval/tensor/dense/typed_cells.h> #include <vespa/searchlib/common/bitvector.h> #include <vespa/vespalib/datastore/array_store.h> #include <vespa/vespalib/datastore/atomic_entry_ref.h> @@ -14,7 +16,6 @@ namespace search::tensor { class DistanceFunction; -class DocVectorAccess; class RandomLevelGenerator; /** @@ -78,6 +79,8 @@ protected: using LinkArrayRef = LinkStore::ConstArrayRef; using LinkArray = vespalib::Array<uint32_t>; + using TypedCells = vespalib::tensor::TypedCells; + const DocVectorAccess& _vectors; const DistanceFunction& _distance_func; RandomLevelGenerator& _level_generator; @@ -97,8 +100,6 @@ protected: LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const; void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links); - virtual double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const = 0; - /** * Returns true if the distance between the candidate and a node in the current result * is less than the distance between the candidate and the node we want to add to the graph. @@ -113,11 +114,27 @@ protected: void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level); void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level); + inline TypedCells get_vector(uint32_t docid) const { + return _vectors.get_vector(docid); + } + + double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const; + double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; + + /** + * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. + */ + HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level); + void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); + public: HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndexBase() override; + void add_document(uint32_t docid) override; + void remove_document(uint32_t docid) override; + // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists) uint32_t get_entry_docid() const { return _entry_docid; } |