summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2020-02-10 12:30:47 +0000
committerGeir Storli <geirst@verizonmedia.com>2020-02-10 12:30:47 +0000
commitb9e867c4a20e9aa340dbdb110066083ebcc2a619 (patch)
tree10af1d56dbc588b8f078ddb44e13aae354f061e0
parent1debbb54c26ce0c146a1d0711beea55adef3173b (diff)
Rename HnswIndexBase -> HnswIndex and remove the templated class.
-rw-r--r--searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp9
-rw-r--r--searchlib/src/vespa/searchlib/tensor/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp325
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h132
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp332
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h150
6 files changed, 445 insertions, 504 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
index 0c5ddb93e92..081361b2fdc 100644
--- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
@@ -43,15 +43,14 @@ struct LevelGenerator : public RandomLevelGenerator {
using FloatVectors = MyDocVectorAccess<float>;
using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>;
-using FloatIndex = HnswIndex<float>;
-using FloatIndexUP = std::unique_ptr<FloatIndex>;
+using HnswIndexUP = std::unique_ptr<HnswIndex>;
class HnswIndexTest : public ::testing::Test {
public:
FloatVectors vectors;
FloatSqEuclideanDistance distance_func;
LevelGenerator level_generator;
- FloatIndexUP index;
+ HnswIndexUP index;
HnswIndexTest()
: vectors(),
@@ -63,8 +62,8 @@ public:
.set(7, {3, 5});
}
void init(bool heuristic_select_neighbors) {
- index = std::make_unique<FloatIndex>(vectors, distance_func, level_generator,
- HnswIndexBase::Config(2, 1, 10, heuristic_select_neighbors));
+ index = std::make_unique<HnswIndex>(vectors, distance_func, level_generator,
+ HnswIndex::Config(2, 1, 10, heuristic_select_neighbors));
}
void add_document(uint32_t docid, uint32_t max_level = 0) {
level_generator.level = max_level;
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index 7c05304d451..9175168248c 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -6,7 +6,6 @@ vespa_add_library(searchlib_tensor OBJECT
dense_tensor_store.cpp
generic_tensor_attribute.cpp
generic_tensor_store.cpp
- hnsw_index_base.cpp
hnsw_index.cpp
imported_tensor_attribute_vector.cpp
imported_tensor_attribute_vector_read_guard.cpp
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index 9ad578ae983..9600f2fd9d4 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -2,18 +2,331 @@
#include "distance_function.h"
#include "hnsw_index.h"
+#include "random_level_generator.h"
+#include <vespa/eval/tensor/dense/typed_cells.h>
+#include <vespa/vespalib/datastore/array_store.hpp>
+#include <vespa/vespalib/util/rcuvector.hpp>
namespace search::tensor {
-template <typename FloatType>
-HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
- RandomLevelGenerator& level_generator, const Config& cfg)
- : HnswIndexBase(vectors, distance_func, level_generator, cfg)
+namespace {
+
+// TODO: Move this to MemoryAllocator, with name PAGE_SIZE.
+constexpr size_t small_page_size = 4 * 1024;
+constexpr size_t min_num_arrays_for_new_buffer = 8 * 1024;
+constexpr float alloc_grow_factor = 0.2;
+// TODO: Adjust these numbers to what we accept as max in config.
+constexpr size_t max_level_array_size = 16;
+constexpr size_t max_link_array_size = 64;
+
+}
+
+search::datastore::ArrayStoreConfig
+HnswIndex::make_default_node_store_config()
+{
+ return NodeStore::optimizedConfigForHugePage(max_level_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE,
+ small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true);
+}
+
+search::datastore::ArrayStoreConfig
+HnswIndex::make_default_link_store_config()
+{
+ return LinkStore::optimizedConfigForHugePage(max_link_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE,
+ small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true);
+}
+
+uint32_t
+HnswIndex::max_links_for_level(uint32_t level) const
{
+ return (level == 0) ? _cfg.max_links_at_level_0() : _cfg.max_links_at_hierarchic_levels();
}
-template <typename FloatType>
-HnswIndex<FloatType>::~HnswIndex() = default;
+uint32_t
+HnswIndex::make_node_for_document(uint32_t docid)
+{
+ uint32_t max_level = _level_generator.max_level();
+ // TODO: Add capping on num_levels
+ uint32_t num_levels = max_level + 1;
+ // Note: The level array instance lives as long as the document is present in the index.
+ LevelArray levels(num_levels, AtomicEntryRef());
+ auto node_ref = _nodes.add(levels);
+ _node_refs[docid].store_release(node_ref);
+ return max_level;
+}
+
+HnswIndex::LevelArrayRef
+HnswIndex::get_level_array(uint32_t docid) const
+{
+ auto node_ref = _node_refs[docid].load_acquire();
+ return _nodes.get(node_ref);
+}
+
+HnswIndex::LinkArrayRef
+HnswIndex::get_link_array(uint32_t docid, uint32_t level) const
+{
+ auto levels = get_level_array(docid);
+ assert(level < levels.size());
+ return _links.get(levels[level].load_acquire());
+}
+
+void
+HnswIndex::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links)
+{
+ auto links_ref = _links.add(links);
+ auto node_ref = _node_refs[docid].load_acquire();
+ auto levels = _nodes.get_writable(node_ref);
+ levels[level].store_release(links_ref);
+}
+
+bool
+HnswIndex::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const
+{
+ for (uint32_t result_docid : result) {
+ double dist = calc_distance(candidate.docid, result_docid);
+ if (dist < candidate.distance) {
+ return true;
+ }
+ }
+ return false;
+}
+
+HnswIndex::LinkArray
+HnswIndex::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const
+{
+ HnswCandidateVector sorted(neighbors);
+ std::sort(sorted.begin(), sorted.end(), LesserDistance());
+ LinkArray result;
+ for (size_t i = 0, m = std::min(static_cast<size_t>(max_links), sorted.size()); i < m; ++i) {
+ result.push_back(sorted[i].docid);
+ }
+ return result;
+}
+
+HnswIndex::LinkArray
+HnswIndex::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const
+{
+ LinkArray result;
+ bool need_filtering = neighbors.size() > max_links;
+ NearestPriQ nearest;
+ for (const auto& entry : neighbors) {
+ nearest.push(entry);
+ }
+ while (!nearest.empty()) {
+ auto candidate = nearest.top();
+ nearest.pop();
+ if (need_filtering && have_closer_distance(candidate, result)) {
+ continue;
+ }
+ result.push_back(candidate.docid);
+ if (result.size() == max_links) {
+ return result;
+ }
+ }
+ return result;
+}
+
+HnswIndex::LinkArray
+HnswIndex::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const
+{
+ if (_cfg.heuristic_select_neighbors()) {
+ return select_neighbors_heuristic(neighbors, max_links);
+ } else {
+ return select_neighbors_simple(neighbors, max_links);
+ }
+}
+
+void
+HnswIndex::connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level)
+{
+ set_link_array(docid, level, neighbors);
+ for (uint32_t neighbor_docid : neighbors) {
+ auto old_links = get_link_array(neighbor_docid, level);
+ LinkArray new_links(old_links.begin(), old_links.end());
+ new_links.push_back(docid);
+ set_link_array(neighbor_docid, level, new_links);
+ }
+}
+
+void
+HnswIndex::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level)
+{
+ LinkArray new_links;
+ auto old_links = get_link_array(remove_from, level);
+ for (uint32_t id : old_links) {
+ if (id != remove_id) new_links.push_back(id);
+ }
+ set_link_array(remove_from, level, new_links);
+}
+
+
+double
+HnswIndex::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const
+{
+ auto lhs = get_vector(lhs_docid);
+ return calc_distance(lhs, rhs_docid);
+}
+
+double
+HnswIndex::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const
+{
+ auto rhs = get_vector(rhs_docid);
+ return _distance_func.calc(lhs, rhs);
+}
+
+HnswCandidate
+HnswIndex::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level)
+{
+ HnswCandidate nearest = entry_point;
+ bool keep_searching = true;
+ while (keep_searching) {
+ keep_searching = false;
+ for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) {
+ double dist = calc_distance(input, neighbor_docid);
+ if (dist < nearest.distance) {
+ nearest = HnswCandidate(neighbor_docid, dist);
+ keep_searching = true;
+ }
+ }
+ }
+ return nearest;
+}
+
+void
+HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level)
+{
+ NearestPriQ candidates;
+ // TODO: Add proper handling of visited set.
+ auto visited = BitVector::create(_node_refs.size());
+ for (const auto &entry : best_neighbors.peek()) {
+ candidates.push(entry);
+ visited->setBit(entry.docid);
+ }
+ double limit_dist = std::numeric_limits<double>::max();
+
+ while (!candidates.empty()) {
+ auto cand = candidates.top();
+ if (cand.distance > limit_dist) {
+ break;
+ }
+ candidates.pop();
+ for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) {
+ if (visited->testBit(neighbor_docid)) {
+ continue;
+ }
+ visited->setBit(neighbor_docid);
+ double dist_to_input = calc_distance(input, neighbor_docid);
+ if (dist_to_input < limit_dist) {
+ candidates.emplace(neighbor_docid, dist_to_input);
+ best_neighbors.emplace(neighbor_docid, dist_to_input);
+ if (best_neighbors.size() > neighbors_to_find) {
+ best_neighbors.pop();
+ limit_dist = best_neighbors.top().distance;
+ }
+ }
+ }
+ }
+}
+
+HnswIndex::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
+ RandomLevelGenerator& level_generator, const Config& cfg)
+ : _vectors(vectors),
+ _distance_func(distance_func),
+ _level_generator(level_generator),
+ _cfg(cfg),
+ _node_refs(),
+ _nodes(make_default_node_store_config()),
+ _links(make_default_link_store_config()),
+ _entry_docid(0), // Note that docid 0 is reserved and never used
+ _entry_level(-1)
+{
+}
+
+HnswIndex::~HnswIndex() = default;
+
+void
+HnswIndex::add_document(uint32_t docid)
+{
+ auto input = get_vector(docid);
+ _node_refs.ensure_size(docid + 1, AtomicEntryRef());
+ // A document cannot be added twice.
+ assert(!_node_refs[docid].load_acquire().valid());
+ int level = make_node_for_document(docid);
+ if (_entry_docid == 0) {
+ _entry_docid = docid;
+ _entry_level = level;
+ return;
+ }
+
+ int search_level = _entry_level;
+ double entry_dist = calc_distance(input, _entry_docid);
+ HnswCandidate entry_point(_entry_docid, entry_dist);
+ while (search_level > level) {
+ entry_point = find_nearest_in_layer(input, entry_point, search_level);
+ --search_level;
+ }
+
+ FurthestPriQ best_neighbors;
+ best_neighbors.push(entry_point);
+ search_level = std::min(level, _entry_level);
+
+ // Insert the added document in each level it should exist in.
+ while (search_level >= 0) {
+ // TODO: Rename to search_level?
+ search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level);
+ auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level));
+ connect_new_node(docid, neighbors, search_level);
+ // TODO: Shrink neighbors if needed
+ --search_level;
+ }
+ if (level > _entry_level) {
+ _entry_docid = docid;
+ _entry_level = level;
+ }
+}
+
+void
+HnswIndex::remove_document(uint32_t docid)
+{
+ bool need_new_entrypoint = (docid == _entry_docid);
+ LinkArray empty;
+ LevelArrayRef node_levels = get_level_array(docid);
+ for (int level = node_levels.size(); level-- > 0; ) {
+ LinkArrayRef my_links = get_link_array(docid, level);
+ for (uint32_t neighbor_id : my_links) {
+ if (need_new_entrypoint) {
+ _entry_docid = neighbor_id;
+ _entry_level = level;
+ need_new_entrypoint = false;
+ }
+ remove_link_to(neighbor_id, docid, level);
+ }
+ set_link_array(docid, level, empty);
+ }
+ if (need_new_entrypoint) {
+ _entry_docid = 0;
+ _entry_level = -1;
+ }
+ search::datastore::EntryRef invalid;
+ _node_refs[docid].store_release(invalid);
+}
+
+HnswNode
+HnswIndex::get_node(uint32_t docid) const
+{
+ auto node_ref = _node_refs[docid].load_acquire();
+ if (!node_ref.valid()) {
+ return HnswNode();
+ }
+ auto levels = _nodes.get(node_ref);
+ HnswNode::LevelArray result;
+ for (const auto& links_ref : levels) {
+ auto links = _links.get(links_ref.load_acquire());
+ HnswNode::LinkArray result_links(links.begin(), links.end());
+ std::sort(result_links.begin(), result_links.end());
+ result.push_back(result_links);
+ }
+ return HnswNode(result);
+}
}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index 2b0761d17ea..15ad4ebf07b 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -3,36 +3,148 @@
#pragma once
#include "doc_vector_access.h"
-#include "hnsw_index_base.h"
+#include "hnsw_index_utils.h"
+#include "hnsw_node.h"
+#include "nearest_neighbor_index.h"
+#include <vespa/eval/tensor/dense/typed_cells.h>
#include <vespa/searchlib/common/bitvector.h>
#include <vespa/vespalib/datastore/array_store.h>
+#include <vespa/vespalib/datastore/atomic_entry_ref.h>
#include <vespa/vespalib/datastore/entryref.h>
#include <vespa/vespalib/util/rcuvector.h>
namespace search::tensor {
class DistanceFunction;
-class DocVectorAccess;
class RandomLevelGenerator;
/**
- * Concrete implementation of a hierarchical navigable small world graph (HNSW)
+ * Implementation of a hierarchical navigable small world graph (HNSW)
* that is used for approximate K-nearest neighbor search.
*
- * See HnswIndexBase for more details.
+ * The implementation supports 1 write thread and multiple search threads without the use of mutexes.
+ * This is achieved by using data stores that use generation tracking and associated memory management.
*
- * The FloatType template argument specifies the data type used in the vectors (4 byte float or 8 byte double).
+ * The implementation is mainly based on the algorithms described in
+ * "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs" (Yu. A. Malkov, D. A. Yashunin),
+ * but some adjustments are made to support proper removes.
+ *
+ * TODO: Add details on how to handle removes.
*/
-template <typename FloatType = float>
-class HnswIndex : public HnswIndexBase {
+class HnswIndex : public NearestNeighborIndex {
+public:
+ class Config {
+ private:
+ uint32_t _max_links_at_level_0;
+ uint32_t _max_links_at_hierarchic_levels;
+ uint32_t _neighbors_to_explore_at_construction;
+ bool _heuristic_select_neighbors;
+
+ public:
+ Config(uint32_t max_links_at_level_0_in,
+ uint32_t max_links_at_hierarchic_levels_in,
+ uint32_t neighbors_to_explore_at_construction_in,
+ bool heuristic_select_neighbors_in)
+ : _max_links_at_level_0(max_links_at_level_0_in),
+ _max_links_at_hierarchic_levels(max_links_at_hierarchic_levels_in),
+ _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in),
+ _heuristic_select_neighbors(heuristic_select_neighbors_in)
+ {}
+ uint32_t max_links_at_level_0() const { return _max_links_at_level_0; }
+ uint32_t max_links_at_hierarchic_levels() const { return _max_links_at_hierarchic_levels; }
+ uint32_t neighbors_to_explore_at_construction() const { return _neighbors_to_explore_at_construction; }
+ bool heuristic_select_neighbors() const { return _heuristic_select_neighbors; }
+ };
+
+protected:
+ using AtomicEntryRef = search::datastore::AtomicEntryRef;
+
+ // This uses 10 bits for buffer id -> 1024 buffers.
+ // As we have very short arrays we get less fragmentation with fewer and larger buffers.
+ using EntryRefType = search::datastore::EntryRefT<22>;
+
+ // Provides mapping from document id -> node reference.
+ // The reference is used to lookup the node data in NodeStore.
+ using NodeRefVector = vespalib::RcuVector<AtomicEntryRef>;
+
+ // This stores the level arrays for all nodes.
+ // Each node consists of an array of levels (from level 0 to n) where each entry is a reference to the link array at that level.
+ using NodeStore = search::datastore::ArrayStore<AtomicEntryRef, EntryRefType>;
+ using LevelArrayRef = NodeStore::ConstArrayRef;
+ using LevelArray = vespalib::Array<AtomicEntryRef>;
+
+ // This stores all link arrays.
+ // A link array consists of the document ids of the nodes a particular node is linked to.
+ using LinkStore = search::datastore::ArrayStore<uint32_t, EntryRefType>;
+ using LinkArrayRef = LinkStore::ConstArrayRef;
+ using LinkArray = vespalib::Array<uint32_t>;
+
+ using TypedCells = vespalib::tensor::TypedCells;
+
+ const DocVectorAccess& _vectors;
+ const DistanceFunction& _distance_func;
+ RandomLevelGenerator& _level_generator;
+ Config _cfg;
+ NodeRefVector _node_refs;
+ NodeStore _nodes;
+ LinkStore _links;
+ uint32_t _entry_docid;
+ int _entry_level;
+
+ static search::datastore::ArrayStoreConfig make_default_node_store_config();
+ static search::datastore::ArrayStoreConfig make_default_link_store_config();
+
+ uint32_t max_links_for_level(uint32_t level) const;
+ uint32_t make_node_for_document(uint32_t docid);
+ LevelArrayRef get_level_array(uint32_t docid) const;
+ LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const;
+ void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links);
+
+ /**
+ * Returns true if the distance between the candidate and a node in the current result
+ * is less than the distance between the candidate and the node we want to add to the graph.
+ * In this case the candidate should be discarded as we already are connected to the space
+ * where the candidate is located.
+ * Used by select_neighbors_heuristic().
+ */
+ bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const;
+ LinkArray select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const;
+ LinkArray select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const;
+ LinkArray select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const;
+ void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level);
+ void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level);
+
+ inline TypedCells get_vector(uint32_t docid) const {
+ return _vectors.get_vector(docid);
+ }
+
+ double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const;
+ double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const;
+
+ /**
+ * Performs a greedy search in the given layer to find the candidate that is nearest the input vector.
+ */
+ HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level);
+ void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level);
+
public:
HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
RandomLevelGenerator& level_generator, const Config& cfg);
~HnswIndex() override;
-};
-template class HnswIndex<float>;
-template class HnswIndex<double>;
+ void add_document(uint32_t docid) override;
+ void remove_document(uint32_t docid) override;
+
+ // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists)
+
+ uint32_t get_entry_docid() const { return _entry_docid; }
+ uint32_t get_entry_level() const { return _entry_level; }
+
+ // Should only be used by unit tests.
+ HnswNode get_node(uint32_t docid) const;
+
+ // TODO: Implement set_node() as well for use in unit tests.
+};
}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
deleted file mode 100644
index a58dba70746..00000000000
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
+++ /dev/null
@@ -1,332 +0,0 @@
-// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "distance_function.h"
-#include "hnsw_index_base.h"
-#include "random_level_generator.h"
-#include <vespa/eval/tensor/dense/typed_cells.h>
-#include <vespa/vespalib/datastore/array_store.hpp>
-#include <vespa/vespalib/util/rcuvector.hpp>
-
-namespace search::tensor {
-
-namespace {
-
-// TODO: Move this to MemoryAllocator, with name PAGE_SIZE.
-constexpr size_t small_page_size = 4 * 1024;
-constexpr size_t min_num_arrays_for_new_buffer = 8 * 1024;
-constexpr float alloc_grow_factor = 0.2;
-// TODO: Adjust these numbers to what we accept as max in config.
-constexpr size_t max_level_array_size = 16;
-constexpr size_t max_link_array_size = 64;
-
-}
-
-search::datastore::ArrayStoreConfig
-HnswIndexBase::make_default_node_store_config()
-{
- return NodeStore::optimizedConfigForHugePage(max_level_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE,
- small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true);
-}
-
-search::datastore::ArrayStoreConfig
-HnswIndexBase::make_default_link_store_config()
-{
- return LinkStore::optimizedConfigForHugePage(max_link_array_size, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE,
- small_page_size, min_num_arrays_for_new_buffer, alloc_grow_factor).enable_free_lists(true);
-}
-
-uint32_t
-HnswIndexBase::max_links_for_level(uint32_t level) const
-{
- return (level == 0) ? _cfg.max_links_at_level_0() : _cfg.max_links_at_hierarchic_levels();
-}
-
-uint32_t
-HnswIndexBase::make_node_for_document(uint32_t docid)
-{
- uint32_t max_level = _level_generator.max_level();
- // TODO: Add capping on num_levels
- uint32_t num_levels = max_level + 1;
- // Note: The level array instance lives as long as the document is present in the index.
- LevelArray levels(num_levels, AtomicEntryRef());
- auto node_ref = _nodes.add(levels);
- _node_refs[docid].store_release(node_ref);
- return max_level;
-}
-
-HnswIndexBase::LevelArrayRef
-HnswIndexBase::get_level_array(uint32_t docid) const
-{
- auto node_ref = _node_refs[docid].load_acquire();
- return _nodes.get(node_ref);
-}
-
-HnswIndexBase::LinkArrayRef
-HnswIndexBase::get_link_array(uint32_t docid, uint32_t level) const
-{
- auto levels = get_level_array(docid);
- assert(level < levels.size());
- return _links.get(levels[level].load_acquire());
-}
-
-void
-HnswIndexBase::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links)
-{
- auto links_ref = _links.add(links);
- auto node_ref = _node_refs[docid].load_acquire();
- auto levels = _nodes.get_writable(node_ref);
- levels[level].store_release(links_ref);
-}
-
-bool
-HnswIndexBase::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const
-{
- for (uint32_t result_docid : result) {
- double dist = calc_distance(candidate.docid, result_docid);
- if (dist < candidate.distance) {
- return true;
- }
- }
- return false;
-}
-
-HnswIndexBase::LinkArray
-HnswIndexBase::select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const
-{
- HnswCandidateVector sorted(neighbors);
- std::sort(sorted.begin(), sorted.end(), LesserDistance());
- LinkArray result;
- for (size_t i = 0, m = std::min(static_cast<size_t>(max_links), sorted.size()); i < m; ++i) {
- result.push_back(sorted[i].docid);
- }
- return result;
-}
-
-HnswIndexBase::LinkArray
-HnswIndexBase::select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const
-{
- LinkArray result;
- bool need_filtering = neighbors.size() > max_links;
- NearestPriQ nearest;
- for (const auto& entry : neighbors) {
- nearest.push(entry);
- }
- while (!nearest.empty()) {
- auto candidate = nearest.top();
- nearest.pop();
- if (need_filtering && have_closer_distance(candidate, result)) {
- continue;
- }
- result.push_back(candidate.docid);
- if (result.size() == max_links) {
- return result;
- }
- }
- return result;
-}
-
-HnswIndexBase::LinkArray
-HnswIndexBase::select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const
-{
- if (_cfg.heuristic_select_neighbors()) {
- return select_neighbors_heuristic(neighbors, max_links);
- } else {
- return select_neighbors_simple(neighbors, max_links);
- }
-}
-
-void
-HnswIndexBase::connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level)
-{
- set_link_array(docid, level, neighbors);
- for (uint32_t neighbor_docid : neighbors) {
- auto old_links = get_link_array(neighbor_docid, level);
- LinkArray new_links(old_links.begin(), old_links.end());
- new_links.push_back(docid);
- set_link_array(neighbor_docid, level, new_links);
- }
-}
-
-void
-HnswIndexBase::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level)
-{
- LinkArray new_links;
- auto old_links = get_link_array(remove_from, level);
- for (uint32_t id : old_links) {
- if (id != remove_id) new_links.push_back(id);
- }
- set_link_array(remove_from, level, new_links);
-}
-
-
-double
-HnswIndexBase::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const
-{
- auto lhs = get_vector(lhs_docid);
- return calc_distance(lhs, rhs_docid);
-}
-
-double
-HnswIndexBase::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const
-{
- auto rhs = get_vector(rhs_docid);
- return _distance_func.calc(lhs, rhs);
-}
-
-HnswCandidate
-HnswIndexBase::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level)
-{
- HnswCandidate nearest = entry_point;
- bool keep_searching = true;
- while (keep_searching) {
- keep_searching = false;
- for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) {
- double dist = calc_distance(input, neighbor_docid);
- if (dist < nearest.distance) {
- nearest = HnswCandidate(neighbor_docid, dist);
- keep_searching = true;
- }
- }
- }
- return nearest;
-}
-
-void
-HnswIndexBase::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level)
-{
- NearestPriQ candidates;
- // TODO: Add proper handling of visited set.
- auto visited = BitVector::create(_node_refs.size());
- for (const auto &entry : best_neighbors.peek()) {
- candidates.push(entry);
- visited->setBit(entry.docid);
- }
- double limit_dist = std::numeric_limits<double>::max();
-
- while (!candidates.empty()) {
- auto cand = candidates.top();
- if (cand.distance > limit_dist) {
- break;
- }
- candidates.pop();
- for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) {
- if (visited->testBit(neighbor_docid)) {
- continue;
- }
- visited->setBit(neighbor_docid);
- double dist_to_input = calc_distance(input, neighbor_docid);
- if (dist_to_input < limit_dist) {
- candidates.emplace(neighbor_docid, dist_to_input);
- best_neighbors.emplace(neighbor_docid, dist_to_input);
- if (best_neighbors.size() > neighbors_to_find) {
- best_neighbors.pop();
- limit_dist = best_neighbors.top().distance;
- }
- }
- }
- }
-}
-
-HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
- RandomLevelGenerator& level_generator, const Config& cfg)
- : _vectors(vectors),
- _distance_func(distance_func),
- _level_generator(level_generator),
- _cfg(cfg),
- _node_refs(),
- _nodes(make_default_node_store_config()),
- _links(make_default_link_store_config()),
- _entry_docid(0), // Note that docid 0 is reserved and never used
- _entry_level(-1)
-{
-}
-
-HnswIndexBase::~HnswIndexBase() = default;
-
-void
-HnswIndexBase::add_document(uint32_t docid)
-{
- auto input = get_vector(docid);
- _node_refs.ensure_size(docid + 1, AtomicEntryRef());
- // A document cannot be added twice.
- assert(!_node_refs[docid].load_acquire().valid());
- int level = make_node_for_document(docid);
- if (_entry_docid == 0) {
- _entry_docid = docid;
- _entry_level = level;
- return;
- }
-
- int search_level = _entry_level;
- double entry_dist = calc_distance(input, _entry_docid);
- HnswCandidate entry_point(_entry_docid, entry_dist);
- while (search_level > level) {
- entry_point = find_nearest_in_layer(input, entry_point, search_level);
- --search_level;
- }
-
- FurthestPriQ best_neighbors;
- best_neighbors.push(entry_point);
- search_level = std::min(level, _entry_level);
-
- // Insert the added document in each level it should exist in.
- while (search_level >= 0) {
- // TODO: Rename to search_level?
- search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level);
- auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level));
- connect_new_node(docid, neighbors, search_level);
- // TODO: Shrink neighbors if needed
- --search_level;
- }
- if (level > _entry_level) {
- _entry_docid = docid;
- _entry_level = level;
- }
-}
-
-void
-HnswIndexBase::remove_document(uint32_t docid)
-{
- bool need_new_entrypoint = (docid == _entry_docid);
- LinkArray empty;
- LevelArrayRef node_levels = get_level_array(docid);
- for (int level = node_levels.size(); level-- > 0; ) {
- LinkArrayRef my_links = get_link_array(docid, level);
- for (uint32_t neighbor_id : my_links) {
- if (need_new_entrypoint) {
- _entry_docid = neighbor_id;
- _entry_level = level;
- need_new_entrypoint = false;
- }
- remove_link_to(neighbor_id, docid, level);
- }
- set_link_array(docid, level, empty);
- }
- if (need_new_entrypoint) {
- _entry_docid = 0;
- _entry_level = -1;
- }
- search::datastore::EntryRef invalid;
- _node_refs[docid].store_release(invalid);
-}
-
-HnswNode
-HnswIndexBase::get_node(uint32_t docid) const
-{
- auto node_ref = _node_refs[docid].load_acquire();
- if (!node_ref.valid()) {
- return HnswNode();
- }
- auto levels = _nodes.get(node_ref);
- HnswNode::LevelArray result;
- for (const auto& links_ref : levels) {
- auto links = _links.get(links_ref.load_acquire());
- HnswNode::LinkArray result_links(links.begin(), links.end());
- std::sort(result_links.begin(), result_links.end());
- result.push_back(result_links);
- }
- return HnswNode(result);
-}
-
-}
-
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
deleted file mode 100644
index 992f3587651..00000000000
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#pragma once
-
-#include "doc_vector_access.h"
-#include "hnsw_index_utils.h"
-#include "hnsw_node.h"
-#include "nearest_neighbor_index.h"
-#include <vespa/eval/tensor/dense/typed_cells.h>
-#include <vespa/searchlib/common/bitvector.h>
-#include <vespa/vespalib/datastore/array_store.h>
-#include <vespa/vespalib/datastore/atomic_entry_ref.h>
-#include <vespa/vespalib/datastore/entryref.h>
-#include <vespa/vespalib/util/rcuvector.h>
-
-namespace search::tensor {
-
-class DistanceFunction;
-class RandomLevelGenerator;
-
-/**
- * Base class for an implementation of a hierarchical navigable small world graph (HNSW)
- * that is used for approximate K-nearest neighbor search.
- *
- * The implementation supports 1 write thread and multiple search threads without the use of mutexes.
- * This is achieved by using data stores that use generation tracking and associated memory management.
- *
- * The implementation is mainly based on the algorithms described in
- * "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs" (Yu. A. Malkov, D. A. Yashunin),
- * but some adjustments are made to support proper removes.
- *
- * TODO: Add details on how to handle removes.
- */
-class HnswIndexBase : public NearestNeighborIndex {
-public:
- class Config {
- private:
- uint32_t _max_links_at_level_0;
- uint32_t _max_links_at_hierarchic_levels;
- uint32_t _neighbors_to_explore_at_construction;
- bool _heuristic_select_neighbors;
-
- public:
- Config(uint32_t max_links_at_level_0_in,
- uint32_t max_links_at_hierarchic_levels_in,
- uint32_t neighbors_to_explore_at_construction_in,
- bool heuristic_select_neighbors_in)
- : _max_links_at_level_0(max_links_at_level_0_in),
- _max_links_at_hierarchic_levels(max_links_at_hierarchic_levels_in),
- _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in),
- _heuristic_select_neighbors(heuristic_select_neighbors_in)
- {}
- uint32_t max_links_at_level_0() const { return _max_links_at_level_0; }
- uint32_t max_links_at_hierarchic_levels() const { return _max_links_at_hierarchic_levels; }
- uint32_t neighbors_to_explore_at_construction() const { return _neighbors_to_explore_at_construction; }
- bool heuristic_select_neighbors() const { return _heuristic_select_neighbors; }
- };
-
-protected:
- using AtomicEntryRef = search::datastore::AtomicEntryRef;
-
- // This uses 10 bits for buffer id -> 1024 buffers.
- // As we have very short arrays we get less fragmentation with fewer and larger buffers.
- using EntryRefType = search::datastore::EntryRefT<22>;
-
- // Provides mapping from document id -> node reference.
- // The reference is used to lookup the node data in NodeStore.
- using NodeRefVector = vespalib::RcuVector<AtomicEntryRef>;
-
- // This stores the level arrays for all nodes.
- // Each node consists of an array of levels (from level 0 to n) where each entry is a reference to the link array at that level.
- using NodeStore = search::datastore::ArrayStore<AtomicEntryRef, EntryRefType>;
- using LevelArrayRef = NodeStore::ConstArrayRef;
- using LevelArray = vespalib::Array<AtomicEntryRef>;
-
- // This stores all link arrays.
- // A link array consists of the document ids of the nodes a particular node is linked to.
- using LinkStore = search::datastore::ArrayStore<uint32_t, EntryRefType>;
- using LinkArrayRef = LinkStore::ConstArrayRef;
- using LinkArray = vespalib::Array<uint32_t>;
-
- using TypedCells = vespalib::tensor::TypedCells;
-
- const DocVectorAccess& _vectors;
- const DistanceFunction& _distance_func;
- RandomLevelGenerator& _level_generator;
- Config _cfg;
- NodeRefVector _node_refs;
- NodeStore _nodes;
- LinkStore _links;
- uint32_t _entry_docid;
- int _entry_level;
-
- static search::datastore::ArrayStoreConfig make_default_node_store_config();
- static search::datastore::ArrayStoreConfig make_default_link_store_config();
-
- uint32_t max_links_for_level(uint32_t level) const;
- uint32_t make_node_for_document(uint32_t docid);
- LevelArrayRef get_level_array(uint32_t docid) const;
- LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const;
- void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links);
-
- /**
- * Returns true if the distance between the candidate and a node in the current result
- * is less than the distance between the candidate and the node we want to add to the graph.
- * In this case the candidate should be discarded as we already are connected to the space
- * where the candidate is located.
- * Used by select_neighbors_heuristic().
- */
- bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const;
- LinkArray select_neighbors_heuristic(const HnswCandidateVector& neighbors, uint32_t max_links) const;
- LinkArray select_neighbors_simple(const HnswCandidateVector& neighbors, uint32_t max_links) const;
- LinkArray select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const;
- void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level);
- void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level);
-
- inline TypedCells get_vector(uint32_t docid) const {
- return _vectors.get_vector(docid);
- }
-
- double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const;
- double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const;
-
- /**
- * Performs a greedy search in the given layer to find the candidate that is nearest the input vector.
- */
- HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level);
- void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level);
-
-public:
- HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
- RandomLevelGenerator& level_generator, const Config& cfg);
- ~HnswIndexBase() override;
-
- void add_document(uint32_t docid) override;
- void remove_document(uint32_t docid) override;
-
- // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists)
-
- uint32_t get_entry_docid() const { return _entry_docid; }
- uint32_t get_entry_level() const { return _entry_level; }
-
- // Should only be used by unit tests.
- HnswNode get_node(uint32_t docid) const;
-
- // TODO: Implement set_node() as well for use in unit tests.
-};
-
-}
-