summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2020-02-10 12:19:40 +0000
committerGeir Storli <geirst@verizonmedia.com>2020-02-10 12:19:40 +0000
commit1debbb54c26ce0c146a1d0711beea55adef3173b (patch)
tree34e4c34d713762bb715696136423932463230435
parent8164e4a5c4910a878e49663730de5c2b310d5940 (diff)
Move code to the hnsw index base class as we no longer depend on the float type.
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp142
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h18
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp138
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h23
4 files changed, 158 insertions, 163 deletions
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index cb617246066..9ad578ae983 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -2,83 +2,10 @@
#include "distance_function.h"
#include "hnsw_index.h"
-#include <vespa/vespalib/util/rcuvector.hpp>
namespace search::tensor {
template <typename FloatType>
-double
-HnswIndex<FloatType>::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const
-{
- auto lhs = get_vector(lhs_docid);
- return calc_distance(lhs, rhs_docid);
-}
-
-template <typename FloatType>
-double
-HnswIndex<FloatType>::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const
-{
- auto rhs = get_vector(rhs_docid);
- return _distance_func.calc(lhs, rhs);
-}
-
-template <typename FloatType>
-HnswCandidate
-HnswIndex<FloatType>::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level)
-{
- HnswCandidate nearest = entry_point;
- bool keep_searching = true;
- while (keep_searching) {
- keep_searching = false;
- for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) {
- double dist = calc_distance(input, neighbor_docid);
- if (dist < nearest.distance) {
- nearest = HnswCandidate(neighbor_docid, dist);
- keep_searching = true;
- }
- }
- }
- return nearest;
-}
-
-template <typename FloatType>
-void
-HnswIndex<FloatType>::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level)
-{
- NearestPriQ candidates;
- // TODO: Add proper handling of visited set.
- auto visited = BitVector::create(_node_refs.size());
- for (const auto &entry : best_neighbors.peek()) {
- candidates.push(entry);
- visited->setBit(entry.docid);
- }
- double limit_dist = std::numeric_limits<double>::max();
-
- while (!candidates.empty()) {
- auto cand = candidates.top();
- if (cand.distance > limit_dist) {
- break;
- }
- candidates.pop();
- for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) {
- if (visited->testBit(neighbor_docid)) {
- continue;
- }
- visited->setBit(neighbor_docid);
- double dist_to_input = calc_distance(input, neighbor_docid);
- if (dist_to_input < limit_dist) {
- candidates.emplace(neighbor_docid, dist_to_input);
- best_neighbors.emplace(neighbor_docid, dist_to_input);
- if (best_neighbors.size() > neighbors_to_find) {
- best_neighbors.pop();
- limit_dist = best_neighbors.top().distance;
- }
- }
- }
- }
-}
-
-template <typename FloatType>
HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
RandomLevelGenerator& level_generator, const Config& cfg)
: HnswIndexBase(vectors, distance_func, level_generator, cfg)
@@ -88,74 +15,5 @@ HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, const DistanceFu
template <typename FloatType>
HnswIndex<FloatType>::~HnswIndex() = default;
-template <typename FloatType>
-void
-HnswIndex<FloatType>::add_document(uint32_t docid)
-{
- auto input = get_vector(docid);
- _node_refs.ensure_size(docid + 1, AtomicEntryRef());
- // A document cannot be added twice.
- assert(!_node_refs[docid].load_acquire().valid());
- int level = make_node_for_document(docid);
- if (_entry_docid == 0) {
- _entry_docid = docid;
- _entry_level = level;
- return;
- }
-
- int search_level = _entry_level;
- double entry_dist = calc_distance(input, _entry_docid);
- HnswCandidate entry_point(_entry_docid, entry_dist);
- while (search_level > level) {
- entry_point = find_nearest_in_layer(input, entry_point, search_level);
- --search_level;
- }
-
- FurthestPriQ best_neighbors;
- best_neighbors.push(entry_point);
- search_level = std::min(level, _entry_level);
-
- // Insert the added document in each level it should exist in.
- while (search_level >= 0) {
- // TODO: Rename to search_level?
- search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level);
- auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level));
- connect_new_node(docid, neighbors, search_level);
- // TODO: Shrink neighbors if needed
- --search_level;
- }
- if (level > _entry_level) {
- _entry_docid = docid;
- _entry_level = level;
- }
-}
-
-template <typename FloatType>
-void
-HnswIndex<FloatType>::remove_document(uint32_t docid)
-{
- bool need_new_entrypoint = (docid == _entry_docid);
- LinkArray empty;
- LevelArrayRef node_levels = get_level_array(docid);
- for (int level = node_levels.size(); level-- > 0; ) {
- LinkArrayRef my_links = get_link_array(docid, level);
- for (uint32_t neighbor_id : my_links) {
- if (need_new_entrypoint) {
- _entry_docid = neighbor_id;
- _entry_level = level;
- need_new_entrypoint = false;
- }
- remove_link_to(neighbor_id, docid, level);
- }
- set_link_array(docid, level, empty);
- }
- if (need_new_entrypoint) {
- _entry_docid = 0;
- _entry_level = -1;
- }
- search::datastore::EntryRef invalid;
- _node_refs[docid].store_release(invalid);
-}
-
}
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index a93062cf200..2b0761d17ea 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -25,28 +25,10 @@ class RandomLevelGenerator;
*/
template <typename FloatType = float>
class HnswIndex : public HnswIndexBase {
-private:
- using TypedCells = vespalib::tensor::TypedCells;
-
- inline TypedCells get_vector(uint32_t docid) const {
- return _vectors.get_vector(docid);
- }
-
- double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override;
- double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const;
- /**
- * Performs a greedy search in the given layer to find the candidate that is nearest the input vector.
- */
- HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level);
- void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level);
-
public:
HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
RandomLevelGenerator& level_generator, const Config& cfg);
~HnswIndex() override;
-
- void add_document(uint32_t docid) override;
- void remove_document(uint32_t docid) override;
};
template class HnswIndex<float>;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
index e0ecdf87399..a58dba70746 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp
@@ -1,7 +1,9 @@
// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "distance_function.h"
#include "hnsw_index_base.h"
#include "random_level_generator.h"
+#include <vespa/eval/tensor/dense/typed_cells.h>
#include <vespa/vespalib/datastore/array_store.hpp>
#include <vespa/vespalib/util/rcuvector.hpp>
@@ -156,6 +158,75 @@ HnswIndexBase::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t
set_link_array(remove_from, level, new_links);
}
+
+double
+HnswIndexBase::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const
+{
+ auto lhs = get_vector(lhs_docid);
+ return calc_distance(lhs, rhs_docid);
+}
+
+double
+HnswIndexBase::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const
+{
+ auto rhs = get_vector(rhs_docid);
+ return _distance_func.calc(lhs, rhs);
+}
+
+HnswCandidate
+HnswIndexBase::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level)
+{
+ HnswCandidate nearest = entry_point;
+ bool keep_searching = true;
+ while (keep_searching) {
+ keep_searching = false;
+ for (uint32_t neighbor_docid : get_link_array(nearest.docid, level)) {
+ double dist = calc_distance(input, neighbor_docid);
+ if (dist < nearest.distance) {
+ nearest = HnswCandidate(neighbor_docid, dist);
+ keep_searching = true;
+ }
+ }
+ }
+ return nearest;
+}
+
+void
+HnswIndexBase::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level)
+{
+ NearestPriQ candidates;
+ // TODO: Add proper handling of visited set.
+ auto visited = BitVector::create(_node_refs.size());
+ for (const auto &entry : best_neighbors.peek()) {
+ candidates.push(entry);
+ visited->setBit(entry.docid);
+ }
+ double limit_dist = std::numeric_limits<double>::max();
+
+ while (!candidates.empty()) {
+ auto cand = candidates.top();
+ if (cand.distance > limit_dist) {
+ break;
+ }
+ candidates.pop();
+ for (uint32_t neighbor_docid : get_link_array(cand.docid, level)) {
+ if (visited->testBit(neighbor_docid)) {
+ continue;
+ }
+ visited->setBit(neighbor_docid);
+ double dist_to_input = calc_distance(input, neighbor_docid);
+ if (dist_to_input < limit_dist) {
+ candidates.emplace(neighbor_docid, dist_to_input);
+ best_neighbors.emplace(neighbor_docid, dist_to_input);
+ if (best_neighbors.size() > neighbors_to_find) {
+ best_neighbors.pop();
+ limit_dist = best_neighbors.top().distance;
+ }
+ }
+ }
+ }
+}
+
HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
RandomLevelGenerator& level_generator, const Config& cfg)
: _vectors(vectors),
@@ -172,6 +243,73 @@ HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunct
HnswIndexBase::~HnswIndexBase() = default;
+void
+HnswIndexBase::add_document(uint32_t docid)
+{
+ auto input = get_vector(docid);
+ _node_refs.ensure_size(docid + 1, AtomicEntryRef());
+ // A document cannot be added twice.
+ assert(!_node_refs[docid].load_acquire().valid());
+ int level = make_node_for_document(docid);
+ if (_entry_docid == 0) {
+ _entry_docid = docid;
+ _entry_level = level;
+ return;
+ }
+
+ int search_level = _entry_level;
+ double entry_dist = calc_distance(input, _entry_docid);
+ HnswCandidate entry_point(_entry_docid, entry_dist);
+ while (search_level > level) {
+ entry_point = find_nearest_in_layer(input, entry_point, search_level);
+ --search_level;
+ }
+
+ FurthestPriQ best_neighbors;
+ best_neighbors.push(entry_point);
+ search_level = std::min(level, _entry_level);
+
+ // Insert the added document in each level it should exist in.
+ while (search_level >= 0) {
+ // TODO: Rename to search_level?
+ search_layer(input, _cfg.neighbors_to_explore_at_construction(), best_neighbors, search_level);
+ auto neighbors = select_neighbors(best_neighbors.peek(), max_links_for_level(search_level));
+ connect_new_node(docid, neighbors, search_level);
+ // TODO: Shrink neighbors if needed
+ --search_level;
+ }
+ if (level > _entry_level) {
+ _entry_docid = docid;
+ _entry_level = level;
+ }
+}
+
+void
+HnswIndexBase::remove_document(uint32_t docid)
+{
+ bool need_new_entrypoint = (docid == _entry_docid);
+ LinkArray empty;
+ LevelArrayRef node_levels = get_level_array(docid);
+ for (int level = node_levels.size(); level-- > 0; ) {
+ LinkArrayRef my_links = get_link_array(docid, level);
+ for (uint32_t neighbor_id : my_links) {
+ if (need_new_entrypoint) {
+ _entry_docid = neighbor_id;
+ _entry_level = level;
+ need_new_entrypoint = false;
+ }
+ remove_link_to(neighbor_id, docid, level);
+ }
+ set_link_array(docid, level, empty);
+ }
+ if (need_new_entrypoint) {
+ _entry_docid = 0;
+ _entry_level = -1;
+ }
+ search::datastore::EntryRef invalid;
+ _node_refs[docid].store_release(invalid);
+}
+
HnswNode
HnswIndexBase::get_node(uint32_t docid) const
{
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
index ab59ad02f14..992f3587651 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h
@@ -2,9 +2,11 @@
#pragma once
+#include "doc_vector_access.h"
#include "hnsw_index_utils.h"
#include "hnsw_node.h"
#include "nearest_neighbor_index.h"
+#include <vespa/eval/tensor/dense/typed_cells.h>
#include <vespa/searchlib/common/bitvector.h>
#include <vespa/vespalib/datastore/array_store.h>
#include <vespa/vespalib/datastore/atomic_entry_ref.h>
@@ -14,7 +16,6 @@
namespace search::tensor {
class DistanceFunction;
-class DocVectorAccess;
class RandomLevelGenerator;
/**
@@ -78,6 +79,8 @@ protected:
using LinkArrayRef = LinkStore::ConstArrayRef;
using LinkArray = vespalib::Array<uint32_t>;
+ using TypedCells = vespalib::tensor::TypedCells;
+
const DocVectorAccess& _vectors;
const DistanceFunction& _distance_func;
RandomLevelGenerator& _level_generator;
@@ -97,8 +100,6 @@ protected:
LinkArrayRef get_link_array(uint32_t docid, uint32_t level) const;
void set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& links);
- virtual double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const = 0;
-
/**
* Returns true if the distance between the candidate and a node in the current result
* is less than the distance between the candidate and the node we want to add to the graph.
@@ -113,11 +114,27 @@ protected:
void connect_new_node(uint32_t docid, const LinkArray& neighbors, uint32_t level);
void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level);
+ inline TypedCells get_vector(uint32_t docid) const {
+ return _vectors.get_vector(docid);
+ }
+
+ double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const;
+ double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const;
+
+ /**
+ * Performs a greedy search in the given layer to find the candidate that is nearest the input vector.
+ */
+ HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level);
+ void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level);
+
public:
HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func,
RandomLevelGenerator& level_generator, const Config& cfg);
~HnswIndexBase() override;
+ void add_document(uint32_t docid) override;
+ void remove_document(uint32_t docid) override;
+
// TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists)
uint32_t get_entry_docid() const { return _entry_docid; }