summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorArne Juul <arnej@verizonmedia.com>2020-03-16 07:04:03 +0000
committerArne Juul <arnej@verizonmedia.com>2020-03-16 14:43:41 +0000
commitf1276559667e86787fab42a85617fc16a6a25593 (patch)
tree7316fac12ae2f9a0b09903b6527e79f203bf8ac1 /searchlib
parent94d58d03e282ef1d5afd8db8b0a451ad24806c51 (diff)
better remove strategy
* compute pairwise distance between neighbors of a removed point, and add links starting with closest pair.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp46
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h3
2 files changed, 47 insertions, 2 deletions
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index 09608a9abbe..c6c2c2961ee 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -28,6 +28,18 @@ bool has_link_to(vespalib::ConstArrayRef<uint32_t> links, uint32_t id) {
return false;
}
+struct PairDist {
+ uint32_t id_first;
+ uint32_t id_second;
+ double distance;
+ PairDist(uint32_t i1, uint32_t i2, double d)
+ : id_first(i1), id_second(i2), distance(d)
+ {}
+};
+bool operator< (const PairDist &a, const PairDist &b) {
+ return (a.distance < b.distance);
+}
+
}
search::datastore::ArrayStoreConfig
@@ -99,7 +111,7 @@ HnswIndex::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& li
}
bool
-HnswIndex::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const
+HnswIndex::have_closer_distance(HnswCandidate candidate, const LinkArrayRef& result) const
{
for (uint32_t result_docid : result) {
double dist = calc_distance(candidate.docid, result_docid);
@@ -335,6 +347,37 @@ HnswIndex::add_document(uint32_t docid)
}
void
+HnswIndex::mutual_reconnect(const LinkArrayRef &cluster, uint32_t level)
+{
+ std::vector<PairDist> pairs;
+ for (uint32_t i = 0; i + 1 < cluster.size(); ++i) {
+ uint32_t n_id_1 = cluster[i];
+ LinkArrayRef n_list_1 = get_link_array(n_id_1, level);
+ for (uint32_t j = i + 1; j < cluster.size(); ++j) {
+ uint32_t n_id_2 = cluster[j];
+ if (has_link_to(n_list_1, n_id_2)) continue;
+ pairs.emplace_back(n_id_1, n_id_2, calc_distance(n_id_1, n_id_2));
+ }
+ }
+ std::sort(pairs.begin(), pairs.end());
+ for (const PairDist & pair : pairs) {
+ LinkArrayRef old_links_1 = get_link_array(pair.id_first, level);
+ if (old_links_1.size() >= _cfg.max_links_on_inserts()) continue;
+
+ LinkArrayRef old_links_2 = get_link_array(pair.id_second, level);
+ if (old_links_2.size() >= _cfg.max_links_on_inserts()) continue;
+
+ LinkArray new_links_1(old_links_1.begin(), old_links_1.end());
+ new_links_1.push_back(pair.id_second);
+ set_link_array(pair.id_first, level, new_links_1);
+
+ LinkArray new_links_2(old_links_2.begin(), old_links_2.end());
+ new_links_2.push_back(pair.id_first);
+ set_link_array(pair.id_second, level, new_links_2);
+ }
+}
+
+void
HnswIndex::remove_document(uint32_t docid)
{
bool need_new_entrypoint = (docid == _entry_docid);
@@ -350,6 +393,7 @@ HnswIndex::remove_document(uint32_t docid)
}
remove_link_to(neighbor_id, docid, level);
}
+ mutual_reconnect(my_links, level);
set_link_array(docid, level, empty);
}
if (need_new_entrypoint) {
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index 4066316a991..4bf3c917ebd 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -109,7 +109,7 @@ protected:
* where the candidate is located.
* Used by select_neighbors_heuristic().
*/
- bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const;
+ bool have_closer_distance(HnswCandidate candidate, const LinkArrayRef& curr_result) const;
struct SelectResult {
LinkArray used;
LinkArray unused;
@@ -119,6 +119,7 @@ protected:
SelectResult select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const;
void shrink_if_needed(uint32_t docid, uint32_t level);
void connect_new_node(uint32_t docid, const LinkArrayRef &neighbors, uint32_t level);
+ void mutual_reconnect(const LinkArrayRef &cluster, uint32_t level);
void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level);
inline TypedCells get_vector(uint32_t docid) const {