diff options
author | Arne Juul <arnej@verizonmedia.com> | 2020-03-16 07:04:03 +0000 |
---|---|---|
committer | Arne Juul <arnej@verizonmedia.com> | 2020-03-16 14:43:41 +0000 |
commit | f1276559667e86787fab42a85617fc16a6a25593 (patch) | |
tree | 7316fac12ae2f9a0b09903b6527e79f203bf8ac1 /searchlib | |
parent | 94d58d03e282ef1d5afd8db8b0a451ad24806c51 (diff) |
better remove strategy
* compute pairwise distance between neighbors of a removed
point, and add links starting with closest pair.
Diffstat (limited to 'searchlib')
-rw-r--r-- | searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp | 46 | ||||
-rw-r--r-- | searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 3 |
2 files changed, 47 insertions, 2 deletions
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 09608a9abbe..c6c2c2961ee 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -28,6 +28,18 @@ bool has_link_to(vespalib::ConstArrayRef<uint32_t> links, uint32_t id) { return false; } +struct PairDist { + uint32_t id_first; + uint32_t id_second; + double distance; + PairDist(uint32_t i1, uint32_t i2, double d) + : id_first(i1), id_second(i2), distance(d) + {} +}; +bool operator< (const PairDist &a, const PairDist &b) { + return (a.distance < b.distance); +} + } search::datastore::ArrayStoreConfig @@ -99,7 +111,7 @@ HnswIndex::set_link_array(uint32_t docid, uint32_t level, const LinkArrayRef& li } bool -HnswIndex::have_closer_distance(HnswCandidate candidate, const LinkArray& result) const +HnswIndex::have_closer_distance(HnswCandidate candidate, const LinkArrayRef& result) const { for (uint32_t result_docid : result) { double dist = calc_distance(candidate.docid, result_docid); @@ -335,6 +347,37 @@ HnswIndex::add_document(uint32_t docid) } void +HnswIndex::mutual_reconnect(const LinkArrayRef &cluster, uint32_t level) +{ + std::vector<PairDist> pairs; + for (uint32_t i = 0; i + 1 < cluster.size(); ++i) { + uint32_t n_id_1 = cluster[i]; + LinkArrayRef n_list_1 = get_link_array(n_id_1, level); + for (uint32_t j = i + 1; j < cluster.size(); ++j) { + uint32_t n_id_2 = cluster[j]; + if (has_link_to(n_list_1, n_id_2)) continue; + pairs.emplace_back(n_id_1, n_id_2, calc_distance(n_id_1, n_id_2)); + } + } + std::sort(pairs.begin(), pairs.end()); + for (const PairDist & pair : pairs) { + LinkArrayRef old_links_1 = get_link_array(pair.id_first, level); + if (old_links_1.size() >= _cfg.max_links_on_inserts()) continue; + + LinkArrayRef old_links_2 = get_link_array(pair.id_second, level); + if (old_links_2.size() >= _cfg.max_links_on_inserts()) continue; + + LinkArray new_links_1(old_links_1.begin(), old_links_1.end()); + new_links_1.push_back(pair.id_second); + set_link_array(pair.id_first, level, new_links_1); + + LinkArray new_links_2(old_links_2.begin(), old_links_2.end()); + new_links_2.push_back(pair.id_first); + set_link_array(pair.id_second, level, new_links_2); + } +} + +void HnswIndex::remove_document(uint32_t docid) { bool need_new_entrypoint = (docid == _entry_docid); @@ -350,6 +393,7 @@ HnswIndex::remove_document(uint32_t docid) } remove_link_to(neighbor_id, docid, level); } + mutual_reconnect(my_links, level); set_link_array(docid, level, empty); } if (need_new_entrypoint) { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index 4066316a991..4bf3c917ebd 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -109,7 +109,7 @@ protected: * where the candidate is located. * Used by select_neighbors_heuristic(). */ - bool have_closer_distance(HnswCandidate candidate, const LinkArray& curr_result) const; + bool have_closer_distance(HnswCandidate candidate, const LinkArrayRef& curr_result) const; struct SelectResult { LinkArray used; LinkArray unused; @@ -119,6 +119,7 @@ protected: SelectResult select_neighbors(const HnswCandidateVector& neighbors, uint32_t max_links) const; void shrink_if_needed(uint32_t docid, uint32_t level); void connect_new_node(uint32_t docid, const LinkArrayRef &neighbors, uint32_t level); + void mutual_reconnect(const LinkArrayRef &cluster, uint32_t level); void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level); inline TypedCells get_vector(uint32_t docid) const { |