From 0e955fe9420b1b89d6fccbb77ebf0a698d922e8c Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Mon, 29 Apr 2024 14:35:21 +0200 Subject: Check consistency after loading tensor attribute with index. Don't reconnect hnsw graph node without vector. --- .../tensorattribute/tensorattribute_test.cpp | 4 +++ .../tests/tensor/hnsw_index/hnsw_index_test.cpp | 30 +++++++++++++++++++++ .../src/vespa/searchlib/tensor/hnsw_index.cpp | 31 +++++++++++++++++++++- searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 3 +++ .../searchlib/tensor/nearest_neighbor_index.h | 3 +++ .../searchlib/tensor/tensor_attribute_loader.cpp | 14 ++++++++++ .../searchlib/tensor/tensor_attribute_loader.h | 1 + 7 files changed, 85 insertions(+), 1 deletion(-) diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index b1b2235165f..825c81de107 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -329,6 +329,10 @@ public: static search::tensor::DistanceFunctionFactory::UP my_dist_fun = search::tensor::make_distance_function_factory(search::attribute::DistanceMetric::Euclidean, vespalib::eval::CellType::DOUBLE); return *my_dist_fun; } + + uint32_t check_consistency(uint32_t) noexcept override { + return 0; + } }; class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory { diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index d50677314df..a1cf86c95cc 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -936,6 +936,36 @@ TYPED_TEST(HnswIndexTest, search_during_remove) this->expect_top_3_by_docid("{0, 0}", {0, 0}, {7}); } +TYPED_TEST(HnswIndexTest, inconsistent_index) +{ + this->init(false); + this->vectors.clear(); + this->vectors.set(1, {1, 3}).set(2, {7, 1}).set(3, {6, 5}).set(4, {8, 3}).set(5, {10, 3}); + this->add_document(1); + this->add_document(2); + this->add_document(3); + this->add_document(4); + this->add_document(5); + this->expect_entry_point(1, 0); + this->expect_level_0(1, {2, 3}); + this->expect_level_0(2, {1, 3, 4, 5}); + this->expect_level_0(3, {1, 2, 4}); + this->expect_level_0(4, {2, 3, 5}); + this->expect_level_0(5, {2, 4}); + EXPECT_EQ(0, this->index->check_consistency(6)); + // Remove vector for docid 5 but don't update index. + this->vectors.clear(5); + EXPECT_EQ(1, this->index->check_consistency(6)); + /* + * Removing document 2 causes mutual reconnect for nodes [1, 3, 4, 5] + * where nodes 1 and 5 are not previously connected. Distance from + * node 1 to node 5 cannot be calculated due to missing vector. + */ + this->remove_document(2); + // No reconnect for node without vector + this->expect_level_0(5, {4}); +} + using HnswMultiIndexTest = HnswIndexTest>; namespace { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 322965ca06a..8cfa10d918b 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -679,7 +679,10 @@ HnswIndex::mutual_reconnect(const LinkArrayRef &cluster, uint32_t level) for (uint32_t j = i + 1; j < cluster.size(); ++j) { uint32_t n_id_2 = cluster[j]; if ( ! has_link_to(n_list_1, n_id_2)) { - pairs.emplace_back(n_id_1, n_id_2, calc_distance(*df, n_id_2)); + auto n_cells_2 = get_vector(n_id_2); + if (!n_cells_2.non_existing_attribute_value()) { + pairs.emplace_back(n_id_1, n_id_2, df->calc(n_cells_2)); + } } } } @@ -1120,6 +1123,32 @@ HnswIndex::count_reachable_nodes() const return {found_cnt, true}; } +template +uint32_t +HnswIndex::get_subspaces(uint32_t docid) noexcept +{ + if (type == HnswIndexType::SINGLE) { + return (docid < _graph.nodes.size() && _graph.nodes[docid].levels_ref().load_relaxed().valid()) ? 1 : 0; + } else { + return _id_mapping.get_ids(docid).size(); + } +} + +template +uint32_t +HnswIndex::check_consistency(uint32_t docid_limit) noexcept +{ + uint32_t inconsistencies = 0; + for (uint32_t docid = 1; docid < docid_limit; ++docid) { + auto index_subspaces = get_subspaces(docid); + auto store_subspaces = get_vectors(docid).subspaces(); + if (index_subspaces != store_subspaces) { + ++inconsistencies; + } + } + return inconsistencies; +} + template class HnswIndex; template class HnswIndex; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index 616140f426f..757f4d00666 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -193,6 +193,7 @@ protected: LinkArray filter_valid_nodeids(uint32_t level, const internal::PreparedAddNode::Links &neighbors, uint32_t self_nodeid); void internal_complete_add(uint32_t docid, internal::PreparedAddDoc &op); void internal_complete_add_node(uint32_t nodeid, uint32_t docid, uint32_t subspace, internal::PreparedAddNode &prepared_node); + uint32_t get_subspaces(uint32_t docid) noexcept; public: HnswIndex(const DocVectorAccess& vectors, DistanceFunctionFactory::UP distance_ff, RandomLevelGenerator::UP level_generator, const HnswIndexConfig& cfg); @@ -248,6 +249,8 @@ public: uint32_t get_active_nodes() const noexcept { return _graph.get_active_nodes(); } + uint32_t check_consistency(uint32_t docid_limit) noexcept override; + // Should only be used by unit tests. HnswTestNode get_node(uint32_t nodeid) const; void set_node(uint32_t nodeid, const HnswTestNode &node); diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h index 8462ff05eca..a4309c366b1 100644 --- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h +++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h @@ -114,6 +114,9 @@ public: double distance_threshold) const = 0; virtual DistanceFunctionFactory &distance_function_factory() const = 0; + + // Used when checking consistency during load + virtual uint32_t check_consistency(uint32_t docid_limit) noexcept = 0; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp index 28c4099c38b..223c9d7d1f2 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp @@ -322,6 +322,9 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor) if (!load_index()) { return false; } + if (dense_store == nullptr) { + check_consistency(docid_limit); + } } else { build_index(executor, docid_limit); } @@ -329,4 +332,15 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor) return true; } +void +TensorAttributeLoader::check_consistency(uint32_t docid_limit) +{ + auto before = vespalib::steady_clock::now(); + uint32_t inconsistencies = _index->check_consistency(docid_limit); + auto after = vespalib::steady_clock::now(); + double elapsed = vespalib::to_s(after - before); + LOG(info, "%u inconsistencies detected after loading index for attribute %s, (check used %6.3fs)", + inconsistencies, _attr.getName().c_str(), elapsed); +} + } diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h index 6bf68957adc..59baaf0b6dc 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h @@ -34,6 +34,7 @@ class TensorAttributeLoader { void load_tensor_store(search::attribute::BlobSequenceReader& reader, uint32_t docid_limit); void build_index(vespalib::Executor* executor, uint32_t docid_limit); bool load_index(); + void check_consistency(uint32_t docid_limit); public: TensorAttributeLoader(TensorAttribute& attr, GenerationHandler& generation_handler, RefVector& ref_vector, TensorStore& store, NearestNeighborIndex* index); -- cgit v1.2.3 From f17741c6567a4e7d2def99bbefe423574d5e2df2 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Mon, 29 Apr 2024 19:55:03 +0200 Subject: Add const specifiers for HnswIndex member functions get_subspaces() and check_consistency(). --- .../src/tests/attribute/tensorattribute/tensorattribute_test.cpp | 2 +- searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp | 8 ++++---- searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 7 +++++-- searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h | 7 +++++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index 825c81de107..089f5e2e239 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -330,7 +330,7 @@ public: return *my_dist_fun; } - uint32_t check_consistency(uint32_t) noexcept override { + uint32_t check_consistency(uint32_t) const noexcept override { return 0; } }; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 8cfa10d918b..b542c422f50 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -1125,10 +1125,10 @@ HnswIndex::count_reachable_nodes() const template uint32_t -HnswIndex::get_subspaces(uint32_t docid) noexcept +HnswIndex::get_subspaces(uint32_t docid) const noexcept { - if (type == HnswIndexType::SINGLE) { - return (docid < _graph.nodes.size() && _graph.nodes[docid].levels_ref().load_relaxed().valid()) ? 1 : 0; + if constexpr (type == HnswIndexType::SINGLE) { + return (docid < _graph.nodes.get_size() && _graph.nodes.get_elem_ref(docid).levels_ref().load_relaxed().valid()) ? 1 : 0; } else { return _id_mapping.get_ids(docid).size(); } @@ -1136,7 +1136,7 @@ HnswIndex::get_subspaces(uint32_t docid) noexcept template uint32_t -HnswIndex::check_consistency(uint32_t docid_limit) noexcept +HnswIndex::check_consistency(uint32_t docid_limit) const noexcept { uint32_t inconsistencies = 0; for (uint32_t docid = 1; docid < docid_limit; ++docid) { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index 757f4d00666..4d4440c1bcb 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -193,7 +193,9 @@ protected: LinkArray filter_valid_nodeids(uint32_t level, const internal::PreparedAddNode::Links &neighbors, uint32_t self_nodeid); void internal_complete_add(uint32_t docid, internal::PreparedAddDoc &op); void internal_complete_add_node(uint32_t nodeid, uint32_t docid, uint32_t subspace, internal::PreparedAddNode &prepared_node); - uint32_t get_subspaces(uint32_t docid) noexcept; + + // Called from writer only. + uint32_t get_subspaces(uint32_t docid) const noexcept; public: HnswIndex(const DocVectorAccess& vectors, DistanceFunctionFactory::UP distance_ff, RandomLevelGenerator::UP level_generator, const HnswIndexConfig& cfg); @@ -249,7 +251,8 @@ public: uint32_t get_active_nodes() const noexcept { return _graph.get_active_nodes(); } - uint32_t check_consistency(uint32_t docid_limit) noexcept override; + // Called from writer only. + uint32_t check_consistency(uint32_t docid_limit) const noexcept override; // Should only be used by unit tests. HnswTestNode get_node(uint32_t nodeid) const; diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h index a4309c366b1..c2bbd17ce63 100644 --- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h +++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h @@ -115,8 +115,11 @@ public: virtual DistanceFunctionFactory &distance_function_factory() const = 0; - // Used when checking consistency during load - virtual uint32_t check_consistency(uint32_t docid_limit) noexcept = 0; + /* + * Used when checking consistency during load. + * Called from writer only. + */ + virtual uint32_t check_consistency(uint32_t docid_limit) const noexcept = 0; }; } -- cgit v1.2.3