diff options
Diffstat (limited to 'searchlib')
7 files changed, 91 insertions, 1 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index b1b2235165f..089f5e2e239 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -329,6 +329,10 @@ public: static search::tensor::DistanceFunctionFactory::UP my_dist_fun = search::tensor::make_distance_function_factory(search::attribute::DistanceMetric::Euclidean, vespalib::eval::CellType::DOUBLE); return *my_dist_fun; } + + uint32_t check_consistency(uint32_t) const noexcept override { + return 0; + } }; class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory { diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index d50677314df..a1cf86c95cc 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -936,6 +936,36 @@ TYPED_TEST(HnswIndexTest, search_during_remove) this->expect_top_3_by_docid("{0, 0}", {0, 0}, {7}); } +TYPED_TEST(HnswIndexTest, inconsistent_index) +{ + this->init(false); + this->vectors.clear(); + this->vectors.set(1, {1, 3}).set(2, {7, 1}).set(3, {6, 5}).set(4, {8, 3}).set(5, {10, 3}); + this->add_document(1); + this->add_document(2); + this->add_document(3); + this->add_document(4); + this->add_document(5); + this->expect_entry_point(1, 0); + this->expect_level_0(1, {2, 3}); + this->expect_level_0(2, {1, 3, 4, 5}); + this->expect_level_0(3, {1, 2, 4}); + this->expect_level_0(4, {2, 3, 5}); + this->expect_level_0(5, {2, 4}); + EXPECT_EQ(0, this->index->check_consistency(6)); + // Remove vector for docid 5 but don't update index. + this->vectors.clear(5); + EXPECT_EQ(1, this->index->check_consistency(6)); + /* + * Removing document 2 causes mutual reconnect for nodes [1, 3, 4, 5] + * where nodes 1 and 5 are not previously connected. Distance from + * node 1 to node 5 cannot be calculated due to missing vector. + */ + this->remove_document(2); + // No reconnect for node without vector + this->expect_level_0(5, {4}); +} + using HnswMultiIndexTest = HnswIndexTest<HnswIndex<HnswIndexType::MULTI>>; namespace { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 322965ca06a..b542c422f50 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -679,7 +679,10 @@ HnswIndex<type>::mutual_reconnect(const LinkArrayRef &cluster, uint32_t level) for (uint32_t j = i + 1; j < cluster.size(); ++j) { uint32_t n_id_2 = cluster[j]; if ( ! has_link_to(n_list_1, n_id_2)) { - pairs.emplace_back(n_id_1, n_id_2, calc_distance(*df, n_id_2)); + auto n_cells_2 = get_vector(n_id_2); + if (!n_cells_2.non_existing_attribute_value()) { + pairs.emplace_back(n_id_1, n_id_2, df->calc(n_cells_2)); + } } } } @@ -1120,6 +1123,32 @@ HnswIndex<type>::count_reachable_nodes() const return {found_cnt, true}; } +template <HnswIndexType type> +uint32_t +HnswIndex<type>::get_subspaces(uint32_t docid) const noexcept +{ + if constexpr (type == HnswIndexType::SINGLE) { + return (docid < _graph.nodes.get_size() && _graph.nodes.get_elem_ref(docid).levels_ref().load_relaxed().valid()) ? 1 : 0; + } else { + return _id_mapping.get_ids(docid).size(); + } +} + +template <HnswIndexType type> +uint32_t +HnswIndex<type>::check_consistency(uint32_t docid_limit) const noexcept +{ + uint32_t inconsistencies = 0; + for (uint32_t docid = 1; docid < docid_limit; ++docid) { + auto index_subspaces = get_subspaces(docid); + auto store_subspaces = get_vectors(docid).subspaces(); + if (index_subspaces != store_subspaces) { + ++inconsistencies; + } + } + return inconsistencies; +} + template class HnswIndex<HnswIndexType::SINGLE>; template class HnswIndex<HnswIndexType::MULTI>; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index 616140f426f..4d4440c1bcb 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -193,6 +193,9 @@ protected: LinkArray filter_valid_nodeids(uint32_t level, const internal::PreparedAddNode::Links &neighbors, uint32_t self_nodeid); void internal_complete_add(uint32_t docid, internal::PreparedAddDoc &op); void internal_complete_add_node(uint32_t nodeid, uint32_t docid, uint32_t subspace, internal::PreparedAddNode &prepared_node); + + // Called from writer only. + uint32_t get_subspaces(uint32_t docid) const noexcept; public: HnswIndex(const DocVectorAccess& vectors, DistanceFunctionFactory::UP distance_ff, RandomLevelGenerator::UP level_generator, const HnswIndexConfig& cfg); @@ -248,6 +251,9 @@ public: uint32_t get_active_nodes() const noexcept { return _graph.get_active_nodes(); } + // Called from writer only. + uint32_t check_consistency(uint32_t docid_limit) const noexcept override; + // Should only be used by unit tests. HnswTestNode get_node(uint32_t nodeid) const; void set_node(uint32_t nodeid, const HnswTestNode &node); diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h index 8462ff05eca..c2bbd17ce63 100644 --- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h +++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h @@ -114,6 +114,12 @@ public: double distance_threshold) const = 0; virtual DistanceFunctionFactory &distance_function_factory() const = 0; + + /* + * Used when checking consistency during load. + * Called from writer only. + */ + virtual uint32_t check_consistency(uint32_t docid_limit) const noexcept = 0; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp index 28c4099c38b..223c9d7d1f2 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp @@ -322,6 +322,9 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor) if (!load_index()) { return false; } + if (dense_store == nullptr) { + check_consistency(docid_limit); + } } else { build_index(executor, docid_limit); } @@ -329,4 +332,15 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor) return true; } +void +TensorAttributeLoader::check_consistency(uint32_t docid_limit) +{ + auto before = vespalib::steady_clock::now(); + uint32_t inconsistencies = _index->check_consistency(docid_limit); + auto after = vespalib::steady_clock::now(); + double elapsed = vespalib::to_s(after - before); + LOG(info, "%u inconsistencies detected after loading index for attribute %s, (check used %6.3fs)", + inconsistencies, _attr.getName().c_str(), elapsed); +} + } diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h index 6bf68957adc..59baaf0b6dc 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h @@ -34,6 +34,7 @@ class TensorAttributeLoader { void load_tensor_store(search::attribute::BlobSequenceReader& reader, uint32_t docid_limit); void build_index(vespalib::Executor* executor, uint32_t docid_limit); bool load_index(); + void check_consistency(uint32_t docid_limit); public: TensorAttributeLoader(TensorAttribute& attr, GenerationHandler& generation_handler, RefVector& ref_vector, TensorStore& store, NearestNeighborIndex* index); |