summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2024-04-29 14:35:21 +0200
committerTor Egge <Tor.Egge@online.no>2024-04-29 14:35:21 +0200
commit0e955fe9420b1b89d6fccbb77ebf0a698d922e8c (patch)
treecbc321564ec560a93a80b5436a7ce4e5a30bb91e /searchlib
parent054818ce52e390e96e08074aa0cc6ef79bad8e52 (diff)
Check consistency after loading tensor attribute with index.
Don't reconnect hnsw graph node without vector.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp4
-rw-r--r--searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp30
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp31
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h1
7 files changed, 85 insertions, 1 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index b1b2235165f..825c81de107 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -329,6 +329,10 @@ public:
static search::tensor::DistanceFunctionFactory::UP my_dist_fun = search::tensor::make_distance_function_factory(search::attribute::DistanceMetric::Euclidean, vespalib::eval::CellType::DOUBLE);
return *my_dist_fun;
}
+
+ uint32_t check_consistency(uint32_t) noexcept override {
+ return 0;
+ }
};
class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory {
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
index d50677314df..a1cf86c95cc 100644
--- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
@@ -936,6 +936,36 @@ TYPED_TEST(HnswIndexTest, search_during_remove)
this->expect_top_3_by_docid("{0, 0}", {0, 0}, {7});
}
+TYPED_TEST(HnswIndexTest, inconsistent_index)
+{
+ this->init(false);
+ this->vectors.clear();
+ this->vectors.set(1, {1, 3}).set(2, {7, 1}).set(3, {6, 5}).set(4, {8, 3}).set(5, {10, 3});
+ this->add_document(1);
+ this->add_document(2);
+ this->add_document(3);
+ this->add_document(4);
+ this->add_document(5);
+ this->expect_entry_point(1, 0);
+ this->expect_level_0(1, {2, 3});
+ this->expect_level_0(2, {1, 3, 4, 5});
+ this->expect_level_0(3, {1, 2, 4});
+ this->expect_level_0(4, {2, 3, 5});
+ this->expect_level_0(5, {2, 4});
+ EXPECT_EQ(0, this->index->check_consistency(6));
+ // Remove vector for docid 5 but don't update index.
+ this->vectors.clear(5);
+ EXPECT_EQ(1, this->index->check_consistency(6));
+ /*
+ * Removing document 2 causes mutual reconnect for nodes [1, 3, 4, 5]
+ * where nodes 1 and 5 are not previously connected. Distance from
+ * node 1 to node 5 cannot be calculated due to missing vector.
+ */
+ this->remove_document(2);
+ // No reconnect for node without vector
+ this->expect_level_0(5, {4});
+}
+
using HnswMultiIndexTest = HnswIndexTest<HnswIndex<HnswIndexType::MULTI>>;
namespace {
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index 322965ca06a..8cfa10d918b 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -679,7 +679,10 @@ HnswIndex<type>::mutual_reconnect(const LinkArrayRef &cluster, uint32_t level)
for (uint32_t j = i + 1; j < cluster.size(); ++j) {
uint32_t n_id_2 = cluster[j];
if ( ! has_link_to(n_list_1, n_id_2)) {
- pairs.emplace_back(n_id_1, n_id_2, calc_distance(*df, n_id_2));
+ auto n_cells_2 = get_vector(n_id_2);
+ if (!n_cells_2.non_existing_attribute_value()) {
+ pairs.emplace_back(n_id_1, n_id_2, df->calc(n_cells_2));
+ }
}
}
}
@@ -1120,6 +1123,32 @@ HnswIndex<type>::count_reachable_nodes() const
return {found_cnt, true};
}
+template <HnswIndexType type>
+uint32_t
+HnswIndex<type>::get_subspaces(uint32_t docid) noexcept
+{
+ if (type == HnswIndexType::SINGLE) {
+ return (docid < _graph.nodes.size() && _graph.nodes[docid].levels_ref().load_relaxed().valid()) ? 1 : 0;
+ } else {
+ return _id_mapping.get_ids(docid).size();
+ }
+}
+
+template <HnswIndexType type>
+uint32_t
+HnswIndex<type>::check_consistency(uint32_t docid_limit) noexcept
+{
+ uint32_t inconsistencies = 0;
+ for (uint32_t docid = 1; docid < docid_limit; ++docid) {
+ auto index_subspaces = get_subspaces(docid);
+ auto store_subspaces = get_vectors(docid).subspaces();
+ if (index_subspaces != store_subspaces) {
+ ++inconsistencies;
+ }
+ }
+ return inconsistencies;
+}
+
template class HnswIndex<HnswIndexType::SINGLE>;
template class HnswIndex<HnswIndexType::MULTI>;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index 616140f426f..757f4d00666 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -193,6 +193,7 @@ protected:
LinkArray filter_valid_nodeids(uint32_t level, const internal::PreparedAddNode::Links &neighbors, uint32_t self_nodeid);
void internal_complete_add(uint32_t docid, internal::PreparedAddDoc &op);
void internal_complete_add_node(uint32_t nodeid, uint32_t docid, uint32_t subspace, internal::PreparedAddNode &prepared_node);
+ uint32_t get_subspaces(uint32_t docid) noexcept;
public:
HnswIndex(const DocVectorAccess& vectors, DistanceFunctionFactory::UP distance_ff,
RandomLevelGenerator::UP level_generator, const HnswIndexConfig& cfg);
@@ -248,6 +249,8 @@ public:
uint32_t get_active_nodes() const noexcept { return _graph.get_active_nodes(); }
+ uint32_t check_consistency(uint32_t docid_limit) noexcept override;
+
// Should only be used by unit tests.
HnswTestNode get_node(uint32_t nodeid) const;
void set_node(uint32_t nodeid, const HnswTestNode &node);
diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
index 8462ff05eca..a4309c366b1 100644
--- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h
@@ -114,6 +114,9 @@ public:
double distance_threshold) const = 0;
virtual DistanceFunctionFactory &distance_function_factory() const = 0;
+
+ // Used when checking consistency during load
+ virtual uint32_t check_consistency(uint32_t docid_limit) noexcept = 0;
};
}
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
index 28c4099c38b..223c9d7d1f2 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp
@@ -322,6 +322,9 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor)
if (!load_index()) {
return false;
}
+ if (dense_store == nullptr) {
+ check_consistency(docid_limit);
+ }
} else {
build_index(executor, docid_limit);
}
@@ -329,4 +332,15 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor)
return true;
}
+void
+TensorAttributeLoader::check_consistency(uint32_t docid_limit)
+{
+ auto before = vespalib::steady_clock::now();
+ uint32_t inconsistencies = _index->check_consistency(docid_limit);
+ auto after = vespalib::steady_clock::now();
+ double elapsed = vespalib::to_s(after - before);
+ LOG(info, "%u inconsistencies detected after loading index for attribute %s, (check used %6.3fs)",
+ inconsistencies, _attr.getName().c_str(), elapsed);
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h
index 6bf68957adc..59baaf0b6dc 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h
@@ -34,6 +34,7 @@ class TensorAttributeLoader {
void load_tensor_store(search::attribute::BlobSequenceReader& reader, uint32_t docid_limit);
void build_index(vespalib::Executor* executor, uint32_t docid_limit);
bool load_index();
+ void check_consistency(uint32_t docid_limit);
public:
TensorAttributeLoader(TensorAttribute& attr, GenerationHandler& generation_handler, RefVector& ref_vector, TensorStore& store, NearestNeighborIndex* index);