From b81121843416500e0ef11191f4b6163f96791e79 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Wed, 24 Jun 2020 09:01:27 +0000 Subject: the very first documents added to HNSW index should use single-phase indexing --- searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp | 2 +- searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp | 2 +- .../tensor/default_nearest_neighbor_index_factory.cpp | 1 + searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp | 12 +++++++++++- searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 4 ++++ 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 7dc0efc106d..cd989c03b4e 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -82,7 +82,7 @@ public: level_generator = generator.get(); index = std::make_unique(vectors, std::make_unique(), std::move(generator), - HnswIndex::Config(5, 2, 10, heuristic_select_neighbors)); + HnswIndex::Config(5, 2, 10, 0, heuristic_select_neighbors)); } void add_document(uint32_t docid, uint32_t max_level = 0) { level_generator->level = max_level; diff --git a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp index 4dec9550f6f..1e10d94bb18 100644 --- a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp @@ -234,7 +234,7 @@ public: uint32_t m = 16; index = std::make_unique(vectors, std::make_unique(), std::make_unique(m), - HnswIndex::Config(2*m, m, 200, true)); + HnswIndex::Config(2*m, m, 200, 10, true)); } size_t get_rnd(size_t size) { return rng.nextUniform() * size; diff --git a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp index 067280e9a23..7d4c605be48 100644 --- a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp @@ -36,6 +36,7 @@ DefaultNearestNeighborIndexFactory::make(const DocVectorAccess& vectors, HnswIndex::Config cfg(m * 2, m, params.neighbors_to_explore_at_insert(), + 1000, true); return std::make_unique(vectors, make_distance_function(params.distance_metric(), cell_type), diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 36d970dfd01..0c450e18c4b 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -371,6 +371,12 @@ HnswIndex::prepare_add_document(uint32_t docid, TypedCells vector, vespalib::GenerationHandler::Guard read_guard) const { + uint32_t max_nodes = _graph.node_refs.size(); + if (max_nodes < _cfg.min_size_before_two_phase()) { + // the first documents added will do all work in write thread + // to ensure they are linked together: + return std::unique_ptr(); + } PreparedAddDoc op = internal_prepare_add(docid, vector); (void) read_guard; // must keep guard until this point return std::make_unique(std::move(op)); @@ -383,7 +389,11 @@ HnswIndex::complete_add_document(uint32_t docid, std::unique_ptr if (prepared && (prepared->docid == docid)) { internal_complete_add(docid, *prepared); } else { - LOG(warning, "complete_add_document called with invalid prepare_result"); + // we expect this for the first documents added, so no warning for them + if (_graph.node_refs.size() > 1.25 * _cfg.min_size_before_two_phase()) { + LOG(warning, "complete_add_document(%u) called with invalid prepare_result %s/%u", + docid, (prepared ? "valid ptr" : "nullptr"), (prepared ? prepared->docid : 0u)); + } // fallback to normal add add_document(docid); } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index ab3eced8fdc..c237e3b8fcf 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -39,21 +39,25 @@ public: uint32_t _max_links_at_level_0; uint32_t _max_links_on_inserts; uint32_t _neighbors_to_explore_at_construction; + uint32_t _min_size_before_two_phase; bool _heuristic_select_neighbors; public: Config(uint32_t max_links_at_level_0_in, uint32_t max_links_on_inserts_in, uint32_t neighbors_to_explore_at_construction_in, + uint32_t min_size_before_two_phase_in, bool heuristic_select_neighbors_in) : _max_links_at_level_0(max_links_at_level_0_in), _max_links_on_inserts(max_links_on_inserts_in), _neighbors_to_explore_at_construction(neighbors_to_explore_at_construction_in), + _min_size_before_two_phase(min_size_before_two_phase_in), _heuristic_select_neighbors(heuristic_select_neighbors_in) {} uint32_t max_links_at_level_0() const { return _max_links_at_level_0; } uint32_t max_links_on_inserts() const { return _max_links_on_inserts; } uint32_t neighbors_to_explore_at_construction() const { return _neighbors_to_explore_at_construction; } + uint32_t min_size_before_two_phase() const { return _min_size_before_two_phase; } bool heuristic_select_neighbors() const { return _heuristic_select_neighbors; } }; -- cgit v1.2.3 From 213c2ad09f9183975717166a4c0022cbbed63eaa Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Thu, 25 Jun 2020 06:32:06 +0000 Subject: default for min_size_before_two_phase is now 10000 --- .../vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp index 7d4c605be48..0bb6f339455 100644 --- a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp @@ -36,7 +36,7 @@ DefaultNearestNeighborIndexFactory::make(const DocVectorAccess& vectors, HnswIndex::Config cfg(m * 2, m, params.neighbors_to_explore_at_insert(), - 1000, + 10000, true); return std::make_unique(vectors, make_distance_function(params.distance_metric(), cell_type), -- cgit v1.2.3