diff options
author | Geir Storli <geirst@yahooinc.com> | 2022-06-22 16:27:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-06-22 16:27:01 +0200 |
commit | 8f1f9e0797ca76a3ee1e9a76978b1470cd5bb0a7 (patch) | |
tree | 02f06726bde5fa45506eb3445a892b3ede7ddb9f /searchlib | |
parent | 07c6eff98909eb10a3cad8c27acc46edec1198ed (diff) |
Revert "Revert "Don't make changes to the hnsw index when the inserted tensor is unchanged""
Diffstat (limited to 'searchlib')
8 files changed, 90 insertions, 7 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index bf0b74b0003..72b2f1e320a 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -157,6 +157,12 @@ public: void expect_empty_add() const { EXPECT_TRUE(_adds.empty()); } + void expect_empty_prepare_add() const { + EXPECT_TRUE(_prepare_adds.empty()); + } + void expect_empty_complete_add() const { + EXPECT_TRUE(_complete_adds.empty()); + } void expect_entry(uint32_t exp_docid, const DoubleVector& exp_vector, const EntryVector& entries) const { EXPECT_EQUAL(1u, entries.size()); EXPECT_EQUAL(exp_docid, entries.back().first); @@ -881,6 +887,30 @@ TEST_F("nearest neighbor index can be updated in two phases", DenseTensorAttribu } } +TEST_F("nearest neighbor index is NOT updated when tensor value is unchanged", DenseTensorAttributeMockIndex) +{ + auto& index = f.mock_index(); + { + auto vec_a = vec_2d(3, 5); + auto prepare_result = f.prepare_set_tensor(1, vec_a); + index.expect_prepare_add(1, {3, 5}); + f.complete_set_tensor(1, vec_a, std::move(prepare_result)); + f.assertGetTensor(vec_a, 1); + index.expect_complete_add(1, {3, 5}); + } + index.clear(); + { + // Replaces previous value with the same value + auto vec_b = vec_2d(3, 5); + auto prepare_result = f.prepare_set_tensor(1, vec_b); + EXPECT_TRUE(prepare_result.get() == nullptr); + index.expect_empty_prepare_add(); + f.complete_set_tensor(1, vec_b, std::move(prepare_result)); + f.assertGetTensor(vec_b, 1); + index.expect_empty_complete_add(); + } +} + TEST_F("clearDoc() updates nearest neighbor index", DenseTensorAttributeMockIndex) { auto& index = f.mock_index(); diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp index 2fdb73fcf96..c713b3ef335 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp @@ -8,9 +8,9 @@ #include "tensor_attribute.hpp" #include <vespa/eval/eval/value.h> #include <vespa/fastlib/io/bufferedfile.h> +#include <vespa/searchcommon/attribute/config.h> #include <vespa/searchlib/attribute/load_utils.h> #include <vespa/searchlib/attribute/readerbase.h> -#include <vespa/searchcommon/attribute/config.h> #include <vespa/vespalib/data/slime/inserter.h> #include <vespa/vespalib/util/cpu_usage.h> #include <vespa/vespalib/util/lambdatask.h> @@ -102,10 +102,16 @@ BlobSequenceReader::is_present() { } +bool +DenseTensorAttribute::tensor_is_unchanged(DocId docid, const vespalib::eval::Value& new_tensor) const +{ + auto old_tensor = extract_cells_ref(docid); + return _comp.equals(old_tensor, new_tensor.cells()); +} + void DenseTensorAttribute::internal_set_tensor(DocId docid, const vespalib::eval::Value& tensor) { - checkTensorType(tensor); consider_remove_from_index(docid); EntryRef ref = _denseTensorStore.setTensor(tensor); setTensorRef(docid, ref); @@ -152,7 +158,8 @@ DenseTensorAttribute::DenseTensorAttribute(vespalib::stringref baseFileName, con const NearestNeighborIndexFactory& index_factory) : TensorAttribute(baseFileName, cfg, _denseTensorStore), _denseTensorStore(cfg.tensorType(), get_memory_allocator()), - _index() + _index(), + _comp(cfg.tensorType()) { if (cfg.hnsw_index_params().has_value()) { auto tensor_type = cfg.tensorType(); @@ -180,6 +187,7 @@ DenseTensorAttribute::clearDoc(DocId docId) void DenseTensorAttribute::setTensor(DocId docId, const vespalib::eval::Value &tensor) { + checkTensorType(tensor); internal_set_tensor(docId, tensor); if (_index) { _index->add_document(docId); @@ -189,16 +197,26 @@ DenseTensorAttribute::setTensor(DocId docId, const vespalib::eval::Value &tensor std::unique_ptr<PrepareResult> DenseTensorAttribute::prepare_set_tensor(DocId docid, const vespalib::eval::Value& tensor) const { + checkTensorType(tensor); if (_index) { + if (tensor_is_unchanged(docid, tensor)) { + // Don't make changes to the nearest neighbor index when the inserted tensor is unchanged. + // With this optimization we avoid doing unnecessary costly work, first removing the vector point, then inserting the same point. + return {}; + } return _index->prepare_add_document(docid, tensor.cells(), getGenerationHandler().takeGuard()); } - return std::unique_ptr<PrepareResult>(); + return {}; } void DenseTensorAttribute::complete_set_tensor(DocId docid, const vespalib::eval::Value& tensor, std::unique_ptr<PrepareResult> prepare_result) { + if (_index && !prepare_result) { + // The tensor is unchanged. + return; + } internal_set_tensor(docid, tensor); if (_index) { _index->complete_add_document(docid, std::move(prepare_result)); diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h index da7a88af1be..1138a4f4433 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h @@ -6,6 +6,7 @@ #include "dense_tensor_store.h" #include "doc_vector_access.h" #include "tensor_attribute.h" +#include "typed_cells_comparator.h" #include <memory> namespace search::tensor { @@ -20,7 +21,9 @@ class DenseTensorAttribute : public TensorAttribute, public DocVectorAccess { private: DenseTensorStore _denseTensorStore; std::unique_ptr<NearestNeighborIndex> _index; + TypedCellsComparator _comp; + bool tensor_is_unchanged(DocId docid, const vespalib::eval::Value& new_tensor) const; void internal_set_tensor(DocId docid, const vespalib::eval::Value& tensor); void consider_remove_from_index(DocId docid); vespalib::MemoryUsage update_stat() override; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index e82f31df38e..2ee1b268449 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -436,7 +436,7 @@ HnswIndex::prepare_add_document(uint32_t docid, if (max_nodes < _cfg.min_size_before_two_phase()) { // the first documents added will do all work in write thread // to ensure they are linked together: - return std::unique_ptr<PrepareResult>(); + return std::make_unique<PreparedFirstAddDoc>(); } PreparedAddDoc op = internal_prepare_add(docid, vector, std::move(read_guard)); return std::make_unique<PreparedAddDoc>(std::move(op)); diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index 72a10724ff1..3f5a9d514ed 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -152,6 +152,8 @@ protected: const BitVector *filter, uint32_t explore_k, double distance_threshold) const; + struct PreparedFirstAddDoc : public PrepareResult {}; + struct PreparedAddDoc : public PrepareResult { using ReadGuard = vespalib::GenerationHandler::Guard; uint32_t docid; diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp index add5184c4eb..78c58e86a3b 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp @@ -135,7 +135,7 @@ TensorAttribute::addDoc(DocId &docId) } void -TensorAttribute::checkTensorType(const vespalib::eval::Value &tensor) +TensorAttribute::checkTensorType(const vespalib::eval::Value &tensor) const { const ValueType &fieldTensorType = getConfig().tensorType(); const ValueType &tensorType = tensor.type(); diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h index ae6a4a302ea..c8aa42c6133 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h @@ -32,7 +32,7 @@ protected: template <typename RefType> void doCompactWorst(); - void checkTensorType(const vespalib::eval::Value &tensor); + void checkTensorType(const vespalib::eval::Value &tensor) const; void setTensorRef(DocId docId, EntryRef ref); virtual vespalib::MemoryUsage update_stat(); virtual vespalib::MemoryUsage memory_usage() const; diff --git a/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h b/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h new file mode 100644 index 00000000000..d1c890be961 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h @@ -0,0 +1,30 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/eval/eval/cell_type.h> +#include <vespa/eval/eval/typed_cells.h> +#include <vespa/eval/eval/value_type.h> +#include <cstring> + +namespace search::tensor { + +/** + * Comparator used to compare two vespalib::eval::TypedCells instances. + * + * The caller must first validate that they are of the same vespalib::eval::ValueType. + */ +class TypedCellsComparator { +private: + size_t _mem_size; + +public: + TypedCellsComparator(const vespalib::eval::ValueType& type) + : _mem_size(vespalib::eval::CellTypeUtils::mem_size(type.cell_type(), type.dense_subspace_size())) + {} + bool equals(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const { + return std::memcmp(lhs.data, rhs.data, _mem_size) == 0; + } +}; + +} |