Revert "Revert "Don't make changes to the hnsw index when the inserted tensor is unchanged""

author: Geir Storli <geirst@yahooinc.com> 2022-06-22 16:27:01 +0200
committer: GitHub <noreply@github.com> 2022-06-22 16:27:01 +0200
commit: 8f1f9e0797ca76a3ee1e9a76978b1470cd5bb0a7 (patch)
tree: 02f06726bde5fa45506eb3445a892b3ede7ddb9f /searchlib
parent: 07c6eff98909eb10a3cad8c27acc46edec1198ed (diff)
8 files changed, 90 insertions, 7 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index bf0b74b0003..72b2f1e320a 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -157,6 +157,12 @@ public:
     void expect_empty_add() const {
         EXPECT_TRUE(_adds.empty());
     }
+    void expect_empty_prepare_add() const {
+        EXPECT_TRUE(_prepare_adds.empty());
+    }
+    void expect_empty_complete_add() const {
+        EXPECT_TRUE(_complete_adds.empty());
+    }
     void expect_entry(uint32_t exp_docid, const DoubleVector& exp_vector, const EntryVector& entries) const {
         EXPECT_EQUAL(1u, entries.size());
         EXPECT_EQUAL(exp_docid, entries.back().first);
@@ -881,6 +887,30 @@ TEST_F("nearest neighbor index can be updated in two phases", DenseTensorAttribu
     }
 }
 
+TEST_F("nearest neighbor index is NOT updated when tensor value is unchanged", DenseTensorAttributeMockIndex)
+{
+    auto& index = f.mock_index();
+    {
+        auto vec_a = vec_2d(3, 5);
+        auto prepare_result = f.prepare_set_tensor(1, vec_a);
+        index.expect_prepare_add(1, {3, 5});
+        f.complete_set_tensor(1, vec_a, std::move(prepare_result));
+        f.assertGetTensor(vec_a, 1);
+        index.expect_complete_add(1, {3, 5});
+    }
+    index.clear();
+    {
+        // Replaces previous value with the same value
+        auto vec_b = vec_2d(3, 5);
+        auto prepare_result = f.prepare_set_tensor(1, vec_b);
+        EXPECT_TRUE(prepare_result.get() == nullptr);
+        index.expect_empty_prepare_add();
+        f.complete_set_tensor(1, vec_b, std::move(prepare_result));
+        f.assertGetTensor(vec_b, 1);
+        index.expect_empty_complete_add();
+    }
+}
+
 TEST_F("clearDoc() updates nearest neighbor index", DenseTensorAttributeMockIndex)
 {
     auto& index = f.mock_index();
diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
index 2fdb73fcf96..c713b3ef335 100644
--- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
@@ -8,9 +8,9 @@
 #include "tensor_attribute.hpp"
 #include <vespa/eval/eval/value.h>
 #include <vespa/fastlib/io/bufferedfile.h>
+#include <vespa/searchcommon/attribute/config.h>
 #include <vespa/searchlib/attribute/load_utils.h>
 #include <vespa/searchlib/attribute/readerbase.h>
-#include <vespa/searchcommon/attribute/config.h>
 #include <vespa/vespalib/data/slime/inserter.h>
 #include <vespa/vespalib/util/cpu_usage.h>
 #include <vespa/vespalib/util/lambdatask.h>
@@ -102,10 +102,16 @@ BlobSequenceReader::is_present() {
 
 }
 
+bool
+DenseTensorAttribute::tensor_is_unchanged(DocId docid, const vespalib::eval::Value& new_tensor) const
+{
+    auto old_tensor = extract_cells_ref(docid);
+    return _comp.equals(old_tensor, new_tensor.cells());
+}
+
 void
 DenseTensorAttribute::internal_set_tensor(DocId docid, const vespalib::eval::Value& tensor)
 {
-    checkTensorType(tensor);
     consider_remove_from_index(docid);
     EntryRef ref = _denseTensorStore.setTensor(tensor);
     setTensorRef(docid, ref);
@@ -152,7 +158,8 @@ DenseTensorAttribute::DenseTensorAttribute(vespalib::stringref baseFileName, con
                                            const NearestNeighborIndexFactory& index_factory)
     : TensorAttribute(baseFileName, cfg, _denseTensorStore),
       _denseTensorStore(cfg.tensorType(), get_memory_allocator()),
-      _index()
+      _index(),
+      _comp(cfg.tensorType())
 {
     if (cfg.hnsw_index_params().has_value()) {
         auto tensor_type = cfg.tensorType();
@@ -180,6 +187,7 @@ DenseTensorAttribute::clearDoc(DocId docId)
 void
 DenseTensorAttribute::setTensor(DocId docId, const vespalib::eval::Value &tensor)
 {
+    checkTensorType(tensor);
     internal_set_tensor(docId, tensor);
     if (_index) {
         _index->add_document(docId);
@@ -189,16 +197,26 @@ DenseTensorAttribute::setTensor(DocId docId, const vespalib::eval::Value &tensor
 std::unique_ptr<PrepareResult>
 DenseTensorAttribute::prepare_set_tensor(DocId docid, const vespalib::eval::Value& tensor) const
 {
+    checkTensorType(tensor);
     if (_index) {
+        if (tensor_is_unchanged(docid, tensor)) {
+            // Don't make changes to the nearest neighbor index when the inserted tensor is unchanged.
+            // With this optimization we avoid doing unnecessary costly work, first removing the vector point, then inserting the same point.
+            return {};
+        }
         return _index->prepare_add_document(docid, tensor.cells(), getGenerationHandler().takeGuard());
     }
-    return std::unique_ptr<PrepareResult>();
+    return {};
 }
 
 void
 DenseTensorAttribute::complete_set_tensor(DocId docid, const vespalib::eval::Value& tensor,
                                           std::unique_ptr<PrepareResult> prepare_result)
 {
+    if (_index && !prepare_result) {
+        // The tensor is unchanged.
+        return;
+    }
     internal_set_tensor(docid, tensor);
     if (_index) {
         _index->complete_add_document(docid, std::move(prepare_result));
diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h
index da7a88af1be..1138a4f4433 100644
--- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h
+++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h
@@ -6,6 +6,7 @@
 #include "dense_tensor_store.h"
 #include "doc_vector_access.h"
 #include "tensor_attribute.h"
+#include "typed_cells_comparator.h"
 #include <memory>
 
 namespace search::tensor {
@@ -20,7 +21,9 @@ class DenseTensorAttribute : public TensorAttribute, public DocVectorAccess {
 private:
     DenseTensorStore _denseTensorStore;
     std::unique_ptr<NearestNeighborIndex> _index;
+    TypedCellsComparator _comp;
 
+    bool tensor_is_unchanged(DocId docid, const vespalib::eval::Value& new_tensor) const;
     void internal_set_tensor(DocId docid, const vespalib::eval::Value& tensor);
     void consider_remove_from_index(DocId docid);
     vespalib::MemoryUsage update_stat() override;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index e82f31df38e..2ee1b268449 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -436,7 +436,7 @@ HnswIndex::prepare_add_document(uint32_t docid,
     if (max_nodes < _cfg.min_size_before_two_phase()) {
         // the first documents added will do all work in write thread
         // to ensure they are linked together:
-        return std::unique_ptr<PrepareResult>();
+        return std::make_unique<PreparedFirstAddDoc>();
     }
     PreparedAddDoc op = internal_prepare_add(docid, vector, std::move(read_guard));
     return std::make_unique<PreparedAddDoc>(std::move(op));
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index 72a10724ff1..3f5a9d514ed 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -152,6 +152,8 @@ protected:
                                          const BitVector *filter, uint32_t explore_k,
                                          double distance_threshold) const;
 
+    struct PreparedFirstAddDoc : public PrepareResult {};
+
     struct PreparedAddDoc : public PrepareResult {
         using ReadGuard = vespalib::GenerationHandler::Guard;
         uint32_t docid;
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
index add5184c4eb..78c58e86a3b 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
@@ -135,7 +135,7 @@ TensorAttribute::addDoc(DocId &docId)
 }
 
 void
-TensorAttribute::checkTensorType(const vespalib::eval::Value &tensor)
+TensorAttribute::checkTensorType(const vespalib::eval::Value &tensor) const
 {
     const ValueType &fieldTensorType = getConfig().tensorType();
     const ValueType &tensorType = tensor.type();
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
index ae6a4a302ea..c8aa42c6133 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
@@ -32,7 +32,7 @@ protected:
 
     template <typename RefType>
     void doCompactWorst();
-    void checkTensorType(const vespalib::eval::Value &tensor);
+    void checkTensorType(const vespalib::eval::Value &tensor) const;
     void setTensorRef(DocId docId, EntryRef ref);
     virtual vespalib::MemoryUsage update_stat();
     virtual vespalib::MemoryUsage memory_usage() const;
diff --git a/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h b/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h
new file mode 100644
index 00000000000..d1c890be961
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h
@@ -0,0 +1,30 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/cell_type.h>
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/eval/eval/value_type.h>
+#include <cstring>
+
+namespace search::tensor {
+
+/**
+ * Comparator used to compare two vespalib::eval::TypedCells instances.
+ *
+ * The caller must first validate that they are of the same vespalib::eval::ValueType.
+ */
+class TypedCellsComparator {
+private:
+    size_t _mem_size;
+
+public:
+    TypedCellsComparator(const vespalib::eval::ValueType& type)
+        : _mem_size(vespalib::eval::CellTypeUtils::mem_size(type.cell_type(), type.dense_subspace_size()))
+    {}
+    bool equals(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const {
+        return std::memcmp(lhs.data, rhs.data, _mem_size) == 0;
+    }
+};
+
+}
author	Geir Storli <geirst@yahooinc.com>	2022-06-22 16:27:01 +0200
committer	GitHub <noreply@github.com>	2022-06-22 16:27:01 +0200
commit	8f1f9e0797ca76a3ee1e9a76978b1470cd5bb0a7 (patch)
tree	02f06726bde5fa45506eb3445a892b3ede7ddb9f /searchlib
parent	07c6eff98909eb10a3cad8c27acc46edec1198ed (diff)