summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2022-06-13 10:07:56 +0200
committerGitHub <noreply@github.com>2022-06-13 10:07:56 +0200
commit01f0eabb66fffdb0ebe89133ae0808ae1bba2431 (patch)
tree60089bc8cac62afbfee0acb24a604cc2a2275c11 /searchlib
parentc43c9d5e9e9f5fe459cae5c255d04be1ad4eb8c1 (diff)
parent81d0a79d43138ab186fc4bcfe380aed9cc18402f (diff)
Merge pull request #23051 from vespa-engine/geirst/dont-update-hnsw-with-unchanged-tensors
Don't make changes to the hnsw index when the inserted tensor is unchanged
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp30
-rw-r--r--searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp26
-rw-r--r--searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute.h2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h30
8 files changed, 90 insertions, 7 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index bf0b74b0003..72b2f1e320a 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -157,6 +157,12 @@ public:
void expect_empty_add() const {
EXPECT_TRUE(_adds.empty());
}
+ void expect_empty_prepare_add() const {
+ EXPECT_TRUE(_prepare_adds.empty());
+ }
+ void expect_empty_complete_add() const {
+ EXPECT_TRUE(_complete_adds.empty());
+ }
void expect_entry(uint32_t exp_docid, const DoubleVector& exp_vector, const EntryVector& entries) const {
EXPECT_EQUAL(1u, entries.size());
EXPECT_EQUAL(exp_docid, entries.back().first);
@@ -881,6 +887,30 @@ TEST_F("nearest neighbor index can be updated in two phases", DenseTensorAttribu
}
}
+TEST_F("nearest neighbor index is NOT updated when tensor value is unchanged", DenseTensorAttributeMockIndex)
+{
+ auto& index = f.mock_index();
+ {
+ auto vec_a = vec_2d(3, 5);
+ auto prepare_result = f.prepare_set_tensor(1, vec_a);
+ index.expect_prepare_add(1, {3, 5});
+ f.complete_set_tensor(1, vec_a, std::move(prepare_result));
+ f.assertGetTensor(vec_a, 1);
+ index.expect_complete_add(1, {3, 5});
+ }
+ index.clear();
+ {
+ // Replaces previous value with the same value
+ auto vec_b = vec_2d(3, 5);
+ auto prepare_result = f.prepare_set_tensor(1, vec_b);
+ EXPECT_TRUE(prepare_result.get() == nullptr);
+ index.expect_empty_prepare_add();
+ f.complete_set_tensor(1, vec_b, std::move(prepare_result));
+ f.assertGetTensor(vec_b, 1);
+ index.expect_empty_complete_add();
+ }
+}
+
TEST_F("clearDoc() updates nearest neighbor index", DenseTensorAttributeMockIndex)
{
auto& index = f.mock_index();
diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
index 2fdb73fcf96..c713b3ef335 100644
--- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
@@ -8,9 +8,9 @@
#include "tensor_attribute.hpp"
#include <vespa/eval/eval/value.h>
#include <vespa/fastlib/io/bufferedfile.h>
+#include <vespa/searchcommon/attribute/config.h>
#include <vespa/searchlib/attribute/load_utils.h>
#include <vespa/searchlib/attribute/readerbase.h>
-#include <vespa/searchcommon/attribute/config.h>
#include <vespa/vespalib/data/slime/inserter.h>
#include <vespa/vespalib/util/cpu_usage.h>
#include <vespa/vespalib/util/lambdatask.h>
@@ -102,10 +102,16 @@ BlobSequenceReader::is_present() {
}
+bool
+DenseTensorAttribute::tensor_is_unchanged(DocId docid, const vespalib::eval::Value& new_tensor) const
+{
+ auto old_tensor = extract_cells_ref(docid);
+ return _comp.equals(old_tensor, new_tensor.cells());
+}
+
void
DenseTensorAttribute::internal_set_tensor(DocId docid, const vespalib::eval::Value& tensor)
{
- checkTensorType(tensor);
consider_remove_from_index(docid);
EntryRef ref = _denseTensorStore.setTensor(tensor);
setTensorRef(docid, ref);
@@ -152,7 +158,8 @@ DenseTensorAttribute::DenseTensorAttribute(vespalib::stringref baseFileName, con
const NearestNeighborIndexFactory& index_factory)
: TensorAttribute(baseFileName, cfg, _denseTensorStore),
_denseTensorStore(cfg.tensorType(), get_memory_allocator()),
- _index()
+ _index(),
+ _comp(cfg.tensorType())
{
if (cfg.hnsw_index_params().has_value()) {
auto tensor_type = cfg.tensorType();
@@ -180,6 +187,7 @@ DenseTensorAttribute::clearDoc(DocId docId)
void
DenseTensorAttribute::setTensor(DocId docId, const vespalib::eval::Value &tensor)
{
+ checkTensorType(tensor);
internal_set_tensor(docId, tensor);
if (_index) {
_index->add_document(docId);
@@ -189,16 +197,26 @@ DenseTensorAttribute::setTensor(DocId docId, const vespalib::eval::Value &tensor
std::unique_ptr<PrepareResult>
DenseTensorAttribute::prepare_set_tensor(DocId docid, const vespalib::eval::Value& tensor) const
{
+ checkTensorType(tensor);
if (_index) {
+ if (tensor_is_unchanged(docid, tensor)) {
+ // Don't make changes to the nearest neighbor index when the inserted tensor is unchanged.
+ // With this optimization we avoid doing unnecessary costly work, first removing the vector point, then inserting the same point.
+ return {};
+ }
return _index->prepare_add_document(docid, tensor.cells(), getGenerationHandler().takeGuard());
}
- return std::unique_ptr<PrepareResult>();
+ return {};
}
void
DenseTensorAttribute::complete_set_tensor(DocId docid, const vespalib::eval::Value& tensor,
std::unique_ptr<PrepareResult> prepare_result)
{
+ if (_index && !prepare_result) {
+ // The tensor is unchanged.
+ return;
+ }
internal_set_tensor(docid, tensor);
if (_index) {
_index->complete_add_document(docid, std::move(prepare_result));
diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h
index da7a88af1be..1138a4f4433 100644
--- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h
+++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h
@@ -6,6 +6,7 @@
#include "dense_tensor_store.h"
#include "doc_vector_access.h"
#include "tensor_attribute.h"
+#include "typed_cells_comparator.h"
#include <memory>
namespace search::tensor {
@@ -20,7 +21,9 @@ class DenseTensorAttribute : public TensorAttribute, public DocVectorAccess {
private:
DenseTensorStore _denseTensorStore;
std::unique_ptr<NearestNeighborIndex> _index;
+ TypedCellsComparator _comp;
+ bool tensor_is_unchanged(DocId docid, const vespalib::eval::Value& new_tensor) const;
void internal_set_tensor(DocId docid, const vespalib::eval::Value& tensor);
void consider_remove_from_index(DocId docid);
vespalib::MemoryUsage update_stat() override;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index e82f31df38e..2ee1b268449 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -436,7 +436,7 @@ HnswIndex::prepare_add_document(uint32_t docid,
if (max_nodes < _cfg.min_size_before_two_phase()) {
// the first documents added will do all work in write thread
// to ensure they are linked together:
- return std::unique_ptr<PrepareResult>();
+ return std::make_unique<PreparedFirstAddDoc>();
}
PreparedAddDoc op = internal_prepare_add(docid, vector, std::move(read_guard));
return std::make_unique<PreparedAddDoc>(std::move(op));
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index 72a10724ff1..3f5a9d514ed 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -152,6 +152,8 @@ protected:
const BitVector *filter, uint32_t explore_k,
double distance_threshold) const;
+ struct PreparedFirstAddDoc : public PrepareResult {};
+
struct PreparedAddDoc : public PrepareResult {
using ReadGuard = vespalib::GenerationHandler::Guard;
uint32_t docid;
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
index add5184c4eb..78c58e86a3b 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
@@ -135,7 +135,7 @@ TensorAttribute::addDoc(DocId &docId)
}
void
-TensorAttribute::checkTensorType(const vespalib::eval::Value &tensor)
+TensorAttribute::checkTensorType(const vespalib::eval::Value &tensor) const
{
const ValueType &fieldTensorType = getConfig().tensorType();
const ValueType &tensorType = tensor.type();
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
index ae6a4a302ea..c8aa42c6133 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
@@ -32,7 +32,7 @@ protected:
template <typename RefType>
void doCompactWorst();
- void checkTensorType(const vespalib::eval::Value &tensor);
+ void checkTensorType(const vespalib::eval::Value &tensor) const;
void setTensorRef(DocId docId, EntryRef ref);
virtual vespalib::MemoryUsage update_stat();
virtual vespalib::MemoryUsage memory_usage() const;
diff --git a/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h b/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h
new file mode 100644
index 00000000000..d1c890be961
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h
@@ -0,0 +1,30 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/cell_type.h>
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/eval/eval/value_type.h>
+#include <cstring>
+
+namespace search::tensor {
+
+/**
+ * Comparator used to compare two vespalib::eval::TypedCells instances.
+ *
+ * The caller must first validate that they are of the same vespalib::eval::ValueType.
+ */
+class TypedCellsComparator {
+private:
+ size_t _mem_size;
+
+public:
+ TypedCellsComparator(const vespalib::eval::ValueType& type)
+ : _mem_size(vespalib::eval::CellTypeUtils::mem_size(type.cell_type(), type.dense_subspace_size()))
+ {}
+ bool equals(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const {
+ return std::memcmp(lhs.data, rhs.data, _mem_size) == 0;
+ }
+};
+
+}