aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2022-06-10 13:39:38 +0000
committerGeir Storli <geirst@yahooinc.com>2022-06-10 13:42:51 +0000
commit81d0a79d43138ab186fc4bcfe380aed9cc18402f (patch)
treea7211a1682c6683b6a3c6157f75906c024e54cd3 /searchlib
parent7eb6bd3486b8e64861edc4915b929695c45c7cec (diff)
Don't make changes to the nearest neighbor index (hnsw) when the inserted tensor is unchanged.
With this optimization we avoid doing unnecessary costly work, first removing the vector point, then inserting the same point.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp30
-rw-r--r--searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp26
-rw-r--r--searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hnsw_index.h2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/tensor_attribute.h2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h30
8 files changed, 90 insertions, 7 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index ee1680bbd9a..5ddbb6d0a11 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -157,6 +157,12 @@ public:
void expect_empty_add() const {
EXPECT_TRUE(_adds.empty());
}
+ void expect_empty_prepare_add() const {
+ EXPECT_TRUE(_prepare_adds.empty());
+ }
+ void expect_empty_complete_add() const {
+ EXPECT_TRUE(_complete_adds.empty());
+ }
void expect_entry(uint32_t exp_docid, const DoubleVector& exp_vector, const EntryVector& entries) const {
EXPECT_EQUAL(1u, entries.size());
EXPECT_EQUAL(exp_docid, entries.back().first);
@@ -881,6 +887,30 @@ TEST_F("nearest neighbor index can be updated in two phases", DenseTensorAttribu
}
}
+TEST_F("nearest neighbor index is NOT updated when tensor value is unchanged", DenseTensorAttributeMockIndex)
+{
+ auto& index = f.mock_index();
+ {
+ auto vec_a = vec_2d(3, 5);
+ auto prepare_result = f.prepare_set_tensor(1, vec_a);
+ index.expect_prepare_add(1, {3, 5});
+ f.complete_set_tensor(1, vec_a, std::move(prepare_result));
+ f.assertGetTensor(vec_a, 1);
+ index.expect_complete_add(1, {3, 5});
+ }
+ index.clear();
+ {
+ // Replaces previous value with the same value
+ auto vec_b = vec_2d(3, 5);
+ auto prepare_result = f.prepare_set_tensor(1, vec_b);
+ EXPECT_TRUE(prepare_result.get() == nullptr);
+ index.expect_empty_prepare_add();
+ f.complete_set_tensor(1, vec_b, std::move(prepare_result));
+ f.assertGetTensor(vec_b, 1);
+ index.expect_empty_complete_add();
+ }
+}
+
TEST_F("clearDoc() updates nearest neighbor index", DenseTensorAttributeMockIndex)
{
auto& index = f.mock_index();
diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
index 2fdb73fcf96..c713b3ef335 100644
--- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
@@ -8,9 +8,9 @@
#include "tensor_attribute.hpp"
#include <vespa/eval/eval/value.h>
#include <vespa/fastlib/io/bufferedfile.h>
+#include <vespa/searchcommon/attribute/config.h>
#include <vespa/searchlib/attribute/load_utils.h>
#include <vespa/searchlib/attribute/readerbase.h>
-#include <vespa/searchcommon/attribute/config.h>
#include <vespa/vespalib/data/slime/inserter.h>
#include <vespa/vespalib/util/cpu_usage.h>
#include <vespa/vespalib/util/lambdatask.h>
@@ -102,10 +102,16 @@ BlobSequenceReader::is_present() {
}
+bool
+DenseTensorAttribute::tensor_is_unchanged(DocId docid, const vespalib::eval::Value& new_tensor) const
+{
+ auto old_tensor = extract_cells_ref(docid);
+ return _comp.equals(old_tensor, new_tensor.cells());
+}
+
void
DenseTensorAttribute::internal_set_tensor(DocId docid, const vespalib::eval::Value& tensor)
{
- checkTensorType(tensor);
consider_remove_from_index(docid);
EntryRef ref = _denseTensorStore.setTensor(tensor);
setTensorRef(docid, ref);
@@ -152,7 +158,8 @@ DenseTensorAttribute::DenseTensorAttribute(vespalib::stringref baseFileName, con
const NearestNeighborIndexFactory& index_factory)
: TensorAttribute(baseFileName, cfg, _denseTensorStore),
_denseTensorStore(cfg.tensorType(), get_memory_allocator()),
- _index()
+ _index(),
+ _comp(cfg.tensorType())
{
if (cfg.hnsw_index_params().has_value()) {
auto tensor_type = cfg.tensorType();
@@ -180,6 +187,7 @@ DenseTensorAttribute::clearDoc(DocId docId)
void
DenseTensorAttribute::setTensor(DocId docId, const vespalib::eval::Value &tensor)
{
+ checkTensorType(tensor);
internal_set_tensor(docId, tensor);
if (_index) {
_index->add_document(docId);
@@ -189,16 +197,26 @@ DenseTensorAttribute::setTensor(DocId docId, const vespalib::eval::Value &tensor
std::unique_ptr<PrepareResult>
DenseTensorAttribute::prepare_set_tensor(DocId docid, const vespalib::eval::Value& tensor) const
{
+ checkTensorType(tensor);
if (_index) {
+ if (tensor_is_unchanged(docid, tensor)) {
+ // Don't make changes to the nearest neighbor index when the inserted tensor is unchanged.
+ // With this optimization we avoid doing unnecessary costly work, first removing the vector point, then inserting the same point.
+ return {};
+ }
return _index->prepare_add_document(docid, tensor.cells(), getGenerationHandler().takeGuard());
}
- return std::unique_ptr<PrepareResult>();
+ return {};
}
void
DenseTensorAttribute::complete_set_tensor(DocId docid, const vespalib::eval::Value& tensor,
std::unique_ptr<PrepareResult> prepare_result)
{
+ if (_index && !prepare_result) {
+ // The tensor is unchanged.
+ return;
+ }
internal_set_tensor(docid, tensor);
if (_index) {
_index->complete_add_document(docid, std::move(prepare_result));
diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h
index da7a88af1be..1138a4f4433 100644
--- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h
+++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h
@@ -6,6 +6,7 @@
#include "dense_tensor_store.h"
#include "doc_vector_access.h"
#include "tensor_attribute.h"
+#include "typed_cells_comparator.h"
#include <memory>
namespace search::tensor {
@@ -20,7 +21,9 @@ class DenseTensorAttribute : public TensorAttribute, public DocVectorAccess {
private:
DenseTensorStore _denseTensorStore;
std::unique_ptr<NearestNeighborIndex> _index;
+ TypedCellsComparator _comp;
+ bool tensor_is_unchanged(DocId docid, const vespalib::eval::Value& new_tensor) const;
void internal_set_tensor(DocId docid, const vespalib::eval::Value& tensor);
void consider_remove_from_index(DocId docid);
vespalib::MemoryUsage update_stat() override;
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
index 24bfacb8c5a..994c5aa142d 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp
@@ -436,7 +436,7 @@ HnswIndex::prepare_add_document(uint32_t docid,
if (max_nodes < _cfg.min_size_before_two_phase()) {
// the first documents added will do all work in write thread
// to ensure they are linked together:
- return std::unique_ptr<PrepareResult>();
+ return std::make_unique<PreparedFirstAddDoc>();
}
PreparedAddDoc op = internal_prepare_add(docid, vector, std::move(read_guard));
return std::make_unique<PreparedAddDoc>(std::move(op));
diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
index 72a10724ff1..3f5a9d514ed 100644
--- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
+++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h
@@ -152,6 +152,8 @@ protected:
const BitVector *filter, uint32_t explore_k,
double distance_threshold) const;
+ struct PreparedFirstAddDoc : public PrepareResult {};
+
struct PreparedAddDoc : public PrepareResult {
using ReadGuard = vespalib::GenerationHandler::Guard;
uint32_t docid;
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
index add5184c4eb..78c58e86a3b 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp
@@ -135,7 +135,7 @@ TensorAttribute::addDoc(DocId &docId)
}
void
-TensorAttribute::checkTensorType(const vespalib::eval::Value &tensor)
+TensorAttribute::checkTensorType(const vespalib::eval::Value &tensor) const
{
const ValueType &fieldTensorType = getConfig().tensorType();
const ValueType &tensorType = tensor.type();
diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
index ae6a4a302ea..c8aa42c6133 100644
--- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
+++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h
@@ -32,7 +32,7 @@ protected:
template <typename RefType>
void doCompactWorst();
- void checkTensorType(const vespalib::eval::Value &tensor);
+ void checkTensorType(const vespalib::eval::Value &tensor) const;
void setTensorRef(DocId docId, EntryRef ref);
virtual vespalib::MemoryUsage update_stat();
virtual vespalib::MemoryUsage memory_usage() const;
diff --git a/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h b/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h
new file mode 100644
index 00000000000..d1c890be961
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/typed_cells_comparator.h
@@ -0,0 +1,30 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/cell_type.h>
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/eval/eval/value_type.h>
+#include <cstring>
+
+namespace search::tensor {
+
+/**
+ * Comparator used to compare two vespalib::eval::TypedCells instances.
+ *
+ * The caller must first validate that they are of the same vespalib::eval::ValueType.
+ */
+class TypedCellsComparator {
+private:
+ size_t _mem_size;
+
+public:
+ TypedCellsComparator(const vespalib::eval::ValueType& type)
+ : _mem_size(vespalib::eval::CellTypeUtils::mem_size(type.cell_type(), type.dense_subspace_size()))
+ {}
+ bool equals(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const {
+ return std::memcmp(lhs.data, rhs.data, _mem_size) == 0;
+ }
+};
+
+}