diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-02-20 12:57:06 +0000 |
---|---|---|
committer | Geir Storli <geirst@verizonmedia.com> | 2020-02-20 12:57:06 +0000 |
commit | fd74211c61afbdac5cec8b7fa9956f53d909b8c4 (patch) | |
tree | 29834fde87b80d3f5df8bb1ee76a8b9a98b2a17c /searchlib | |
parent | a04b22c8b07d9d03d374c1732126f4502a981319 (diff) |
Update nearest neighbor index when tensor values change.
Diffstat (limited to 'searchlib')
3 files changed, 187 insertions, 11 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index 69d9e3c8393..88118ca4204 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -6,12 +6,17 @@ #include <vespa/eval/tensor/tensor.h> #include <vespa/fastos/file.h> #include <vespa/searchlib/attribute/attributeguard.h> +#include <vespa/searchlib/tensor/default_nearest_neighbor_index_factory.h> #include <vespa/searchlib/tensor/dense_tensor_attribute.h> +#include <vespa/searchlib/tensor/doc_vector_access.h> #include <vespa/searchlib/tensor/generic_tensor_attribute.h> #include <vespa/searchlib/tensor/hnsw_index.h> +#include <vespa/searchlib/tensor/nearest_neighbor_index.h> +#include <vespa/searchlib/tensor/nearest_neighbor_index_factory.h> #include <vespa/searchlib/tensor/tensor_attribute.h> #include <vespa/vespalib/data/fileheader.h> #include <vespa/vespalib/io/fileutil.h> +#include <vespa/vespalib/test/insertion_operators.h> #include <vespa/vespalib/testkit/test_kit.h> #include <vespa/log/log.h> @@ -21,9 +26,13 @@ using document::WrongTensorTypeException; using search::AttributeGuard; using search::AttributeVector; using search::attribute::HnswIndexParams; +using search::tensor::DefaultNearestNeighborIndexFactory; using search::tensor::DenseTensorAttribute; +using search::tensor::DocVectorAccess; using search::tensor::GenericTensorAttribute; using search::tensor::HnswIndex; +using search::tensor::NearestNeighborIndex; +using search::tensor::NearestNeighborIndexFactory; using search::tensor::TensorAttribute; using vespalib::eval::TensorSpec; using vespalib::eval::ValueType; @@ -31,6 +40,8 @@ using vespalib::tensor::DefaultTensorEngine; using vespalib::tensor::DenseTensor; using vespalib::tensor::Tensor; +using DoubleVector = std::vector<double>; + namespace vespalib::tensor { static bool operator==(const Tensor &lhs, const Tensor &rhs) @@ -42,6 +53,7 @@ static bool operator==(const Tensor &lhs, const Tensor &rhs) vespalib::string sparseSpec("tensor(x{},y{})"); vespalib::string denseSpec("tensor(x[2],y[3])"); +vespalib::string vec_2d_spec("tensor(x[2])"); Tensor::UP createTensor(const TensorSpec &spec) { auto value = DefaultTensorEngine::ref().from_spec(spec); @@ -54,6 +66,78 @@ Tensor::UP createTensor(const TensorSpec &spec) { return Tensor::UP(tensor); } +TensorSpec +vec_2d(double x0, double x1) +{ + return TensorSpec(vec_2d_spec).add({{"x", 0}}, x0).add({{"x", 1}}, x1); +} + +class MockNearestNeighborIndex : public NearestNeighborIndex { +private: + using Entry = std::pair<uint32_t, DoubleVector>; + using EntryVector = std::vector<Entry>; + + const DocVectorAccess& _vectors; + EntryVector _adds; + EntryVector _removes; + +public: + MockNearestNeighborIndex(const DocVectorAccess& vectors) + : _vectors(vectors), + _adds(), + _removes() + { + } + void clear() { + _adds.clear(); + _removes.clear(); + } + void expect_empty_add() const { + EXPECT_TRUE(_adds.empty()); + } + void expect_add(uint32_t exp_docid, const DoubleVector& exp_vector) const { + EXPECT_EQUAL(1u, _adds.size()); + EXPECT_EQUAL(exp_docid, _adds.back().first); + EXPECT_EQUAL(exp_vector, _adds.back().second); + } + void expect_adds(const EntryVector &exp_adds) const { + EXPECT_EQUAL(exp_adds, _adds); + } + void expect_empty_remove() const { + EXPECT_TRUE(_removes.empty()); + } + void expect_remove(uint32_t exp_docid, const DoubleVector& exp_vector) const { + EXPECT_EQUAL(1u, _removes.size()); + EXPECT_EQUAL(exp_docid, _removes.back().first); + EXPECT_EQUAL(exp_vector, _removes.back().second); + } + void add_document(uint32_t docid) override { + auto vector = _vectors.get_vector(docid).typify<double>(); + _adds.emplace_back(docid, DoubleVector(vector.begin(), vector.end())); + } + void remove_document(uint32_t docid) override { + auto vector = _vectors.get_vector(docid).typify<double>(); + _removes.emplace_back(docid, DoubleVector(vector.begin(), vector.end())); + } + std::vector<uint32_t> find_top_k(uint32_t k, vespalib::tensor::TypedCells vector, uint32_t explore_k) override { + (void) k; + (void) vector; + (void) explore_k; + return std::vector<uint32_t>(); + } +}; + +class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory { + + std::unique_ptr<NearestNeighborIndex> make(const DocVectorAccess& vectors, + ValueType::CellType cell_type, + const search::attribute::HnswIndexParams& params) const override { + (void) params; + assert(cell_type == ValueType::CellType::DOUBLE); + return std::make_unique<MockNearestNeighborIndex>(vectors); + } +}; + struct Fixture { using BasicType = search::attribute::BasicType; @@ -63,6 +147,7 @@ struct Fixture Config _cfg; vespalib::string _name; vespalib::string _typeSpec; + std::unique_ptr<NearestNeighborIndexFactory> _index_factory; std::shared_ptr<TensorAttribute> _tensorAttr; std::shared_ptr<AttributeVector> _attr; bool _denseTensors; @@ -70,10 +155,12 @@ struct Fixture Fixture(const vespalib::string &typeSpec, bool useDenseTensorAttribute = false, - bool enable_hnsw_index = false) + bool enable_hnsw_index = false, + bool use_mock_index = false) : _cfg(BasicType::TENSOR, CollectionType::SINGLE), _name("test"), _typeSpec(typeSpec), + _index_factory(std::make_unique<DefaultNearestNeighborIndexFactory>()), _tensorAttr(), _attr(), _denseTensors(false), @@ -85,16 +172,20 @@ struct Fixture } if (enable_hnsw_index) { _cfg.set_hnsw_index_params(HnswIndexParams(4, 20)); + if (use_mock_index) { + _index_factory = std::make_unique<MockNearestNeighborIndexFactory>(); + } } _tensorAttr = makeAttr(); _attr = _tensorAttr; _attr->addReservedDoc(); } + ~Fixture() {} std::shared_ptr<TensorAttribute> makeAttr() { if (_useDenseTensorAttribute) { assert(_denseTensors); - return std::make_shared<DenseTensorAttribute>(_name, _cfg); + return std::make_shared<DenseTensorAttribute>(_name, _cfg, *_index_factory); } else { return std::make_shared<GenericTensorAttribute>(_name, _cfg); } @@ -106,6 +197,13 @@ struct Fixture return *result; } + MockNearestNeighborIndex& mock_index() { + assert(as_dense_tensor().nearest_neighbor_index() != nullptr); + auto mock_index = dynamic_cast<const MockNearestNeighborIndex*>(as_dense_tensor().nearest_neighbor_index()); + assert(mock_index != nullptr); + return *const_cast<MockNearestNeighborIndex*>(mock_index); + } + void ensureSpace(uint32_t docId) { while (_attr->getNumDocs() <= docId) { uint32_t newDocId = 0u; @@ -120,6 +218,10 @@ struct Fixture _attr->commit(); } + void set_tensor(uint32_t docid, const TensorSpec &spec) { + setTensor(docid, *createTensor(spec)); + } + void setTensor(uint32_t docId, const Tensor &tensor) { ensureSpace(docId); _tensorAttr->setTensor(docId, tensor); @@ -370,14 +472,14 @@ TEST("Test dense tensors with dense tensor attribute") } TEST_F("Hnsw index is NOT instantiated in dense tensor attribute by default", - Fixture("tensor(x[2])", true, false)) + Fixture(vec_2d_spec, true, false)) { const auto& tensor = f.as_dense_tensor(); EXPECT_TRUE(tensor.nearest_neighbor_index() == nullptr); } TEST_F("Hnsw index is instantiated in dense tensor attribute when specified in config", - Fixture("tensor(x[2])", true, true)) + Fixture(vec_2d_spec, true, true)) { const auto& tensor = f.as_dense_tensor(); ASSERT_TRUE(tensor.nearest_neighbor_index() != nullptr); @@ -391,4 +493,51 @@ TEST_F("Hnsw index is instantiated in dense tensor attribute when specified in c EXPECT_TRUE(cfg.heuristic_select_neighbors()); } +class DenseTensorAttributeMockIndex : public Fixture { +public: + DenseTensorAttributeMockIndex() : Fixture(vec_2d_spec, true, true, true) {} +}; + +TEST_F("setTensor() updates nearest neighbor index", DenseTensorAttributeMockIndex) +{ + auto& index = f.mock_index(); + + f.set_tensor(1, vec_2d(3, 5)); + index.expect_add(1, {3, 5}); + index.expect_empty_remove(); + index.clear(); + + // Replaces previous value. + f.set_tensor(1, vec_2d(7, 9)); + index.expect_remove(1, {3, 5}); + index.expect_add(1, {7, 9}); +} + +TEST_F("clearDoc() updates nearest neighbor index", DenseTensorAttributeMockIndex) +{ + auto& index = f.mock_index(); + + // Nothing to clear. + f.clearTensor(1); + index.expect_empty_remove(); + index.expect_empty_add(); + + // Clears previous value. + f.set_tensor(1, vec_2d(3, 5)); + index.clear(); + f.clearTensor(1); + index.expect_remove(1, {3, 5}); + index.expect_empty_add(); +} + +TEST_F("onLoad() updates nearest neighbor index", DenseTensorAttributeMockIndex) +{ + f.set_tensor(1, vec_2d(3, 5)); + f.set_tensor(2, vec_2d(7, 9)); + f.save(); + f.load(); + auto& index = f.mock_index(); + index.expect_adds({{1, {3, 5}}, {2, {7, 9}}}); +} + TEST_MAIN() { TEST_RUN_ALL(); vespalib::unlink("test.dat"); } diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp index d4786cc2e1a..171340e07f1 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp @@ -56,6 +56,14 @@ TensorReader::is_present() { } +void +DenseTensorAttribute::consider_remove_from_index(DocId docid) +{ + if (_index && _refVector[docid].valid()) { + _index->remove_document(docid); + } +} + DenseTensorAttribute::DenseTensorAttribute(vespalib::stringref baseFileName, const Config& cfg, const NearestNeighborIndexFactory& index_factory) : TensorAttribute(baseFileName, cfg, _denseTensorStore), @@ -74,12 +82,23 @@ DenseTensorAttribute::~DenseTensorAttribute() _tensorStore.clearHoldLists(); } +uint32_t +DenseTensorAttribute::clearDoc(DocId docId) +{ + consider_remove_from_index(docId); + return TensorAttribute::clearDoc(docId); +} + void DenseTensorAttribute::setTensor(DocId docId, const Tensor &tensor) { checkTensorType(tensor); + consider_remove_from_index(docId); EntryRef ref = _denseTensorStore.setTensor(tensor); setTensorRef(docId, ref); + if (_index) { + _index->add_document(docId); + } } @@ -125,6 +144,11 @@ DenseTensorAttribute::onLoad() auto raw = _denseTensorStore.allocRawBuffer(); tensorReader.readTensor(raw.data, _denseTensorStore.getBufSize()); _refVector.push_back(raw.ref); + if (_index) { + // This ensures that get_vector() (via getTensor()) is able to find the newly added tensor. + setCommittedDocIdLimit(lid + 1); + _index->add_document(lid); + } } else { _refVector.push_back(EntryRef()); } diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h index 74c93aca212..f9a8a81b56b 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h @@ -23,18 +23,21 @@ private: DenseTensorStore _denseTensorStore; std::unique_ptr<NearestNeighborIndex> _index; + void consider_remove_from_index(DocId docid); + public: DenseTensorAttribute(vespalib::stringref baseFileName, const Config& cfg, const NearestNeighborIndexFactory& index_factory = DefaultNearestNeighborIndexFactory()); virtual ~DenseTensorAttribute(); // Implements TensorAttribute - virtual void setTensor(DocId docId, const Tensor &tensor) override; - virtual std::unique_ptr<Tensor> getTensor(DocId docId) const override; - virtual void getTensor(DocId docId, vespalib::tensor::MutableDenseTensorView &tensor) const override; - virtual bool onLoad() override; - virtual std::unique_ptr<AttributeSaver> onInitSave(vespalib::stringref fileName) override; - virtual void compactWorst() override; - virtual uint32_t getVersion() const override; + uint32_t clearDoc(DocId docId) override; + void setTensor(DocId docId, const Tensor &tensor) override; + std::unique_ptr<Tensor> getTensor(DocId docId) const override; + void getTensor(DocId docId, vespalib::tensor::MutableDenseTensorView &tensor) const override; + bool onLoad() override; + std::unique_ptr<AttributeSaver> onInitSave(vespalib::stringref fileName) override; + void compactWorst() override; + uint32_t getVersion() const override; // Implements DocVectorAccess vespalib::tensor::TypedCells get_vector(uint32_t docid) const override; |