diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-03-31 11:15:26 +0000 |
---|---|---|
committer | Geir Storli <geirst@verizonmedia.com> | 2020-03-31 12:19:38 +0000 |
commit | 1fdb33e3b404ab21a11b5da337667797b795a77f (patch) | |
tree | 0842dd46f9bdb4cad7be44fc16b49dfd0d6adbe9 /searchlib | |
parent | 451173e78f50c4db14f0def7a12eb9881720b94a (diff) |
Implement saving and loading of nearest neighbor index.
Diffstat (limited to 'searchlib')
9 files changed, 168 insertions, 66 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/CMakeLists.txt b/searchlib/src/tests/attribute/tensorattribute/CMakeLists.txt index 3794fd88fc3..44ff45d02d3 100644 --- a/searchlib/src/tests/attribute/tensorattribute/CMakeLists.txt +++ b/searchlib/src/tests/attribute/tensorattribute/CMakeLists.txt @@ -5,5 +5,4 @@ vespa_add_executable(searchlib_tensorattribute_test_app TEST DEPENDS searchlib ) -vespa_add_test(NAME searchlib_tensorattribute_test_app COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/tensorattribute_test.sh - DEPENDS searchlib_tensorattribute_test_app) +vespa_add_test(NAME searchlib_tensorattribute_test_app COMMAND searchlib_tensorattribute_test_app) diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index 00450eab21a..12256423a8d 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -16,10 +16,13 @@ #include <vespa/searchlib/tensor/nearest_neighbor_index_factory.h> #include <vespa/searchlib/tensor/nearest_neighbor_index_saver.h> #include <vespa/searchlib/tensor/tensor_attribute.h> +#include <vespa/searchlib/test/directory_handler.h> +#include <vespa/searchlib/util/fileutil.h> #include <vespa/vespalib/data/fileheader.h> #include <vespa/vespalib/io/fileutil.h> #include <vespa/vespalib/test/insertion_operators.h> #include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/vespalib/util/bufferwriter.h> #include <vespa/log/log.h> LOG_SETUP("tensorattribute_test"); @@ -77,6 +80,18 @@ vec_2d(double x0, double x1) return TensorSpec(vec_2d_spec).add({{"x", 0}}, x0).add({{"x", 1}}, x1); } +class MockIndexSaver : public NearestNeighborIndexSaver { +private: + int _index_value; + +public: + MockIndexSaver(int index_value) : _index_value(index_value) {} + void save(search::BufferWriter& writer) const override { + writer.write(&_index_value, sizeof(int)); + writer.flush(); + } +}; + class MockNearestNeighborIndex : public NearestNeighborIndex { private: using Entry = std::pair<uint32_t, DoubleVector>; @@ -88,6 +103,7 @@ private: generation_t _transfer_gen; generation_t _trim_gen; mutable size_t _memory_usage_cnt; + int _index_value; public: MockNearestNeighborIndex(const DocVectorAccess& vectors) @@ -96,13 +112,20 @@ public: _removes(), _transfer_gen(std::numeric_limits<generation_t>::max()), _trim_gen(std::numeric_limits<generation_t>::max()), - _memory_usage_cnt(0) + _memory_usage_cnt(0), + _index_value(0) { } void clear() { _adds.clear(); _removes.clear(); } + int get_index_value() const { + return _index_value; + } + void save_index_with_value(int value) { + _index_value = value; + } void expect_empty_add() const { EXPECT_TRUE(_adds.empty()); } @@ -146,9 +169,16 @@ public: } void get_state(const vespalib::slime::Inserter&) const override {} std::unique_ptr<NearestNeighborIndexSaver> make_saver() const override { + if (_index_value != 0) { + return std::make_unique<MockIndexSaver>(_index_value); + } return std::unique_ptr<NearestNeighborIndexSaver>(); } - bool load(const search::fileutil::LoadedBuffer&) override { return false; } + bool load(const search::fileutil::LoadedBuffer& buf) override { + ASSERT_EQUAL(sizeof(int), buf.size()); + _index_value = (reinterpret_cast<const int*>(buf.buffer()))[0]; + return true; + } std::vector<Neighbor> find_top_k(uint32_t k, vespalib::tensor::TypedCells vector, uint32_t explore_k) const override { (void) k; (void) vector; @@ -172,12 +202,15 @@ class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory { } }; -struct Fixture -{ +const vespalib::string test_dir = "test_data/"; +const vespalib::string attr_name = test_dir + "my_attr"; + +struct Fixture { using BasicType = search::attribute::BasicType; using CollectionType = search::attribute::CollectionType; using Config = search::attribute::Config; + search::test::DirectoryHandler _dir_handler; Config _cfg; vespalib::string _name; vespalib::string _typeSpec; @@ -191,8 +224,9 @@ struct Fixture bool useDenseTensorAttribute = false, bool enable_hnsw_index = false, bool use_mock_index = false) - : _cfg(BasicType::TENSOR, CollectionType::SINGLE), - _name("test"), + : _dir_handler(test_dir), + _cfg(BasicType::TENSOR, CollectionType::SINGLE), + _name(attr_name), _typeSpec(typeSpec), _index_factory(std::make_unique<DefaultNearestNeighborIndexFactory>()), _tensorAttr(), @@ -328,7 +362,6 @@ struct Fixture void testEmptyTensor(); }; - void Fixture::testEmptyAttribute() { @@ -389,7 +422,6 @@ Fixture::testSaveLoad() TEST_DO(assertGetNoTensor(2)); } - void Fixture::testCompaction() { @@ -444,7 +476,8 @@ Fixture::testTensorTypeFileHeaderTag() vespalib::FileHeader header; FastOS_File file; - EXPECT_TRUE(file.OpenReadOnly("test.dat")); + vespalib::string file_name = attr_name + ".dat"; + EXPECT_TRUE(file.OpenReadOnly(file_name.c_str())); (void) header.readFile(file); file.Close(); EXPECT_TRUE(header.hasTag("tensortype")); @@ -456,7 +489,6 @@ Fixture::testTensorTypeFileHeaderTag() } } - void Fixture::testEmptyTensor() { @@ -471,7 +503,6 @@ Fixture::testEmptyTensor() } } - template <class MakeFixture> void testAll(MakeFixture &&f) { @@ -557,17 +588,6 @@ TEST_F("clearDoc() updates nearest neighbor index", DenseTensorAttributeMockInde index.expect_empty_add(); } -TEST_F("onLoad() updates nearest neighbor index", DenseTensorAttributeMockIndex) -{ - f.set_tensor(1, vec_2d(3, 5)); - f.set_tensor(2, vec_2d(7, 9)); - f.save(); - f.load(); - auto& index = f.mock_index(); - index.expect_adds({{1, {3, 5}}, {2, {7, 9}}}); -} - - TEST_F("commit() ensures transfer and trim hold lists on nearest neighbor index", DenseTensorAttributeMockIndex) { auto& index = f.mock_index(); @@ -604,4 +624,32 @@ TEST_F("Memory usage is extracted from index when updating stats on attribute", EXPECT_EQUAL(before + 1, after); } -TEST_MAIN() { TEST_RUN_ALL(); vespalib::unlink("test.dat"); } +TEST_F("Nearest neighbor index can be saved to disk and then loaded from file", DenseTensorAttributeMockIndex) +{ + f.set_tensor(1, vec_2d(3, 5)); + f.set_tensor(2, vec_2d(7, 9)); + f.mock_index().save_index_with_value(123); + f.save(); + EXPECT_TRUE(vespalib::fileExists(attr_name + ".nnidx")); + + f.load(); // index is loaded from saved file + auto& index = f.mock_index(); + EXPECT_EQUAL(123, index.get_index_value()); + index.expect_adds({}); +} + +TEST_F("onLoad() reconstructs nearest neighbor index if save file does not exists", DenseTensorAttributeMockIndex) +{ + f.set_tensor(1, vec_2d(3, 5)); + f.set_tensor(2, vec_2d(7, 9)); + f.save(); + EXPECT_FALSE(vespalib::fileExists(attr_name + ".nnidx")); + + f.load(); // index is reconstructed by adding all loaded tensors + auto& index = f.mock_index(); + EXPECT_EQUAL(0, index.get_index_value()); + index.expect_adds({{1, {3, 5}}, {2, {7, 9}}}); +} + +TEST_MAIN() { TEST_RUN_ALL(); } + diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.sh b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.sh deleted file mode 100755 index dd9399dea78..00000000000 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -set -e -$VALGRIND ./searchlib_tensorattribute_test_app -rm -rf *.dat diff --git a/searchlib/src/vespa/searchlib/attribute/load_utils.cpp b/searchlib/src/vespa/searchlib/attribute/load_utils.cpp index b379edc49db..701c8eaf702 100644 --- a/searchlib/src/vespa/searchlib/attribute/load_utils.cpp +++ b/searchlib/src/vespa/searchlib/attribute/load_utils.cpp @@ -7,6 +7,7 @@ #include "multivalue.h" #include <vespa/fastos/file.h> #include <vespa/searchlib/util/fileutil.h> +#include <vespa/vespalib/io/fileutil.h> #include <vespa/vespalib/util/array.hpp> using search::multivalue::Value; @@ -23,11 +24,7 @@ LoadUtils::openFile(const AttributeVector& attr, const vespalib::string& suffix) return FileUtil::openFile(attr.getBaseFileName() + "." + suffix); } -LoadedBufferUP -LoadUtils::loadFile(const AttributeVector& attr, const vespalib::string& suffix) -{ - return FileUtil::loadFile(attr.getBaseFileName() + "." + suffix); -} + FileInterfaceUP LoadUtils::openDAT(const AttributeVector& attr) @@ -47,6 +44,18 @@ LoadUtils::openWeight(const AttributeVector& attr) return openFile(attr, "weight"); } +bool +LoadUtils::file_exists(const AttributeVector& attr, const vespalib::string& suffix) +{ + return vespalib::fileExists(attr.getBaseFileName() + "." + suffix); +} + +LoadedBufferUP +LoadUtils::loadFile(const AttributeVector& attr, const vespalib::string& suffix) +{ + return FileUtil::loadFile(attr.getBaseFileName() + "." + suffix); +} + LoadedBufferUP LoadUtils::loadDAT(const AttributeVector& attr) { diff --git a/searchlib/src/vespa/searchlib/attribute/load_utils.h b/searchlib/src/vespa/searchlib/attribute/load_utils.h index 41c24e5a099..cd9d98084d5 100644 --- a/searchlib/src/vespa/searchlib/attribute/load_utils.h +++ b/searchlib/src/vespa/searchlib/attribute/load_utils.h @@ -18,13 +18,15 @@ public: private: static FileInterfaceUP openFile(const AttributeVector& attr, const vespalib::string& suffix); - static LoadedBufferUP loadFile(const AttributeVector& attr, const vespalib::string& suffix); public: static FileInterfaceUP openDAT(const AttributeVector& attr); static FileInterfaceUP openIDX(const AttributeVector& attr); static FileInterfaceUP openWeight(const AttributeVector& attr); + static bool file_exists(const AttributeVector& attr, const vespalib::string& suffix); + static LoadedBufferUP loadFile(const AttributeVector& attr, const vespalib::string& suffix); + static LoadedBufferUP loadDAT(const AttributeVector& attr); static LoadedBufferUP loadIDX(const AttributeVector& attr); static LoadedBufferUP loadWeight(const AttributeVector& attr); diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp index 627f7f0dfa9..68ce0c1bb00 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp @@ -3,16 +3,19 @@ #include "dense_tensor_attribute.h" #include "dense_tensor_attribute_saver.h" #include "nearest_neighbor_index.h" +#include "nearest_neighbor_index_saver.h" #include "tensor_attribute.hpp" #include <vespa/eval/tensor/dense/mutable_dense_tensor_view.h> #include <vespa/eval/tensor/tensor.h> #include <vespa/fastlib/io/bufferedfile.h> +#include <vespa/searchlib/attribute/load_utils.h> #include <vespa/searchlib/attribute/readerbase.h> #include <vespa/vespalib/data/slime/inserter.h> #include <vespa/log/log.h> LOG_SETUP(".searchlib.tensor.dense_tensor_attribute"); +using search::attribute::LoadUtils; using vespalib::eval::ValueType; using vespalib::slime::ObjectInserter; using vespalib::tensor::MutableDenseTensorView; @@ -148,6 +151,8 @@ DenseTensorAttribute::onLoad() if (!tensorReader.hasData()) { return false; } + bool has_index_file = LoadUtils::file_exists(*this, DenseTensorAttributeSaver::index_file_suffix()); + setCreateSerialNum(tensorReader.getCreateSerialNum()); assert(tensorReader.getVersion() == DENSE_TENSOR_ATTRIBUTE_VERSION); assert(getConfig().tensorType().to_spec() == @@ -160,7 +165,7 @@ DenseTensorAttribute::onLoad() auto raw = _denseTensorStore.allocRawBuffer(); tensorReader.readTensor(raw.data, _denseTensorStore.getBufSize()); _refVector.push_back(raw.ref); - if (_index) { + if (_index && !has_index_file) { // This ensures that get_vector() (via getTensor()) is able to find the newly added tensor. setCommittedDocIdLimit(lid + 1); _index->add_document(lid); @@ -171,6 +176,12 @@ DenseTensorAttribute::onLoad() } setNumDocs(numDocs); setCommittedDocIdLimit(numDocs); + if (_index && has_index_file) { + auto buffer = LoadUtils::loadFile(*this, DenseTensorAttributeSaver::index_file_suffix()); + if (!_index->load(*buffer)) { + return false; + } + } return true; } @@ -180,11 +191,13 @@ DenseTensorAttribute::onInitSave(vespalib::stringref fileName) { vespalib::GenerationHandler::Guard guard(getGenerationHandler(). takeGuard()); + auto index_saver = (_index ? _index->make_saver() : std::unique_ptr<NearestNeighborIndexSaver>()); return std::make_unique<DenseTensorAttributeSaver> (std::move(guard), this->createAttributeHeader(fileName), getRefCopy(), - _denseTensorStore); + _denseTensorStore, + std::move(index_saver)); } void diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.cpp index d78adab81b5..fd8d6162f01 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.cpp +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.cpp @@ -1,20 +1,19 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "dense_tensor_attribute_saver.h" -#include <vespa/vespalib/util/bufferwriter.h> #include "dense_tensor_store.h" +#include "nearest_neighbor_index_saver.h" +#include <vespa/vespalib/util/bufferwriter.h> #include <vespa/searchlib/attribute/iattributesavetarget.h> using vespalib::GenerationHandler; -namespace search { - -namespace tensor { +namespace search::tensor { namespace { -static const uint8_t tensorIsNotPresent = 0; -static const uint8_t tensorIsPresent = 1; +constexpr uint8_t tensorIsNotPresent = 0; +constexpr uint8_t tensorIsPresent = 1; } @@ -22,42 +21,60 @@ DenseTensorAttributeSaver:: DenseTensorAttributeSaver(GenerationHandler::Guard &&guard, const attribute::AttributeHeader &header, RefCopyVector &&refs, - const DenseTensorStore &tensorStore) + const DenseTensorStore &tensorStore, + IndexSaverUP index_saver) : AttributeSaver(std::move(guard), header), _refs(std::move(refs)), - _tensorStore(tensorStore) + _tensorStore(tensorStore), + _index_saver(std::move(index_saver)) { } +DenseTensorAttributeSaver::~DenseTensorAttributeSaver() = default; -DenseTensorAttributeSaver::~DenseTensorAttributeSaver() +vespalib::string +DenseTensorAttributeSaver::index_file_suffix() { + return "nnidx"; } - bool DenseTensorAttributeSaver::onSave(IAttributeSaveTarget &saveTarget) { - std::unique_ptr<BufferWriter> - datWriter(saveTarget.datWriter().allocBufferWriter()); + if (_index_saver) { + if (!saveTarget.setup_writer(index_file_suffix(), "Binary data file for nearest neighbor index")) { + return false; + } + } + + auto dat_writer = saveTarget.datWriter().allocBufferWriter(); + save_tensor_store(*dat_writer); + + if (_index_saver) { + auto index_writer = saveTarget.get_writer(index_file_suffix()).allocBufferWriter(); + // Note: Implementation of save() is responsible to call BufferWriter::flush(). + _index_saver->save(*index_writer); + } + return true; +} + +void +DenseTensorAttributeSaver::save_tensor_store(BufferWriter& writer) const +{ const uint32_t docIdLimit(_refs.size()); const uint32_t cellSize = _tensorStore.getCellSize(); for (uint32_t lid = 0; lid < docIdLimit; ++lid) { if (_refs[lid].valid()) { auto raw = _tensorStore.getRawBuffer(_refs[lid]); - datWriter->write(&tensorIsPresent, sizeof(tensorIsPresent)); + writer.write(&tensorIsPresent, sizeof(tensorIsPresent)); size_t numCells = _tensorStore.getNumCells(); size_t rawLen = numCells * cellSize; - datWriter->write(static_cast<const char *>(raw), rawLen); + writer.write(static_cast<const char *>(raw), rawLen); } else { - datWriter->write(&tensorIsNotPresent, sizeof(tensorIsNotPresent)); + writer.write(&tensorIsNotPresent, sizeof(tensorIsNotPresent)); } } - datWriter->flush(); - return true; + writer.flush(); } - -} // namespace search::tensor - -} // namespace search +} diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.h b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.h index 1f6596e82f5..895e2951cea 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.h +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.h @@ -5,28 +5,41 @@ #include "tensor_attribute.h" #include <vespa/searchlib/attribute/attributesaver.h> +namespace search { class BufferWriter; } + namespace search::tensor { class DenseTensorStore; +class NearestNeighborIndexSaver; -/* - * Class for saving a tensor attribute. +/** + * Class for saving a dense tensor attribute. + * Will also save the nearest neighbor index if existing. */ -class DenseTensorAttributeSaver : public AttributeSaver -{ +class DenseTensorAttributeSaver : public AttributeSaver { public: using RefCopyVector = TensorAttribute::RefCopyVector; private: + using GenerationHandler = vespalib::GenerationHandler; + using IndexSaverUP = std::unique_ptr<NearestNeighborIndexSaver>; + RefCopyVector _refs; const DenseTensorStore &_tensorStore; - using GenerationHandler = vespalib::GenerationHandler; + IndexSaverUP _index_saver; bool onSave(IAttributeSaveTarget &saveTarget) override; + void save_tensor_store(BufferWriter& writer) const; + public: - DenseTensorAttributeSaver(GenerationHandler::Guard &&guard, const attribute::AttributeHeader &header, - RefCopyVector &&refs, const DenseTensorStore &tensorStore); + DenseTensorAttributeSaver(GenerationHandler::Guard &&guard, + const attribute::AttributeHeader &header, + RefCopyVector &&refs, + const DenseTensorStore &tensorStore, + IndexSaverUP index_saver); ~DenseTensorAttributeSaver() override; + + static vespalib::string index_file_suffix(); }; } diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_saver.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_saver.h index cee48d63359..99d8960ae10 100644 --- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_saver.h +++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_saver.h @@ -21,6 +21,12 @@ namespace search::tensor { class NearestNeighborIndexSaver { public: virtual ~NearestNeighborIndexSaver() {} + + /** + * Saves the index in binary form using the given writer. + * + * It is the responsibility of the implementer to call BufferWriter::flush() at the end. + */ virtual void save(BufferWriter& writer) const = 0; }; |