aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@verizonmedia.com>2020-03-31 11:15:26 +0000
committerGeir Storli <geirst@verizonmedia.com>2020-03-31 12:19:38 +0000
commit1fdb33e3b404ab21a11b5da337667797b795a77f (patch)
tree0842dd46f9bdb4cad7be44fc16b49dfd0d6adbe9 /searchlib
parent451173e78f50c4db14f0def7a12eb9881720b94a (diff)
Implement saving and loading of nearest neighbor index.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/CMakeLists.txt3
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp94
-rwxr-xr-xsearchlib/src/tests/attribute/tensorattribute/tensorattribute_test.sh5
-rw-r--r--searchlib/src/vespa/searchlib/attribute/load_utils.cpp19
-rw-r--r--searchlib/src/vespa/searchlib/attribute/load_utils.h4
-rw-r--r--searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp17
-rw-r--r--searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.cpp59
-rw-r--r--searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.h27
-rw-r--r--searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_saver.h6
9 files changed, 168 insertions, 66 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/CMakeLists.txt b/searchlib/src/tests/attribute/tensorattribute/CMakeLists.txt
index 3794fd88fc3..44ff45d02d3 100644
--- a/searchlib/src/tests/attribute/tensorattribute/CMakeLists.txt
+++ b/searchlib/src/tests/attribute/tensorattribute/CMakeLists.txt
@@ -5,5 +5,4 @@ vespa_add_executable(searchlib_tensorattribute_test_app TEST
DEPENDS
searchlib
)
-vespa_add_test(NAME searchlib_tensorattribute_test_app COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/tensorattribute_test.sh
- DEPENDS searchlib_tensorattribute_test_app)
+vespa_add_test(NAME searchlib_tensorattribute_test_app COMMAND searchlib_tensorattribute_test_app)
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index 00450eab21a..12256423a8d 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -16,10 +16,13 @@
#include <vespa/searchlib/tensor/nearest_neighbor_index_factory.h>
#include <vespa/searchlib/tensor/nearest_neighbor_index_saver.h>
#include <vespa/searchlib/tensor/tensor_attribute.h>
+#include <vespa/searchlib/test/directory_handler.h>
+#include <vespa/searchlib/util/fileutil.h>
#include <vespa/vespalib/data/fileheader.h>
#include <vespa/vespalib/io/fileutil.h>
#include <vespa/vespalib/test/insertion_operators.h>
#include <vespa/vespalib/testkit/test_kit.h>
+#include <vespa/vespalib/util/bufferwriter.h>
#include <vespa/log/log.h>
LOG_SETUP("tensorattribute_test");
@@ -77,6 +80,18 @@ vec_2d(double x0, double x1)
return TensorSpec(vec_2d_spec).add({{"x", 0}}, x0).add({{"x", 1}}, x1);
}
+class MockIndexSaver : public NearestNeighborIndexSaver {
+private:
+ int _index_value;
+
+public:
+ MockIndexSaver(int index_value) : _index_value(index_value) {}
+ void save(search::BufferWriter& writer) const override {
+ writer.write(&_index_value, sizeof(int));
+ writer.flush();
+ }
+};
+
class MockNearestNeighborIndex : public NearestNeighborIndex {
private:
using Entry = std::pair<uint32_t, DoubleVector>;
@@ -88,6 +103,7 @@ private:
generation_t _transfer_gen;
generation_t _trim_gen;
mutable size_t _memory_usage_cnt;
+ int _index_value;
public:
MockNearestNeighborIndex(const DocVectorAccess& vectors)
@@ -96,13 +112,20 @@ public:
_removes(),
_transfer_gen(std::numeric_limits<generation_t>::max()),
_trim_gen(std::numeric_limits<generation_t>::max()),
- _memory_usage_cnt(0)
+ _memory_usage_cnt(0),
+ _index_value(0)
{
}
void clear() {
_adds.clear();
_removes.clear();
}
+ int get_index_value() const {
+ return _index_value;
+ }
+ void save_index_with_value(int value) {
+ _index_value = value;
+ }
void expect_empty_add() const {
EXPECT_TRUE(_adds.empty());
}
@@ -146,9 +169,16 @@ public:
}
void get_state(const vespalib::slime::Inserter&) const override {}
std::unique_ptr<NearestNeighborIndexSaver> make_saver() const override {
+ if (_index_value != 0) {
+ return std::make_unique<MockIndexSaver>(_index_value);
+ }
return std::unique_ptr<NearestNeighborIndexSaver>();
}
- bool load(const search::fileutil::LoadedBuffer&) override { return false; }
+ bool load(const search::fileutil::LoadedBuffer& buf) override {
+ ASSERT_EQUAL(sizeof(int), buf.size());
+ _index_value = (reinterpret_cast<const int*>(buf.buffer()))[0];
+ return true;
+ }
std::vector<Neighbor> find_top_k(uint32_t k, vespalib::tensor::TypedCells vector, uint32_t explore_k) const override {
(void) k;
(void) vector;
@@ -172,12 +202,15 @@ class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory {
}
};
-struct Fixture
-{
+const vespalib::string test_dir = "test_data/";
+const vespalib::string attr_name = test_dir + "my_attr";
+
+struct Fixture {
using BasicType = search::attribute::BasicType;
using CollectionType = search::attribute::CollectionType;
using Config = search::attribute::Config;
+ search::test::DirectoryHandler _dir_handler;
Config _cfg;
vespalib::string _name;
vespalib::string _typeSpec;
@@ -191,8 +224,9 @@ struct Fixture
bool useDenseTensorAttribute = false,
bool enable_hnsw_index = false,
bool use_mock_index = false)
- : _cfg(BasicType::TENSOR, CollectionType::SINGLE),
- _name("test"),
+ : _dir_handler(test_dir),
+ _cfg(BasicType::TENSOR, CollectionType::SINGLE),
+ _name(attr_name),
_typeSpec(typeSpec),
_index_factory(std::make_unique<DefaultNearestNeighborIndexFactory>()),
_tensorAttr(),
@@ -328,7 +362,6 @@ struct Fixture
void testEmptyTensor();
};
-
void
Fixture::testEmptyAttribute()
{
@@ -389,7 +422,6 @@ Fixture::testSaveLoad()
TEST_DO(assertGetNoTensor(2));
}
-
void
Fixture::testCompaction()
{
@@ -444,7 +476,8 @@ Fixture::testTensorTypeFileHeaderTag()
vespalib::FileHeader header;
FastOS_File file;
- EXPECT_TRUE(file.OpenReadOnly("test.dat"));
+ vespalib::string file_name = attr_name + ".dat";
+ EXPECT_TRUE(file.OpenReadOnly(file_name.c_str()));
(void) header.readFile(file);
file.Close();
EXPECT_TRUE(header.hasTag("tensortype"));
@@ -456,7 +489,6 @@ Fixture::testTensorTypeFileHeaderTag()
}
}
-
void
Fixture::testEmptyTensor()
{
@@ -471,7 +503,6 @@ Fixture::testEmptyTensor()
}
}
-
template <class MakeFixture>
void testAll(MakeFixture &&f)
{
@@ -557,17 +588,6 @@ TEST_F("clearDoc() updates nearest neighbor index", DenseTensorAttributeMockInde
index.expect_empty_add();
}
-TEST_F("onLoad() updates nearest neighbor index", DenseTensorAttributeMockIndex)
-{
- f.set_tensor(1, vec_2d(3, 5));
- f.set_tensor(2, vec_2d(7, 9));
- f.save();
- f.load();
- auto& index = f.mock_index();
- index.expect_adds({{1, {3, 5}}, {2, {7, 9}}});
-}
-
-
TEST_F("commit() ensures transfer and trim hold lists on nearest neighbor index", DenseTensorAttributeMockIndex)
{
auto& index = f.mock_index();
@@ -604,4 +624,32 @@ TEST_F("Memory usage is extracted from index when updating stats on attribute",
EXPECT_EQUAL(before + 1, after);
}
-TEST_MAIN() { TEST_RUN_ALL(); vespalib::unlink("test.dat"); }
+TEST_F("Nearest neighbor index can be saved to disk and then loaded from file", DenseTensorAttributeMockIndex)
+{
+ f.set_tensor(1, vec_2d(3, 5));
+ f.set_tensor(2, vec_2d(7, 9));
+ f.mock_index().save_index_with_value(123);
+ f.save();
+ EXPECT_TRUE(vespalib::fileExists(attr_name + ".nnidx"));
+
+ f.load(); // index is loaded from saved file
+ auto& index = f.mock_index();
+ EXPECT_EQUAL(123, index.get_index_value());
+ index.expect_adds({});
+}
+
+TEST_F("onLoad() reconstructs nearest neighbor index if save file does not exists", DenseTensorAttributeMockIndex)
+{
+ f.set_tensor(1, vec_2d(3, 5));
+ f.set_tensor(2, vec_2d(7, 9));
+ f.save();
+ EXPECT_FALSE(vespalib::fileExists(attr_name + ".nnidx"));
+
+ f.load(); // index is reconstructed by adding all loaded tensors
+ auto& index = f.mock_index();
+ EXPECT_EQUAL(0, index.get_index_value());
+ index.expect_adds({{1, {3, 5}}, {2, {7, 9}}});
+}
+
+TEST_MAIN() { TEST_RUN_ALL(); }
+
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.sh b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.sh
deleted file mode 100755
index dd9399dea78..00000000000
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-set -e
-$VALGRIND ./searchlib_tensorattribute_test_app
-rm -rf *.dat
diff --git a/searchlib/src/vespa/searchlib/attribute/load_utils.cpp b/searchlib/src/vespa/searchlib/attribute/load_utils.cpp
index b379edc49db..701c8eaf702 100644
--- a/searchlib/src/vespa/searchlib/attribute/load_utils.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/load_utils.cpp
@@ -7,6 +7,7 @@
#include "multivalue.h"
#include <vespa/fastos/file.h>
#include <vespa/searchlib/util/fileutil.h>
+#include <vespa/vespalib/io/fileutil.h>
#include <vespa/vespalib/util/array.hpp>
using search::multivalue::Value;
@@ -23,11 +24,7 @@ LoadUtils::openFile(const AttributeVector& attr, const vespalib::string& suffix)
return FileUtil::openFile(attr.getBaseFileName() + "." + suffix);
}
-LoadedBufferUP
-LoadUtils::loadFile(const AttributeVector& attr, const vespalib::string& suffix)
-{
- return FileUtil::loadFile(attr.getBaseFileName() + "." + suffix);
-}
+
FileInterfaceUP
LoadUtils::openDAT(const AttributeVector& attr)
@@ -47,6 +44,18 @@ LoadUtils::openWeight(const AttributeVector& attr)
return openFile(attr, "weight");
}
+bool
+LoadUtils::file_exists(const AttributeVector& attr, const vespalib::string& suffix)
+{
+ return vespalib::fileExists(attr.getBaseFileName() + "." + suffix);
+}
+
+LoadedBufferUP
+LoadUtils::loadFile(const AttributeVector& attr, const vespalib::string& suffix)
+{
+ return FileUtil::loadFile(attr.getBaseFileName() + "." + suffix);
+}
+
LoadedBufferUP
LoadUtils::loadDAT(const AttributeVector& attr)
{
diff --git a/searchlib/src/vespa/searchlib/attribute/load_utils.h b/searchlib/src/vespa/searchlib/attribute/load_utils.h
index 41c24e5a099..cd9d98084d5 100644
--- a/searchlib/src/vespa/searchlib/attribute/load_utils.h
+++ b/searchlib/src/vespa/searchlib/attribute/load_utils.h
@@ -18,13 +18,15 @@ public:
private:
static FileInterfaceUP openFile(const AttributeVector& attr, const vespalib::string& suffix);
- static LoadedBufferUP loadFile(const AttributeVector& attr, const vespalib::string& suffix);
public:
static FileInterfaceUP openDAT(const AttributeVector& attr);
static FileInterfaceUP openIDX(const AttributeVector& attr);
static FileInterfaceUP openWeight(const AttributeVector& attr);
+ static bool file_exists(const AttributeVector& attr, const vespalib::string& suffix);
+ static LoadedBufferUP loadFile(const AttributeVector& attr, const vespalib::string& suffix);
+
static LoadedBufferUP loadDAT(const AttributeVector& attr);
static LoadedBufferUP loadIDX(const AttributeVector& attr);
static LoadedBufferUP loadWeight(const AttributeVector& attr);
diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
index 627f7f0dfa9..68ce0c1bb00 100644
--- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp
@@ -3,16 +3,19 @@
#include "dense_tensor_attribute.h"
#include "dense_tensor_attribute_saver.h"
#include "nearest_neighbor_index.h"
+#include "nearest_neighbor_index_saver.h"
#include "tensor_attribute.hpp"
#include <vespa/eval/tensor/dense/mutable_dense_tensor_view.h>
#include <vespa/eval/tensor/tensor.h>
#include <vespa/fastlib/io/bufferedfile.h>
+#include <vespa/searchlib/attribute/load_utils.h>
#include <vespa/searchlib/attribute/readerbase.h>
#include <vespa/vespalib/data/slime/inserter.h>
#include <vespa/log/log.h>
LOG_SETUP(".searchlib.tensor.dense_tensor_attribute");
+using search::attribute::LoadUtils;
using vespalib::eval::ValueType;
using vespalib::slime::ObjectInserter;
using vespalib::tensor::MutableDenseTensorView;
@@ -148,6 +151,8 @@ DenseTensorAttribute::onLoad()
if (!tensorReader.hasData()) {
return false;
}
+ bool has_index_file = LoadUtils::file_exists(*this, DenseTensorAttributeSaver::index_file_suffix());
+
setCreateSerialNum(tensorReader.getCreateSerialNum());
assert(tensorReader.getVersion() == DENSE_TENSOR_ATTRIBUTE_VERSION);
assert(getConfig().tensorType().to_spec() ==
@@ -160,7 +165,7 @@ DenseTensorAttribute::onLoad()
auto raw = _denseTensorStore.allocRawBuffer();
tensorReader.readTensor(raw.data, _denseTensorStore.getBufSize());
_refVector.push_back(raw.ref);
- if (_index) {
+ if (_index && !has_index_file) {
// This ensures that get_vector() (via getTensor()) is able to find the newly added tensor.
setCommittedDocIdLimit(lid + 1);
_index->add_document(lid);
@@ -171,6 +176,12 @@ DenseTensorAttribute::onLoad()
}
setNumDocs(numDocs);
setCommittedDocIdLimit(numDocs);
+ if (_index && has_index_file) {
+ auto buffer = LoadUtils::loadFile(*this, DenseTensorAttributeSaver::index_file_suffix());
+ if (!_index->load(*buffer)) {
+ return false;
+ }
+ }
return true;
}
@@ -180,11 +191,13 @@ DenseTensorAttribute::onInitSave(vespalib::stringref fileName)
{
vespalib::GenerationHandler::Guard guard(getGenerationHandler().
takeGuard());
+ auto index_saver = (_index ? _index->make_saver() : std::unique_ptr<NearestNeighborIndexSaver>());
return std::make_unique<DenseTensorAttributeSaver>
(std::move(guard),
this->createAttributeHeader(fileName),
getRefCopy(),
- _denseTensorStore);
+ _denseTensorStore,
+ std::move(index_saver));
}
void
diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.cpp
index d78adab81b5..fd8d6162f01 100644
--- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.cpp
@@ -1,20 +1,19 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "dense_tensor_attribute_saver.h"
-#include <vespa/vespalib/util/bufferwriter.h>
#include "dense_tensor_store.h"
+#include "nearest_neighbor_index_saver.h"
+#include <vespa/vespalib/util/bufferwriter.h>
#include <vespa/searchlib/attribute/iattributesavetarget.h>
using vespalib::GenerationHandler;
-namespace search {
-
-namespace tensor {
+namespace search::tensor {
namespace {
-static const uint8_t tensorIsNotPresent = 0;
-static const uint8_t tensorIsPresent = 1;
+constexpr uint8_t tensorIsNotPresent = 0;
+constexpr uint8_t tensorIsPresent = 1;
}
@@ -22,42 +21,60 @@ DenseTensorAttributeSaver::
DenseTensorAttributeSaver(GenerationHandler::Guard &&guard,
const attribute::AttributeHeader &header,
RefCopyVector &&refs,
- const DenseTensorStore &tensorStore)
+ const DenseTensorStore &tensorStore,
+ IndexSaverUP index_saver)
: AttributeSaver(std::move(guard), header),
_refs(std::move(refs)),
- _tensorStore(tensorStore)
+ _tensorStore(tensorStore),
+ _index_saver(std::move(index_saver))
{
}
+DenseTensorAttributeSaver::~DenseTensorAttributeSaver() = default;
-DenseTensorAttributeSaver::~DenseTensorAttributeSaver()
+vespalib::string
+DenseTensorAttributeSaver::index_file_suffix()
{
+ return "nnidx";
}
-
bool
DenseTensorAttributeSaver::onSave(IAttributeSaveTarget &saveTarget)
{
- std::unique_ptr<BufferWriter>
- datWriter(saveTarget.datWriter().allocBufferWriter());
+ if (_index_saver) {
+ if (!saveTarget.setup_writer(index_file_suffix(), "Binary data file for nearest neighbor index")) {
+ return false;
+ }
+ }
+
+ auto dat_writer = saveTarget.datWriter().allocBufferWriter();
+ save_tensor_store(*dat_writer);
+
+ if (_index_saver) {
+ auto index_writer = saveTarget.get_writer(index_file_suffix()).allocBufferWriter();
+ // Note: Implementation of save() is responsible to call BufferWriter::flush().
+ _index_saver->save(*index_writer);
+ }
+ return true;
+}
+
+void
+DenseTensorAttributeSaver::save_tensor_store(BufferWriter& writer) const
+{
const uint32_t docIdLimit(_refs.size());
const uint32_t cellSize = _tensorStore.getCellSize();
for (uint32_t lid = 0; lid < docIdLimit; ++lid) {
if (_refs[lid].valid()) {
auto raw = _tensorStore.getRawBuffer(_refs[lid]);
- datWriter->write(&tensorIsPresent, sizeof(tensorIsPresent));
+ writer.write(&tensorIsPresent, sizeof(tensorIsPresent));
size_t numCells = _tensorStore.getNumCells();
size_t rawLen = numCells * cellSize;
- datWriter->write(static_cast<const char *>(raw), rawLen);
+ writer.write(static_cast<const char *>(raw), rawLen);
} else {
- datWriter->write(&tensorIsNotPresent, sizeof(tensorIsNotPresent));
+ writer.write(&tensorIsNotPresent, sizeof(tensorIsNotPresent));
}
}
- datWriter->flush();
- return true;
+ writer.flush();
}
-
-} // namespace search::tensor
-
-} // namespace search
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.h b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.h
index 1f6596e82f5..895e2951cea 100644
--- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.h
+++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute_saver.h
@@ -5,28 +5,41 @@
#include "tensor_attribute.h"
#include <vespa/searchlib/attribute/attributesaver.h>
+namespace search { class BufferWriter; }
+
namespace search::tensor {
class DenseTensorStore;
+class NearestNeighborIndexSaver;
-/*
- * Class for saving a tensor attribute.
+/**
+ * Class for saving a dense tensor attribute.
+ * Will also save the nearest neighbor index if existing.
*/
-class DenseTensorAttributeSaver : public AttributeSaver
-{
+class DenseTensorAttributeSaver : public AttributeSaver {
public:
using RefCopyVector = TensorAttribute::RefCopyVector;
private:
+ using GenerationHandler = vespalib::GenerationHandler;
+ using IndexSaverUP = std::unique_ptr<NearestNeighborIndexSaver>;
+
RefCopyVector _refs;
const DenseTensorStore &_tensorStore;
- using GenerationHandler = vespalib::GenerationHandler;
+ IndexSaverUP _index_saver;
bool onSave(IAttributeSaveTarget &saveTarget) override;
+ void save_tensor_store(BufferWriter& writer) const;
+
public:
- DenseTensorAttributeSaver(GenerationHandler::Guard &&guard, const attribute::AttributeHeader &header,
- RefCopyVector &&refs, const DenseTensorStore &tensorStore);
+ DenseTensorAttributeSaver(GenerationHandler::Guard &&guard,
+ const attribute::AttributeHeader &header,
+ RefCopyVector &&refs,
+ const DenseTensorStore &tensorStore,
+ IndexSaverUP index_saver);
~DenseTensorAttributeSaver() override;
+
+ static vespalib::string index_file_suffix();
};
}
diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_saver.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_saver.h
index cee48d63359..99d8960ae10 100644
--- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_saver.h
+++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_saver.h
@@ -21,6 +21,12 @@ namespace search::tensor {
class NearestNeighborIndexSaver {
public:
virtual ~NearestNeighborIndexSaver() {}
+
+ /**
+ * Saves the index in binary form using the given writer.
+ *
+ * It is the responsibility of the implementer to call BufferWriter::flush() at the end.
+ */
virtual void save(BufferWriter& writer) const = 0;
};