diff options
author | Tor Egge <Tor.Egge@online.no> | 2022-11-23 11:22:10 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2022-11-23 11:22:10 +0100 |
commit | ea7389254797d0b45940439ad3d7c7f3370b3af1 (patch) | |
tree | de4ac043a312a679a81ffa2d451bc1599c0678db /searchlib | |
parent | e03f1e82952dbdb801e737de41b285c0fa74c3f9 (diff) |
Setup hnsw index for mixed tensor types.
Diffstat (limited to 'searchlib')
13 files changed, 142 insertions, 43 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index 8a3be423457..6fa1bdcf072 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -19,6 +19,7 @@ #include <vespa/searchlib/util/fileutil.h> #include <vespa/searchcommon/attribute/config.h> #include <vespa/vespalib/data/fileheader.h> +#include <vespa/vespalib/stllike/asciistream.h> #include <vespa/vespalib/test/insertion_operators.h> #include <vespa/vespalib/testkit/test_kit.h> #include <vespa/vespalib/util/mmap_file_allocator_factory.h> @@ -72,6 +73,7 @@ using generation_t = vespalib::GenerationHandler::generation_t; vespalib::string sparseSpec("tensor(x{},y{})"); vespalib::string denseSpec("tensor(x[2],y[3])"); vespalib::string vec_2d_spec("tensor(x[2])"); +vespalib::string vec_mixed_2d_spec("tensor(a{},x[2])"); Value::UP createTensor(const TensorSpec &spec) { return SimpleValue::from_spec(spec); @@ -83,6 +85,31 @@ vec_2d(double x0, double x1) return TensorSpec(vec_2d_spec).add({{"x", 0}}, x0).add({{"x", 1}}, x1); } +TensorSpec +vec_mixed_2d(std::vector<std::vector<double>> val) +{ + TensorSpec spec(vec_mixed_2d_spec); + for (uint32_t a = 0; a < val.size(); ++a) { + vespalib::asciistream a_stream; + a_stream << a; + vespalib::string a_as_string = a_stream.str(); + for (uint32_t x = 0; x < val[a].size(); ++x) { + spec.add({{"a", a_as_string.c_str()},{"x", x}}, val[a][x]); + } + } + return spec; +} + +TensorSpec +typed_vec_2d(HnswIndexType type, double x0, double x1) +{ + if (type == HnswIndexType::SINGLE) { + return vec_2d(x0, x1); + } else { + return vec_mixed_2d({{x0, x1}}); + } +} + class MockIndexSaver : public NearestNeighborIndexSaver { private: int _index_value; @@ -274,7 +301,6 @@ public: return std::vector<Neighbor>(); } - const search::tensor::DistanceFunction *distance_function() const override { static search::tensor::SquaredEuclideanDistance my_dist_fun(vespalib::eval::CellType::DOUBLE); return &my_dist_fun; @@ -285,10 +311,12 @@ class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory { std::unique_ptr<NearestNeighborIndex> make(const DocVectorAccess& vectors, size_t vector_size, + bool multi_vector_index, CellType cell_type, const search::attribute::HnswIndexParams& params) const override { (void) vector_size; (void) params; + (void) multi_vector_index; assert(cell_type == CellType::DOUBLE); return std::make_unique<MockNearestNeighborIndex>(vectors); } @@ -322,6 +350,13 @@ struct FixtureTraits { return *this; } + FixtureTraits mixed_hnsw() && { + use_dense_tensor_attribute = false; + enable_hnsw_index = true; + use_mock_index = false; + return *this; + } + FixtureTraits mock_hnsw() && { use_dense_tensor_attribute = true; enable_hnsw_index = true; @@ -406,8 +441,8 @@ struct Fixture { template <typename IndexType> IndexType& get_nearest_neighbor_index() { - assert(as_dense_tensor().nearest_neighbor_index() != nullptr); - auto index = dynamic_cast<const IndexType*>(as_dense_tensor().nearest_neighbor_index()); + assert(_tensorAttr->nearest_neighbor_index() != nullptr); + auto index = dynamic_cast<const IndexType*>(_tensorAttr->nearest_neighbor_index()); assert(index != nullptr); return *const_cast<IndexType*>(index); } @@ -416,6 +451,11 @@ struct Fixture { return get_nearest_neighbor_index<HnswIndex<HnswIndexType::SINGLE>>(); } + template <HnswIndexType type> + HnswIndex<type>& hnsw_typed_index() { + return get_nearest_neighbor_index<HnswIndex<type>>(); + } + MockNearestNeighborIndex& mock_index() { return get_nearest_neighbor_index<MockNearestNeighborIndex>(); } @@ -836,15 +876,24 @@ TEST_F("Hnsw index is NOT instantiated in dense tensor attribute by default", EXPECT_TRUE(tensor.nearest_neighbor_index() == nullptr); } -class DenseTensorAttributeHnswIndex : public Fixture { + +template <HnswIndexType type> +class TensorAttributeHnswIndex : public Fixture +{ public: - DenseTensorAttributeHnswIndex() : Fixture(vec_2d_spec, FixtureTraits().hnsw()) {} + TensorAttributeHnswIndex(const vespalib::string &type_spec, FixtureTraits traits) + : Fixture(type_spec, traits) + { + } + void test_setup(); + void test_save_load(bool multi_node); }; -TEST_F("Hnsw index is instantiated in dense tensor attribute when specified in config", DenseTensorAttributeHnswIndex) +template <HnswIndexType type> +void +TensorAttributeHnswIndex<type>::test_setup() { - auto& index = f.hnsw_index(); - + auto& index = hnsw_typed_index<type>(); const auto& cfg = index.config(); EXPECT_EQUAL(8u, cfg.max_links_at_level_0()); EXPECT_EQUAL(4u, cfg.max_links_on_inserts()); @@ -853,32 +902,74 @@ TEST_F("Hnsw index is instantiated in dense tensor attribute when specified in c } void -expect_level_0(uint32_t exp_docid, const HnswTestNode& node) +expect_level_0(uint32_t exp_nodeid, const HnswTestNode& node) { ASSERT_GREATER_EQUAL(node.size(), 1u); ASSERT_EQUAL(1u, node.level(0).size()); - EXPECT_EQUAL(exp_docid, node.level(0)[0]); + EXPECT_EQUAL(exp_nodeid, node.level(0)[0]); } -TEST_F("Hnsw index is integrated in dense tensor attribute and can be saved and loaded", DenseTensorAttributeHnswIndex) +template <HnswIndexType type> +void +TensorAttributeHnswIndex<type>::test_save_load(bool multi_node) { // Set two points that will be linked together in level 0 of the hnsw graph. - f.set_tensor(1, vec_2d(3, 5)); - f.set_tensor(2, vec_2d(7, 9)); + if (multi_node) { + set_tensor(1, vec_mixed_2d({{3, 5}, {7, 9}})); + } else { + set_tensor(1, typed_vec_2d(type, 3, 5)); + set_tensor(2, typed_vec_2d(type, 7, 9)); + } - auto &index_a = f.hnsw_index(); + auto old_attr = _attr; + auto &index_a = hnsw_typed_index<type>(); expect_level_0(2, index_a.get_node(1)); expect_level_0(1, index_a.get_node(2)); - f.save(); + save(); EXPECT_TRUE(std::filesystem::exists(std::filesystem::path(attr_name + ".nnidx"))); - f.load(); - auto &index_b = f.hnsw_index(); + load(); + auto &index_b = hnsw_typed_index<type>(); EXPECT_NOT_EQUAL(&index_a, &index_b); expect_level_0(2, index_b.get_node(1)); expect_level_0(1, index_b.get_node(2)); } +class DenseTensorAttributeHnswIndex : public TensorAttributeHnswIndex<HnswIndexType::SINGLE> { +public: + DenseTensorAttributeHnswIndex() : TensorAttributeHnswIndex<HnswIndexType::SINGLE>(vec_2d_spec, FixtureTraits().hnsw()) {} +}; + +class MixedTensorAttributeHnswIndex : public TensorAttributeHnswIndex<HnswIndexType::MULTI> { +public: + MixedTensorAttributeHnswIndex() : TensorAttributeHnswIndex<HnswIndexType::MULTI>(vec_mixed_2d_spec, FixtureTraits().mixed_hnsw()) {} +}; + +TEST_F("Hnsw index is instantiated in dense tensor attribute when specified in config", DenseTensorAttributeHnswIndex) +{ + f.test_setup(); +} + +TEST_F("Hnsw index is integrated in dense tensor attribute and can be saved and loaded", DenseTensorAttributeHnswIndex) +{ + f.test_save_load(false); +} + +TEST_F("Hnsw index is instantiated in mixed tensor attribute when specified in config", MixedTensorAttributeHnswIndex) +{ + f.test_setup(); +} + +TEST_F("Hnsw index is integrated in mixed tensor attribute and can be saved and loaded", MixedTensorAttributeHnswIndex) +{ + f.test_save_load(false); +} + +TEST_F("Hnsw index is integrated in mixed tensor attribute and can be saved and loaded with multiple points per document", MixedTensorAttributeHnswIndex) +{ + f.test_save_load(true); +} + TEST_F("Populates address space usage", DenseTensorAttributeHnswIndex) { search::AddressSpaceUsage usage = f._attr->getAddressSpaceUsage(); diff --git a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp index 054331880f4..4f6f8ac5c87 100644 --- a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp @@ -28,6 +28,7 @@ make_random_level_generator(uint32_t m) std::unique_ptr<NearestNeighborIndex> DefaultNearestNeighborIndexFactory::make(const DocVectorAccess& vectors, size_t vector_size, + bool multi_vector_index, vespalib::eval::CellType cell_type, const search::attribute::HnswIndexParams& params) const { @@ -38,10 +39,17 @@ DefaultNearestNeighborIndexFactory::make(const DocVectorAccess& vectors, params.neighbors_to_explore_at_insert(), 10000, true); - return std::make_unique<HnswIndex<HnswIndexType::SINGLE>>(vectors, - make_distance_function(params.distance_metric(), cell_type), - make_random_level_generator(m), - cfg); + if (multi_vector_index) { + return std::make_unique<HnswIndex<HnswIndexType::MULTI>>(vectors, + make_distance_function(params.distance_metric(), cell_type), + make_random_level_generator(m), + cfg); + } else { + return std::make_unique<HnswIndex<HnswIndexType::SINGLE>>(vectors, + make_distance_function(params.distance_metric(), cell_type), + make_random_level_generator(m), + cfg); + } } } diff --git a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.h b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.h index 7933a87aa53..f1af7a8ac99 100644 --- a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.h +++ b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.h @@ -13,6 +13,7 @@ class DefaultNearestNeighborIndexFactory : public NearestNeighborIndexFactory { public: std::unique_ptr<NearestNeighborIndex> make(const DocVectorAccess& vectors, size_t vector_size, + bool multi_vector_index, vespalib::eval::CellType cell_type, const search::attribute::HnswIndexParams& params) const override; }; diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp index bd7fe2d3276..ca09c0e58d9 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp @@ -15,16 +15,9 @@ namespace search::tensor { DenseTensorAttribute::DenseTensorAttribute(vespalib::stringref baseFileName, const Config& cfg, const NearestNeighborIndexFactory& index_factory) - : TensorAttribute(baseFileName, cfg, _denseTensorStore), + : TensorAttribute(baseFileName, cfg, _denseTensorStore, index_factory), _denseTensorStore(cfg.tensorType(), get_memory_allocator()) { - if (cfg.hnsw_index_params().has_value()) { - auto tensor_type = cfg.tensorType(); - assert(tensor_type.dimensions().size() == 1); - assert(tensor_type.is_dense()); - size_t vector_size = tensor_type.dimensions()[0].size; - _index = index_factory.make(*this, vector_size, tensor_type.cell_type(), cfg.hnsw_index_params().value()); - } } diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h index 45bd0d98274..89f0fd1bd06 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h @@ -9,8 +9,6 @@ namespace search::tensor { -class NearestNeighborIndex; - /** * Attribute vector class used to store dense tensors for all * documents in memory. diff --git a/searchlib/src/vespa/searchlib/tensor/direct_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/direct_tensor_attribute.cpp index c2f0ff36c3a..4770b5d41b7 100644 --- a/searchlib/src/vespa/searchlib/tensor/direct_tensor_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/direct_tensor_attribute.cpp @@ -9,8 +9,8 @@ using vespalib::eval::FastValueBuilderFactory; namespace search::tensor { -DirectTensorAttribute::DirectTensorAttribute(stringref name, const Config &cfg) - : TensorAttribute(name, cfg, _direct_store), +DirectTensorAttribute::DirectTensorAttribute(stringref name, const Config &cfg, const NearestNeighborIndexFactory& index_factory) + : TensorAttribute(name, cfg, _direct_store, index_factory), _direct_store(cfg.tensorType()) { } diff --git a/searchlib/src/vespa/searchlib/tensor/direct_tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/direct_tensor_attribute.h index ed48ea20e0f..d331cdca440 100644 --- a/searchlib/src/vespa/searchlib/tensor/direct_tensor_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/direct_tensor_attribute.h @@ -3,6 +3,7 @@ #pragma once #include "tensor_attribute.h" +#include "default_nearest_neighbor_index_factory.h" #include "direct_tensor_store.h" namespace vespalib::eval { struct Value; } @@ -14,7 +15,7 @@ class DirectTensorAttribute final : public TensorAttribute DirectTensorStore _direct_store; public: - DirectTensorAttribute(vespalib::stringref baseFileName, const Config &cfg); + DirectTensorAttribute(vespalib::stringref baseFileName, const Config &cfg, const NearestNeighborIndexFactory& index_factory = DefaultNearestNeighborIndexFactory()); ~DirectTensorAttribute() override; void setTensor(DocId docId, const vespalib::eval::Value &tensor) override; void update_tensor(DocId docId, diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index e9e52301f8e..b293b001bcf 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -380,7 +380,6 @@ template <HnswIndexType type> typename HnswIndex<type>::PreparedAddDoc HnswIndex<type>::internal_prepare_add(uint32_t docid, VectorBundle input_vectors, vespalib::GenerationHandler::Guard read_guard) const { - assert(input_vectors.subspaces() == 1); PreparedAddDoc op(docid, std::move(read_guard)); auto entry = _graph.get_entry_node(); auto subspaces = input_vectors.subspaces(); @@ -459,7 +458,6 @@ template <HnswIndexType type> void HnswIndex<type>::internal_complete_add(uint32_t docid, PreparedAddDoc &op) { - assert(op.nodes.size() == 1); auto nodeids = _id_mapping.allocate_ids(docid, op.nodes.size()); assert(nodeids.size() == op.nodes.size()); uint32_t subspace = 0; @@ -575,7 +573,6 @@ void HnswIndex<type>::remove_document(uint32_t docid) { auto nodeids = _id_mapping.get_ids(docid); - assert(nodeids.size() == 1u); for (auto nodeid : nodeids) { remove_node(nodeid); } diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_factory.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_factory.h index 4b53fdca311..8083ebcf2b3 100644 --- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_factory.h +++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_factory.h @@ -20,6 +20,7 @@ public: virtual ~NearestNeighborIndexFactory() {} virtual std::unique_ptr<NearestNeighborIndex> make(const DocVectorAccess& vectors, size_t vector_size, + bool multi_vector_index, vespalib::eval::CellType cell_type, const search::attribute::HnswIndexParams& params) const = 0; }; diff --git a/searchlib/src/vespa/searchlib/tensor/serialized_fast_value_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/serialized_fast_value_attribute.cpp index 4fd8da5ac9d..c4dd0ef30f0 100644 --- a/searchlib/src/vespa/searchlib/tensor/serialized_fast_value_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/serialized_fast_value_attribute.cpp @@ -13,9 +13,9 @@ using namespace vespalib::eval; namespace search::tensor { -SerializedFastValueAttribute::SerializedFastValueAttribute(stringref name, const Config &cfg) - : TensorAttribute(name, cfg, _tensorBufferStore), - _tensorBufferStore(cfg.tensorType(), get_memory_allocator(), 1000u) +SerializedFastValueAttribute::SerializedFastValueAttribute(stringref name, const Config &cfg, const NearestNeighborIndexFactory& index_factory) + : TensorAttribute(name, cfg, _tensorBufferStore, index_factory), + _tensorBufferStore(cfg.tensorType(), get_memory_allocator(), 1000u) { } diff --git a/searchlib/src/vespa/searchlib/tensor/serialized_fast_value_attribute.h b/searchlib/src/vespa/searchlib/tensor/serialized_fast_value_attribute.h index 31a7f136d23..4cfcc3d19a2 100644 --- a/searchlib/src/vespa/searchlib/tensor/serialized_fast_value_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/serialized_fast_value_attribute.h @@ -3,6 +3,7 @@ #pragma once #include "tensor_attribute.h" +#include "default_nearest_neighbor_index_factory.h" #include "tensor_buffer_store.h" namespace search::tensor { @@ -19,7 +20,7 @@ namespace search::tensor { class SerializedFastValueAttribute : public TensorAttribute { TensorBufferStore _tensorBufferStore; // data store for serialized tensors public: - SerializedFastValueAttribute(vespalib::stringref baseFileName, const Config &cfg); + SerializedFastValueAttribute(vespalib::stringref baseFileName, const Config &cfg, const NearestNeighborIndexFactory& index_factory = DefaultNearestNeighborIndexFactory()); ~SerializedFastValueAttribute() override; // Implements DocVectorAccess diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp index acbf1c0d6b2..5c50b2d83a2 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp @@ -2,6 +2,7 @@ #include "tensor_attribute.h" #include "nearest_neighbor_index.h" +#include "nearest_neighbor_index_factory.h" #include "nearest_neighbor_index_saver.h" #include "tensor_attribute_constants.h" #include "tensor_attribute_loader.h" @@ -51,7 +52,7 @@ vespalib::string makeWrongTensorTypeMsg(const ValueType &fieldTensorType, const } -TensorAttribute::TensorAttribute(vespalib::stringref name, const Config &cfg, TensorStore &tensorStore) +TensorAttribute::TensorAttribute(vespalib::stringref name, const Config &cfg, TensorStore &tensorStore, const NearestNeighborIndexFactory& index_factory) : NotImplementedAttribute(name, cfg), _refVector(cfg.getGrowStrategy(), getGenerationHolder()), _tensorStore(tensorStore), @@ -62,6 +63,11 @@ TensorAttribute::TensorAttribute(vespalib::stringref name, const Config &cfg, Te _subspace_type(cfg.tensorType()), _comp(cfg.tensorType()) { + if (cfg.hnsw_index_params().has_value()) { + auto tensor_type = cfg.tensorType(); + size_t vector_size = tensor_type.dense_subspace_size(); + _index = index_factory.make(*this, vector_size, !_is_dense, tensor_type.cell_type(), cfg.hnsw_index_params().value()); + } } TensorAttribute::~TensorAttribute() = default; diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h index f139a608706..15a2db2b861 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.h @@ -16,6 +16,8 @@ namespace vespalib::eval { struct Value; struct ValueBuilderFactory; } namespace search::tensor { +class NearestNeighborIndexFactory; + /** * Attribute vector class used to store tensors for all documents in memory. */ @@ -48,7 +50,7 @@ protected: public: using RefCopyVector = vespalib::Array<EntryRef>; - TensorAttribute(vespalib::stringref name, const Config &cfg, TensorStore &tensorStore); + TensorAttribute(vespalib::stringref name, const Config &cfg, TensorStore &tensorStore, const NearestNeighborIndexFactory& index_factory); ~TensorAttribute() override; const ITensorAttribute *asTensorAttribute() const override; |