From a04b22c8b07d9d03d374c1732126f4502a981319 Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Wed, 19 Feb 2020 15:08:51 +0000 Subject: Instantiate nearest neighbor (hnsw) index in dense tensor attribute when specified in config. --- .../tensorattribute/tensorattribute_test.cpp | 72 ++++++++++++++++------ .../tests/tensor/hnsw_index/hnsw_index_test.cpp | 10 +-- .../src/vespa/searchlib/tensor/CMakeLists.txt | 1 + .../default_nearest_neighbor_index_factory.cpp | 51 +++++++++++++++ .../default_nearest_neighbor_index_factory.h | 19 ++++++ .../searchlib/tensor/dense_tensor_attribute.cpp | 19 +++++- .../searchlib/tensor/dense_tensor_attribute.h | 29 ++++++--- .../src/vespa/searchlib/tensor/distance_function.h | 3 + .../vespa/searchlib/tensor/distance_functions.h | 1 + .../src/vespa/searchlib/tensor/hnsw_index.cpp | 12 ++-- searchlib/src/vespa/searchlib/tensor/hnsw_index.h | 15 ++--- .../tensor/nearest_neighbor_index_factory.h | 26 ++++++++ .../searchlib/tensor/random_level_generator.h | 3 + 13 files changed, 212 insertions(+), 49 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp create mode 100644 searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.h create mode 100644 searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_factory.h (limited to 'searchlib') diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index 7e0fcdc0ccc..69d9e3c8393 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -1,41 +1,43 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include #include -#include -#include -#include -#include -#include -#include #include -#include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include + #include LOG_SETUP("tensorattribute_test"); using document::WrongTensorTypeException; -using search::tensor::TensorAttribute; -using search::tensor::DenseTensorAttribute; -using search::tensor::GenericTensorAttribute; using search::AttributeGuard; using search::AttributeVector; -using vespalib::eval::ValueType; +using search::attribute::HnswIndexParams; +using search::tensor::DenseTensorAttribute; +using search::tensor::GenericTensorAttribute; +using search::tensor::HnswIndex; +using search::tensor::TensorAttribute; using vespalib::eval::TensorSpec; -using vespalib::tensor::Tensor; -using vespalib::tensor::DenseTensor; +using vespalib::eval::ValueType; using vespalib::tensor::DefaultTensorEngine; +using vespalib::tensor::DenseTensor; +using vespalib::tensor::Tensor; -namespace vespalib { -namespace tensor { +namespace vespalib::tensor { static bool operator==(const Tensor &lhs, const Tensor &rhs) { return lhs.equals(rhs); } -} } vespalib::string sparseSpec("tensor(x{},y{})"); @@ -67,7 +69,8 @@ struct Fixture bool _useDenseTensorAttribute; Fixture(const vespalib::string &typeSpec, - bool useDenseTensorAttribute = false) + bool useDenseTensorAttribute = false, + bool enable_hnsw_index = false) : _cfg(BasicType::TENSOR, CollectionType::SINGLE), _name("test"), _typeSpec(typeSpec), @@ -80,6 +83,9 @@ struct Fixture if (_cfg.tensorType().is_dense()) { _denseTensors = true; } + if (enable_hnsw_index) { + _cfg.set_hnsw_index_params(HnswIndexParams(4, 20)); + } _tensorAttr = makeAttr(); _attr = _tensorAttr; _attr->addReservedDoc(); @@ -94,6 +100,12 @@ struct Fixture } } + const DenseTensorAttribute& as_dense_tensor() const { + auto result = dynamic_cast(_tensorAttr.get()); + assert(result != nullptr); + return *result; + } + void ensureSpace(uint32_t docId) { while (_attr->getNumDocs() <= docId) { uint32_t newDocId = 0u; @@ -357,4 +369,26 @@ TEST("Test dense tensors with dense tensor attribute") testAll([]() { return std::make_shared(denseSpec, true); }); } +TEST_F("Hnsw index is NOT instantiated in dense tensor attribute by default", + Fixture("tensor(x[2])", true, false)) +{ + const auto& tensor = f.as_dense_tensor(); + EXPECT_TRUE(tensor.nearest_neighbor_index() == nullptr); +} + +TEST_F("Hnsw index is instantiated in dense tensor attribute when specified in config", + Fixture("tensor(x[2])", true, true)) +{ + const auto& tensor = f.as_dense_tensor(); + ASSERT_TRUE(tensor.nearest_neighbor_index() != nullptr); + auto hnsw_index = dynamic_cast(tensor.nearest_neighbor_index()); + ASSERT_TRUE(hnsw_index != nullptr); + + const auto& cfg = hnsw_index->config(); + EXPECT_EQUAL(8u, cfg.max_links_at_level_0()); + EXPECT_EQUAL(4u, cfg.max_links_at_hierarchic_levels()); + EXPECT_EQUAL(20u, cfg.neighbors_to_explore_at_construction()); + EXPECT_TRUE(cfg.heuristic_select_neighbors()); +} + TEST_MAIN() { TEST_RUN_ALL(); vespalib::unlink("test.dat"); } diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index c6246bb8434..cd0d4bcaad0 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -48,8 +48,7 @@ using HnswIndexUP = std::unique_ptr; class HnswIndexTest : public ::testing::Test { public: FloatVectors vectors; - FloatSqEuclideanDistance distance_func; - LevelGenerator level_generator; + LevelGenerator* level_generator; HnswIndexUP index; HnswIndexTest() @@ -62,11 +61,14 @@ public: .set(7, {3, 5}).set(8, {0, 3}).set(9, {4, 5}); } void init(bool heuristic_select_neighbors) { - index = std::make_unique(vectors, distance_func, level_generator, + auto generator = std::make_unique(); + level_generator = generator.get(); + index = std::make_unique(vectors, std::make_unique(), + std::move(generator), HnswIndex::Config(2, 1, 10, heuristic_select_neighbors)); } void add_document(uint32_t docid, uint32_t max_level = 0) { - level_generator.level = max_level; + level_generator->level = max_level; index->add_document(docid); } void remove_document(uint32_t docid) { diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index 9175168248c..09069861ab4 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -1,6 +1,7 @@ # Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(searchlib_tensor OBJECT SOURCES + default_nearest_neighbor_index_factory.cpp dense_tensor_attribute.cpp dense_tensor_attribute_saver.cpp dense_tensor_store.cpp diff --git a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp new file mode 100644 index 00000000000..68efe6417c0 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.cpp @@ -0,0 +1,51 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "default_nearest_neighbor_index_factory.h" +#include "distance_functions.h" +#include "hnsw_index.h" +#include "random_level_generator.h" +#include + +namespace search::tensor { + +using vespalib::eval::ValueType; + +namespace { + +class LevelZeroGenerator : public RandomLevelGenerator { + uint32_t max_level() override { return 0; } +}; + +DistanceFunction::UP +make_distance_function(ValueType::CellType cell_type) +{ + if (cell_type == ValueType::CellType::FLOAT) { + return std::make_unique>(); + } else { + return std::make_unique>(); + } +} + +RandomLevelGenerator::UP +make_random_level_generator() +{ + // TODO: Make generator that results in hierarchical graph. + return std::make_unique(); +} + +} + +std::unique_ptr +DefaultNearestNeighborIndexFactory::make(const DocVectorAccess& vectors, + vespalib::eval::ValueType::CellType cell_type, + const search::attribute::HnswIndexParams& params) const +{ + HnswIndex::Config cfg(params.max_links_per_node() * 2, + params.max_links_per_node(), + params.neighbors_to_explore_at_insert(), + true); + return std::make_unique(vectors, make_distance_function(cell_type), make_random_level_generator(), cfg); +} + +} + diff --git a/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.h b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.h new file mode 100644 index 00000000000..ea784efdb51 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/default_nearest_neighbor_index_factory.h @@ -0,0 +1,19 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "nearest_neighbor_index_factory.h" + +namespace search::tensor { + +/** + * Factory that instantiates the production hnsw index. + */ +class DefaultNearestNeighborIndexFactory : public NearestNeighborIndexFactory { +public: + std::unique_ptr make(const DocVectorAccess& vectors, + vespalib::eval::ValueType::CellType cell_type, + const search::attribute::HnswIndexParams& params) const override; +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp index a2b9f136ed9..d4786cc2e1a 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.cpp @@ -2,6 +2,7 @@ #include "dense_tensor_attribute.h" #include "dense_tensor_attribute_saver.h" +#include "nearest_neighbor_index.h" #include "tensor_attribute.hpp" #include #include @@ -55,11 +56,15 @@ TensorReader::is_present() { } -DenseTensorAttribute::DenseTensorAttribute(vespalib::stringref baseFileName, - const Config &cfg) +DenseTensorAttribute::DenseTensorAttribute(vespalib::stringref baseFileName, const Config& cfg, + const NearestNeighborIndexFactory& index_factory) : TensorAttribute(baseFileName, cfg, _denseTensorStore), - _denseTensorStore(cfg.tensorType()) + _denseTensorStore(cfg.tensorType()), + _index() { + if (cfg.hnsw_index_params().has_value()) { + _index = index_factory.make(*this, cfg.tensorType().cell_type(), cfg.hnsw_index_params().value()); + } } @@ -154,4 +159,12 @@ DenseTensorAttribute::getVersion() const return DENSE_TENSOR_ATTRIBUTE_VERSION; } +vespalib::tensor::TypedCells +DenseTensorAttribute::get_vector(uint32_t docid) const +{ + MutableDenseTensorView tensor_view(_denseTensorStore.type()); + getTensor(docid, tensor_view); + return tensor_view.cellsRef(); +} + } diff --git a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h index 593741cef39..74c93aca212 100644 --- a/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h +++ b/searchlib/src/vespa/searchlib/tensor/dense_tensor_attribute.h @@ -2,25 +2,32 @@ #pragma once -#include "tensor_attribute.h" +#include "default_nearest_neighbor_index_factory.h" #include "dense_tensor_store.h" +#include "doc_vector_access.h" +#include "tensor_attribute.h" +#include -namespace vespalib { namespace tensor { class MutableDenseTensorView; }} +namespace vespalib::tensor { class MutableDenseTensorView; } -namespace search { +namespace search::tensor { -namespace tensor { +class NearestNeighborIndex; /** * Attribute vector class used to store dense tensors for all * documents in memory. */ -class DenseTensorAttribute : public TensorAttribute -{ +class DenseTensorAttribute : public TensorAttribute, public DocVectorAccess { +private: DenseTensorStore _denseTensorStore; + std::unique_ptr _index; + public: - DenseTensorAttribute(vespalib::stringref baseFileName, const Config &cfg); + DenseTensorAttribute(vespalib::stringref baseFileName, const Config& cfg, + const NearestNeighborIndexFactory& index_factory = DefaultNearestNeighborIndexFactory()); virtual ~DenseTensorAttribute(); + // Implements TensorAttribute virtual void setTensor(DocId docId, const Tensor &tensor) override; virtual std::unique_ptr getTensor(DocId docId) const override; virtual void getTensor(DocId docId, vespalib::tensor::MutableDenseTensorView &tensor) const override; @@ -28,9 +35,11 @@ public: virtual std::unique_ptr onInitSave(vespalib::stringref fileName) override; virtual void compactWorst() override; virtual uint32_t getVersion() const override; -}; + // Implements DocVectorAccess + vespalib::tensor::TypedCells get_vector(uint32_t docid) const override; -} // namespace search::tensor + const NearestNeighborIndex* nearest_neighbor_index() const { return _index.get(); } +}; -} // namespace search +} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function.h b/searchlib/src/vespa/searchlib/tensor/distance_function.h index 8dfb77ddccb..b682824c805 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_function.h @@ -2,6 +2,8 @@ #pragma once +#include + namespace vespalib::tensor { struct TypedCells; } namespace search::tensor { @@ -14,6 +16,7 @@ namespace search::tensor { */ class DistanceFunction { public: + using UP = std::unique_ptr; virtual ~DistanceFunction() {} virtual double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const = 0; }; diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h index 1e8727e92aa..494d1a859b6 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -3,6 +3,7 @@ #pragma once #include "distance_function.h" +#include namespace search::tensor { diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index be53b758841..860686f3c6a 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -44,7 +44,7 @@ HnswIndex::max_links_for_level(uint32_t level) const uint32_t HnswIndex::make_node_for_document(uint32_t docid) { - uint32_t max_level = _level_generator.max_level(); + uint32_t max_level = _level_generator->max_level(); // TODO: Add capping on num_levels uint32_t num_levels = max_level + 1; // Note: The level array instance lives as long as the document is present in the index. @@ -170,7 +170,7 @@ double HnswIndex::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const { auto rhs = get_vector(rhs_docid); - return _distance_func.calc(lhs, rhs); + return _distance_func->calc(lhs, rhs); } HnswCandidate @@ -227,11 +227,11 @@ HnswIndex::search_layer(const TypedCells& input, uint32_t neighbors_to_find, Fur } } -HnswIndex::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, - RandomLevelGenerator& level_generator, const Config& cfg) +HnswIndex::HnswIndex(const DocVectorAccess& vectors, DistanceFunction::UP distance_func, + RandomLevelGenerator::UP level_generator, const Config& cfg) : _vectors(vectors), - _distance_func(distance_func), - _level_generator(level_generator), + _distance_func(std::move(distance_func)), + _level_generator(std::move(level_generator)), _cfg(cfg), _node_refs(), _nodes(make_default_node_store_config()), diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index 814148072ca..66d6a6d25c2 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -2,10 +2,12 @@ #pragma once +#include "distance_function.h" #include "doc_vector_access.h" #include "hnsw_index_utils.h" #include "hnsw_node.h" #include "nearest_neighbor_index.h" +#include "random_level_generator.h" #include #include #include @@ -15,9 +17,6 @@ namespace search::tensor { -class DistanceFunction; -class RandomLevelGenerator; - /** * Implementation of a hierarchical navigable small world graph (HNSW) * that is used for approximate K-nearest neighbor search. @@ -82,8 +81,8 @@ protected: using TypedCells = vespalib::tensor::TypedCells; const DocVectorAccess& _vectors; - const DistanceFunction& _distance_func; - RandomLevelGenerator& _level_generator; + DistanceFunction::UP _distance_func; + RandomLevelGenerator::UP _level_generator; Config _cfg; NodeRefVector _node_refs; NodeStore _nodes; @@ -128,10 +127,12 @@ protected: void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); public: - HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, - RandomLevelGenerator& level_generator, const Config& cfg); + HnswIndex(const DocVectorAccess& vectors, DistanceFunction::UP distance_func, + RandomLevelGenerator::UP level_generator, const Config& cfg); ~HnswIndex() override; + const Config& config() const { return _cfg; } + void add_document(uint32_t docid) override; void remove_document(uint32_t docid) override; std::vector find_top_k(uint32_t k, TypedCells vector, uint32_t explore_k) override; diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_factory.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_factory.h new file mode 100644 index 00000000000..c09403df5e0 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index_factory.h @@ -0,0 +1,26 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include +#include + +namespace search::attribute { class HnswIndexParams; } + +namespace search::tensor { + +class DocVectorAccess; +class NearestNeighborIndex; + +/** + * Factory interface used to instantiate an index used for (approximate) nearest neighbor search. + */ +class NearestNeighborIndexFactory { +public: + virtual ~NearestNeighborIndexFactory() {} + virtual std::unique_ptr make(const DocVectorAccess& vectors, + vespalib::eval::ValueType::CellType cell_type, + const search::attribute::HnswIndexParams& params) const = 0; +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/random_level_generator.h b/searchlib/src/vespa/searchlib/tensor/random_level_generator.h index 0fcac977d9d..0f4c7c34445 100644 --- a/searchlib/src/vespa/searchlib/tensor/random_level_generator.h +++ b/searchlib/src/vespa/searchlib/tensor/random_level_generator.h @@ -2,6 +2,8 @@ #pragma once +#include + namespace search::tensor { /** @@ -9,6 +11,7 @@ namespace search::tensor { */ class RandomLevelGenerator { public: + using UP = std::unique_ptr; virtual ~RandomLevelGenerator() {} virtual uint32_t max_level() = 0; }; -- cgit v1.2.3