diff options
author | Geir Storli <geirst@verizonmedia.com> | 2020-02-10 11:59:23 +0000 |
---|---|---|
committer | Geir Storli <geirst@verizonmedia.com> | 2020-02-10 12:12:08 +0000 |
commit | 8164e4a5c4910a878e49663730de5c2b310d5940 (patch) | |
tree | 283898fd957bdac97afbd99cfb10bf021e70f798 | |
parent | 2fb3b11480e049cea184c96824f9ac6ab11e4c46 (diff) |
Add interface used to calculate the distance between two n-dim vectors.
7 files changed, 83 insertions, 24 deletions
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 634569cbc8c..0c5ddb93e92 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -1,6 +1,7 @@ // Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include <vespa/eval/tensor/dense/typed_cells.h> +#include <vespa/searchlib/tensor/distance_functions.h> #include <vespa/searchlib/tensor/doc_vector_access.h> #include <vespa/searchlib/tensor/hnsw_index.h> #include <vespa/searchlib/tensor/random_level_generator.h> @@ -41,12 +42,14 @@ struct LevelGenerator : public RandomLevelGenerator { }; using FloatVectors = MyDocVectorAccess<float>; +using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>; using FloatIndex = HnswIndex<float>; using FloatIndexUP = std::unique_ptr<FloatIndex>; class HnswIndexTest : public ::testing::Test { public: FloatVectors vectors; + FloatSqEuclideanDistance distance_func; LevelGenerator level_generator; FloatIndexUP index; @@ -60,7 +63,7 @@ public: .set(7, {3, 5}); } void init(bool heuristic_select_neighbors) { - index = std::make_unique<FloatIndex>(vectors, level_generator, + index = std::make_unique<FloatIndex>(vectors, distance_func, level_generator, HnswIndexBase::Config(2, 1, 10, heuristic_select_neighbors)); } void add_document(uint32_t docid, uint32_t max_level = 0) { diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function.h b/searchlib/src/vespa/searchlib/tensor/distance_function.h new file mode 100644 index 00000000000..8dfb77ddccb --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/distance_function.h @@ -0,0 +1,21 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +namespace vespalib::tensor { struct TypedCells; } + +namespace search::tensor { + +/** + * Interface used to calculate the distance between two n-dimensional vectors. + * + * The vectors must be of same size and same type (float or double). + * The actual implementation must know which type the vectors are. + */ +class DistanceFunction { +public: + virtual ~DistanceFunction() {} + virtual double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const = 0; +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h new file mode 100644 index 00000000000..dabab55dc7a --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -0,0 +1,34 @@ +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" + +namespace search::tensor { + +// TODO: Make distance functions hardware optimized. + +/** + * Calculates the square of the standard Euclidean distance. + */ +template <typename FloatType> +class SquaredEuclideanDistance : public DistanceFunction { +public: + double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override { + auto lhs_vector = lhs.template typify<FloatType>(); + auto rhs_vector = rhs.template typify<FloatType>(); + double result = 0.0; + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + for (size_t i = 0; i < sz; ++i) { + double diff = lhs_vector[i] - rhs_vector[i]; + result += diff * diff; + } + return result; + } +}; + +template class SquaredEuclideanDistance<float>; +template class SquaredEuclideanDistance<double>; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 345f7f551a6..cb617246066 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -1,5 +1,6 @@ // Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "distance_function.h" #include "hnsw_index.h" #include <vespa/vespalib/util/rcuvector.hpp> @@ -15,23 +16,15 @@ HnswIndex<FloatType>::calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) cons template <typename FloatType> double -HnswIndex<FloatType>::calc_distance(const Vector& lhs, uint32_t rhs_docid) const +HnswIndex<FloatType>::calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const { - // TODO: Make it possible to specify the distance function from the outside and make it hardware optimized. auto rhs = get_vector(rhs_docid); - double result = 0.0; - size_t sz = lhs.size(); - assert(sz == rhs.size()); - for (size_t i = 0; i < sz; ++i) { - double diff = lhs[i] - rhs[i]; - result += diff * diff; - } - return result; + return _distance_func.calc(lhs, rhs); } template <typename FloatType> HnswCandidate -HnswIndex<FloatType>::find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level) +HnswIndex<FloatType>::find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level) { HnswCandidate nearest = entry_point; bool keep_searching = true; @@ -50,7 +43,7 @@ HnswIndex<FloatType>::find_nearest_in_layer(const Vector& input, const HnswCandi template <typename FloatType> void -HnswIndex<FloatType>::search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) +HnswIndex<FloatType>::search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& best_neighbors, uint32_t level) { NearestPriQ candidates; // TODO: Add proper handling of visited set. @@ -86,8 +79,9 @@ HnswIndex<FloatType>::search_layer(const Vector& input, uint32_t neighbors_to_fi } template <typename FloatType> -HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg) - : HnswIndexBase(vectors, level_generator, cfg) +HnswIndex<FloatType>::HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg) + : HnswIndexBase(vectors, distance_func, level_generator, cfg) { } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index be6f507dbe1..a93062cf200 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -11,6 +11,7 @@ namespace search::tensor { +class DistanceFunction; class DocVectorAccess; class RandomLevelGenerator; @@ -25,22 +26,23 @@ class RandomLevelGenerator; template <typename FloatType = float> class HnswIndex : public HnswIndexBase { private: - using Vector = vespalib::ConstArrayRef<FloatType>; + using TypedCells = vespalib::tensor::TypedCells; - inline Vector get_vector(uint32_t docid) const { - return _vectors.get_vector(docid).template typify<FloatType>(); + inline TypedCells get_vector(uint32_t docid) const { + return _vectors.get_vector(docid); } double calc_distance(uint32_t lhs_docid, uint32_t rhs_docid) const override; - double calc_distance(const Vector& lhs, uint32_t rhs_docid) const; + double calc_distance(const TypedCells& lhs, uint32_t rhs_docid) const; /** * Performs a greedy search in the given layer to find the candidate that is nearest the input vector. */ - HnswCandidate find_nearest_in_layer(const Vector& input, const HnswCandidate& entry_point, uint32_t level); - void search_layer(const Vector& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); + HnswCandidate find_nearest_in_layer(const TypedCells& input, const HnswCandidate& entry_point, uint32_t level); + void search_layer(const TypedCells& input, uint32_t neighbors_to_find, FurthestPriQ& found_neighbors, uint32_t level); public: - HnswIndex(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg); + HnswIndex(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndex() override; void add_document(uint32_t docid) override; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp index a7b9c1dba79..e0ecdf87399 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.cpp @@ -156,8 +156,10 @@ HnswIndexBase::remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t set_link_array(remove_from, level, new_links); } -HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg) +HnswIndexBase::HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg) : _vectors(vectors), + _distance_func(distance_func), _level_generator(level_generator), _cfg(cfg), _node_refs(), diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h index 9987b61c157..ab59ad02f14 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index_base.h @@ -13,6 +13,7 @@ namespace search::tensor { +class DistanceFunction; class DocVectorAccess; class RandomLevelGenerator; @@ -78,6 +79,7 @@ protected: using LinkArray = vespalib::Array<uint32_t>; const DocVectorAccess& _vectors; + const DistanceFunction& _distance_func; RandomLevelGenerator& _level_generator; Config _cfg; NodeRefVector _node_refs; @@ -112,7 +114,8 @@ protected: void remove_link_to(uint32_t remove_from, uint32_t remove_id, uint32_t level); public: - HnswIndexBase(const DocVectorAccess& vectors, RandomLevelGenerator& level_generator, const Config& cfg); + HnswIndexBase(const DocVectorAccess& vectors, const DistanceFunction& distance_func, + RandomLevelGenerator& level_generator, const Config& cfg); ~HnswIndexBase() override; // TODO: Add support for generation handling and cleanup (transfer_hold_lists, trim_hold_lists) |