diff options
author | Arne Juul <arnej@yahooinc.com> | 2023-04-20 14:06:11 +0000 |
---|---|---|
committer | Arne Juul <arnej@yahooinc.com> | 2023-04-25 11:29:28 +0000 |
commit | e7091a54239d7d0f030ae161781a16aed4c86364 (patch) | |
tree | ceb17b43a998f0d3c5de46b1f6c9853f9fe05ea0 /searchlib | |
parent | d5f17d23f377776e85aa687be17b211b54423c59 (diff) |
add BoundPrenormalizedAngularDistance
Diffstat (limited to 'searchlib')
6 files changed, 186 insertions, 0 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp index ae283f3f2b2..29637bd7bf4 100644 --- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp +++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp @@ -257,6 +257,63 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score) EXPECT_DOUBLE_EQ(a66, computeAngularChecked(t(iv6), t(iv6))); } +double computePrenormalizedAngularChecked(TypedCells a, TypedCells b) { + static PrenormalizedAngularDistanceFunctionFactory<float> flt_dff; + static PrenormalizedAngularDistanceFunctionFactory<double> dbl_dff; + auto d_n = dbl_dff.for_query_vector(a); + auto d_f = flt_dff.for_query_vector(a); + auto d_i = dbl_dff.for_insertion_vector(a); + // normal: + double result = d_n->calc(b); + // insert is exactly same: + EXPECT_EQ(d_i->calc(b), result); + // float factory: + EXPECT_FLOAT_EQ(d_f->calc(b), result); + return result; +} + +TEST(DistanceFunctionsTest, prenormalized_angular_gives_expected_score) +{ + std::vector<double> p0{0.0, 0.0, 0.0}; + std::vector<double> p1{1.0, 0.0, 0.0}; + std::vector<double> p2{0.0, 1.0, 0.0}; + std::vector<double> p3{0.0, 0.0, 1.0}; + std::vector<double> p4{0.5, 0.5, 0.707107}; + std::vector<double> p5{0.0,-1.0, 0.0}; + std::vector<double> p6{1.0, 2.0, 2.0}; + + PrenormalizedAngularDistanceFunctionFactory<double> dff; + auto pnad = dff.for_query_vector(t(p0)); + + double i12 = computePrenormalizedAngularChecked(t(p1), t(p2)); + double i13 = computePrenormalizedAngularChecked(t(p1), t(p3)); + double i23 = computePrenormalizedAngularChecked(t(p2), t(p3)); + EXPECT_DOUBLE_EQ(i12, 1.0); + EXPECT_DOUBLE_EQ(i13, 1.0); + EXPECT_DOUBLE_EQ(i23, 1.0); + + double i14 = computePrenormalizedAngularChecked(t(p1), t(p4)); + double i24 = computePrenormalizedAngularChecked(t(p2), t(p4)); + EXPECT_DOUBLE_EQ(i14, 0.5); + EXPECT_DOUBLE_EQ(i24, 0.5); + double i34 = computePrenormalizedAngularChecked(t(p3), t(p4)); + EXPECT_FLOAT_EQ(i34, 1.0 - 0.707107); + + double i25 = computePrenormalizedAngularChecked(t(p2), t(p5)); + EXPECT_DOUBLE_EQ(i25, 2.0); + + double i44 = computePrenormalizedAngularChecked(t(p4), t(p4)); + EXPECT_GE(i44, 0.0); + EXPECT_LT(i44, 0.000001); + + double threshold = pnad->convert_threshold(0.25); + EXPECT_DOUBLE_EQ(threshold, 0.25); + threshold = pnad->convert_threshold(0.5); + EXPECT_DOUBLE_EQ(threshold, 0.5); + threshold = pnad->convert_threshold(1.0); + EXPECT_DOUBLE_EQ(threshold, 1.0); +} + TEST(DistanceFunctionsTest, innerproduct_gives_expected_score) { auto ct = vespalib::eval::CellType::DOUBLE; @@ -292,6 +349,10 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score) EXPECT_GE(i44, 0.0); EXPECT_LT(i44, 0.000001); + double i66 = innerproduct->calc(t(p6), t(p6)); + EXPECT_GE(i66, 0.0); + EXPECT_LT(i66, 0.000001); + double threshold = innerproduct->convert_threshold(0.25); EXPECT_DOUBLE_EQ(threshold, 0.25); threshold = innerproduct->convert_threshold(0.5); diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index 1783e0da1dd..2e874ffa4ae 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -30,6 +30,7 @@ vespa_add_library(searchlib_tensor OBJECT large_subspaces_buffer_type.cpp nearest_neighbor_index.cpp nearest_neighbor_index_saver.cpp + prenormalized_angular_distance.cpp serialized_fast_value_attribute.cpp serialized_tensor_ref.cpp small_subspaces_buffer_type.cpp diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp index cca492ef212..4553f39a525 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp @@ -107,6 +107,20 @@ make_distance_function_factory(search::attribute::DistanceMetric variant, default: return std::make_unique<EuclideanDistanceFunctionFactory<float>>(); } } + if (variant == DistanceMetric::PrenormalizedAngular) { + if (cell_type == CellType::DOUBLE) { + return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<double>>(); + } + return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<float>>(); + } + /* + if (variant == DistanceMetric::GeoDegrees) { + return std::make_unique<GeoDistanceFunctionFactory>(); + } + if (variant == DistanceMetric::Hamming) { + return std::make_unique<HammingDistanceFunctionFactory>(); + } + */ auto df = make_distance_function(variant, cell_type); return std::make_unique<SimpleDistanceFunctionFactory>(std::move(df)); } diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h index b28cc2bda46..2300dba2db1 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -8,3 +8,4 @@ #include "geo_degrees_distance.h" #include "hamming_distance.h" #include "inner_product_distance.h" +#include "prenormalized_angular_distance.h" diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp new file mode 100644 index 00000000000..d2693f9f443 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp @@ -0,0 +1,82 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "prenormalized_angular_distance.h" +#include "temporary_vector_store.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +template<typename FloatType> +class BoundPrenormalizedAngularDistance : public BoundDistanceFunction { +private: + const vespalib::hwaccelrated::IAccelrated & _computer; + mutable TemporaryVectorStore<FloatType> _tmpSpace; + const vespalib::ConstArrayRef<FloatType> _lhs; + double _lhs_norm_sq; +public: + BoundPrenormalizedAngularDistance(const vespalib::eval::TypedCells& lhs) + : BoundDistanceFunction(vespalib::eval::get_cell_type<FloatType>()), + _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()), + _tmpSpace(lhs.size), + _lhs(_tmpSpace.storeLhs(lhs)) + { + auto a = _lhs.data(); + _lhs_norm_sq = _computer.dotProduct(a, a, lhs.size); + if (_lhs_norm_sq <= 0.0) { + _lhs_norm_sq = 1.0; + } + } + double calc(const vespalib::eval::TypedCells& rhs) const override { + size_t sz = _lhs.size(); + vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs); + assert(sz == rhs_vector.size()); + auto a = _lhs.data(); + auto b = rhs_vector.data(); + double dot_product = _computer.dotProduct(a, b, sz); + double distance = _lhs_norm_sq - dot_product; + return distance; + } + double convert_threshold(double threshold) const override { + double cosine_similarity = 1.0 - threshold; + double dot_product = cosine_similarity * _lhs_norm_sq; + double distance = _lhs_norm_sq - dot_product; + return distance; + } + double to_rawscore(double distance) const override { + double dot_product = _lhs_norm_sq - distance; + double cosine_similarity = dot_product / _lhs_norm_sq; + // should be in in range [-1,1] but roundoff may cause problems: + cosine_similarity = std::min(1.0, cosine_similarity); + cosine_similarity = std::max(-1.0, cosine_similarity); + double cosine_distance = 1.0 - cosine_similarity; // in range [0,2] + double score = 1.0 / (1.0 + cosine_distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& rhs, double) const override { + return calc(rhs); + } +}; + +template class BoundPrenormalizedAngularDistance<float>; +template class BoundPrenormalizedAngularDistance<double>; + +template <typename FloatType> +BoundDistanceFunction::UP +PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_query_vector(const vespalib::eval::TypedCells& lhs) { + using DFT = BoundPrenormalizedAngularDistance<FloatType>; + return std::make_unique<DFT>(lhs); +} + +template <typename FloatType> +BoundDistanceFunction::UP +PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_insertion_vector(const vespalib::eval::TypedCells& lhs) { + using DFT = BoundPrenormalizedAngularDistance<FloatType>; + return std::make_unique<DFT>(lhs); +} + +template class PrenormalizedAngularDistanceFunctionFactory<float>; +template class PrenormalizedAngularDistanceFunctionFactory<double>; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h new file mode 100644 index 00000000000..88953a236e7 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h @@ -0,0 +1,27 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include "bound_distance_function.h" +#include "distance_function_factory.h" +#include <vespa/eval/eval/typed_cells.h> +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> + +namespace search::tensor { + +/** + * Calculates inner-product "distance" between vectors with assumed norm 1. + * Should give same ordering as Angular distance, but is less expensive. + */ +template <typename FloatType> +class PrenormalizedAngularDistanceFunctionFactory : public DistanceFunctionFactory { +public: + PrenormalizedAngularDistanceFunctionFactory() + : DistanceFunctionFactory(vespalib::eval::get_cell_type<FloatType>()) + {} + BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override; + BoundDistanceFunction::UP for_insertion_vector(const vespalib::eval::TypedCells& lhs) override; +}; + +} |