diff options
author | Arne Juul <arnej@yahooinc.com> | 2023-04-20 09:27:12 +0000 |
---|---|---|
committer | Arne Juul <arnej@yahooinc.com> | 2023-04-20 09:30:24 +0000 |
commit | 808175e4afc92fe5e6a7eddeaf23110df7d46f8f (patch) | |
tree | bbda83e2d55cbbcf5f954f7057ac5ad2951ef0e1 /searchlib | |
parent | 4e3fd9eeebeb403d4ad23bf70470d895cbdfbd1c (diff) |
add BoundAngularDistance
Diffstat (limited to 'searchlib')
4 files changed, 144 insertions, 18 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp index f4faabde559..86b83b2c651 100644 --- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp +++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp @@ -91,12 +91,26 @@ TEST(DistanceFunctionsTest, euclidean_int8_smoketest) } +double computeAngularChecked(TypedCells a, TypedCells b) { + static AngularDistanceFunctionFactory<float> flt_dff; + static AngularDistanceFunctionFactory<double> dbl_dff; + auto d_n = dbl_dff.for_query_vector(a); + auto d_f = flt_dff.for_query_vector(a); + auto d_r = dbl_dff.for_query_vector(b); + auto d_i = dbl_dff.for_insertion_vector(a); + // normal: + double result = d_n->calc(b); + // insert is exactly same: + EXPECT_EQ(d_i->calc(b), result); + // reverse: + EXPECT_DOUBLE_EQ(d_r->calc(a), result); + // float factory: + EXPECT_FLOAT_EQ(d_f->calc(b), result); + return result; +} + TEST(DistanceFunctionsTest, angular_gives_expected_score) { - auto ct = vespalib::eval::CellType::DOUBLE; - - auto angular = make_distance_function(DistanceMetric::Angular, ct); - std::vector<double> p0{0.0, 0.0, 0.0}; std::vector<double> p1{1.0, 0.0, 0.0}; std::vector<double> p2{0.0, 1.0, 0.0}; @@ -105,10 +119,13 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score) std::vector<double> p5{0.0,-1.0, 0.0}; std::vector<double> p6{1.0, 2.0, 2.0}; + AngularDistanceFunctionFactory<double> dff; + auto angular = dff.for_query_vector(t(p0)); + constexpr double pi = 3.14159265358979323846; - double a12 = angular->calc(t(p1), t(p2)); - double a13 = angular->calc(t(p1), t(p3)); - double a23 = angular->calc(t(p2), t(p3)); + double a12 = computeAngularChecked(t(p1), t(p2)); + double a13 = computeAngularChecked(t(p1), t(p3)); + double a23 = computeAngularChecked(t(p2), t(p3)); EXPECT_DOUBLE_EQ(a12, 1.0); EXPECT_DOUBLE_EQ(a13, 1.0); EXPECT_DOUBLE_EQ(a23, 1.0); @@ -117,44 +134,63 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score) double threshold = angular->convert_threshold(pi/2); EXPECT_DOUBLE_EQ(threshold, 1.0); - double a14 = angular->calc(t(p1), t(p4)); - double a24 = angular->calc(t(p2), t(p4)); + double a14 = computeAngularChecked(t(p1), t(p4)); + double a24 = computeAngularChecked(t(p2), t(p4)); EXPECT_FLOAT_EQ(a14, 0.5); EXPECT_FLOAT_EQ(a24, 0.5); EXPECT_FLOAT_EQ(angular->to_rawscore(a14), 1.0/(1.0 + pi/3)); threshold = angular->convert_threshold(pi/3); EXPECT_DOUBLE_EQ(threshold, 0.5); - double a34 = angular->calc(t(p3), t(p4)); + double a34 = computeAngularChecked(t(p3), t(p4)); EXPECT_FLOAT_EQ(a34, (1.0 - 0.707107)); EXPECT_FLOAT_EQ(angular->to_rawscore(a34), 1.0/(1.0 + pi/4)); threshold = angular->convert_threshold(pi/4); EXPECT_FLOAT_EQ(threshold, a34); - double a25 = angular->calc(t(p2), t(p5)); + double a25 = computeAngularChecked(t(p2), t(p5)); EXPECT_DOUBLE_EQ(a25, 2.0); EXPECT_FLOAT_EQ(angular->to_rawscore(a25), 1.0/(1.0 + pi)); threshold = angular->convert_threshold(pi); EXPECT_FLOAT_EQ(threshold, 2.0); - double a44 = angular->calc(t(p4), t(p4)); + double a44 = computeAngularChecked(t(p4), t(p4)); EXPECT_GE(a44, 0.0); EXPECT_LT(a44, 0.000001); EXPECT_FLOAT_EQ(angular->to_rawscore(a44), 1.0); - double a66 = angular->calc(t(p6), t(p6)); + double a66 = computeAngularChecked(t(p6), t(p6)); EXPECT_GE(a66, 0.0); EXPECT_LT(a66, 0.000001); EXPECT_FLOAT_EQ(angular->to_rawscore(a66), 1.0); threshold = angular->convert_threshold(0.0); EXPECT_FLOAT_EQ(threshold, 0.0); - double a16 = angular->calc(t(p1), t(p6)); - double a26 = angular->calc(t(p2), t(p6)); - double a36 = angular->calc(t(p3), t(p6)); + double a16 = computeAngularChecked(t(p1), t(p6)); + double a26 = computeAngularChecked(t(p2), t(p6)); + double a36 = computeAngularChecked(t(p3), t(p6)); EXPECT_FLOAT_EQ(a16, 1.0 - (1.0/3.0)); EXPECT_FLOAT_EQ(a26, 1.0 - (2.0/3.0)); EXPECT_FLOAT_EQ(a36, 1.0 - (2.0/3.0)); + + // check also that cell type conversion works: + std::vector<Int8Float> iv0{0.0, 0.0, 0.0}; + std::vector<Int8Float> iv1{1.0, 0.0, 0.0}; + std::vector<Int8Float> iv2{0.0, 1.0, 0.0}; + std::vector<Int8Float> iv3{0.0, 0.0, 1.0}; + std::vector<Int8Float> iv5{0.0,-1.0, 0.0}; + std::vector<Int8Float> iv6{1.0, 2.0, 2.0}; + + EXPECT_DOUBLE_EQ(a12, computeAngularChecked(t(iv1), t(iv2))); + EXPECT_DOUBLE_EQ(a13, computeAngularChecked(t(iv1), t(iv3))); + EXPECT_DOUBLE_EQ(a14, computeAngularChecked(t(iv1), t(p4))); + EXPECT_DOUBLE_EQ(a24, computeAngularChecked(t(iv2), t(p4))); + EXPECT_DOUBLE_EQ(a34, computeAngularChecked(t(iv3), t(p4))); + EXPECT_DOUBLE_EQ(a25, computeAngularChecked(t(iv2), t(iv5))); + EXPECT_DOUBLE_EQ(a16, computeAngularChecked(t(iv1), t(iv6))); + EXPECT_DOUBLE_EQ(a26, computeAngularChecked(t(iv2), t(iv6))); + EXPECT_DOUBLE_EQ(a36, computeAngularChecked(t(iv3), t(iv6))); + EXPECT_DOUBLE_EQ(a66, computeAngularChecked(t(iv6), t(iv6))); } TEST(DistanceFunctionsTest, innerproduct_gives_expected_score) @@ -177,7 +213,7 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score) EXPECT_DOUBLE_EQ(i12, 1.0); EXPECT_DOUBLE_EQ(i13, 1.0); EXPECT_DOUBLE_EQ(i23, 1.0); - + double i14 = innerproduct->calc(t(p1), t(p4)); double i24 = innerproduct->calc(t(p2), t(p4)); EXPECT_DOUBLE_EQ(i14, 0.5); @@ -238,7 +274,7 @@ TEST(DistanceFunctionsTest, hamming_gives_expected_score) double d25 = hamming->calc(t(points[2]), t(points[5])); EXPECT_EQ(d25, 1.0); EXPECT_DOUBLE_EQ(hamming->to_rawscore(d25), 1.0/(1.0 + 1.0)); - + double threshold = hamming->convert_threshold(0.25); EXPECT_DOUBLE_EQ(threshold, 0.25); threshold = hamming->convert_threshold(0.5); diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp index 263bf91877a..6c6a1b34097 100644 --- a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp +++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp @@ -4,6 +4,7 @@ using vespalib::typify_invoke; using vespalib::eval::TypifyCellType; +using vespalib::eval::TypedCells; namespace search::tensor { @@ -49,4 +50,74 @@ AngularDistance::calc(const vespalib::eval::TypedCells& lhs, template class AngularDistanceHW<float>; template class AngularDistanceHW<double>; + +template<typename FloatType> +class BoundAngularDistance : public BoundDistanceFunction { +private: + const vespalib::hwaccelrated::IAccelrated & _computer; + mutable TemporaryVectorStore<FloatType> _tmpSpace; + const vespalib::ConstArrayRef<FloatType> _lhs; + double _lhs_norm_sq; +public: + BoundAngularDistance(const vespalib::eval::TypedCells& lhs) + : BoundDistanceFunction(vespalib::eval::get_cell_type<FloatType>()), + _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()), + _tmpSpace(lhs.size), + _lhs(_tmpSpace.storeLhs(lhs)) + { + auto a = &_lhs[0]; + _lhs_norm_sq = _computer.dotProduct(a, a, lhs.size); + } + double calc(const vespalib::eval::TypedCells& rhs) const override { + size_t sz = _lhs.size(); + vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs); + assert(sz == rhs_vector.size()); + auto a = &_lhs[0]; + auto b = &rhs_vector[0]; + double b_norm_sq = _computer.dotProduct(b, b, sz); + double squared_norms = _lhs_norm_sq * b_norm_sq; + double dot_product = _computer.dotProduct(a, b, sz); + double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; + double cosine_similarity = dot_product / div; + double distance = 1.0 - cosine_similarity; // in range [0,2] + return distance; + } + double convert_threshold(double threshold) const override { + double cosine_similarity = cos(threshold); + return 1.0 - cosine_similarity; + } + double to_rawscore(double distance) const override { + double cosine_similarity = 1.0 - distance; + // should be in in range [-1,1] but roundoff may cause problems: + cosine_similarity = std::min(1.0, cosine_similarity); + cosine_similarity = std::max(-1.0, cosine_similarity); + double angle_distance = acos(cosine_similarity); // in range [0,pi] + double score = 1.0 / (1.0 + angle_distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& rhs, double) const override { + return calc(rhs); + } +}; + +template class BoundAngularDistance<float>; +template class BoundAngularDistance<double>; + +template <typename FloatType> +BoundDistanceFunction::UP +AngularDistanceFunctionFactory<FloatType>::for_query_vector(const vespalib::eval::TypedCells& lhs) { + using DFT = BoundAngularDistance<FloatType>; + return std::make_unique<DFT>(lhs); +} + +template <typename FloatType> +BoundDistanceFunction::UP +AngularDistanceFunctionFactory<FloatType>::for_insertion_vector(const vespalib::eval::TypedCells& lhs) { + using DFT = BoundAngularDistance<FloatType>; + return std::make_unique<DFT>(lhs); +} + +template class AngularDistanceFunctionFactory<float>; +template class AngularDistanceFunctionFactory<double>; + } diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.h b/searchlib/src/vespa/searchlib/tensor/angular_distance.h index e3b21f9546e..4f10c25bdde 100644 --- a/searchlib/src/vespa/searchlib/tensor/angular_distance.h +++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.h @@ -3,6 +3,8 @@ #pragma once #include "distance_function.h" +#include "bound_distance_function.h" +#include "distance_function_factory.h" #include <vespa/eval/eval/typed_cells.h> #include <vespa/vespalib/hwaccelrated/iaccelrated.h> #include <cmath> @@ -73,4 +75,15 @@ private: const vespalib::hwaccelrated::IAccelrated & _computer; }; +template <typename FloatType> +class AngularDistanceFunctionFactory : public DistanceFunctionFactory { +public: + AngularDistanceFunctionFactory() + : DistanceFunctionFactory(vespalib::eval::get_cell_type<FloatType>()) + {} + + BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override; + BoundDistanceFunction::UP for_insertion_vector(const vespalib::eval::TypedCells& lhs) override; +}; + } diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp index f96715bcf60..3d2beec2d19 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp @@ -93,6 +93,12 @@ std::unique_ptr<DistanceFunctionFactory> make_distance_function_factory(search::attribute::DistanceMetric variant, vespalib::eval::CellType cell_type) { + if (variant == DistanceMetric::Angular) { + if (cell_type == CellType::DOUBLE) { + return std::make_unique<AngularDistanceFunctionFactory<double>>(); + } + return std::make_unique<AngularDistanceFunctionFactory<float>>(); + } auto df = make_distance_function(variant, cell_type); return std::make_unique<SimpleDistanceFunctionFactory>(std::move(df)); } |