add BoundAngularDistance

author: Arne Juul <arnej@yahooinc.com> 2023-04-20 09:27:12 +0000
committer: Arne Juul <arnej@yahooinc.com> 2023-04-20 09:30:24 +0000
commit: 808175e4afc92fe5e6a7eddeaf23110df7d46f8f (patch)
tree: bbda83e2d55cbbcf5f954f7057ac5ad2951ef0e1 /searchlib
parent: 4e3fd9eeebeb403d4ad23bf70470d895cbdfbd1c (diff)
4 files changed, 144 insertions, 18 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
index f4faabde559..86b83b2c651 100644
--- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
+++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
@@ -91,12 +91,26 @@ TEST(DistanceFunctionsTest, euclidean_int8_smoketest)
 
 }
 
+double computeAngularChecked(TypedCells a, TypedCells b) {
+    static AngularDistanceFunctionFactory<float> flt_dff;
+    static AngularDistanceFunctionFactory<double> dbl_dff;
+    auto d_n = dbl_dff.for_query_vector(a);
+    auto d_f = flt_dff.for_query_vector(a);
+    auto d_r = dbl_dff.for_query_vector(b);
+    auto d_i = dbl_dff.for_insertion_vector(a);
+    // normal:
+    double result = d_n->calc(b);
+     // insert is exactly same:
+    EXPECT_EQ(d_i->calc(b), result);
+    // reverse:
+    EXPECT_DOUBLE_EQ(d_r->calc(a), result);
+    // float factory:
+    EXPECT_FLOAT_EQ(d_f->calc(b), result);
+    return result;
+}
+
 TEST(DistanceFunctionsTest, angular_gives_expected_score)
 {
-    auto ct = vespalib::eval::CellType::DOUBLE;
-
-    auto angular = make_distance_function(DistanceMetric::Angular, ct);
-
     std::vector<double> p0{0.0, 0.0, 0.0};
     std::vector<double> p1{1.0, 0.0, 0.0};
     std::vector<double> p2{0.0, 1.0, 0.0};
@@ -105,10 +119,13 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score)
     std::vector<double> p5{0.0,-1.0, 0.0};
     std::vector<double> p6{1.0, 2.0, 2.0};
 
+    AngularDistanceFunctionFactory<double> dff;
+    auto angular = dff.for_query_vector(t(p0));
+
     constexpr double pi = 3.14159265358979323846;
-    double a12 = angular->calc(t(p1), t(p2));
-    double a13 = angular->calc(t(p1), t(p3));
-    double a23 = angular->calc(t(p2), t(p3));
+    double a12 = computeAngularChecked(t(p1), t(p2));
+    double a13 = computeAngularChecked(t(p1), t(p3));
+    double a23 = computeAngularChecked(t(p2), t(p3));
     EXPECT_DOUBLE_EQ(a12, 1.0);
     EXPECT_DOUBLE_EQ(a13, 1.0);
     EXPECT_DOUBLE_EQ(a23, 1.0);
@@ -117,44 +134,63 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score)
     double threshold = angular->convert_threshold(pi/2);
     EXPECT_DOUBLE_EQ(threshold, 1.0);
 
-    double a14 = angular->calc(t(p1), t(p4));
-    double a24 = angular->calc(t(p2), t(p4));
+    double a14 = computeAngularChecked(t(p1), t(p4));
+    double a24 = computeAngularChecked(t(p2), t(p4));
     EXPECT_FLOAT_EQ(a14, 0.5);
     EXPECT_FLOAT_EQ(a24, 0.5);
     EXPECT_FLOAT_EQ(angular->to_rawscore(a14), 1.0/(1.0 + pi/3));
     threshold = angular->convert_threshold(pi/3);
     EXPECT_DOUBLE_EQ(threshold, 0.5);
 
-    double a34 = angular->calc(t(p3), t(p4));
+    double a34 = computeAngularChecked(t(p3), t(p4));
     EXPECT_FLOAT_EQ(a34, (1.0 - 0.707107));
     EXPECT_FLOAT_EQ(angular->to_rawscore(a34), 1.0/(1.0 + pi/4));
     threshold = angular->convert_threshold(pi/4);
     EXPECT_FLOAT_EQ(threshold, a34);
 
-    double a25 = angular->calc(t(p2), t(p5));
+    double a25 = computeAngularChecked(t(p2), t(p5));
     EXPECT_DOUBLE_EQ(a25, 2.0);
     EXPECT_FLOAT_EQ(angular->to_rawscore(a25), 1.0/(1.0 + pi));
     threshold = angular->convert_threshold(pi);
     EXPECT_FLOAT_EQ(threshold, 2.0);
 
-    double a44 = angular->calc(t(p4), t(p4));
+    double a44 = computeAngularChecked(t(p4), t(p4));
     EXPECT_GE(a44, 0.0);
     EXPECT_LT(a44, 0.000001);
     EXPECT_FLOAT_EQ(angular->to_rawscore(a44), 1.0);
 
-    double a66 = angular->calc(t(p6), t(p6));
+    double a66 = computeAngularChecked(t(p6), t(p6));
     EXPECT_GE(a66, 0.0);
     EXPECT_LT(a66, 0.000001);
     EXPECT_FLOAT_EQ(angular->to_rawscore(a66), 1.0);
     threshold = angular->convert_threshold(0.0);
     EXPECT_FLOAT_EQ(threshold, 0.0);
 
-    double a16 = angular->calc(t(p1), t(p6));
-    double a26 = angular->calc(t(p2), t(p6));
-    double a36 = angular->calc(t(p3), t(p6));
+    double a16 = computeAngularChecked(t(p1), t(p6));
+    double a26 = computeAngularChecked(t(p2), t(p6));
+    double a36 = computeAngularChecked(t(p3), t(p6));
     EXPECT_FLOAT_EQ(a16, 1.0 - (1.0/3.0));
     EXPECT_FLOAT_EQ(a26, 1.0 - (2.0/3.0));
     EXPECT_FLOAT_EQ(a36, 1.0 - (2.0/3.0));
+
+    // check also that cell type conversion works:
+    std::vector<Int8Float> iv0{0.0, 0.0, 0.0};
+    std::vector<Int8Float> iv1{1.0, 0.0, 0.0};
+    std::vector<Int8Float> iv2{0.0, 1.0, 0.0};
+    std::vector<Int8Float> iv3{0.0, 0.0, 1.0};
+    std::vector<Int8Float> iv5{0.0,-1.0, 0.0};
+    std::vector<Int8Float> iv6{1.0, 2.0, 2.0};
+
+    EXPECT_DOUBLE_EQ(a12, computeAngularChecked(t(iv1), t(iv2)));
+    EXPECT_DOUBLE_EQ(a13, computeAngularChecked(t(iv1), t(iv3)));
+    EXPECT_DOUBLE_EQ(a14, computeAngularChecked(t(iv1), t(p4)));
+    EXPECT_DOUBLE_EQ(a24, computeAngularChecked(t(iv2), t(p4)));
+    EXPECT_DOUBLE_EQ(a34, computeAngularChecked(t(iv3), t(p4)));
+    EXPECT_DOUBLE_EQ(a25, computeAngularChecked(t(iv2), t(iv5)));
+    EXPECT_DOUBLE_EQ(a16, computeAngularChecked(t(iv1), t(iv6)));
+    EXPECT_DOUBLE_EQ(a26, computeAngularChecked(t(iv2), t(iv6)));
+    EXPECT_DOUBLE_EQ(a36, computeAngularChecked(t(iv3), t(iv6)));
+    EXPECT_DOUBLE_EQ(a66, computeAngularChecked(t(iv6), t(iv6)));
 }
 
 TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
@@ -177,7 +213,7 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
     EXPECT_DOUBLE_EQ(i12, 1.0);
     EXPECT_DOUBLE_EQ(i13, 1.0);
     EXPECT_DOUBLE_EQ(i23, 1.0);
-    
+
     double i14 = innerproduct->calc(t(p1), t(p4));
     double i24 = innerproduct->calc(t(p2), t(p4));
     EXPECT_DOUBLE_EQ(i14, 0.5);
@@ -238,7 +274,7 @@ TEST(DistanceFunctionsTest, hamming_gives_expected_score)
     double d25 = hamming->calc(t(points[2]), t(points[5]));
     EXPECT_EQ(d25, 1.0);
     EXPECT_DOUBLE_EQ(hamming->to_rawscore(d25), 1.0/(1.0 + 1.0));
- 
+
     double threshold = hamming->convert_threshold(0.25);
     EXPECT_DOUBLE_EQ(threshold, 0.25);
     threshold = hamming->convert_threshold(0.5);
diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
index 263bf91877a..6c6a1b34097 100644
--- a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
@@ -4,6 +4,7 @@
 
 using vespalib::typify_invoke;
 using vespalib::eval::TypifyCellType;
+using vespalib::eval::TypedCells;
 
 namespace search::tensor {
 
@@ -49,4 +50,74 @@ AngularDistance::calc(const vespalib::eval::TypedCells& lhs,
 template class AngularDistanceHW<float>;
 template class AngularDistanceHW<double>;
 
+
+template<typename FloatType>
+class BoundAngularDistance : public BoundDistanceFunction {
+private:
+    const vespalib::hwaccelrated::IAccelrated & _computer;
+    mutable TemporaryVectorStore<FloatType> _tmpSpace;
+    const vespalib::ConstArrayRef<FloatType> _lhs;
+    double _lhs_norm_sq;
+public:
+    BoundAngularDistance(const vespalib::eval::TypedCells& lhs)
+        : BoundDistanceFunction(vespalib::eval::get_cell_type<FloatType>()),
+          _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()),
+          _tmpSpace(lhs.size),
+          _lhs(_tmpSpace.storeLhs(lhs))
+    {
+        auto a = &_lhs[0];
+        _lhs_norm_sq = _computer.dotProduct(a, a, lhs.size);
+    }
+    double calc(const vespalib::eval::TypedCells& rhs) const override {
+        size_t sz = _lhs.size();
+        vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
+        assert(sz == rhs_vector.size());
+        auto a = &_lhs[0];
+        auto b = &rhs_vector[0];
+        double b_norm_sq = _computer.dotProduct(b, b, sz);
+        double squared_norms = _lhs_norm_sq * b_norm_sq;
+        double dot_product = _computer.dotProduct(a, b, sz);
+        double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0;
+        double cosine_similarity = dot_product / div;
+        double distance = 1.0 - cosine_similarity; // in range [0,2]
+        return distance;
+    }
+    double convert_threshold(double threshold) const override {
+        double cosine_similarity = cos(threshold);
+        return 1.0 - cosine_similarity;
+    }
+    double to_rawscore(double distance) const override {
+        double cosine_similarity = 1.0 - distance;
+        // should be in in range [-1,1] but roundoff may cause problems:
+        cosine_similarity = std::min(1.0, cosine_similarity);
+        cosine_similarity = std::max(-1.0, cosine_similarity);
+        double angle_distance = acos(cosine_similarity); // in range [0,pi]
+        double score = 1.0 / (1.0 + angle_distance);
+        return score;
+    }
+    double calc_with_limit(const vespalib::eval::TypedCells& rhs, double) const override {
+        return calc(rhs);
+    }
+};
+
+template class BoundAngularDistance<float>;
+template class BoundAngularDistance<double>;
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+AngularDistanceFunctionFactory<FloatType>::for_query_vector(const vespalib::eval::TypedCells& lhs) {
+    using DFT = BoundAngularDistance<FloatType>;
+    return std::make_unique<DFT>(lhs);
+}
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+AngularDistanceFunctionFactory<FloatType>::for_insertion_vector(const vespalib::eval::TypedCells& lhs) {
+    using DFT = BoundAngularDistance<FloatType>;
+    return std::make_unique<DFT>(lhs);
+}
+
+template class AngularDistanceFunctionFactory<float>;
+template class AngularDistanceFunctionFactory<double>;
+
 }
diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.h b/searchlib/src/vespa/searchlib/tensor/angular_distance.h
index e3b21f9546e..4f10c25bdde 100644
--- a/searchlib/src/vespa/searchlib/tensor/angular_distance.h
+++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.h
@@ -3,6 +3,8 @@
 #pragma once
 
 #include "distance_function.h"
+#include "bound_distance_function.h"
+#include "distance_function_factory.h"
 #include <vespa/eval/eval/typed_cells.h>
 #include <vespa/vespalib/hwaccelrated/iaccelrated.h>
 #include <cmath>
@@ -73,4 +75,15 @@ private:
     const vespalib::hwaccelrated::IAccelrated & _computer;
 };
 
+template <typename FloatType>
+class AngularDistanceFunctionFactory : public DistanceFunctionFactory {
+public:
+    AngularDistanceFunctionFactory()
+        : DistanceFunctionFactory(vespalib::eval::get_cell_type<FloatType>())
+        {}
+
+    BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override;
+    BoundDistanceFunction::UP for_insertion_vector(const vespalib::eval::TypedCells& lhs) override;
+};
+
 }
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index f96715bcf60..3d2beec2d19 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -93,6 +93,12 @@ std::unique_ptr<DistanceFunctionFactory>
 make_distance_function_factory(search::attribute::DistanceMetric variant,
                                vespalib::eval::CellType cell_type)
 {
+    if (variant == DistanceMetric::Angular) {
+        if (cell_type == CellType::DOUBLE) {
+            return std::make_unique<AngularDistanceFunctionFactory<double>>();
+        }
+        return std::make_unique<AngularDistanceFunctionFactory<float>>();
+    }
     auto df = make_distance_function(variant, cell_type);
     return std::make_unique<SimpleDistanceFunctionFactory>(std::move(df));
 }
author	Arne Juul <arnej@yahooinc.com>	2023-04-20 09:27:12 +0000
committer	Arne Juul <arnej@yahooinc.com>	2023-04-20 09:30:24 +0000
commit	808175e4afc92fe5e6a7eddeaf23110df7d46f8f (patch)
tree	bbda83e2d55cbbcf5f954f7057ac5ad2951ef0e1 /searchlib
parent	4e3fd9eeebeb403d4ad23bf70470d895cbdfbd1c (diff)