Merge pull request #26846 from vespa-engine/arnej/bound-prenormalized-3

Arnej/bound prenormalized 3
author: Arne H Juul <arnej27959@users.noreply.github.com> 2023-04-25 17:29:38 +0200
committer: GitHub <noreply@github.com> 2023-04-25 17:29:38 +0200
commit: 1380d8ab1a17aab66c4ea76bfc956a91fcb54cb5 (patch)
tree: c3b2b159e40da8325d9b49fa8809532d165140f9 /searchlib
parent: 21773bf29f0c2318a23661a106b5c29562468ffb (diff)
parent: d9a41679f37d93e3a98338fe01de0ba62d4acdbf (diff)
10 files changed, 227 insertions, 13 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
index ae283f3f2b2..e5bed3ebae5 100644
--- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
+++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
@@ -69,6 +69,8 @@ double computeEuclideanChecked(TypedCells a, TypedCells b) {
     return result;
 }
 
+namespace { constexpr double sq_root_half = std::sqrt(0.5); }
+
 TEST(DistanceFunctionsTest, euclidean_gives_expected_score)
 {
     auto ct = vespalib::eval::CellType::DOUBLE;
@@ -79,7 +81,7 @@ TEST(DistanceFunctionsTest, euclidean_gives_expected_score)
     std::vector<double> p1{1.0, 0.0, 0.0};
     std::vector<double> p2{0.0, 1.0, 0.0};
     std::vector<double> p3{0.0, 0.0, 1.0};
-    std::vector<double> p4{0.5, 0.5, 0.707107};
+    std::vector<double> p4{0.5, 0.5, sq_root_half};
     std::vector<double> p5{0.0,-1.0, 0.0};
     std::vector<double> p6{1.0, 2.0, 2.0};
 
@@ -179,7 +181,7 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score)
     std::vector<double> p1{1.0, 0.0, 0.0};
     std::vector<double> p2{0.0, 1.0, 0.0};
     std::vector<double> p3{0.0, 0.0, 1.0};
-    std::vector<double> p4{0.5, 0.5, 0.707107};
+    std::vector<double> p4{0.5, 0.5, sq_root_half};
     std::vector<double> p5{0.0,-1.0, 0.0};
     std::vector<double> p6{1.0, 2.0, 2.0};
 
@@ -207,7 +209,7 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score)
     EXPECT_DOUBLE_EQ(threshold, 0.5);
 
     double a34 = computeAngularChecked(t(p3), t(p4));
-    EXPECT_FLOAT_EQ(a34, (1.0 - 0.707107));
+    EXPECT_FLOAT_EQ(a34, (1.0 - sq_root_half));
     EXPECT_FLOAT_EQ(angular->to_rawscore(a34), 1.0/(1.0 + pi/4));
     threshold = angular->convert_threshold(pi/4);
     EXPECT_FLOAT_EQ(threshold, a34);
@@ -257,6 +259,89 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score)
     EXPECT_DOUBLE_EQ(a66, computeAngularChecked(t(iv6), t(iv6)));
 }
 
+double computePrenormalizedAngularChecked(TypedCells a, TypedCells b) {
+    static PrenormalizedAngularDistanceFunctionFactory<float> flt_dff;
+    static PrenormalizedAngularDistanceFunctionFactory<double> dbl_dff;
+    auto d_n = dbl_dff.for_query_vector(a);
+    auto d_f = flt_dff.for_query_vector(a);
+    auto d_r = dbl_dff.for_query_vector(b);
+    auto d_i = dbl_dff.for_insertion_vector(a);
+    // normal:
+    double result = d_n->calc(b);
+     // insert is exactly same:
+    EXPECT_EQ(d_i->calc(b), result);
+    // note: for this distance, reverse is not necessarily equal,
+    // since we normalize based on length of LHS only
+    EXPECT_FLOAT_EQ(d_r->calc(a), result);
+    // float factory:
+    EXPECT_FLOAT_EQ(d_f->calc(b), result);
+    double closeness_n = d_n->to_rawscore(result);
+    double closeness_f = d_f->to_rawscore(result);
+    double closeness_r = d_r->to_rawscore(result);
+    double closeness_i = d_i->to_rawscore(result);
+    EXPECT_DOUBLE_EQ(closeness_n, closeness_f);
+    EXPECT_DOUBLE_EQ(closeness_n, closeness_r);
+    EXPECT_DOUBLE_EQ(closeness_n, closeness_i);
+    EXPECT_GT(closeness_n, 0.0);
+    EXPECT_LE(closeness_n, 1.0);
+    return result;
+}
+
+TEST(DistanceFunctionsTest, prenormalized_angular_gives_expected_score)
+{
+    std::vector<double> p0{0.0, 0.0, 0.0};
+    std::vector<double> p1{1.0, 0.0, 0.0};
+    std::vector<double> p2{0.0, 1.0, 0.0};
+    std::vector<double> p3{0.0, 0.0, 1.0};
+    std::vector<double> p4{0.5, 0.5, sq_root_half};
+    std::vector<double> p5{0.0,-1.0, 0.0};
+    std::vector<double> p6{1.0, 2.0, 2.0};
+    std::vector<double> p7{2.0, -1.0, -2.0};
+    std::vector<double> p8{3.0, 0.0, 0.0};
+
+    PrenormalizedAngularDistanceFunctionFactory<double> dff;
+    auto pnad = dff.for_query_vector(t(p0));
+
+    double i12 = computePrenormalizedAngularChecked(t(p1), t(p2));
+    double i13 = computePrenormalizedAngularChecked(t(p1), t(p3));
+    double i23 = computePrenormalizedAngularChecked(t(p2), t(p3));
+    EXPECT_DOUBLE_EQ(i12, 1.0);
+    EXPECT_DOUBLE_EQ(i13, 1.0);
+    EXPECT_DOUBLE_EQ(i23, 1.0);
+
+    double i14 = computePrenormalizedAngularChecked(t(p1), t(p4));
+    double i24 = computePrenormalizedAngularChecked(t(p2), t(p4));
+    EXPECT_DOUBLE_EQ(i14, 0.5);
+    EXPECT_DOUBLE_EQ(i24, 0.5);
+    double i34 = computePrenormalizedAngularChecked(t(p3), t(p4));
+    EXPECT_FLOAT_EQ(i34, 1.0 - sq_root_half);
+
+    double i25 = computePrenormalizedAngularChecked(t(p2), t(p5));
+    EXPECT_DOUBLE_EQ(i25, 2.0);
+
+    double i44 = computePrenormalizedAngularChecked(t(p4), t(p4));
+    EXPECT_GE(i44, 0.0);
+    EXPECT_LT(i44, 0.000001);
+
+    double i66 = computePrenormalizedAngularChecked(t(p6), t(p6));
+    EXPECT_GE(i66, 0.0);
+    EXPECT_LT(i66, 0.000001);
+
+    double i67 = computePrenormalizedAngularChecked(t(p6), t(p7));
+    EXPECT_DOUBLE_EQ(i67, 13.0);
+    double i68 = computePrenormalizedAngularChecked(t(p6), t(p8));
+    EXPECT_DOUBLE_EQ(i68, 6.0);
+    double i78 = computePrenormalizedAngularChecked(t(p7), t(p8));
+    EXPECT_DOUBLE_EQ(i78, 3.0);
+
+    double threshold = pnad->convert_threshold(0.25);
+    EXPECT_DOUBLE_EQ(threshold, 0.25);
+    threshold = pnad->convert_threshold(0.5);
+    EXPECT_DOUBLE_EQ(threshold, 0.5);
+    threshold = pnad->convert_threshold(1.0);
+    EXPECT_DOUBLE_EQ(threshold, 1.0);
+}
+
 TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
 {
     auto ct = vespalib::eval::CellType::DOUBLE;
@@ -267,7 +352,7 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
     std::vector<double> p1{1.0, 0.0, 0.0};
     std::vector<double> p2{0.0, 1.0, 0.0};
     std::vector<double> p3{0.0, 0.0, 1.0};
-    std::vector<double> p4{0.5, 0.5, 0.707107};
+    std::vector<double> p4{0.5, 0.5, sq_root_half};
     std::vector<double> p5{0.0,-1.0, 0.0};
     std::vector<double> p6{1.0, 2.0, 2.0};
 
@@ -283,7 +368,7 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
     EXPECT_DOUBLE_EQ(i14, 0.5);
     EXPECT_DOUBLE_EQ(i24, 0.5);
     double i34 = innerproduct->calc(t(p3), t(p4));
-    EXPECT_FLOAT_EQ(i34, 1.0 - 0.707107);
+    EXPECT_FLOAT_EQ(i34, 1.0 - sq_root_half);
 
     double i25 = innerproduct->calc(t(p2), t(p5));
     EXPECT_DOUBLE_EQ(i25, 2.0);
@@ -292,6 +377,10 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
     EXPECT_GE(i44, 0.0);
     EXPECT_LT(i44, 0.000001);
 
+    double i66 = innerproduct->calc(t(p6), t(p6));
+    EXPECT_GE(i66, 0.0);
+    EXPECT_LT(i66, 0.000001);
+
     double threshold = innerproduct->convert_threshold(0.25);
     EXPECT_DOUBLE_EQ(threshold, 0.25);
     threshold = innerproduct->convert_threshold(0.5);
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index 1783e0da1dd..2e874ffa4ae 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -30,6 +30,7 @@ vespa_add_library(searchlib_tensor OBJECT
     large_subspaces_buffer_type.cpp
     nearest_neighbor_index.cpp
     nearest_neighbor_index_saver.cpp
+    prenormalized_angular_distance.cpp
     serialized_fast_value_attribute.cpp
     serialized_tensor_ref.cpp
     small_subspaces_buffer_type.cpp
diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
index 85eac76728c..efc1170baf5 100644
--- a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
@@ -66,15 +66,15 @@ public:
           _tmpSpace(lhs.size),
           _lhs(_tmpSpace.storeLhs(lhs))
     {
-        auto a = &_lhs[0];
+        auto a = _lhs.data();
         _lhs_norm_sq = _computer.dotProduct(a, a, lhs.size);
     }
     double calc(const vespalib::eval::TypedCells& rhs) const override {
         size_t sz = _lhs.size();
         vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
         assert(sz == rhs_vector.size());
-        auto a = &_lhs[0];
-        auto b = &rhs_vector[0];
+        auto a = _lhs.data();
+        auto b = rhs_vector.data();
         double b_norm_sq = _computer.dotProduct(b, b, sz);
         double squared_norms = _lhs_norm_sq * b_norm_sq;
         double dot_product = _computer.dotProduct(a, b, sz);
diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.h b/searchlib/src/vespa/searchlib/tensor/angular_distance.h
index 97f692d05a2..bba83576153 100644
--- a/searchlib/src/vespa/searchlib/tensor/angular_distance.h
+++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.h
@@ -60,8 +60,8 @@ public:
         auto rhs_vector = rhs.typify<FloatType>();
         size_t sz = lhs_vector.size();
         assert(sz == rhs_vector.size());
-        auto a = &lhs_vector[0];
-        auto b = &rhs_vector[0];
+        auto a = lhs_vector.data();
+        auto b = rhs_vector.data();
         double a_norm_sq = _computer.dotProduct(a, a, sz);
         double b_norm_sq = _computer.dotProduct(b, b, sz);
         double squared_norms = a_norm_sq * b_norm_sq;
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index cca492ef212..4553f39a525 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -107,6 +107,20 @@ make_distance_function_factory(search::attribute::DistanceMetric variant,
         default:               return std::make_unique<EuclideanDistanceFunctionFactory<float>>();
         }
     }
+    if (variant == DistanceMetric::PrenormalizedAngular) {
+        if (cell_type == CellType::DOUBLE) {
+            return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<double>>();
+        }
+        return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<float>>();
+    }
+    /*
+    if (variant == DistanceMetric::GeoDegrees) {
+        return std::make_unique<GeoDistanceFunctionFactory>();
+    }
+    if (variant == DistanceMetric::Hamming) {
+        return std::make_unique<HammingDistanceFunctionFactory>();
+    }
+    */
     auto df = make_distance_function(variant, cell_type);
     return std::make_unique<SimpleDistanceFunctionFactory>(std::move(df));
 }
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
index b28cc2bda46..2300dba2db1 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
@@ -8,3 +8,4 @@
 #include "geo_degrees_distance.h"
 #include "hamming_distance.h"
 #include "inner_product_distance.h"
+#include "prenormalized_angular_distance.h"
diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
index 92d4e7af406..9c37b191637 100644
--- a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
@@ -71,8 +71,8 @@ public:
         size_t sz = _lhs_vector.size();
         vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
         assert(sz == rhs_vector.size());
-        auto a = &_lhs_vector[0];
-        auto b = &rhs_vector[0];
+        auto a = _lhs_vector.data();
+        auto b = rhs_vector.data();
         return _computer.squaredEuclideanDistance(cast(a), cast(b), sz);
     }
     double convert_threshold(double threshold) const override {
diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h
index 8ba14580885..135bb186fd4 100644
--- a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h
+++ b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h
@@ -54,7 +54,7 @@ public:
         auto rhs_vector = rhs.typify<FloatType>();
         size_t sz = lhs_vector.size();
         assert(sz == rhs_vector.size());
-        double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz);
+        double score = 1.0 - _computer.dotProduct(lhs_vector.data(), rhs_vector.data(), sz);
         return std::max(0.0, score);
     }
 private:
diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp
new file mode 100644
index 00000000000..d2693f9f443
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp
@@ -0,0 +1,82 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "prenormalized_angular_distance.h"
+#include "temporary_vector_store.h"
+
+using vespalib::typify_invoke;
+using vespalib::eval::TypifyCellType;
+
+namespace search::tensor {
+
+template<typename FloatType>
+class BoundPrenormalizedAngularDistance : public BoundDistanceFunction {
+private:
+    const vespalib::hwaccelrated::IAccelrated & _computer;
+    mutable TemporaryVectorStore<FloatType> _tmpSpace;
+    const vespalib::ConstArrayRef<FloatType> _lhs;
+    double _lhs_norm_sq;
+public:
+    BoundPrenormalizedAngularDistance(const vespalib::eval::TypedCells& lhs)
+        : BoundDistanceFunction(vespalib::eval::get_cell_type<FloatType>()),
+          _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()),
+          _tmpSpace(lhs.size),
+          _lhs(_tmpSpace.storeLhs(lhs))
+    {
+        auto a = _lhs.data();
+        _lhs_norm_sq = _computer.dotProduct(a, a, lhs.size);
+        if (_lhs_norm_sq <= 0.0) {
+            _lhs_norm_sq = 1.0;
+        }
+    }
+    double calc(const vespalib::eval::TypedCells& rhs) const override {
+        size_t sz = _lhs.size();
+        vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
+        assert(sz == rhs_vector.size());
+        auto a = _lhs.data();
+        auto b = rhs_vector.data();
+        double dot_product = _computer.dotProduct(a, b, sz);
+        double distance = _lhs_norm_sq - dot_product;
+        return distance;
+    }
+    double convert_threshold(double threshold) const override {
+        double cosine_similarity = 1.0 - threshold;
+        double dot_product = cosine_similarity * _lhs_norm_sq;
+        double distance = _lhs_norm_sq - dot_product;
+        return distance;
+    }
+    double to_rawscore(double distance) const override {
+        double dot_product = _lhs_norm_sq - distance;
+        double cosine_similarity = dot_product / _lhs_norm_sq;
+        // should be in in range [-1,1] but roundoff may cause problems:
+        cosine_similarity = std::min(1.0, cosine_similarity);
+        cosine_similarity = std::max(-1.0, cosine_similarity);
+        double cosine_distance = 1.0 - cosine_similarity; // in range [0,2]
+        double score = 1.0 / (1.0 + cosine_distance);
+        return score;
+    }
+    double calc_with_limit(const vespalib::eval::TypedCells& rhs, double) const override {
+        return calc(rhs);
+    }
+};
+
+template class BoundPrenormalizedAngularDistance<float>;
+template class BoundPrenormalizedAngularDistance<double>;
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_query_vector(const vespalib::eval::TypedCells& lhs) {
+    using DFT = BoundPrenormalizedAngularDistance<FloatType>;
+    return std::make_unique<DFT>(lhs);
+}
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_insertion_vector(const vespalib::eval::TypedCells& lhs) {
+    using DFT = BoundPrenormalizedAngularDistance<FloatType>;
+    return std::make_unique<DFT>(lhs);
+}
+
+template class PrenormalizedAngularDistanceFunctionFactory<float>;
+template class PrenormalizedAngularDistanceFunctionFactory<double>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h
new file mode 100644
index 00000000000..88953a236e7
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h
@@ -0,0 +1,27 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+#include "bound_distance_function.h"
+#include "distance_function_factory.h"
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+
+namespace search::tensor {
+
+/**
+ * Calculates inner-product "distance" between vectors with assumed norm 1.
+ * Should give same ordering as Angular distance, but is less expensive.
+ */
+template <typename FloatType>
+class PrenormalizedAngularDistanceFunctionFactory : public DistanceFunctionFactory {
+public:
+    PrenormalizedAngularDistanceFunctionFactory()
+        : DistanceFunctionFactory(vespalib::eval::get_cell_type<FloatType>())
+        {}
+    BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override;
+    BoundDistanceFunction::UP for_insertion_vector(const vespalib::eval::TypedCells& lhs) override;
+};
+
+}
author	Arne H Juul <arnej27959@users.noreply.github.com>	2023-04-25 17:29:38 +0200
committer	GitHub <noreply@github.com>	2023-04-25 17:29:38 +0200
commit	1380d8ab1a17aab66c4ea76bfc956a91fcb54cb5 (patch)
tree	c3b2b159e40da8325d9b49fa8809532d165140f9 /searchlib
parent	21773bf29f0c2318a23661a106b5c29562468ffb (diff)
parent	d9a41679f37d93e3a98338fe01de0ba62d4acdbf (diff)