Merge pull request #26906 from vespa-engine/arnej/add-max-inner-product-search-transform

Arnej/add max inner product search transform
author: Arne H Juul <arnej27959@users.noreply.github.com> 2023-04-29 09:38:36 +0200
committer: GitHub <noreply@github.com> 2023-04-29 09:38:36 +0200
commit: 592c38d1b1f85ebc2150354f23f3b64ca3be2159 (patch)
tree: 579e0afbf5444c05c4fb2e9fc0faae2ec3903fc2
parent: 6be89d2c64d5f4b3938c581c68fb577a6b052e73 (diff)
parent: 22e933ddb54a825c646296e0153df3cba73fcfe9 (diff)
8 files changed, 330 insertions, 2 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
index a1b29c90986..9d0b7259912 100644
--- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
+++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
@@ -4,6 +4,7 @@
 #include <vespa/searchlib/common/geo_gcd.h>
 #include <vespa/searchlib/tensor/distance_functions.h>
 #include <vespa/searchlib/tensor/distance_function_factory.h>
+#include <vespa/searchlib/tensor/mips_distance_transform.h>
 #include <vespa/vespalib/gtest/gtest.h>
 #include <vector>
 
@@ -508,5 +509,166 @@ TEST(GeoDegreesTest, gives_expected_score)
     verify_geo_miles(g9_jfk, g9_jfk, 0);
 }
 
+
+double computeTransformedMipsChecked(TypedCells a, TypedCells b, bool check_insert = true) {
+    MipsDistanceFunctionFactory<float> flt_dff;
+    MipsDistanceFunctionFactory<double> dbl_dff;
+
+    auto d_n = dbl_dff.for_query_vector(a);
+    auto d_f = flt_dff.for_query_vector(a);
+    auto d_r = dbl_dff.for_query_vector(b);
+    // normal:
+    double result = d_n->calc(b);
+    // reverse:
+    EXPECT_DOUBLE_EQ(d_r->calc(a), result);
+    // float factory:
+    EXPECT_FLOAT_EQ(d_f->calc(b), result);
+    double closeness_n = d_n->to_rawscore(result);
+    double closeness_f = d_f->to_rawscore(result);
+    double closeness_r = d_r->to_rawscore(result);
+    EXPECT_DOUBLE_EQ(closeness_n, closeness_f);
+    EXPECT_DOUBLE_EQ(closeness_n, closeness_r);
+    EXPECT_GT(closeness_n, 0.0);
+    EXPECT_LE(closeness_n, 1.0);
+    if (check_insert) {
+        auto d_i = dbl_dff.for_insertion_vector(a);
+        EXPECT_DOUBLE_EQ(d_i->calc(b), result);
+    }
+    return result;
+}
+
+TEST(DistanceFunctionsTest, transformed_mips_basic_scores)
+{
+    std::vector<double> p1{1.0, 0.0, 0.0};
+    std::vector<double> p2{0.0, 1.0, 0.0};
+    std::vector<double> p3{0.0, 0.0, 1.0};
+    std::vector<double> p4{0.5, 0.5, sq_root_half};
+    std::vector<double> p5{0.0,-1.0, 0.0};
+
+    double i12 = computeTransformedMipsChecked(t(p1), t(p2));
+    double i13 = computeTransformedMipsChecked(t(p1), t(p3));
+    double i23 = computeTransformedMipsChecked(t(p2), t(p3));
+    EXPECT_DOUBLE_EQ(i12, 0.0);
+    EXPECT_DOUBLE_EQ(i13, 0.0);
+    EXPECT_DOUBLE_EQ(i23, 0.0);
+
+    double i14 = computeTransformedMipsChecked(t(p1), t(p4));
+    double i24 = computeTransformedMipsChecked(t(p2), t(p4));
+    EXPECT_DOUBLE_EQ(i14, -0.5);
+    EXPECT_DOUBLE_EQ(i24, -0.5);
+
+    double i34 = computeTransformedMipsChecked(t(p3), t(p4));
+    EXPECT_FLOAT_EQ(i34, -sq_root_half);
+
+    double i25 = computeTransformedMipsChecked(t(p2), t(p5));
+    EXPECT_DOUBLE_EQ(i25, 1.0);
+
+    double i44 = computeTransformedMipsChecked(t(p4), t(p4));
+    EXPECT_DOUBLE_EQ(i44, -1.0);
+
+    std::vector<double> p6{ 0.0,  4.0, -4.0};
+    std::vector<double> p7{-4.0,  0.0,  4.0};
+    std::vector<double> p8{ 4.0, -4.0,  0.0};
+
+    double i66 = computeTransformedMipsChecked(t(p6), t(p6));
+    EXPECT_DOUBLE_EQ(i66, -32.0);
+
+    double i67 = computeTransformedMipsChecked(t(p6), t(p7));
+    EXPECT_DOUBLE_EQ(i67, 16.0);
+
+    double i68 = computeTransformedMipsChecked(t(p6), t(p8));
+    EXPECT_DOUBLE_EQ(i68, 16.0);
+
+    double i78 = computeTransformedMipsChecked(t(p7), t(p8));
+    EXPECT_DOUBLE_EQ(i78, 16.0);
+}
+
+TEST(DistanceFunctionsTest, transformed_mips_growing_norm)
+{
+    std::vector<double> p1{1.0, 0.0, 0.0};
+    std::vector<double> p2{0.0, 1.0, 0.0};
+    std::vector<double> p3{0.0, 0.0, 1.0};
+    std::vector<double> p6{ 0.0,  4.0, -4.0};
+    std::vector<double> p7{-4.0,  0.0,  4.0};
+    std::vector<double> p8{ 4.0, -4.0,  0.0};
+
+    MipsDistanceFunctionFactory<double> dff;
+    auto f = dff.for_insertion_vector(t(p1));
+    EXPECT_DOUBLE_EQ(-1.0, f->calc(t(p1)));
+    EXPECT_DOUBLE_EQ(0.0, f->calc(t(p2)));
+    EXPECT_DOUBLE_EQ(0.0, f->calc(t(p3)));
+    EXPECT_DOUBLE_EQ(0.0, f->calc(t(p6)));
+    EXPECT_DOUBLE_EQ(4.0, f->calc(t(p7)));
+    EXPECT_DOUBLE_EQ(-4.0, f->calc(t(p8)));
+
+    // closeness
+    EXPECT_DOUBLE_EQ(0.25, f->to_rawscore(1.0));
+    EXPECT_DOUBLE_EQ(0.50, f->to_rawscore(0.0));
+    EXPECT_DOUBLE_EQ(0.75, f->to_rawscore(-1.0));
+
+    // now "insert" a bigger vector
+    f = dff.for_insertion_vector(t(p6));
+    EXPECT_DOUBLE_EQ(0.0, f->calc(t(p1)));
+    EXPECT_DOUBLE_EQ(-4.0, f->calc(t(p2)));
+    EXPECT_DOUBLE_EQ(4.0, f->calc(t(p3)));
+    EXPECT_DOUBLE_EQ(-32.0, f->calc(t(p6)));
+    EXPECT_DOUBLE_EQ(16.0, f->calc(t(p7)));
+    EXPECT_DOUBLE_EQ(16.0, f->calc(t(p8)));
+
+    // now max squared norm is 32, so p1 is "closer" to itself
+    f = dff.for_insertion_vector(t(p1));
+    EXPECT_DOUBLE_EQ(-32.0, f->calc(t(p1)));
+    // closeness (rawscore) is also different:
+    EXPECT_DOUBLE_EQ(0.25, f->to_rawscore(32.0));
+    EXPECT_DOUBLE_EQ(1/3., f->to_rawscore(16.0));
+    EXPECT_DOUBLE_EQ(0.50, f->to_rawscore(0.0));
+    EXPECT_DOUBLE_EQ(2/3., f->to_rawscore(-16.0));
+    EXPECT_DOUBLE_EQ(0.75, f->to_rawscore(-32.0));
+
+    // also closer to other small vectors
+    EXPECT_DOUBLE_EQ(-31.0, f->calc(t(p2)));
+    EXPECT_DOUBLE_EQ(-31.0, f->calc(t(p3)));
+    std::vector<double> p9a{-5.0, 0.0, 0.0};
+    // 32 - (-5)^2 = 32 - 25 = 7
+    EXPECT_DOUBLE_EQ(5.0 - std::sqrt(31.0 * 7), f->calc(t(p9a)));
+    std::vector<double> p9b{-3.0, 4.0, 0.0};
+    std::vector<double> p9c{0.0, -3.0, 4.0};
+    std::vector<double> p9d{-4.0, 0.0, 3.0};
+    EXPECT_DOUBLE_EQ(3.0 - std::sqrt(31.0 * 7), f->calc(t(p9b)));
+    EXPECT_DOUBLE_EQ(0.0 - std::sqrt(31.0 * 7), f->calc(t(p9c)));
+    EXPECT_DOUBLE_EQ(4.0 - std::sqrt(31.0 * 7), f->calc(t(p9d)));
+
+    // but only for insert:
+    f = dff.for_query_vector(t(p1));
+    EXPECT_DOUBLE_EQ(-1.0, f->calc(t(p1)));
+
+    std::vector<double> big{-100, 100, -100};
+    f = dff.for_insertion_vector(t(big));
+    EXPECT_DOUBLE_EQ(100.0, f->calc(t(p1)));
+
+    // much bigger numbers expected:
+    f = dff.for_insertion_vector(t(p1));
+    EXPECT_DOUBLE_EQ(-30000.0, f->calc(t(p1)));
+    EXPECT_DOUBLE_EQ(-29999.0, f->calc(t(p2)));
+    EXPECT_DOUBLE_EQ(-29999.0, f->calc(t(p3)));
+    // all these have larger distance:
+    EXPECT_LT(-29999.0, f->calc(t(p6)));
+    EXPECT_LT(-29999.0, f->calc(t(p7)));
+    EXPECT_LT(-29999.0, f->calc(t(p8)));
+    EXPECT_LT(-29999.0, f->calc(t(p9a)));
+    EXPECT_LT(-29999.0, f->calc(t(p9b)));
+    EXPECT_LT(-29999.0, f->calc(t(p9c)));
+    EXPECT_LT(-29999.0, f->calc(t(p9d)));
+    // but not by much:
+    EXPECT_GT(-29900.0, f->calc(t(p6)));
+    EXPECT_GT(-29900.0, f->calc(t(p7)));
+    EXPECT_GT(-29900.0, f->calc(t(p8)));
+    EXPECT_GT(-29900.0, f->calc(t(p9a)));
+    EXPECT_GT(-29900.0, f->calc(t(p9b)));
+    EXPECT_GT(-29900.0, f->calc(t(p9c)));
+    EXPECT_GT(-29900.0, f->calc(t(p9d)));
+}
+
+
 GTEST_MAIN_RUN_ALL_TESTS()
 
diff --git a/searchlib/src/vespa/searchcommon/attribute/distance_metric.h b/searchlib/src/vespa/searchcommon/attribute/distance_metric.h
index 5b747d2016d..c157f6abb28 100644
--- a/searchlib/src/vespa/searchcommon/attribute/distance_metric.h
+++ b/searchlib/src/vespa/searchcommon/attribute/distance_metric.h
@@ -4,6 +4,6 @@
 
 namespace search::attribute {
 
-enum class DistanceMetric { Euclidean, Angular, GeoDegrees, InnerProduct, Hamming, PrenormalizedAngular };
+enum class DistanceMetric { Euclidean, Angular, GeoDegrees, InnerProduct, Hamming, PrenormalizedAngular, TransformedMips };
 
 }
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
index b9b3c63a0c3..0edab90f089 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
@@ -29,6 +29,7 @@ const vespalib::string angular = "angular";
 const vespalib::string geodegrees = "geodegrees";
 const vespalib::string innerproduct = "innerproduct";
 const vespalib::string prenormalized_angular = "prenormalized_angular";
+const vespalib::string transformed_mips = "transformed_mips";
 const vespalib::string hamming = "hamming";
 const vespalib::string doc_id_limit_tag = "docIdLimit";
 const vespalib::string enumerated_tag = "enumerated";
@@ -103,6 +104,7 @@ to_string(DistanceMetric metric)
         case DistanceMetric::InnerProduct: return innerproduct;
         case DistanceMetric::Hamming: return hamming;
         case DistanceMetric::PrenormalizedAngular: return prenormalized_angular;
+        case DistanceMetric::TransformedMips: return transformed_mips;
     }
     throw vespalib::IllegalArgumentException("Unknown distance metric " + std::to_string(static_cast<int>(metric)));
 }
@@ -119,7 +121,9 @@ to_distance_metric(const vespalib::string& metric)
     } else if (metric == innerproduct) {
         return DistanceMetric::InnerProduct;
     } else if (metric == prenormalized_angular) {
-      return DistanceMetric::PrenormalizedAngular;
+        return DistanceMetric::PrenormalizedAngular;
+    } else if (metric == transformed_mips) {
+        return DistanceMetric::TransformedMips;
     } else if (metric == hamming) {
         return DistanceMetric::Hamming;
     } else {
diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
index 31a0b2f2733..2119f441a14 100644
--- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
@@ -136,6 +136,10 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg)
             break;
         case CfgDm::PRENORMALIZED_ANGULAR:
             dm = DistanceMetric::PrenormalizedAngular;
+            /*
+        case CfgDm::TRANSFORMED_MIPS:
+            dm = DistanceMetric::TransformedMips;
+            */
             break;
     }
     retval.set_distance_metric(dm);
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index 362bceaf0da..24c8db4863e 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -2,6 +2,7 @@
 vespa_add_library(searchlib_tensor OBJECT
     SOURCES
     angular_distance.cpp
+    mips_distance_transform.cpp
     bitvector_visited_tracker.cpp
     bound_distance_function.cpp
     default_nearest_neighbor_index_factory.cpp
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index bec0f430a51..a338bf85e43 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -2,6 +2,7 @@
 
 #include "distance_function_factory.h"
 #include "distance_functions.h"
+#include "mips_distance_transform.h"
 #include <vespa/vespalib/util/typify.h>
 #include <vespa/vespalib/util/array.h>
 #include <vespa/vespalib/util/arrayref.h>
@@ -38,6 +39,12 @@ make_distance_function_factory(search::attribute::DistanceMetric variant,
                 case CellType::DOUBLE: return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<double>>();
                 default:               return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<float>>();
             }
+        case DistanceMetric::TransformedMips:
+            switch (cell_type) {
+                case CellType::DOUBLE: return std::make_unique<MipsDistanceFunctionFactory<double>>();
+                case CellType::INT8:   return std::make_unique<MipsDistanceFunctionFactory<vespalib::eval::Int8Float>>();
+                default:               return std::make_unique<MipsDistanceFunctionFactory<float>>();
+            }
         case DistanceMetric::GeoDegrees:
             return std::make_unique<GeoDistanceFunctionFactory>();
         case DistanceMetric::Hamming:
diff --git a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp
new file mode 100644
index 00000000000..1e238aaacc7
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp
@@ -0,0 +1,92 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "mips_distance_transform.h"
+#include "temporary_vector_store.h"
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+#include <cmath>
+#include <mutex>
+#include <variant>
+
+using vespalib::eval::Int8Float;
+
+namespace search::tensor {
+
+template<typename FloatType, bool extra_dim>
+class BoundMipsDistanceFunction : public BoundDistanceFunction {
+    mutable TemporaryVectorStore<FloatType> _tmpSpace;
+    const vespalib::ConstArrayRef<FloatType> _lhs_vector;
+    const vespalib::hwaccelrated::IAccelrated & _computer;
+    double _max_sq_norm;
+    using ExtraDimT = std::conditional<extra_dim,double,std::monostate>::type;
+    [[no_unique_address]] ExtraDimT _lhs_extra_dim;
+
+    static const double *cast(const double * p) { return p; }
+    static const float *cast(const float * p) { return p; }
+    static const int8_t *cast(const Int8Float * p) { return reinterpret_cast<const int8_t *>(p); }
+public:
+    BoundMipsDistanceFunction(const vespalib::eval::TypedCells& lhs, MaximumSquaredNormStore& sq_norm_store)
+        : BoundDistanceFunction(),
+          _tmpSpace(lhs.size),
+          _lhs_vector(_tmpSpace.storeLhs(lhs)),
+          _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator())
+    {
+        const FloatType * a = _lhs_vector.data();
+        if constexpr (extra_dim) {
+            double lhs_sq_norm = _computer.dotProduct(cast(a), cast(a), lhs.size);
+            _max_sq_norm = sq_norm_store.get_max(lhs_sq_norm);
+            _lhs_extra_dim = std::sqrt(_max_sq_norm - lhs_sq_norm);
+        } else {
+            _max_sq_norm = sq_norm_store.get_max();
+        }
+    }
+
+    double get_extra_dim_value() requires extra_dim {
+        return _lhs_extra_dim;
+    }
+
+    double calc(const vespalib::eval::TypedCells &rhs) const override {
+        vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
+        const FloatType * a = _lhs_vector.data();
+        const FloatType * b = rhs_vector.data();
+        double dp = _computer.dotProduct(cast(a), cast(b), rhs.size);
+        if constexpr (extra_dim) {
+            double rhs_sq_norm = _computer.dotProduct(cast(b), cast(b), rhs.size);
+	    // avoid sqrt(negative) for robustness:
+            double diff = std::max(0.0, _max_sq_norm - rhs_sq_norm);
+            double rhs_extra_dim = std::sqrt(diff);
+            dp += _lhs_extra_dim * rhs_extra_dim;
+        }
+        return -dp;
+    }
+    double convert_threshold(double threshold) const override {
+        return threshold;
+    }
+    double to_rawscore(double distance) const override {
+        double dp = -distance;
+        double t1 = dp / _max_sq_norm;
+        double t2 = t1 / (1.0 + std::fabs(t1));
+        double r = (t2 + 1.0) * 0.5;
+        return r;
+    }
+    double calc_with_limit(const vespalib::eval::TypedCells& rhs, double) const override {
+        return calc(rhs);
+    }
+};
+
+template<typename FloatType>
+BoundDistanceFunction::UP
+MipsDistanceFunctionFactory<FloatType>::for_query_vector(const vespalib::eval::TypedCells& lhs) {
+    return std::make_unique<BoundMipsDistanceFunction<FloatType, false>>(lhs, *_sq_norm_store);
+}
+
+template<typename FloatType>
+BoundDistanceFunction::UP
+MipsDistanceFunctionFactory<FloatType>::for_insertion_vector(const vespalib::eval::TypedCells& lhs) {
+    return std::make_unique<BoundMipsDistanceFunction<FloatType, true>>(lhs, *_sq_norm_store);
+};
+
+template class MipsDistanceFunctionFactory<Int8Float>;
+template class MipsDistanceFunctionFactory<float>;
+template class MipsDistanceFunctionFactory<double>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h
new file mode 100644
index 00000000000..fabd6bfcc57
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h
@@ -0,0 +1,58 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+#include "distance_function_factory.h"
+#include <vespa/eval/eval/typed_cells.h>
+#include <mutex>
+#include <memory>
+
+namespace search::tensor {
+
+/**
+ * Thread-safe storage of maximum value for squared vector norm.
+ * sq_norm = |x|^2 = sum(x[i]*x[i]) = dotproduct(x,x)
+ * Note that the initial value is 1.0; so even if all
+ * vectors seen have 0 or very small length, you will never
+ * get a value < 1.0.
+ */
+class MaximumSquaredNormStore {
+private:
+    std::mutex _lock;
+    double _max_sq_norm;
+public:
+    MaximumSquaredNormStore() noexcept : _lock(), _max_sq_norm(1.0) {}
+    /**
+     * Fetch the maximum value seen so far.
+     * Usually you will also supply a value computed for a newly seen
+     * vector, which may update the maximum value.
+     */
+    double get_max(double value = 0.0) {
+        std::lock_guard<std::mutex> guard(_lock);
+        if (value > _max_sq_norm) [[unlikely]] {
+            _max_sq_norm = value;
+        }
+        return _max_sq_norm;
+    }
+};
+
+/**
+ * Factory for distance functions which can apply a transformation
+ * mapping Maximum Inner Product Search to a nearest neighbor
+ * problem.  When inserting vectors, an extra dimension is
+ * added ensuring behavior "as if" all vectors had length equal
+ * to the longest vector inserted so far, or at least length 1.
+ */
+template<typename FloatType>
+class MipsDistanceFunctionFactory : public DistanceFunctionFactory {
+    std::shared_ptr<MaximumSquaredNormStore> _sq_norm_store;
+public:
+    MipsDistanceFunctionFactory() : _sq_norm_store(std::make_shared<MaximumSquaredNormStore>()) {}
+
+    BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override;
+
+    BoundDistanceFunction::UP for_insertion_vector(const vespalib::eval::TypedCells& lhs) override;
+};
+
+}
author	Arne H Juul <arnej27959@users.noreply.github.com>	2023-04-29 09:38:36 +0200
committer	GitHub <noreply@github.com>	2023-04-29 09:38:36 +0200
commit	592c38d1b1f85ebc2150354f23f3b64ca3be2159 (patch)
tree	579e0afbf5444c05c4fb2e9fc0faae2ec3903fc2
parent	6be89d2c64d5f4b3938c581c68fb577a6b052e73 (diff)
parent	22e933ddb54a825c646296e0153df3cba73fcfe9 (diff)