diff options
author | Arne H Juul <arnej27959@users.noreply.github.com> | 2023-04-29 09:38:36 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-04-29 09:38:36 +0200 |
commit | 592c38d1b1f85ebc2150354f23f3b64ca3be2159 (patch) | |
tree | 579e0afbf5444c05c4fb2e9fc0faae2ec3903fc2 | |
parent | 6be89d2c64d5f4b3938c581c68fb577a6b052e73 (diff) | |
parent | 22e933ddb54a825c646296e0153df3cba73fcfe9 (diff) |
Merge pull request #26906 from vespa-engine/arnej/add-max-inner-product-search-transform
Arnej/add max inner product search transform
8 files changed, 330 insertions, 2 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp index a1b29c90986..9d0b7259912 100644 --- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp +++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp @@ -4,6 +4,7 @@ #include <vespa/searchlib/common/geo_gcd.h> #include <vespa/searchlib/tensor/distance_functions.h> #include <vespa/searchlib/tensor/distance_function_factory.h> +#include <vespa/searchlib/tensor/mips_distance_transform.h> #include <vespa/vespalib/gtest/gtest.h> #include <vector> @@ -508,5 +509,166 @@ TEST(GeoDegreesTest, gives_expected_score) verify_geo_miles(g9_jfk, g9_jfk, 0); } + +double computeTransformedMipsChecked(TypedCells a, TypedCells b, bool check_insert = true) { + MipsDistanceFunctionFactory<float> flt_dff; + MipsDistanceFunctionFactory<double> dbl_dff; + + auto d_n = dbl_dff.for_query_vector(a); + auto d_f = flt_dff.for_query_vector(a); + auto d_r = dbl_dff.for_query_vector(b); + // normal: + double result = d_n->calc(b); + // reverse: + EXPECT_DOUBLE_EQ(d_r->calc(a), result); + // float factory: + EXPECT_FLOAT_EQ(d_f->calc(b), result); + double closeness_n = d_n->to_rawscore(result); + double closeness_f = d_f->to_rawscore(result); + double closeness_r = d_r->to_rawscore(result); + EXPECT_DOUBLE_EQ(closeness_n, closeness_f); + EXPECT_DOUBLE_EQ(closeness_n, closeness_r); + EXPECT_GT(closeness_n, 0.0); + EXPECT_LE(closeness_n, 1.0); + if (check_insert) { + auto d_i = dbl_dff.for_insertion_vector(a); + EXPECT_DOUBLE_EQ(d_i->calc(b), result); + } + return result; +} + +TEST(DistanceFunctionsTest, transformed_mips_basic_scores) +{ + std::vector<double> p1{1.0, 0.0, 0.0}; + std::vector<double> p2{0.0, 1.0, 0.0}; + std::vector<double> p3{0.0, 0.0, 1.0}; + std::vector<double> p4{0.5, 0.5, sq_root_half}; + std::vector<double> p5{0.0,-1.0, 0.0}; + + double i12 = computeTransformedMipsChecked(t(p1), t(p2)); + double i13 = computeTransformedMipsChecked(t(p1), t(p3)); + double i23 = computeTransformedMipsChecked(t(p2), t(p3)); + EXPECT_DOUBLE_EQ(i12, 0.0); + EXPECT_DOUBLE_EQ(i13, 0.0); + EXPECT_DOUBLE_EQ(i23, 0.0); + + double i14 = computeTransformedMipsChecked(t(p1), t(p4)); + double i24 = computeTransformedMipsChecked(t(p2), t(p4)); + EXPECT_DOUBLE_EQ(i14, -0.5); + EXPECT_DOUBLE_EQ(i24, -0.5); + + double i34 = computeTransformedMipsChecked(t(p3), t(p4)); + EXPECT_FLOAT_EQ(i34, -sq_root_half); + + double i25 = computeTransformedMipsChecked(t(p2), t(p5)); + EXPECT_DOUBLE_EQ(i25, 1.0); + + double i44 = computeTransformedMipsChecked(t(p4), t(p4)); + EXPECT_DOUBLE_EQ(i44, -1.0); + + std::vector<double> p6{ 0.0, 4.0, -4.0}; + std::vector<double> p7{-4.0, 0.0, 4.0}; + std::vector<double> p8{ 4.0, -4.0, 0.0}; + + double i66 = computeTransformedMipsChecked(t(p6), t(p6)); + EXPECT_DOUBLE_EQ(i66, -32.0); + + double i67 = computeTransformedMipsChecked(t(p6), t(p7)); + EXPECT_DOUBLE_EQ(i67, 16.0); + + double i68 = computeTransformedMipsChecked(t(p6), t(p8)); + EXPECT_DOUBLE_EQ(i68, 16.0); + + double i78 = computeTransformedMipsChecked(t(p7), t(p8)); + EXPECT_DOUBLE_EQ(i78, 16.0); +} + +TEST(DistanceFunctionsTest, transformed_mips_growing_norm) +{ + std::vector<double> p1{1.0, 0.0, 0.0}; + std::vector<double> p2{0.0, 1.0, 0.0}; + std::vector<double> p3{0.0, 0.0, 1.0}; + std::vector<double> p6{ 0.0, 4.0, -4.0}; + std::vector<double> p7{-4.0, 0.0, 4.0}; + std::vector<double> p8{ 4.0, -4.0, 0.0}; + + MipsDistanceFunctionFactory<double> dff; + auto f = dff.for_insertion_vector(t(p1)); + EXPECT_DOUBLE_EQ(-1.0, f->calc(t(p1))); + EXPECT_DOUBLE_EQ(0.0, f->calc(t(p2))); + EXPECT_DOUBLE_EQ(0.0, f->calc(t(p3))); + EXPECT_DOUBLE_EQ(0.0, f->calc(t(p6))); + EXPECT_DOUBLE_EQ(4.0, f->calc(t(p7))); + EXPECT_DOUBLE_EQ(-4.0, f->calc(t(p8))); + + // closeness + EXPECT_DOUBLE_EQ(0.25, f->to_rawscore(1.0)); + EXPECT_DOUBLE_EQ(0.50, f->to_rawscore(0.0)); + EXPECT_DOUBLE_EQ(0.75, f->to_rawscore(-1.0)); + + // now "insert" a bigger vector + f = dff.for_insertion_vector(t(p6)); + EXPECT_DOUBLE_EQ(0.0, f->calc(t(p1))); + EXPECT_DOUBLE_EQ(-4.0, f->calc(t(p2))); + EXPECT_DOUBLE_EQ(4.0, f->calc(t(p3))); + EXPECT_DOUBLE_EQ(-32.0, f->calc(t(p6))); + EXPECT_DOUBLE_EQ(16.0, f->calc(t(p7))); + EXPECT_DOUBLE_EQ(16.0, f->calc(t(p8))); + + // now max squared norm is 32, so p1 is "closer" to itself + f = dff.for_insertion_vector(t(p1)); + EXPECT_DOUBLE_EQ(-32.0, f->calc(t(p1))); + // closeness (rawscore) is also different: + EXPECT_DOUBLE_EQ(0.25, f->to_rawscore(32.0)); + EXPECT_DOUBLE_EQ(1/3., f->to_rawscore(16.0)); + EXPECT_DOUBLE_EQ(0.50, f->to_rawscore(0.0)); + EXPECT_DOUBLE_EQ(2/3., f->to_rawscore(-16.0)); + EXPECT_DOUBLE_EQ(0.75, f->to_rawscore(-32.0)); + + // also closer to other small vectors + EXPECT_DOUBLE_EQ(-31.0, f->calc(t(p2))); + EXPECT_DOUBLE_EQ(-31.0, f->calc(t(p3))); + std::vector<double> p9a{-5.0, 0.0, 0.0}; + // 32 - (-5)^2 = 32 - 25 = 7 + EXPECT_DOUBLE_EQ(5.0 - std::sqrt(31.0 * 7), f->calc(t(p9a))); + std::vector<double> p9b{-3.0, 4.0, 0.0}; + std::vector<double> p9c{0.0, -3.0, 4.0}; + std::vector<double> p9d{-4.0, 0.0, 3.0}; + EXPECT_DOUBLE_EQ(3.0 - std::sqrt(31.0 * 7), f->calc(t(p9b))); + EXPECT_DOUBLE_EQ(0.0 - std::sqrt(31.0 * 7), f->calc(t(p9c))); + EXPECT_DOUBLE_EQ(4.0 - std::sqrt(31.0 * 7), f->calc(t(p9d))); + + // but only for insert: + f = dff.for_query_vector(t(p1)); + EXPECT_DOUBLE_EQ(-1.0, f->calc(t(p1))); + + std::vector<double> big{-100, 100, -100}; + f = dff.for_insertion_vector(t(big)); + EXPECT_DOUBLE_EQ(100.0, f->calc(t(p1))); + + // much bigger numbers expected: + f = dff.for_insertion_vector(t(p1)); + EXPECT_DOUBLE_EQ(-30000.0, f->calc(t(p1))); + EXPECT_DOUBLE_EQ(-29999.0, f->calc(t(p2))); + EXPECT_DOUBLE_EQ(-29999.0, f->calc(t(p3))); + // all these have larger distance: + EXPECT_LT(-29999.0, f->calc(t(p6))); + EXPECT_LT(-29999.0, f->calc(t(p7))); + EXPECT_LT(-29999.0, f->calc(t(p8))); + EXPECT_LT(-29999.0, f->calc(t(p9a))); + EXPECT_LT(-29999.0, f->calc(t(p9b))); + EXPECT_LT(-29999.0, f->calc(t(p9c))); + EXPECT_LT(-29999.0, f->calc(t(p9d))); + // but not by much: + EXPECT_GT(-29900.0, f->calc(t(p6))); + EXPECT_GT(-29900.0, f->calc(t(p7))); + EXPECT_GT(-29900.0, f->calc(t(p8))); + EXPECT_GT(-29900.0, f->calc(t(p9a))); + EXPECT_GT(-29900.0, f->calc(t(p9b))); + EXPECT_GT(-29900.0, f->calc(t(p9c))); + EXPECT_GT(-29900.0, f->calc(t(p9d))); +} + + GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/vespa/searchcommon/attribute/distance_metric.h b/searchlib/src/vespa/searchcommon/attribute/distance_metric.h index 5b747d2016d..c157f6abb28 100644 --- a/searchlib/src/vespa/searchcommon/attribute/distance_metric.h +++ b/searchlib/src/vespa/searchcommon/attribute/distance_metric.h @@ -4,6 +4,6 @@ namespace search::attribute { -enum class DistanceMetric { Euclidean, Angular, GeoDegrees, InnerProduct, Hamming, PrenormalizedAngular }; +enum class DistanceMetric { Euclidean, Angular, GeoDegrees, InnerProduct, Hamming, PrenormalizedAngular, TransformedMips }; } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp index b9b3c63a0c3..0edab90f089 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp @@ -29,6 +29,7 @@ const vespalib::string angular = "angular"; const vespalib::string geodegrees = "geodegrees"; const vespalib::string innerproduct = "innerproduct"; const vespalib::string prenormalized_angular = "prenormalized_angular"; +const vespalib::string transformed_mips = "transformed_mips"; const vespalib::string hamming = "hamming"; const vespalib::string doc_id_limit_tag = "docIdLimit"; const vespalib::string enumerated_tag = "enumerated"; @@ -103,6 +104,7 @@ to_string(DistanceMetric metric) case DistanceMetric::InnerProduct: return innerproduct; case DistanceMetric::Hamming: return hamming; case DistanceMetric::PrenormalizedAngular: return prenormalized_angular; + case DistanceMetric::TransformedMips: return transformed_mips; } throw vespalib::IllegalArgumentException("Unknown distance metric " + std::to_string(static_cast<int>(metric))); } @@ -119,7 +121,9 @@ to_distance_metric(const vespalib::string& metric) } else if (metric == innerproduct) { return DistanceMetric::InnerProduct; } else if (metric == prenormalized_angular) { - return DistanceMetric::PrenormalizedAngular; + return DistanceMetric::PrenormalizedAngular; + } else if (metric == transformed_mips) { + return DistanceMetric::TransformedMips; } else if (metric == hamming) { return DistanceMetric::Hamming; } else { diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp index 31a0b2f2733..2119f441a14 100644 --- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp +++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp @@ -136,6 +136,10 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg) break; case CfgDm::PRENORMALIZED_ANGULAR: dm = DistanceMetric::PrenormalizedAngular; + /* + case CfgDm::TRANSFORMED_MIPS: + dm = DistanceMetric::TransformedMips; + */ break; } retval.set_distance_metric(dm); diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index 362bceaf0da..24c8db4863e 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -2,6 +2,7 @@ vespa_add_library(searchlib_tensor OBJECT SOURCES angular_distance.cpp + mips_distance_transform.cpp bitvector_visited_tracker.cpp bound_distance_function.cpp default_nearest_neighbor_index_factory.cpp diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp index bec0f430a51..a338bf85e43 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp @@ -2,6 +2,7 @@ #include "distance_function_factory.h" #include "distance_functions.h" +#include "mips_distance_transform.h" #include <vespa/vespalib/util/typify.h> #include <vespa/vespalib/util/array.h> #include <vespa/vespalib/util/arrayref.h> @@ -38,6 +39,12 @@ make_distance_function_factory(search::attribute::DistanceMetric variant, case CellType::DOUBLE: return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<double>>(); default: return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<float>>(); } + case DistanceMetric::TransformedMips: + switch (cell_type) { + case CellType::DOUBLE: return std::make_unique<MipsDistanceFunctionFactory<double>>(); + case CellType::INT8: return std::make_unique<MipsDistanceFunctionFactory<vespalib::eval::Int8Float>>(); + default: return std::make_unique<MipsDistanceFunctionFactory<float>>(); + } case DistanceMetric::GeoDegrees: return std::make_unique<GeoDistanceFunctionFactory>(); case DistanceMetric::Hamming: diff --git a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp new file mode 100644 index 00000000000..1e238aaacc7 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp @@ -0,0 +1,92 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "mips_distance_transform.h" +#include "temporary_vector_store.h" +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> +#include <cmath> +#include <mutex> +#include <variant> + +using vespalib::eval::Int8Float; + +namespace search::tensor { + +template<typename FloatType, bool extra_dim> +class BoundMipsDistanceFunction : public BoundDistanceFunction { + mutable TemporaryVectorStore<FloatType> _tmpSpace; + const vespalib::ConstArrayRef<FloatType> _lhs_vector; + const vespalib::hwaccelrated::IAccelrated & _computer; + double _max_sq_norm; + using ExtraDimT = std::conditional<extra_dim,double,std::monostate>::type; + [[no_unique_address]] ExtraDimT _lhs_extra_dim; + + static const double *cast(const double * p) { return p; } + static const float *cast(const float * p) { return p; } + static const int8_t *cast(const Int8Float * p) { return reinterpret_cast<const int8_t *>(p); } +public: + BoundMipsDistanceFunction(const vespalib::eval::TypedCells& lhs, MaximumSquaredNormStore& sq_norm_store) + : BoundDistanceFunction(), + _tmpSpace(lhs.size), + _lhs_vector(_tmpSpace.storeLhs(lhs)), + _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + { + const FloatType * a = _lhs_vector.data(); + if constexpr (extra_dim) { + double lhs_sq_norm = _computer.dotProduct(cast(a), cast(a), lhs.size); + _max_sq_norm = sq_norm_store.get_max(lhs_sq_norm); + _lhs_extra_dim = std::sqrt(_max_sq_norm - lhs_sq_norm); + } else { + _max_sq_norm = sq_norm_store.get_max(); + } + } + + double get_extra_dim_value() requires extra_dim { + return _lhs_extra_dim; + } + + double calc(const vespalib::eval::TypedCells &rhs) const override { + vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs); + const FloatType * a = _lhs_vector.data(); + const FloatType * b = rhs_vector.data(); + double dp = _computer.dotProduct(cast(a), cast(b), rhs.size); + if constexpr (extra_dim) { + double rhs_sq_norm = _computer.dotProduct(cast(b), cast(b), rhs.size); + // avoid sqrt(negative) for robustness: + double diff = std::max(0.0, _max_sq_norm - rhs_sq_norm); + double rhs_extra_dim = std::sqrt(diff); + dp += _lhs_extra_dim * rhs_extra_dim; + } + return -dp; + } + double convert_threshold(double threshold) const override { + return threshold; + } + double to_rawscore(double distance) const override { + double dp = -distance; + double t1 = dp / _max_sq_norm; + double t2 = t1 / (1.0 + std::fabs(t1)); + double r = (t2 + 1.0) * 0.5; + return r; + } + double calc_with_limit(const vespalib::eval::TypedCells& rhs, double) const override { + return calc(rhs); + } +}; + +template<typename FloatType> +BoundDistanceFunction::UP +MipsDistanceFunctionFactory<FloatType>::for_query_vector(const vespalib::eval::TypedCells& lhs) { + return std::make_unique<BoundMipsDistanceFunction<FloatType, false>>(lhs, *_sq_norm_store); +} + +template<typename FloatType> +BoundDistanceFunction::UP +MipsDistanceFunctionFactory<FloatType>::for_insertion_vector(const vespalib::eval::TypedCells& lhs) { + return std::make_unique<BoundMipsDistanceFunction<FloatType, true>>(lhs, *_sq_norm_store); +}; + +template class MipsDistanceFunctionFactory<Int8Float>; +template class MipsDistanceFunctionFactory<float>; +template class MipsDistanceFunctionFactory<double>; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h new file mode 100644 index 00000000000..fabd6bfcc57 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h @@ -0,0 +1,58 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include "distance_function_factory.h" +#include <vespa/eval/eval/typed_cells.h> +#include <mutex> +#include <memory> + +namespace search::tensor { + +/** + * Thread-safe storage of maximum value for squared vector norm. + * sq_norm = |x|^2 = sum(x[i]*x[i]) = dotproduct(x,x) + * Note that the initial value is 1.0; so even if all + * vectors seen have 0 or very small length, you will never + * get a value < 1.0. + */ +class MaximumSquaredNormStore { +private: + std::mutex _lock; + double _max_sq_norm; +public: + MaximumSquaredNormStore() noexcept : _lock(), _max_sq_norm(1.0) {} + /** + * Fetch the maximum value seen so far. + * Usually you will also supply a value computed for a newly seen + * vector, which may update the maximum value. + */ + double get_max(double value = 0.0) { + std::lock_guard<std::mutex> guard(_lock); + if (value > _max_sq_norm) [[unlikely]] { + _max_sq_norm = value; + } + return _max_sq_norm; + } +}; + +/** + * Factory for distance functions which can apply a transformation + * mapping Maximum Inner Product Search to a nearest neighbor + * problem. When inserting vectors, an extra dimension is + * added ensuring behavior "as if" all vectors had length equal + * to the longest vector inserted so far, or at least length 1. + */ +template<typename FloatType> +class MipsDistanceFunctionFactory : public DistanceFunctionFactory { + std::shared_ptr<MaximumSquaredNormStore> _sq_norm_store; +public: + MipsDistanceFunctionFactory() : _sq_norm_store(std::make_shared<MaximumSquaredNormStore>()) {} + + BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override; + + BoundDistanceFunction::UP for_insertion_vector(const vespalib::eval::TypedCells& lhs) override; +}; + +} |