also add BoundEuclideanDistance

author: Arne Juul <arnej@yahooinc.com> 2023-04-20 13:26:02 +0000
committer: Arne Juul <arnej@yahooinc.com> 2023-04-24 07:58:38 +0000
commit: 06ae8f17e040fe27e19dd7bdf4857ce0c4ccaba1 (patch)
tree: 95635dda48f7f188f87e4b62ee2852a7cfb652c3
parent: bc7e1b8c1a96a49838e55a7b97c4de565abc9ad3 (diff)
6 files changed, 163 insertions, 11 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
index 86b83b2c651..ae283f3f2b2 100644
--- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
+++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
@@ -44,6 +44,30 @@ void verify_geo_miles(const DistanceFunction *dist_fun,
     }
 }
 
+double computeEuclideanChecked(TypedCells a, TypedCells b) {
+    static EuclideanDistanceFunctionFactory<Int8Float> i8f_dff;
+    static EuclideanDistanceFunctionFactory<float> flt_dff;
+    static EuclideanDistanceFunctionFactory<double> dbl_dff;
+    auto d_n = dbl_dff.for_query_vector(a);
+    auto d_f = flt_dff.for_query_vector(a);
+    auto d_r = dbl_dff.for_query_vector(b);
+    auto d_i = dbl_dff.for_insertion_vector(a);
+    // normal:
+    double result = d_n->calc(b);
+     // insert is exactly same:
+    EXPECT_EQ(d_i->calc(b), result);
+    // reverse:
+    EXPECT_DOUBLE_EQ(d_r->calc(a), result);
+    // float factory:
+    EXPECT_FLOAT_EQ(d_f->calc(b), result);
+    if (a.type == vespalib::eval::CellType::INT8 ||
+        b.type == vespalib::eval::CellType::INT8)
+    {
+        auto d_8 = i8f_dff.for_query_vector(a);
+        EXPECT_DOUBLE_EQ(d_8->calc(b), result);
+    }
+    return result;
+}
 
 TEST(DistanceFunctionsTest, euclidean_gives_expected_score)
 {
@@ -59,15 +83,56 @@ TEST(DistanceFunctionsTest, euclidean_gives_expected_score)
     std::vector<double> p5{0.0,-1.0, 0.0};
     std::vector<double> p6{1.0, 2.0, 2.0};
 
-    double n4 = euclid->calc(t(p0), t(p4));
+    double n4 = computeEuclideanChecked(t(p0), t(p4));
     EXPECT_FLOAT_EQ(n4, 1.0);
-    double d12 = euclid->calc(t(p1), t(p2));
+    double d12 = computeEuclideanChecked(t(p1), t(p2));
     EXPECT_EQ(d12, 2.0);
     EXPECT_DOUBLE_EQ(euclid->to_rawscore(d12), 1.0/(1.0 + sqrt(2.0)));
     double threshold = euclid->convert_threshold(8.0);
     EXPECT_EQ(threshold, 64.0);
     threshold = euclid->convert_threshold(0.5);
     EXPECT_EQ(threshold, 0.25);
+
+    // simple hand-checked distances:
+    EXPECT_EQ(computeEuclideanChecked(t(p0), t(p0)), 0.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p0), t(p1)), 1.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p0), t(p2)), 1.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p0), t(p3)), 1.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p0), t(p5)), 1.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p0), t(p6)), 9.0);
+
+    EXPECT_EQ(computeEuclideanChecked(t(p1), t(p1)), 0.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p1), t(p2)), 2.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p1), t(p3)), 2.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p1), t(p5)), 2.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p1), t(p6)), 8.0);
+
+    EXPECT_EQ(computeEuclideanChecked(t(p2), t(p2)), 0.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p2), t(p3)), 2.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p2), t(p5)), 4.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p2), t(p6)), 6.0);
+
+    EXPECT_EQ(computeEuclideanChecked(t(p3), t(p3)), 0.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p3), t(p5)), 2.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p3), t(p6)), 6.0);
+
+    EXPECT_EQ(computeEuclideanChecked(t(p5), t(p5)), 0.0);
+    EXPECT_EQ(computeEuclideanChecked(t(p5), t(p6)), 14.0);
+
+    EXPECT_EQ(computeEuclideanChecked(t(p6), t(p6)), 0.0);
+
+    // smoke test for bfloat16:
+    std::vector<vespalib::BFloat16> bf16v;
+    bf16v.emplace_back(1.0);
+    bf16v.emplace_back(1.0);
+    bf16v.emplace_back(1.0);
+    EXPECT_EQ(computeEuclideanChecked(t(bf16v), t(p0)), 3.0);
+    EXPECT_EQ(computeEuclideanChecked(t(bf16v), t(p1)), 2.0);
+    EXPECT_EQ(computeEuclideanChecked(t(bf16v), t(p2)), 2.0);
+    EXPECT_EQ(computeEuclideanChecked(t(bf16v), t(p3)), 2.0);
+    EXPECT_FLOAT_EQ(computeEuclideanChecked(t(bf16v), t(p4)), 0.5857863);
+    EXPECT_EQ(computeEuclideanChecked(t(bf16v), t(p5)), 6.0);
+    EXPECT_EQ(computeEuclideanChecked(t(bf16v), t(p6)), 2.0);
 }
 
 TEST(DistanceFunctionsTest, euclidean_int8_smoketest)
@@ -81,14 +146,13 @@ TEST(DistanceFunctionsTest, euclidean_int8_smoketest)
     std::vector<Int8Float> p5{0.0,-1.0, 0.0};
     std::vector<Int8Float> p7{-1.0, 2.0, -2.0};
 
-    EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p0), t(p1)));
-    EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p0), t(p5)));
-    EXPECT_DOUBLE_EQ(9.0, euclid->calc(t(p0), t(p7)));
-
-    EXPECT_DOUBLE_EQ(2.0, euclid->calc(t(p1), t(p5)));
-    EXPECT_DOUBLE_EQ(12.0, euclid->calc(t(p1), t(p7)));
-    EXPECT_DOUBLE_EQ(14.0, euclid->calc(t(p5), t(p7)));
+    EXPECT_DOUBLE_EQ(1.0, computeEuclideanChecked(t(p0), t(p1)));
+    EXPECT_DOUBLE_EQ(1.0, computeEuclideanChecked(t(p0), t(p5)));
+    EXPECT_DOUBLE_EQ(9.0, computeEuclideanChecked(t(p0), t(p7)));
 
+    EXPECT_DOUBLE_EQ(2.0, computeEuclideanChecked(t(p1), t(p5)));
+    EXPECT_DOUBLE_EQ(12.0, computeEuclideanChecked(t(p1), t(p7)));
+    EXPECT_DOUBLE_EQ(14.0, computeEuclideanChecked(t(p5), t(p7)));
 }
 
 double computeAngularChecked(TypedCells a, TypedCells b) {
diff --git a/searchlib/src/vespa/searchlib/tensor/bound_distance_function.cpp b/searchlib/src/vespa/searchlib/tensor/bound_distance_function.cpp
index 56edbf9fede..19c2e744954 100644
--- a/searchlib/src/vespa/searchlib/tensor/bound_distance_function.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/bound_distance_function.cpp
@@ -49,6 +49,7 @@ TemporaryVectorStore<FloatType>::internal_convert(TypedCells cells, size_t offse
     return result;
 }
 
+template class TemporaryVectorStore<vespalib::eval::Int8Float>;
 template class TemporaryVectorStore<float>;
 template class TemporaryVectorStore<double>;
 
diff --git a/searchlib/src/vespa/searchlib/tensor/bound_distance_function.h b/searchlib/src/vespa/searchlib/tensor/bound_distance_function.h
index 0f51e8a33ef..8949aea8796 100644
--- a/searchlib/src/vespa/searchlib/tensor/bound_distance_function.h
+++ b/searchlib/src/vespa/searchlib/tensor/bound_distance_function.h
@@ -5,7 +5,6 @@
 #include <memory>
 #include <vespa/eval/eval/cell_type.h>
 #include <vespa/eval/eval/typed_cells.h>
-#include <vespa/vespalib/util/array.h>
 #include <vespa/vespalib/util/arrayref.h>
 #include "distance_function.h"
 
@@ -48,7 +47,7 @@ public:
 template <typename FloatType>
 class TemporaryVectorStore {
 private:
-    vespalib::Array<FloatType> _tmpSpace;
+    std::vector<FloatType> _tmpSpace;
     vespalib::ConstArrayRef<FloatType> internal_convert(vespalib::eval::TypedCells cells, size_t offset);
 public:
     TemporaryVectorStore(size_t vectorSize) : _tmpSpace(vectorSize * 2) {}
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index 7ccca655943..cca492ef212 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -100,6 +100,13 @@ make_distance_function_factory(search::attribute::DistanceMetric variant,
         }
         return std::make_unique<AngularDistanceFunctionFactory<float>>();
     }
+    if (variant == DistanceMetric::Euclidean) {
+        switch (cell_type) {
+        case CellType::DOUBLE: return std::make_unique<EuclideanDistanceFunctionFactory<double>>();
+        case CellType::INT8:   return std::make_unique<EuclideanDistanceFunctionFactory<vespalib::eval::Int8Float>>();
+        default:               return std::make_unique<EuclideanDistanceFunctionFactory<float>>();
+        }
+    }
     auto df = make_distance_function(variant, cell_type);
     return std::make_unique<SimpleDistanceFunctionFactory>(std::move(df));
 }
diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
index c83f1821321..6a54798883f 100644
--- a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
@@ -48,4 +48,73 @@ SquaredEuclideanDistance::calc_with_limit(const vespalib::eval::TypedCells& lhs,
 template class SquaredEuclideanDistanceHW<float>;
 template class SquaredEuclideanDistanceHW<double>;
 
+using vespalib::eval::Int8Float;
+
+template<typename FloatType>
+class BoundEuclideanDistance : public BoundDistanceFunction {
+private:
+    const vespalib::hwaccelrated::IAccelrated & _computer;
+    mutable TemporaryVectorStore<FloatType> _tmpSpace;
+    const vespalib::ConstArrayRef<FloatType> _lhs_vector;
+    static const double *cast(const double * p) { return p; }
+    static const float *cast(const float * p) { return p; }
+    static const int8_t *cast(const Int8Float * p) { return reinterpret_cast<const int8_t *>(p); }
+public:
+    BoundEuclideanDistance(const vespalib::eval::TypedCells& lhs)
+        : BoundDistanceFunction(vespalib::eval::get_cell_type<FloatType>()),
+          _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()),
+          _tmpSpace(lhs.size),
+          _lhs_vector(_tmpSpace.storeLhs(lhs))
+    {}
+    double calc(const vespalib::eval::TypedCells& rhs) const override {
+        size_t sz = _lhs_vector.size();
+        vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
+        assert(sz == rhs_vector.size());
+        auto a = &_lhs_vector[0];
+        auto b = &rhs_vector[0];
+        return _computer.squaredEuclideanDistance(cast(a), cast(b), sz);
+    }
+    double convert_threshold(double threshold) const override {
+        return threshold*threshold;
+    }
+    double to_rawscore(double distance) const override {
+        double d = sqrt(distance);
+        double score = 1.0 / (1.0 + d);
+        return score;
+    }
+    double calc_with_limit(const vespalib::eval::TypedCells& rhs, double limit) const override {
+        vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
+        double sum = 0.0;
+        size_t sz = _lhs_vector.size();
+        assert(sz == rhs_vector.size());
+        for (size_t i = 0; i < sz && sum <= limit; ++i) {
+            double diff = _lhs_vector[i] - rhs_vector[i];
+            sum += diff*diff;
+        }
+        return sum;
+    }
+};
+
+template class BoundEuclideanDistance<Int8Float>;
+template class BoundEuclideanDistance<float>;
+template class BoundEuclideanDistance<double>;
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+EuclideanDistanceFunctionFactory<FloatType>::for_query_vector(const vespalib::eval::TypedCells& lhs) {
+    using DFT = BoundEuclideanDistance<FloatType>;
+    return std::make_unique<DFT>(lhs);
+}
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+EuclideanDistanceFunctionFactory<FloatType>::for_insertion_vector(const vespalib::eval::TypedCells& lhs) {
+    using DFT = BoundEuclideanDistance<FloatType>;
+    return std::make_unique<DFT>(lhs);
+}
+
+template class EuclideanDistanceFunctionFactory<Int8Float>;
+template class EuclideanDistanceFunctionFactory<float>;
+template class EuclideanDistanceFunctionFactory<double>;
+
 }
diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h
index 6505ea119ea..b406f0d3d1a 100644
--- a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h
+++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "distance_function.h"
+#include "distance_function_factory.h"
 #include <vespa/eval/eval/typed_cells.h>
 #include <vespa/vespalib/hwaccelrated/iaccelrated.h>
 #include <cmath>
@@ -78,4 +79,15 @@ private:
     const vespalib::hwaccelrated::IAccelrated & _computer;
 };
 
+
+template <typename FloatType>
+class EuclideanDistanceFunctionFactory : public DistanceFunctionFactory {
+public:
+    EuclideanDistanceFunctionFactory()
+        : DistanceFunctionFactory(vespalib::eval::get_cell_type<FloatType>())
+        {}
+    BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override;
+    BoundDistanceFunction::UP for_insertion_vector(const vespalib::eval::TypedCells& lhs) override;
+};
+
 }
author	Arne Juul <arnej@yahooinc.com>	2023-04-20 13:26:02 +0000
committer	Arne Juul <arnej@yahooinc.com>	2023-04-24 07:58:38 +0000
commit	06ae8f17e040fe27e19dd7bdf4857ce0c4ccaba1 (patch)
tree	95635dda48f7f188f87e4b62ee2852a7cfb652c3
parent	bc7e1b8c1a96a49838e55a7b97c4de565abc9ad3 (diff)