aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorArne H Juul <arnej27959@users.noreply.github.com>2023-04-25 17:29:38 +0200
committerGitHub <noreply@github.com>2023-04-25 17:29:38 +0200
commit1380d8ab1a17aab66c4ea76bfc956a91fcb54cb5 (patch)
treec3b2b159e40da8325d9b49fa8809532d165140f9 /searchlib
parent21773bf29f0c2318a23661a106b5c29562468ffb (diff)
parentd9a41679f37d93e3a98338fe01de0ba62d4acdbf (diff)
Merge pull request #26846 from vespa-engine/arnej/bound-prenormalized-3
Arnej/bound prenormalized 3
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp99
-rw-r--r--searchlib/src/vespa/searchlib/tensor/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/tensor/angular_distance.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/tensor/angular_distance.h4
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.h1
-rw-r--r--searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/tensor/inner_product_distance.h2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp82
-rw-r--r--searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h27
10 files changed, 227 insertions, 13 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
index ae283f3f2b2..e5bed3ebae5 100644
--- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
+++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
@@ -69,6 +69,8 @@ double computeEuclideanChecked(TypedCells a, TypedCells b) {
return result;
}
+namespace { constexpr double sq_root_half = std::sqrt(0.5); }
+
TEST(DistanceFunctionsTest, euclidean_gives_expected_score)
{
auto ct = vespalib::eval::CellType::DOUBLE;
@@ -79,7 +81,7 @@ TEST(DistanceFunctionsTest, euclidean_gives_expected_score)
std::vector<double> p1{1.0, 0.0, 0.0};
std::vector<double> p2{0.0, 1.0, 0.0};
std::vector<double> p3{0.0, 0.0, 1.0};
- std::vector<double> p4{0.5, 0.5, 0.707107};
+ std::vector<double> p4{0.5, 0.5, sq_root_half};
std::vector<double> p5{0.0,-1.0, 0.0};
std::vector<double> p6{1.0, 2.0, 2.0};
@@ -179,7 +181,7 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score)
std::vector<double> p1{1.0, 0.0, 0.0};
std::vector<double> p2{0.0, 1.0, 0.0};
std::vector<double> p3{0.0, 0.0, 1.0};
- std::vector<double> p4{0.5, 0.5, 0.707107};
+ std::vector<double> p4{0.5, 0.5, sq_root_half};
std::vector<double> p5{0.0,-1.0, 0.0};
std::vector<double> p6{1.0, 2.0, 2.0};
@@ -207,7 +209,7 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score)
EXPECT_DOUBLE_EQ(threshold, 0.5);
double a34 = computeAngularChecked(t(p3), t(p4));
- EXPECT_FLOAT_EQ(a34, (1.0 - 0.707107));
+ EXPECT_FLOAT_EQ(a34, (1.0 - sq_root_half));
EXPECT_FLOAT_EQ(angular->to_rawscore(a34), 1.0/(1.0 + pi/4));
threshold = angular->convert_threshold(pi/4);
EXPECT_FLOAT_EQ(threshold, a34);
@@ -257,6 +259,89 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score)
EXPECT_DOUBLE_EQ(a66, computeAngularChecked(t(iv6), t(iv6)));
}
+double computePrenormalizedAngularChecked(TypedCells a, TypedCells b) {
+ static PrenormalizedAngularDistanceFunctionFactory<float> flt_dff;
+ static PrenormalizedAngularDistanceFunctionFactory<double> dbl_dff;
+ auto d_n = dbl_dff.for_query_vector(a);
+ auto d_f = flt_dff.for_query_vector(a);
+ auto d_r = dbl_dff.for_query_vector(b);
+ auto d_i = dbl_dff.for_insertion_vector(a);
+ // normal:
+ double result = d_n->calc(b);
+ // insert is exactly same:
+ EXPECT_EQ(d_i->calc(b), result);
+ // note: for this distance, reverse is not necessarily equal,
+ // since we normalize based on length of LHS only
+ EXPECT_FLOAT_EQ(d_r->calc(a), result);
+ // float factory:
+ EXPECT_FLOAT_EQ(d_f->calc(b), result);
+ double closeness_n = d_n->to_rawscore(result);
+ double closeness_f = d_f->to_rawscore(result);
+ double closeness_r = d_r->to_rawscore(result);
+ double closeness_i = d_i->to_rawscore(result);
+ EXPECT_DOUBLE_EQ(closeness_n, closeness_f);
+ EXPECT_DOUBLE_EQ(closeness_n, closeness_r);
+ EXPECT_DOUBLE_EQ(closeness_n, closeness_i);
+ EXPECT_GT(closeness_n, 0.0);
+ EXPECT_LE(closeness_n, 1.0);
+ return result;
+}
+
+TEST(DistanceFunctionsTest, prenormalized_angular_gives_expected_score)
+{
+ std::vector<double> p0{0.0, 0.0, 0.0};
+ std::vector<double> p1{1.0, 0.0, 0.0};
+ std::vector<double> p2{0.0, 1.0, 0.0};
+ std::vector<double> p3{0.0, 0.0, 1.0};
+ std::vector<double> p4{0.5, 0.5, sq_root_half};
+ std::vector<double> p5{0.0,-1.0, 0.0};
+ std::vector<double> p6{1.0, 2.0, 2.0};
+ std::vector<double> p7{2.0, -1.0, -2.0};
+ std::vector<double> p8{3.0, 0.0, 0.0};
+
+ PrenormalizedAngularDistanceFunctionFactory<double> dff;
+ auto pnad = dff.for_query_vector(t(p0));
+
+ double i12 = computePrenormalizedAngularChecked(t(p1), t(p2));
+ double i13 = computePrenormalizedAngularChecked(t(p1), t(p3));
+ double i23 = computePrenormalizedAngularChecked(t(p2), t(p3));
+ EXPECT_DOUBLE_EQ(i12, 1.0);
+ EXPECT_DOUBLE_EQ(i13, 1.0);
+ EXPECT_DOUBLE_EQ(i23, 1.0);
+
+ double i14 = computePrenormalizedAngularChecked(t(p1), t(p4));
+ double i24 = computePrenormalizedAngularChecked(t(p2), t(p4));
+ EXPECT_DOUBLE_EQ(i14, 0.5);
+ EXPECT_DOUBLE_EQ(i24, 0.5);
+ double i34 = computePrenormalizedAngularChecked(t(p3), t(p4));
+ EXPECT_FLOAT_EQ(i34, 1.0 - sq_root_half);
+
+ double i25 = computePrenormalizedAngularChecked(t(p2), t(p5));
+ EXPECT_DOUBLE_EQ(i25, 2.0);
+
+ double i44 = computePrenormalizedAngularChecked(t(p4), t(p4));
+ EXPECT_GE(i44, 0.0);
+ EXPECT_LT(i44, 0.000001);
+
+ double i66 = computePrenormalizedAngularChecked(t(p6), t(p6));
+ EXPECT_GE(i66, 0.0);
+ EXPECT_LT(i66, 0.000001);
+
+ double i67 = computePrenormalizedAngularChecked(t(p6), t(p7));
+ EXPECT_DOUBLE_EQ(i67, 13.0);
+ double i68 = computePrenormalizedAngularChecked(t(p6), t(p8));
+ EXPECT_DOUBLE_EQ(i68, 6.0);
+ double i78 = computePrenormalizedAngularChecked(t(p7), t(p8));
+ EXPECT_DOUBLE_EQ(i78, 3.0);
+
+ double threshold = pnad->convert_threshold(0.25);
+ EXPECT_DOUBLE_EQ(threshold, 0.25);
+ threshold = pnad->convert_threshold(0.5);
+ EXPECT_DOUBLE_EQ(threshold, 0.5);
+ threshold = pnad->convert_threshold(1.0);
+ EXPECT_DOUBLE_EQ(threshold, 1.0);
+}
+
TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
{
auto ct = vespalib::eval::CellType::DOUBLE;
@@ -267,7 +352,7 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
std::vector<double> p1{1.0, 0.0, 0.0};
std::vector<double> p2{0.0, 1.0, 0.0};
std::vector<double> p3{0.0, 0.0, 1.0};
- std::vector<double> p4{0.5, 0.5, 0.707107};
+ std::vector<double> p4{0.5, 0.5, sq_root_half};
std::vector<double> p5{0.0,-1.0, 0.0};
std::vector<double> p6{1.0, 2.0, 2.0};
@@ -283,7 +368,7 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
EXPECT_DOUBLE_EQ(i14, 0.5);
EXPECT_DOUBLE_EQ(i24, 0.5);
double i34 = innerproduct->calc(t(p3), t(p4));
- EXPECT_FLOAT_EQ(i34, 1.0 - 0.707107);
+ EXPECT_FLOAT_EQ(i34, 1.0 - sq_root_half);
double i25 = innerproduct->calc(t(p2), t(p5));
EXPECT_DOUBLE_EQ(i25, 2.0);
@@ -292,6 +377,10 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
EXPECT_GE(i44, 0.0);
EXPECT_LT(i44, 0.000001);
+ double i66 = innerproduct->calc(t(p6), t(p6));
+ EXPECT_GE(i66, 0.0);
+ EXPECT_LT(i66, 0.000001);
+
double threshold = innerproduct->convert_threshold(0.25);
EXPECT_DOUBLE_EQ(threshold, 0.25);
threshold = innerproduct->convert_threshold(0.5);
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index 1783e0da1dd..2e874ffa4ae 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -30,6 +30,7 @@ vespa_add_library(searchlib_tensor OBJECT
large_subspaces_buffer_type.cpp
nearest_neighbor_index.cpp
nearest_neighbor_index_saver.cpp
+ prenormalized_angular_distance.cpp
serialized_fast_value_attribute.cpp
serialized_tensor_ref.cpp
small_subspaces_buffer_type.cpp
diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
index 85eac76728c..efc1170baf5 100644
--- a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
@@ -66,15 +66,15 @@ public:
_tmpSpace(lhs.size),
_lhs(_tmpSpace.storeLhs(lhs))
{
- auto a = &_lhs[0];
+ auto a = _lhs.data();
_lhs_norm_sq = _computer.dotProduct(a, a, lhs.size);
}
double calc(const vespalib::eval::TypedCells& rhs) const override {
size_t sz = _lhs.size();
vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
assert(sz == rhs_vector.size());
- auto a = &_lhs[0];
- auto b = &rhs_vector[0];
+ auto a = _lhs.data();
+ auto b = rhs_vector.data();
double b_norm_sq = _computer.dotProduct(b, b, sz);
double squared_norms = _lhs_norm_sq * b_norm_sq;
double dot_product = _computer.dotProduct(a, b, sz);
diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.h b/searchlib/src/vespa/searchlib/tensor/angular_distance.h
index 97f692d05a2..bba83576153 100644
--- a/searchlib/src/vespa/searchlib/tensor/angular_distance.h
+++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.h
@@ -60,8 +60,8 @@ public:
auto rhs_vector = rhs.typify<FloatType>();
size_t sz = lhs_vector.size();
assert(sz == rhs_vector.size());
- auto a = &lhs_vector[0];
- auto b = &rhs_vector[0];
+ auto a = lhs_vector.data();
+ auto b = rhs_vector.data();
double a_norm_sq = _computer.dotProduct(a, a, sz);
double b_norm_sq = _computer.dotProduct(b, b, sz);
double squared_norms = a_norm_sq * b_norm_sq;
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index cca492ef212..4553f39a525 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -107,6 +107,20 @@ make_distance_function_factory(search::attribute::DistanceMetric variant,
default: return std::make_unique<EuclideanDistanceFunctionFactory<float>>();
}
}
+ if (variant == DistanceMetric::PrenormalizedAngular) {
+ if (cell_type == CellType::DOUBLE) {
+ return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<double>>();
+ }
+ return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<float>>();
+ }
+ /*
+ if (variant == DistanceMetric::GeoDegrees) {
+ return std::make_unique<GeoDistanceFunctionFactory>();
+ }
+ if (variant == DistanceMetric::Hamming) {
+ return std::make_unique<HammingDistanceFunctionFactory>();
+ }
+ */
auto df = make_distance_function(variant, cell_type);
return std::make_unique<SimpleDistanceFunctionFactory>(std::move(df));
}
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
index b28cc2bda46..2300dba2db1 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
@@ -8,3 +8,4 @@
#include "geo_degrees_distance.h"
#include "hamming_distance.h"
#include "inner_product_distance.h"
+#include "prenormalized_angular_distance.h"
diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
index 92d4e7af406..9c37b191637 100644
--- a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
@@ -71,8 +71,8 @@ public:
size_t sz = _lhs_vector.size();
vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
assert(sz == rhs_vector.size());
- auto a = &_lhs_vector[0];
- auto b = &rhs_vector[0];
+ auto a = _lhs_vector.data();
+ auto b = rhs_vector.data();
return _computer.squaredEuclideanDistance(cast(a), cast(b), sz);
}
double convert_threshold(double threshold) const override {
diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h
index 8ba14580885..135bb186fd4 100644
--- a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h
+++ b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h
@@ -54,7 +54,7 @@ public:
auto rhs_vector = rhs.typify<FloatType>();
size_t sz = lhs_vector.size();
assert(sz == rhs_vector.size());
- double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz);
+ double score = 1.0 - _computer.dotProduct(lhs_vector.data(), rhs_vector.data(), sz);
return std::max(0.0, score);
}
private:
diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp
new file mode 100644
index 00000000000..d2693f9f443
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp
@@ -0,0 +1,82 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "prenormalized_angular_distance.h"
+#include "temporary_vector_store.h"
+
+using vespalib::typify_invoke;
+using vespalib::eval::TypifyCellType;
+
+namespace search::tensor {
+
+template<typename FloatType>
+class BoundPrenormalizedAngularDistance : public BoundDistanceFunction {
+private:
+ const vespalib::hwaccelrated::IAccelrated & _computer;
+ mutable TemporaryVectorStore<FloatType> _tmpSpace;
+ const vespalib::ConstArrayRef<FloatType> _lhs;
+ double _lhs_norm_sq;
+public:
+ BoundPrenormalizedAngularDistance(const vespalib::eval::TypedCells& lhs)
+ : BoundDistanceFunction(vespalib::eval::get_cell_type<FloatType>()),
+ _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()),
+ _tmpSpace(lhs.size),
+ _lhs(_tmpSpace.storeLhs(lhs))
+ {
+ auto a = _lhs.data();
+ _lhs_norm_sq = _computer.dotProduct(a, a, lhs.size);
+ if (_lhs_norm_sq <= 0.0) {
+ _lhs_norm_sq = 1.0;
+ }
+ }
+ double calc(const vespalib::eval::TypedCells& rhs) const override {
+ size_t sz = _lhs.size();
+ vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
+ assert(sz == rhs_vector.size());
+ auto a = _lhs.data();
+ auto b = rhs_vector.data();
+ double dot_product = _computer.dotProduct(a, b, sz);
+ double distance = _lhs_norm_sq - dot_product;
+ return distance;
+ }
+ double convert_threshold(double threshold) const override {
+ double cosine_similarity = 1.0 - threshold;
+ double dot_product = cosine_similarity * _lhs_norm_sq;
+ double distance = _lhs_norm_sq - dot_product;
+ return distance;
+ }
+ double to_rawscore(double distance) const override {
+ double dot_product = _lhs_norm_sq - distance;
+ double cosine_similarity = dot_product / _lhs_norm_sq;
+ // should be in in range [-1,1] but roundoff may cause problems:
+ cosine_similarity = std::min(1.0, cosine_similarity);
+ cosine_similarity = std::max(-1.0, cosine_similarity);
+ double cosine_distance = 1.0 - cosine_similarity; // in range [0,2]
+ double score = 1.0 / (1.0 + cosine_distance);
+ return score;
+ }
+ double calc_with_limit(const vespalib::eval::TypedCells& rhs, double) const override {
+ return calc(rhs);
+ }
+};
+
+template class BoundPrenormalizedAngularDistance<float>;
+template class BoundPrenormalizedAngularDistance<double>;
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_query_vector(const vespalib::eval::TypedCells& lhs) {
+ using DFT = BoundPrenormalizedAngularDistance<FloatType>;
+ return std::make_unique<DFT>(lhs);
+}
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_insertion_vector(const vespalib::eval::TypedCells& lhs) {
+ using DFT = BoundPrenormalizedAngularDistance<FloatType>;
+ return std::make_unique<DFT>(lhs);
+}
+
+template class PrenormalizedAngularDistanceFunctionFactory<float>;
+template class PrenormalizedAngularDistanceFunctionFactory<double>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h
new file mode 100644
index 00000000000..88953a236e7
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h
@@ -0,0 +1,27 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+#include "bound_distance_function.h"
+#include "distance_function_factory.h"
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+
+namespace search::tensor {
+
+/**
+ * Calculates inner-product "distance" between vectors with assumed norm 1.
+ * Should give same ordering as Angular distance, but is less expensive.
+ */
+template <typename FloatType>
+class PrenormalizedAngularDistanceFunctionFactory : public DistanceFunctionFactory {
+public:
+ PrenormalizedAngularDistanceFunctionFactory()
+ : DistanceFunctionFactory(vespalib::eval::get_cell_type<FloatType>())
+ {}
+ BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override;
+ BoundDistanceFunction::UP for_insertion_vector(const vespalib::eval::TypedCells& lhs) override;
+};
+
+}