summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorArne Juul <arnej@yahooinc.com>2023-04-20 14:06:11 +0000
committerArne Juul <arnej@yahooinc.com>2023-04-25 11:29:28 +0000
commite7091a54239d7d0f030ae161781a16aed4c86364 (patch)
treeceb17b43a998f0d3c5de46b1f6c9853f9fe05ea0 /searchlib
parentd5f17d23f377776e85aa687be17b211b54423c59 (diff)
add BoundPrenormalizedAngularDistance
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp61
-rw-r--r--searchlib/src/vespa/searchlib/tensor/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.h1
-rw-r--r--searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp82
-rw-r--r--searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h27
6 files changed, 186 insertions, 0 deletions
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
index ae283f3f2b2..29637bd7bf4 100644
--- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
+++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
@@ -257,6 +257,63 @@ TEST(DistanceFunctionsTest, angular_gives_expected_score)
EXPECT_DOUBLE_EQ(a66, computeAngularChecked(t(iv6), t(iv6)));
}
+double computePrenormalizedAngularChecked(TypedCells a, TypedCells b) {
+ static PrenormalizedAngularDistanceFunctionFactory<float> flt_dff;
+ static PrenormalizedAngularDistanceFunctionFactory<double> dbl_dff;
+ auto d_n = dbl_dff.for_query_vector(a);
+ auto d_f = flt_dff.for_query_vector(a);
+ auto d_i = dbl_dff.for_insertion_vector(a);
+ // normal:
+ double result = d_n->calc(b);
+ // insert is exactly same:
+ EXPECT_EQ(d_i->calc(b), result);
+ // float factory:
+ EXPECT_FLOAT_EQ(d_f->calc(b), result);
+ return result;
+}
+
+TEST(DistanceFunctionsTest, prenormalized_angular_gives_expected_score)
+{
+ std::vector<double> p0{0.0, 0.0, 0.0};
+ std::vector<double> p1{1.0, 0.0, 0.0};
+ std::vector<double> p2{0.0, 1.0, 0.0};
+ std::vector<double> p3{0.0, 0.0, 1.0};
+ std::vector<double> p4{0.5, 0.5, 0.707107};
+ std::vector<double> p5{0.0,-1.0, 0.0};
+ std::vector<double> p6{1.0, 2.0, 2.0};
+
+ PrenormalizedAngularDistanceFunctionFactory<double> dff;
+ auto pnad = dff.for_query_vector(t(p0));
+
+ double i12 = computePrenormalizedAngularChecked(t(p1), t(p2));
+ double i13 = computePrenormalizedAngularChecked(t(p1), t(p3));
+ double i23 = computePrenormalizedAngularChecked(t(p2), t(p3));
+ EXPECT_DOUBLE_EQ(i12, 1.0);
+ EXPECT_DOUBLE_EQ(i13, 1.0);
+ EXPECT_DOUBLE_EQ(i23, 1.0);
+
+ double i14 = computePrenormalizedAngularChecked(t(p1), t(p4));
+ double i24 = computePrenormalizedAngularChecked(t(p2), t(p4));
+ EXPECT_DOUBLE_EQ(i14, 0.5);
+ EXPECT_DOUBLE_EQ(i24, 0.5);
+ double i34 = computePrenormalizedAngularChecked(t(p3), t(p4));
+ EXPECT_FLOAT_EQ(i34, 1.0 - 0.707107);
+
+ double i25 = computePrenormalizedAngularChecked(t(p2), t(p5));
+ EXPECT_DOUBLE_EQ(i25, 2.0);
+
+ double i44 = computePrenormalizedAngularChecked(t(p4), t(p4));
+ EXPECT_GE(i44, 0.0);
+ EXPECT_LT(i44, 0.000001);
+
+ double threshold = pnad->convert_threshold(0.25);
+ EXPECT_DOUBLE_EQ(threshold, 0.25);
+ threshold = pnad->convert_threshold(0.5);
+ EXPECT_DOUBLE_EQ(threshold, 0.5);
+ threshold = pnad->convert_threshold(1.0);
+ EXPECT_DOUBLE_EQ(threshold, 1.0);
+}
+
TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
{
auto ct = vespalib::eval::CellType::DOUBLE;
@@ -292,6 +349,10 @@ TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
EXPECT_GE(i44, 0.0);
EXPECT_LT(i44, 0.000001);
+ double i66 = innerproduct->calc(t(p6), t(p6));
+ EXPECT_GE(i66, 0.0);
+ EXPECT_LT(i66, 0.000001);
+
double threshold = innerproduct->convert_threshold(0.25);
EXPECT_DOUBLE_EQ(threshold, 0.25);
threshold = innerproduct->convert_threshold(0.5);
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index 1783e0da1dd..2e874ffa4ae 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -30,6 +30,7 @@ vespa_add_library(searchlib_tensor OBJECT
large_subspaces_buffer_type.cpp
nearest_neighbor_index.cpp
nearest_neighbor_index_saver.cpp
+ prenormalized_angular_distance.cpp
serialized_fast_value_attribute.cpp
serialized_tensor_ref.cpp
small_subspaces_buffer_type.cpp
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index cca492ef212..4553f39a525 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -107,6 +107,20 @@ make_distance_function_factory(search::attribute::DistanceMetric variant,
default: return std::make_unique<EuclideanDistanceFunctionFactory<float>>();
}
}
+ if (variant == DistanceMetric::PrenormalizedAngular) {
+ if (cell_type == CellType::DOUBLE) {
+ return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<double>>();
+ }
+ return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<float>>();
+ }
+ /*
+ if (variant == DistanceMetric::GeoDegrees) {
+ return std::make_unique<GeoDistanceFunctionFactory>();
+ }
+ if (variant == DistanceMetric::Hamming) {
+ return std::make_unique<HammingDistanceFunctionFactory>();
+ }
+ */
auto df = make_distance_function(variant, cell_type);
return std::make_unique<SimpleDistanceFunctionFactory>(std::move(df));
}
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
index b28cc2bda46..2300dba2db1 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
@@ -8,3 +8,4 @@
#include "geo_degrees_distance.h"
#include "hamming_distance.h"
#include "inner_product_distance.h"
+#include "prenormalized_angular_distance.h"
diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp
new file mode 100644
index 00000000000..d2693f9f443
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp
@@ -0,0 +1,82 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "prenormalized_angular_distance.h"
+#include "temporary_vector_store.h"
+
+using vespalib::typify_invoke;
+using vespalib::eval::TypifyCellType;
+
+namespace search::tensor {
+
+template<typename FloatType>
+class BoundPrenormalizedAngularDistance : public BoundDistanceFunction {
+private:
+ const vespalib::hwaccelrated::IAccelrated & _computer;
+ mutable TemporaryVectorStore<FloatType> _tmpSpace;
+ const vespalib::ConstArrayRef<FloatType> _lhs;
+ double _lhs_norm_sq;
+public:
+ BoundPrenormalizedAngularDistance(const vespalib::eval::TypedCells& lhs)
+ : BoundDistanceFunction(vespalib::eval::get_cell_type<FloatType>()),
+ _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()),
+ _tmpSpace(lhs.size),
+ _lhs(_tmpSpace.storeLhs(lhs))
+ {
+ auto a = _lhs.data();
+ _lhs_norm_sq = _computer.dotProduct(a, a, lhs.size);
+ if (_lhs_norm_sq <= 0.0) {
+ _lhs_norm_sq = 1.0;
+ }
+ }
+ double calc(const vespalib::eval::TypedCells& rhs) const override {
+ size_t sz = _lhs.size();
+ vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs);
+ assert(sz == rhs_vector.size());
+ auto a = _lhs.data();
+ auto b = rhs_vector.data();
+ double dot_product = _computer.dotProduct(a, b, sz);
+ double distance = _lhs_norm_sq - dot_product;
+ return distance;
+ }
+ double convert_threshold(double threshold) const override {
+ double cosine_similarity = 1.0 - threshold;
+ double dot_product = cosine_similarity * _lhs_norm_sq;
+ double distance = _lhs_norm_sq - dot_product;
+ return distance;
+ }
+ double to_rawscore(double distance) const override {
+ double dot_product = _lhs_norm_sq - distance;
+ double cosine_similarity = dot_product / _lhs_norm_sq;
+ // should be in in range [-1,1] but roundoff may cause problems:
+ cosine_similarity = std::min(1.0, cosine_similarity);
+ cosine_similarity = std::max(-1.0, cosine_similarity);
+ double cosine_distance = 1.0 - cosine_similarity; // in range [0,2]
+ double score = 1.0 / (1.0 + cosine_distance);
+ return score;
+ }
+ double calc_with_limit(const vespalib::eval::TypedCells& rhs, double) const override {
+ return calc(rhs);
+ }
+};
+
+template class BoundPrenormalizedAngularDistance<float>;
+template class BoundPrenormalizedAngularDistance<double>;
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_query_vector(const vespalib::eval::TypedCells& lhs) {
+ using DFT = BoundPrenormalizedAngularDistance<FloatType>;
+ return std::make_unique<DFT>(lhs);
+}
+
+template <typename FloatType>
+BoundDistanceFunction::UP
+PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_insertion_vector(const vespalib::eval::TypedCells& lhs) {
+ using DFT = BoundPrenormalizedAngularDistance<FloatType>;
+ return std::make_unique<DFT>(lhs);
+}
+
+template class PrenormalizedAngularDistanceFunctionFactory<float>;
+template class PrenormalizedAngularDistanceFunctionFactory<double>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h
new file mode 100644
index 00000000000..88953a236e7
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h
@@ -0,0 +1,27 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+#include "bound_distance_function.h"
+#include "distance_function_factory.h"
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+
+namespace search::tensor {
+
+/**
+ * Calculates inner-product "distance" between vectors with assumed norm 1.
+ * Should give same ordering as Angular distance, but is less expensive.
+ */
+template <typename FloatType>
+class PrenormalizedAngularDistanceFunctionFactory : public DistanceFunctionFactory {
+public:
+ PrenormalizedAngularDistanceFunctionFactory()
+ : DistanceFunctionFactory(vespalib::eval::get_cell_type<FloatType>())
+ {}
+ BoundDistanceFunction::UP for_query_vector(const vespalib::eval::TypedCells& lhs) override;
+ BoundDistanceFunction::UP for_insertion_vector(const vespalib::eval::TypedCells& lhs) override;
+};
+
+}