aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src
diff options
context:
space:
mode:
authorArne H Juul <arnej27959@users.noreply.github.com>2020-06-29 13:16:20 +0200
committerGitHub <noreply@github.com>2020-06-29 13:16:20 +0200
commita58193603b25fbf75fac825aa44e789cb711e70c (patch)
tree732b1c664078f977457e8e90588bba0411013b15 /searchlib/src
parentb6a411e72d1ecff196be130f91d90e93529f5c48 (diff)
parent28a0996ab97193ebe5588fa25c38873f32ba37f4 (diff)
Merge pull request #13721 from vespa-engine/arnej/innerproduct-distance-metric
Arnej/innerproduct distance metric
Diffstat (limited to 'searchlib/src')
-rw-r--r--searchlib/src/tests/attribute/attributemanager/attributemanager_test.cpp7
-rw-r--r--searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp93
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_header.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/configconverter.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/tensor/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp7
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.cpp19
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.h56
8 files changed, 166 insertions, 22 deletions
diff --git a/searchlib/src/tests/attribute/attributemanager/attributemanager_test.cpp b/searchlib/src/tests/attribute/attributemanager/attributemanager_test.cpp
index b94186626c2..1191a7aa2e2 100644
--- a/searchlib/src/tests/attribute/attributemanager/attributemanager_test.cpp
+++ b/searchlib/src/tests/attribute/attributemanager/attributemanager_test.cpp
@@ -289,6 +289,12 @@ AttributeManagerTest::testConfigConvert()
auto out = ConfigConverter::convert(a);
EXPECT_TRUE(out.distance_metric() == DistanceMetric::GeoDegrees);
}
+ { // distance metric (explicit)
+ CACA a;
+ a.distancemetric = AttributesConfig::Attribute::Distancemetric::INNERPRODUCT;
+ auto out = ConfigConverter::convert(a);
+ EXPECT_TRUE(out.distance_metric() == DistanceMetric::InnerProduct);
+ }
{ // hnsw index params (enabled)
auto dm_in = AttributesConfig::Attribute::Distancemetric::ANGULAR;
auto dm_out = DistanceMetric::Angular;
@@ -306,6 +312,7 @@ AttributeManagerTest::testConfigConvert()
EXPECT_TRUE(params.distance_metric() == dm_out);
EXPECT_TRUE(params.multi_threaded_indexing());
}
+
{ // hnsw index params (disabled)
CACA a;
a.index.hnsw.enabled = false;
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
index 59532919347..082d04f104b 100644
--- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
+++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
@@ -31,12 +31,11 @@ void verify_geo_miles(const DistanceFunction *dist_fun,
}
-TEST(DistanceFunctionsTest, gives_expected_score)
+TEST(DistanceFunctionsTest, euclidean_gives_expected_score)
{
auto ct = vespalib::eval::ValueType::CellType::DOUBLE;
auto euclid = make_distance_function(DistanceMetric::Euclidean, ct);
- auto angular = make_distance_function(DistanceMetric::Angular, ct);
std::vector<double> p0{0.0, 0.0, 0.0};
std::vector<double> p1{1.0, 0.0, 0.0};
@@ -44,33 +43,103 @@ TEST(DistanceFunctionsTest, gives_expected_score)
std::vector<double> p3{0.0, 0.0, 1.0};
std::vector<double> p4{0.5, 0.5, 0.707107};
std::vector<double> p5{0.0,-1.0, 0.0};
+ std::vector<double> p6{1.0, 2.0, 2.0};
double n4 = euclid->calc(t(p0), t(p4));
- EXPECT_GT(n4, 0.99999);
- EXPECT_LT(n4, 1.00001);
+ EXPECT_FLOAT_EQ(n4, 1.0);
double d12 = euclid->calc(t(p1), t(p2));
EXPECT_EQ(d12, 2.0);
+ EXPECT_DOUBLE_EQ(euclid->to_rawscore(d12), 1.0/(1.0 + sqrt(2.0)));
+}
+
+TEST(DistanceFunctionsTest, angular_gives_expected_score)
+{
+ auto ct = vespalib::eval::ValueType::CellType::DOUBLE;
+ auto angular = make_distance_function(DistanceMetric::Angular, ct);
+
+ std::vector<double> p0{0.0, 0.0, 0.0};
+ std::vector<double> p1{1.0, 0.0, 0.0};
+ std::vector<double> p2{0.0, 1.0, 0.0};
+ std::vector<double> p3{0.0, 0.0, 1.0};
+ std::vector<double> p4{0.5, 0.5, 0.707107};
+ std::vector<double> p5{0.0,-1.0, 0.0};
+ std::vector<double> p6{1.0, 2.0, 2.0};
+
+ constexpr double pi = 3.14159265358979323846;
double a12 = angular->calc(t(p1), t(p2));
double a13 = angular->calc(t(p1), t(p3));
double a23 = angular->calc(t(p2), t(p3));
- EXPECT_EQ(a12, 1.0);
- EXPECT_EQ(a13, 1.0);
- EXPECT_EQ(a23, 1.0);
+ EXPECT_DOUBLE_EQ(a12, 1.0);
+ EXPECT_DOUBLE_EQ(a13, 1.0);
+ EXPECT_DOUBLE_EQ(a23, 1.0);
+ EXPECT_FLOAT_EQ(angular->to_rawscore(a12), 1.0/(1.0 + pi/2));
+
double a14 = angular->calc(t(p1), t(p4));
double a24 = angular->calc(t(p2), t(p4));
- EXPECT_EQ(a14, 0.5);
- EXPECT_EQ(a24, 0.5);
+ EXPECT_FLOAT_EQ(a14, 0.5);
+ EXPECT_FLOAT_EQ(a24, 0.5);
+ EXPECT_FLOAT_EQ(angular->to_rawscore(a14), 1.0/(1.0 + pi/3));
+
double a34 = angular->calc(t(p3), t(p4));
- EXPECT_GT(a34, 0.999999 - 0.707107);
- EXPECT_LT(a34, 1.000001 - 0.707107);
+ EXPECT_FLOAT_EQ(a34, (1.0 - 0.707107));
+ EXPECT_FLOAT_EQ(angular->to_rawscore(a34), 1.0/(1.0 + pi/4));
double a25 = angular->calc(t(p2), t(p5));
- EXPECT_EQ(a25, 2.0);
+ EXPECT_DOUBLE_EQ(a25, 2.0);
+ EXPECT_FLOAT_EQ(angular->to_rawscore(a25), 1.0/(1.0 + pi));
double a44 = angular->calc(t(p4), t(p4));
EXPECT_GE(a44, 0.0);
EXPECT_LT(a44, 0.000001);
+ EXPECT_FLOAT_EQ(angular->to_rawscore(a44), 1.0);
+
+ double a66 = angular->calc(t(p6), t(p6));
+ EXPECT_GE(a66, 0.0);
+ EXPECT_LT(a66, 0.000001);
+ EXPECT_FLOAT_EQ(angular->to_rawscore(a66), 1.0);
+
+ double a16 = angular->calc(t(p1), t(p6));
+ double a26 = angular->calc(t(p2), t(p6));
+ double a36 = angular->calc(t(p3), t(p6));
+ EXPECT_FLOAT_EQ(a16, 1.0 - (1.0/3.0));
+ EXPECT_FLOAT_EQ(a26, 1.0 - (2.0/3.0));
+ EXPECT_FLOAT_EQ(a36, 1.0 - (2.0/3.0));
+}
+
+TEST(DistanceFunctionsTest, innerproduct_gives_expected_score)
+{
+ auto ct = vespalib::eval::ValueType::CellType::DOUBLE;
+
+ auto innerproduct = make_distance_function(DistanceMetric::InnerProduct, ct);
+
+ std::vector<double> p0{0.0, 0.0, 0.0};
+ std::vector<double> p1{1.0, 0.0, 0.0};
+ std::vector<double> p2{0.0, 1.0, 0.0};
+ std::vector<double> p3{0.0, 0.0, 1.0};
+ std::vector<double> p4{0.5, 0.5, 0.707107};
+ std::vector<double> p5{0.0,-1.0, 0.0};
+ std::vector<double> p6{1.0, 2.0, 2.0};
+
+ double i12 = innerproduct->calc(t(p1), t(p2));
+ double i13 = innerproduct->calc(t(p1), t(p3));
+ double i23 = innerproduct->calc(t(p2), t(p3));
+ EXPECT_DOUBLE_EQ(i12, 1.0);
+ EXPECT_DOUBLE_EQ(i13, 1.0);
+ EXPECT_DOUBLE_EQ(i23, 1.0);
+ double i14 = innerproduct->calc(t(p1), t(p4));
+ double i24 = innerproduct->calc(t(p2), t(p4));
+ EXPECT_DOUBLE_EQ(i14, 0.5);
+ EXPECT_DOUBLE_EQ(i24, 0.5);
+ double i34 = innerproduct->calc(t(p3), t(p4));
+ EXPECT_FLOAT_EQ(i34, 1.0 - 0.707107);
+
+ double i25 = innerproduct->calc(t(p2), t(p5));
+ EXPECT_DOUBLE_EQ(i25, 2.0);
+
+ double i44 = innerproduct->calc(t(p4), t(p4));
+ EXPECT_GE(i44, 0.0);
+ EXPECT_LT(i44, 0.000001);
}
TEST(GeoDegreesTest, gives_expected_score)
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
index e97b0364af8..acf0d3d2fd6 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_header.cpp
@@ -27,6 +27,7 @@ const vespalib::string hnsw_distance_metric = "hnsw.distance_metric";
const vespalib::string euclidean = "euclidean";
const vespalib::string angular = "angular";
const vespalib::string geodegrees = "geodegrees";
+const vespalib::string innerproduct = "innerproduct";
const vespalib::string doc_id_limit_tag = "docIdLimit";
const vespalib::string enumerated_tag = "enumerated";
const vespalib::string unique_value_count_tag = "uniqueValueCount";
@@ -97,6 +98,7 @@ to_string(DistanceMetric metric)
case DistanceMetric::Euclidean: return euclidean;
case DistanceMetric::Angular: return angular;
case DistanceMetric::GeoDegrees: return geodegrees;
+ case DistanceMetric::InnerProduct: return innerproduct;
}
throw vespalib::IllegalArgumentException("Unknown distance metric " + std::to_string(static_cast<int>(metric)));
}
diff --git a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
index f435f79bf65..5a8b32ec01b 100644
--- a/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/configconverter.cpp
@@ -85,6 +85,9 @@ ConfigConverter::convert(const AttributesConfig::Attribute & cfg)
case CfgDm::GEODEGREES:
dm = DistanceMetric::GeoDegrees;
break;
+ case CfgDm::INNERPRODUCT:
+ dm = DistanceMetric::InnerProduct;
+ break;
}
retval.set_distance_metric(dm);
if (cfg.index.hnsw.enabled) {
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index 0f106f693f8..35615b255c0 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -6,6 +6,7 @@ vespa_add_library(searchlib_tensor OBJECT
dense_tensor_attribute_saver.cpp
dense_tensor_store.cpp
distance_function_factory.cpp
+ distance_functions.cpp
generic_tensor_attribute.cpp
generic_tensor_attribute_saver.cpp
generic_tensor_store.cpp
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index 6b24a062727..b76994d6092 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -33,6 +33,13 @@ make_distance_function(DistanceMetric variant, ValueType::CellType cell_type)
return std::make_unique<GeoDegreesDistance<double>>();
}
break;
+ case DistanceMetric::InnerProduct:
+ if (cell_type == ValueType::CellType::FLOAT) {
+ return std::make_unique<InnerProductDistance<float>>();
+ } else {
+ return std::make_unique<InnerProductDistance<double>>();
+ }
+ break;
}
// not reached:
return DistanceFunction::UP();
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
new file mode 100644
index 00000000000..9017628d42c
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
@@ -0,0 +1,19 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "distance_functions.h"
+
+namespace search::tensor {
+
+template class SquaredEuclideanDistance<float>;
+template class SquaredEuclideanDistance<double>;
+
+template class AngularDistance<float>;
+template class AngularDistance<double>;
+
+template class InnerProductDistance<float>;
+template class InnerProductDistance<double>;
+
+template class GeoDegreesDistance<float>;
+template class GeoDegreesDistance<double>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
index d37495e85da..7e75920619f 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
@@ -50,11 +50,8 @@ public:
const vespalib::hwaccelrated::IAccelrated & _computer;
};
-template class SquaredEuclideanDistance<float>;
-template class SquaredEuclideanDistance<double>;
-
/**
- * Calculates angular distance between vectors with assumed norm 1.
+ * Calculates angular distance between vectors
*/
template <typename FloatType>
class AngularDistance : public DistanceFunction {
@@ -67,6 +64,51 @@ public:
auto rhs_vector = rhs.typify<FloatType>();
size_t sz = lhs_vector.size();
assert(sz == rhs_vector.size());
+ auto a = &lhs_vector[0];
+ auto b = &rhs_vector[0];
+ double a_norm_sq = _computer.dotProduct(a, a, sz);
+ double b_norm_sq = _computer.dotProduct(b, b, sz);
+ double squared_norms = a_norm_sq * b_norm_sq;
+ double dot_product = _computer.dotProduct(a, b, sz);
+ double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0;
+ double cosine_similarity = dot_product / div;
+ double distance = 1.0 - cosine_similarity; // in range [0,2]
+ return distance;
+ }
+ double to_rawscore(double distance) const override {
+ double cosine_similarity = 1.0 - distance;
+ // should be in in range [-1,1] but roundoff may cause problems:
+ cosine_similarity = std::min(1.0, cosine_similarity);
+ cosine_similarity = std::max(-1.0, cosine_similarity);
+ double angle_distance = acos(cosine_similarity); // in range [0,pi]
+ double score = 1.0 / (1.0 + angle_distance);
+ return score;
+ }
+ double calc_with_limit(const vespalib::tensor::TypedCells& lhs,
+ const vespalib::tensor::TypedCells& rhs,
+ double /*limit*/) const override
+ {
+ return calc(lhs, rhs);
+ }
+
+ const vespalib::hwaccelrated::IAccelrated & _computer;
+};
+
+/**
+ * Calculates inner-product "distance" between vectors with assumed norm 1.
+ * Should give same ordering as Angular distance, but is less expensive.
+ */
+template <typename FloatType>
+class InnerProductDistance : public DistanceFunction {
+public:
+ InnerProductDistance()
+ : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator())
+ {}
+ double calc(const vespalib::tensor::TypedCells& lhs, const vespalib::tensor::TypedCells& rhs) const override {
+ auto lhs_vector = lhs.typify<FloatType>();
+ auto rhs_vector = rhs.typify<FloatType>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz);
return std::max(0.0, score);
}
@@ -84,9 +126,6 @@ public:
const vespalib::hwaccelrated::IAccelrated & _computer;
};
-template class AngularDistance<float>;
-template class AngularDistance<double>;
-
/**
* Calculates great-circle distance between Latitude/Longitude pairs,
* measured in degrees. Output distance is measured in meters.
@@ -139,7 +178,4 @@ public:
};
-template class GeoDegreesDistance<float>;
-template class GeoDegreesDistance<double>;
-
}