summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorArne H Juul <arnej27959@users.noreply.github.com>2021-04-14 11:07:37 +0200
committerGitHub <noreply@github.com>2021-04-14 11:07:37 +0200
commit53f821ab8ce9940dca4639738d0ca997016e47ed (patch)
tree6447e12a919fa69f84eccc33ff2a4ea710229d0b /searchlib
parent9f8a6c642871300d7c937b610bd32dfceb6b29ea (diff)
parent9dd87566bafd50e50036a68095cc24df04a20ee0 (diff)
Merge pull request #17378 from vespa-engine/arnej/redo-nn-distance-for-new-cell-types-2
Arnej/redo nn distance for new cell types 2
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp2
-rw-r--r--searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp34
-rw-r--r--searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp3
-rw-r--r--searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp73
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/tensor/CMakeLists.txt6
-rw-r--r--searchlib/src/vespa/searchlib/tensor/angular_distance.cpp52
-rw-r--r--searchlib/src/vespa/searchlib/tensor/angular_distance.h76
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_function.h13
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp61
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.cpp22
-rw-r--r--searchlib/src/vespa/searchlib/tensor/distance_functions.h247
-rw-r--r--searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp51
-rw-r--r--searchlib/src/vespa/searchlib/tensor/euclidean_distance.h59
-rw-r--r--searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp50
-rw-r--r--searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h53
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp61
-rw-r--r--searchlib/src/vespa/searchlib/tensor/hamming_distance.h38
-rw-r--r--searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp44
-rw-r--r--searchlib/src/vespa/searchlib/tensor/inner_product_distance.h64
21 files changed, 673 insertions, 340 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
index f1d910f2635..9621b93fd37 100644
--- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
+++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp
@@ -230,7 +230,7 @@ public:
const search::tensor::DistanceFunction *distance_function() const override {
- static search::tensor::SquaredEuclideanDistance<double> my_dist_fun;
+ static search::tensor::SquaredEuclideanDistance my_dist_fun(vespalib::eval::CellType::DOUBLE);
return &my_dist_fun;
}
};
diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
index ee0a2aff80e..701f4c91ff2 100644
--- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
+++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp
@@ -10,10 +10,12 @@
LOG_SETUP("distance_function_test");
using namespace search::tensor;
+using vespalib::eval::Int8Float;
using vespalib::eval::TypedCells;
using search::attribute::DistanceMetric;
-TypedCells t(const std::vector<double> &v) { return TypedCells(v); }
+template <typename T>
+TypedCells t(const std::vector<T> &v) { return TypedCells(v); }
void verify_geo_miles(const DistanceFunction *dist_fun,
const std::vector<double> &p1,
@@ -58,6 +60,31 @@ TEST(DistanceFunctionsTest, euclidean_gives_expected_score)
EXPECT_EQ(threshold, 0.25);
}
+TEST(DistanceFunctionsTest, euclidean_int8_smoketest)
+{
+ auto ct = vespalib::eval::CellType::INT8;
+
+ auto euclid = make_distance_function(DistanceMetric::Euclidean, ct);
+
+ std::vector<double> p00{0.0, 0.0, 0.0};
+ std::vector<Int8Float> p0{0.0, 0.0, 0.0};
+ std::vector<Int8Float> p1{1.0, 0.0, 0.0};
+ std::vector<Int8Float> p5{0.0,-1.0, 0.0};
+ std::vector<Int8Float> p7{-1.0, 2.0, -2.0};
+
+ EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p0), t(p1)));
+ EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p0), t(p5)));
+ EXPECT_DOUBLE_EQ(9.0, euclid->calc(t(p0), t(p7)));
+
+ EXPECT_DOUBLE_EQ(2.0, euclid->calc(t(p1), t(p5)));
+ EXPECT_DOUBLE_EQ(12.0, euclid->calc(t(p1), t(p7)));
+ EXPECT_DOUBLE_EQ(14.0, euclid->calc(t(p5), t(p7)));
+
+ EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p00), t(p1)));
+ EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p00), t(p5)));
+ EXPECT_DOUBLE_EQ(9.0, euclid->calc(t(p00), t(p7)));
+}
+
TEST(DistanceFunctionsTest, angular_gives_expected_score)
{
auto ct = vespalib::eval::CellType::DOUBLE;
@@ -212,6 +239,11 @@ TEST(DistanceFunctionsTest, hamming_gives_expected_score)
EXPECT_DOUBLE_EQ(threshold, 0.5);
threshold = hamming->convert_threshold(1.0);
EXPECT_DOUBLE_EQ(threshold, 1.0);
+
+ std::vector<Int8Float> bytes_a = { 0, 1, 2, 4, 8, 16, 32, 64, -128, 0, 1, 2, 4, 8, 16, 32, 64, -128, 0, 1, 2 };
+ std::vector<Int8Float> bytes_b = { 1, 2, 2, 4, 8, 16, 32, 65, -128, 0, 1, 0, 4, 8, 16, 32, 64, -128, 0, 1, -1 };
+ // expect diff: 1 2 1 1 7
+ EXPECT_EQ(hamming->calc(TypedCells(bytes_a), TypedCells(bytes_b)), 12.0);
}
TEST(GeoDegreesTest, gives_expected_score)
diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
index 20dc55df329..3f7ec140781 100644
--- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
+++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp
@@ -51,7 +51,6 @@ struct LevelGenerator : public RandomLevelGenerator {
};
using FloatVectors = MyDocVectorAccess<float>;
-using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>;
using HnswIndexUP = std::unique_ptr<HnswIndex>;
class HnswIndexTest : public ::testing::Test {
@@ -79,7 +78,7 @@ public:
void init(bool heuristic_select_neighbors) {
auto generator = std::make_unique<LevelGenerator>();
level_generator = generator.get();
- index = std::make_unique<HnswIndex>(vectors, std::make_unique<FloatSqEuclideanDistance>(),
+ index = std::make_unique<HnswIndex>(vectors, std::make_unique<SquaredEuclideanDistance>(vespalib::eval::CellType::FLOAT),
std::move(generator),
HnswIndex::Config(5, 2, 10, 0, heuristic_select_neighbors));
}
diff --git a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp
index 8e3bb95a776..154950822ee 100644
--- a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp
+++ b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp
@@ -117,7 +117,7 @@ public:
}
};
-using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>;
+using FloatSqEuclideanDistance = SquaredEuclideanDistanceHW<float>;
using HnswIndexUP = std::unique_ptr<HnswIndex>;
class Stressor : public ::testing::Test {
diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp
index 4ec23b993b6..68af090afb6 100644
--- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp
@@ -4,7 +4,7 @@
#include "nearest_neighbor_blueprint.h"
#include "nearest_neighbor_iterator.h"
#include "nns_index_iterator.h"
-#include <vespa/eval/eval/dense_cells_value.h>
+#include <vespa/eval/eval/fast_value.h>
#include <vespa/searchlib/fef/termfieldmatchdataarray.h>
#include <vespa/searchlib/tensor/dense_tensor_attribute.h>
#include <vespa/searchlib/tensor/distance_function_factory.h>
@@ -12,35 +12,43 @@
LOG_SETUP(".searchlib.queryeval.nearest_neighbor_blueprint");
-using vespalib::eval::DenseCellsValue;
+using vespalib::eval::CellType;
+using vespalib::eval::FastValueBuilderFactory;
+using vespalib::eval::TypedCells;
using vespalib::eval::Value;
+using vespalib::eval::ValueType;
namespace search::queryeval {
namespace {
template<typename LCT, typename RCT>
-void
-convert_cells(std::unique_ptr<Value> &original, const vespalib::eval::ValueType &want_type)
+std::unique_ptr<Value>
+convert_cells(const ValueType &new_type, std::unique_ptr<Value> old_value)
{
- if constexpr (std::is_same<LCT,RCT>::value) {
- return;
- } else {
- auto old_cells = original->cells().typify<LCT>();
- std::vector<RCT> new_cells;
- new_cells.reserve(old_cells.size());
- for (LCT value : old_cells) {
- RCT conv(value);
- new_cells.push_back(conv);
- }
- original = std::make_unique<DenseCellsValue<RCT>>(want_type, std::move(new_cells));
+ auto old_cells = old_value->cells().typify<LCT>();
+ auto builder = FastValueBuilderFactory::get().create_value_builder<RCT>(new_type);
+ auto new_cells = builder->add_subspace();
+ assert(old_cells.size() == new_cells.size());
+ auto p = new_cells.begin();
+ for (LCT value : old_cells) {
+ RCT conv(value);
+ *p++ = conv;
}
+ return builder->build(std::move(builder));
}
struct ConvertCellsSelector
{
template <typename LCT, typename RCT>
- static auto invoke() { return convert_cells<LCT, RCT>; }
+ static auto invoke(const ValueType &new_type, std::unique_ptr<Value> old_value) {
+ return convert_cells<LCT, RCT>(new_type, std::move(old_value));
+ }
+ auto operator() (CellType from, CellType to, std::unique_ptr<Value> old_value) const {
+ using MyTypify = vespalib::eval::TypifyCellType;
+ ValueType new_type = old_value->type().cell_cast(to);
+ return vespalib::typify_invoke<2,MyTypify,ConvertCellsSelector>(from, to, new_type, std::move(old_value));
+ }
};
} // namespace <unnamed>
@@ -63,12 +71,8 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f
_found_hits(),
_global_filter(GlobalFilter::create())
{
- auto lct = _query_tensor->cells().type;
- auto rct = _attr_tensor.getTensorType().cell_type();
- using MyTypify = vespalib::eval::TypifyCellType;
- auto fixup_fun = vespalib::typify_invoke<2,MyTypify,ConvertCellsSelector>(lct, rct);
- fixup_fun(_query_tensor, _attr_tensor.getTensorType());
- _fallback_dist_fun = search::tensor::make_distance_function(_attr_tensor.distance_metric(), rct);
+ CellType attr_ct = _attr_tensor.getTensorType().cell_type();
+ _fallback_dist_fun = search::tensor::make_distance_function(_attr_tensor.distance_metric(), attr_ct);
_dist_fun = _fallback_dist_fun.get();
assert(_dist_fun);
auto nns_index = _attr_tensor.nearest_neighbor_index();
@@ -76,6 +80,12 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f
_dist_fun = nns_index->distance_function();
assert(_dist_fun);
}
+ auto query_ct = _query_tensor->cells().type;
+ CellType required_ct = _dist_fun->expected_cell_type();
+ if (query_ct != required_ct) {
+ ConvertCellsSelector converter;
+ _query_tensor = converter(query_ct, required_ct, std::move(_query_tensor));
+ }
if (distance_threshold < std::numeric_limits<double>::max()) {
_distance_threshold = _dist_fun->convert_threshold(distance_threshold);
_distance_heap.set_distance_threshold(_distance_threshold);
@@ -123,18 +133,13 @@ NearestNeighborBlueprint::perform_top_k()
{
auto nns_index = _attr_tensor.nearest_neighbor_index();
if (_approximate && nns_index) {
- auto lhs_type = _query_tensor->type();
- auto rhs_type = _attr_tensor.getTensorType();
- // different cell types should be converted already
- if (lhs_type == rhs_type) {
- auto lhs = _query_tensor->cells();
- uint32_t k = _target_num_hits;
- if (_global_filter->has_filter()) {
- auto filter = _global_filter->filter();
- _found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold);
- } else {
- _found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold);
- }
+ auto lhs = _query_tensor->cells();
+ uint32_t k = _target_num_hits;
+ if (_global_filter->has_filter()) {
+ auto filter = _global_filter->filter();
+ _found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold);
+ } else {
+ _found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold);
}
}
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp
index d85da49c2f7..6379a06e6d6 100644
--- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp
@@ -16,7 +16,7 @@ bool
is_compatible(const vespalib::eval::ValueType& lhs,
const vespalib::eval::ValueType& rhs)
{
- return (lhs == rhs);
+ return (lhs.dimensions() == rhs.dimensions());
}
}
diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
index 8af77a57a44..46bafa189e1 100644
--- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt
@@ -1,6 +1,7 @@
# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
vespa_add_library(searchlib_tensor OBJECT
SOURCES
+ angular_distance.cpp
default_nearest_neighbor_index_factory.cpp
dense_tensor_attribute.cpp
dense_tensor_attribute_saver.cpp
@@ -9,13 +10,16 @@ vespa_add_library(searchlib_tensor OBJECT
direct_tensor_saver.cpp
direct_tensor_store.cpp
distance_function_factory.cpp
- distance_functions.cpp
+ euclidean_distance.cpp
+ geo_degrees_distance.cpp
+ hamming_distance.cpp
hnsw_graph.cpp
hnsw_index.cpp
hnsw_index_loader.cpp
hnsw_index_saver.cpp
imported_tensor_attribute_vector.cpp
imported_tensor_attribute_vector_read_guard.cpp
+ inner_product_distance.cpp
inv_log_level_generator.cpp
nearest_neighbor_index.cpp
nearest_neighbor_index_saver.cpp
diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
new file mode 100644
index 00000000000..21b2622283c
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp
@@ -0,0 +1,52 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "angular_distance.h"
+
+using vespalib::typify_invoke;
+using vespalib::eval::TypifyCellType;
+
+namespace search::tensor {
+
+namespace {
+
+struct CalcAngular {
+ template <typename LCT, typename RCT>
+ static double invoke(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs)
+ {
+ auto lhs_vector = lhs.unsafe_typify<LCT>();
+ auto rhs_vector = rhs.unsafe_typify<RCT>();
+
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ double a_norm_sq = 0.0;
+ double b_norm_sq = 0.0;
+ double dot_product = 0.0;
+ for (size_t i = 0; i < sz; ++i) {
+ double a = lhs_vector[i];
+ double b = rhs_vector[i];
+ a_norm_sq += a*a;
+ b_norm_sq += b*b;
+ dot_product += a*b;
+ }
+ double squared_norms = a_norm_sq * b_norm_sq;
+ double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0;
+ double cosine_similarity = dot_product / div;
+ double distance = 1.0 - cosine_similarity; // in range [0,2]
+ return std::max(0.0, distance);
+ }
+};
+
+}
+
+double
+AngularDistance::calc(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs) const
+{
+ return typify_invoke<2,TypifyCellType,CalcAngular>(lhs.type, rhs.type, lhs, rhs);
+}
+
+template class AngularDistanceHW<float>;
+template class AngularDistanceHW<double>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.h b/searchlib/src/vespa/searchlib/tensor/angular_distance.h
new file mode 100644
index 00000000000..ce1fa40b2a4
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.h
@@ -0,0 +1,76 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+#include <cmath>
+
+namespace search::tensor {
+
+/**
+ * Calculates angular distance between vectors
+ */
+class AngularDistance : public DistanceFunction {
+public:
+ AngularDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {}
+ double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override;
+ double convert_threshold(double threshold) const override {
+ double cosine_similarity = cos(threshold);
+ return 1.0 - cosine_similarity;
+ }
+ double to_rawscore(double distance) const override {
+ double cosine_similarity = 1.0 - distance;
+ // should be in in range [-1,1] but roundoff may cause problems:
+ cosine_similarity = std::min(1.0, cosine_similarity);
+ cosine_similarity = std::max(-1.0, cosine_similarity);
+ double angle_distance = acos(cosine_similarity); // in range [0,pi]
+ double score = 1.0 / (1.0 + angle_distance);
+ return score;
+ }
+ double calc_with_limit(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs,
+ double /*limit*/) const override
+ {
+ return calc(lhs, rhs);
+ }
+};
+
+/**
+ * Calculates angular distance between vectors
+ * Will use instruction optimal for the cpu it is running on
+ * when both vectors have the expected cell type.
+ */
+template <typename FloatType>
+class AngularDistanceHW : public AngularDistance {
+public:
+ AngularDistanceHW()
+ : AngularDistance(vespalib::eval::get_cell_type<FloatType>()),
+ _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator())
+ {
+ assert(expected_cell_type() == vespalib::eval::get_cell_type<FloatType>());
+ }
+ double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override {
+ constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type<FloatType>();
+ assert(lhs.type == expected && rhs.type == expected);
+ auto lhs_vector = lhs.typify<FloatType>();
+ auto rhs_vector = rhs.typify<FloatType>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ auto a = &lhs_vector[0];
+ auto b = &rhs_vector[0];
+ double a_norm_sq = _computer.dotProduct(a, a, sz);
+ double b_norm_sq = _computer.dotProduct(b, b, sz);
+ double squared_norms = a_norm_sq * b_norm_sq;
+ double dot_product = _computer.dotProduct(a, b, sz);
+ double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0;
+ double cosine_similarity = dot_product / div;
+ double distance = 1.0 - cosine_similarity; // in range [0,2]
+ return distance;
+ }
+private:
+ const vespalib::hwaccelrated::IAccelrated & _computer;
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function.h b/searchlib/src/vespa/searchlib/tensor/distance_function.h
index 724f83b6129..4c1cd2c5608 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function.h
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function.h
@@ -1,8 +1,9 @@
-// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
#include <memory>
+#include <vespa/eval/eval/cell_type.h>
namespace vespalib::eval { struct TypedCells; }
@@ -15,10 +16,20 @@ namespace search::tensor {
* The actual implementation must know which type the vectors are.
*/
class DistanceFunction {
+private:
+ vespalib::eval::CellType _expect_cell_type;
public:
using UP = std::unique_ptr<DistanceFunction>;
+
+ DistanceFunction(vespalib::eval::CellType expected) : _expect_cell_type(expected) {}
+
virtual ~DistanceFunction() {}
+ // input (query) vectors must be converted to this cell type:
+ vespalib::eval::CellType expected_cell_type() const {
+ return _expect_cell_type;
+ }
+
// calculate internal distance (comparable)
virtual double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const = 0;
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
index 81b27b56258..8ae9441ff11 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
+++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp
@@ -2,6 +2,10 @@
#include "distance_function_factory.h"
#include "distance_functions.h"
+#include <vespa/vespalib/util/typify.h>
+#include <vespa/log/log.h>
+
+LOG_SETUP(".searchlib.tensor.distance_function_factory");
using search::attribute::DistanceMetric;
using vespalib::eval::CellType;
@@ -13,41 +17,28 @@ DistanceFunction::UP
make_distance_function(DistanceMetric variant, CellType cell_type)
{
switch (variant) {
- case DistanceMetric::Euclidean:
- if (cell_type == CellType::FLOAT) {
- return std::make_unique<SquaredEuclideanDistance<float>>();
- } else {
- return std::make_unique<SquaredEuclideanDistance<double>>();
- }
- break;
- case DistanceMetric::Angular:
- if (cell_type == CellType::FLOAT) {
- return std::make_unique<AngularDistance<float>>();
- } else {
- return std::make_unique<AngularDistance<double>>();
- }
- break;
- case DistanceMetric::GeoDegrees:
- if (cell_type == CellType::FLOAT) {
- return std::make_unique<GeoDegreesDistance<float>>();
- } else {
- return std::make_unique<GeoDegreesDistance<double>>();
- }
- break;
- case DistanceMetric::InnerProduct:
- if (cell_type == CellType::FLOAT) {
- return std::make_unique<InnerProductDistance<float>>();
- } else {
- return std::make_unique<InnerProductDistance<double>>();
- }
- break;
- case DistanceMetric::Hamming:
- if (cell_type == CellType::FLOAT) {
- return std::make_unique<HammingDistance<float>>();
- } else {
- return std::make_unique<HammingDistance<double>>();
- }
- break;
+ case DistanceMetric::Euclidean:
+ switch (cell_type) {
+ case CellType::FLOAT: return std::make_unique<SquaredEuclideanDistanceHW<float>>();
+ case CellType::DOUBLE: return std::make_unique<SquaredEuclideanDistanceHW<double>>();
+ default: return std::make_unique<SquaredEuclideanDistance>(CellType::FLOAT);
+ }
+ case DistanceMetric::Angular:
+ switch (cell_type) {
+ case CellType::FLOAT: return std::make_unique<AngularDistanceHW<float>>();
+ case CellType::DOUBLE: return std::make_unique<AngularDistanceHW<double>>();
+ default: return std::make_unique<AngularDistance>(CellType::FLOAT);
+ }
+ case DistanceMetric::GeoDegrees:
+ return std::make_unique<GeoDegreesDistance>(CellType::DOUBLE);
+ case DistanceMetric::InnerProduct:
+ switch (cell_type) {
+ case CellType::FLOAT: return std::make_unique<InnerProductDistanceHW<float>>();
+ case CellType::DOUBLE: return std::make_unique<InnerProductDistanceHW<double>>();
+ default: return std::make_unique<InnerProductDistance>(CellType::FLOAT);
+ }
+ case DistanceMetric::Hamming:
+ return std::make_unique<HammingDistance>(cell_type);
}
// not reached:
return DistanceFunction::UP();
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
deleted file mode 100644
index 8cf3a95db19..00000000000
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-#include "distance_functions.h"
-
-namespace search::tensor {
-
-template class SquaredEuclideanDistance<float>;
-template class SquaredEuclideanDistance<double>;
-
-template class AngularDistance<float>;
-template class AngularDistance<double>;
-
-template class InnerProductDistance<float>;
-template class InnerProductDistance<double>;
-
-template class GeoDegreesDistance<float>;
-template class GeoDegreesDistance<double>;
-
-template class HammingDistance<float>;
-template class HammingDistance<double>;
-
-}
diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
index 2557a51e0d7..8ad8c07bd3c 100644
--- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h
+++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h
@@ -1,245 +1,10 @@
-// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
#include "distance_function.h"
-#include <vespa/eval/eval/typed_cells.h>
-#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
-#include <cmath>
-
-namespace search::tensor {
-
-/**
- * Calculates the square of the standard Euclidean distance.
- * Will use instruction optimal for the cpu it is running on.
- */
-template <typename FloatType>
-class SquaredEuclideanDistance : public DistanceFunction {
-public:
- SquaredEuclideanDistance()
- : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator())
- {}
- double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override {
- auto lhs_vector = lhs.typify<FloatType>();
- auto rhs_vector = rhs.typify<FloatType>();
- size_t sz = lhs_vector.size();
- assert(sz == rhs_vector.size());
- return _computer.squaredEuclideanDistance(&lhs_vector[0], &rhs_vector[0], sz);
- }
- double convert_threshold(double threshold) const override {
- return threshold*threshold;
- }
- double to_rawscore(double distance) const override {
- double d = sqrt(distance);
- double score = 1.0 / (1.0 + d);
- return score;
- }
- double calc_with_limit(const vespalib::eval::TypedCells& lhs,
- const vespalib::eval::TypedCells& rhs,
- double limit) const override
- {
- auto lhs_vector = lhs.typify<FloatType>();
- auto rhs_vector = rhs.typify<FloatType>();
- double sum = 0.0;
- size_t sz = lhs_vector.size();
- assert(sz == rhs_vector.size());
- for (size_t i = 0; i < sz && sum <= limit; ++i) {
- double diff = lhs_vector[i] - rhs_vector[i];
- sum += diff*diff;
- }
- return sum;
- }
-
- const vespalib::hwaccelrated::IAccelrated & _computer;
-};
-
-/**
- * Calculates angular distance between vectors
- */
-template <typename FloatType>
-class AngularDistance : public DistanceFunction {
-public:
- AngularDistance()
- : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator())
- {}
- double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override {
- auto lhs_vector = lhs.typify<FloatType>();
- auto rhs_vector = rhs.typify<FloatType>();
- size_t sz = lhs_vector.size();
- assert(sz == rhs_vector.size());
- auto a = &lhs_vector[0];
- auto b = &rhs_vector[0];
- double a_norm_sq = _computer.dotProduct(a, a, sz);
- double b_norm_sq = _computer.dotProduct(b, b, sz);
- double squared_norms = a_norm_sq * b_norm_sq;
- double dot_product = _computer.dotProduct(a, b, sz);
- double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0;
- double cosine_similarity = dot_product / div;
- double distance = 1.0 - cosine_similarity; // in range [0,2]
- return distance;
- }
- double convert_threshold(double threshold) const override {
- double cosine_similarity = cos(threshold);
- return 1.0 - cosine_similarity;
- }
- double to_rawscore(double distance) const override {
- double cosine_similarity = 1.0 - distance;
- // should be in in range [-1,1] but roundoff may cause problems:
- cosine_similarity = std::min(1.0, cosine_similarity);
- cosine_similarity = std::max(-1.0, cosine_similarity);
- double angle_distance = acos(cosine_similarity); // in range [0,pi]
- double score = 1.0 / (1.0 + angle_distance);
- return score;
- }
- double calc_with_limit(const vespalib::eval::TypedCells& lhs,
- const vespalib::eval::TypedCells& rhs,
- double /*limit*/) const override
- {
- return calc(lhs, rhs);
- }
-
- const vespalib::hwaccelrated::IAccelrated & _computer;
-};
-
-/**
- * Calculates inner-product "distance" between vectors with assumed norm 1.
- * Should give same ordering as Angular distance, but is less expensive.
- */
-template <typename FloatType>
-class InnerProductDistance : public DistanceFunction {
-public:
- InnerProductDistance()
- : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator())
- {}
- double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override {
- auto lhs_vector = lhs.typify<FloatType>();
- auto rhs_vector = rhs.typify<FloatType>();
- size_t sz = lhs_vector.size();
- assert(sz == rhs_vector.size());
- double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz);
- return std::max(0.0, score);
- }
- double convert_threshold(double threshold) const override {
- return threshold;
- }
- double to_rawscore(double distance) const override {
- double score = 1.0 / (1.0 + distance);
- return score;
- }
- double calc_with_limit(const vespalib::eval::TypedCells& lhs,
- const vespalib::eval::TypedCells& rhs,
- double /*limit*/) const override
- {
- return calc(lhs, rhs);
- }
-
- const vespalib::hwaccelrated::IAccelrated & _computer;
-};
-
-/**
- * Calculates great-circle distance between Latitude/Longitude pairs,
- * measured in degrees. Output distance is measured in meters.
- * Uses the haversine formula directly from:
- * https://en.wikipedia.org/wiki/Haversine_formula
- **/
-template <typename FloatType>
-class GeoDegreesDistance : public DistanceFunction {
-public:
- // in km, as defined by IUGG, see:
- // https://en.wikipedia.org/wiki/Earth_radius#Mean_radius
- static constexpr double earth_mean_radius = 6371.0088;
- static constexpr double degrees_to_radians = M_PI / 180.0;
-
- GeoDegreesDistance() {}
- // haversine function:
- static double hav(double angle) {
- double s = sin(0.5*angle);
- return s*s;
- }
- double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override {
- auto lhs_vector = lhs.typify<FloatType>();
- auto rhs_vector = rhs.typify<FloatType>();
- assert(2 == lhs_vector.size());
- assert(2 == rhs_vector.size());
- // convert to radians:
- double lat_A = lhs_vector[0] * degrees_to_radians;
- double lat_B = rhs_vector[0] * degrees_to_radians;
- double lon_A = lhs_vector[1] * degrees_to_radians;
- double lon_B = rhs_vector[1] * degrees_to_radians;
-
- double lat_diff = lat_A - lat_B;
- double lon_diff = lon_A - lon_B;
-
- // haversines of differences:
- double hav_lat = hav(lat_diff);
- double hav_lon = hav(lon_diff);
-
- // haversine of central angle between the two points:
- double hav_central_angle = hav_lat + cos(lat_A)*cos(lat_B)*hav_lon;
- return hav_central_angle;
- }
- double convert_threshold(double threshold) const override {
- double half_angle = threshold / (2 * earth_mean_radius);
- double rt_hav = sin(half_angle);
- return rt_hav * rt_hav;
- }
- double to_rawscore(double distance) const override {
- double hav_diff = sqrt(distance);
- // distance in kilometers:
- double d = 2 * asin(hav_diff) * earth_mean_radius;
- // km to rawscore:
- return 1.0 / (1.0 + d);
- }
- double calc_with_limit(const vespalib::eval::TypedCells& lhs,
- const vespalib::eval::TypedCells& rhs,
- double /*limit*/) const override
- {
- return calc(lhs, rhs);
- }
-
-};
-
-/**
- * Calculates the Hamming distance defined as
- * "number of cells where the values are different"
- */
-template <typename FloatType>
-class HammingDistance : public DistanceFunction {
-public:
- HammingDistance() {}
- double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override {
- auto lhs_vector = lhs.typify<FloatType>();
- auto rhs_vector = rhs.typify<FloatType>();
- size_t sz = lhs_vector.size();
- assert(sz == rhs_vector.size());
- size_t sum = 0;
- for (size_t i = 0; i < sz; ++i) {
- sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1;
- }
- return (double)sum;
- }
- double convert_threshold(double threshold) const override {
- return threshold;
- }
- double to_rawscore(double distance) const override {
- double score = 1.0 / (1.0 + distance);
- return score;
- }
- double calc_with_limit(const vespalib::eval::TypedCells& lhs,
- const vespalib::eval::TypedCells& rhs,
- double limit) const override
- {
- auto lhs_vector = lhs.typify<FloatType>();
- auto rhs_vector = rhs.typify<FloatType>();
- size_t sz = lhs_vector.size();
- assert(sz == rhs_vector.size());
- size_t sum = 0;
- for (size_t i = 0; i < sz && sum <= limit; ++i) {
- sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1;
- }
- return (double)sum;
- }
-};
-
-
-}
+#include "angular_distance.h"
+#include "euclidean_distance.h"
+#include "geo_degrees_distance.h"
+#include "hamming_distance.h"
+#include "inner_product_distance.h"
diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
new file mode 100644
index 00000000000..223ceeed940
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp
@@ -0,0 +1,51 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "euclidean_distance.h"
+
+using vespalib::typify_invoke;
+using vespalib::eval::TypifyCellType;
+
+namespace search::tensor {
+
+namespace {
+
+struct CalcEuclidean {
+ template <typename LCT, typename RCT>
+ static double invoke(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs)
+ {
+ auto lhs_vector = lhs.unsafe_typify<LCT>();
+ auto rhs_vector = rhs.unsafe_typify<RCT>();
+ double sum = 0.0;
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ for (size_t i = 0; i < sz; ++i) {
+ double diff = lhs_vector[i] - rhs_vector[i];
+ sum += diff*diff;
+ }
+ return sum;
+ }
+};
+
+}
+
+double
+SquaredEuclideanDistance::calc(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs) const
+{
+ return typify_invoke<2,TypifyCellType,CalcEuclidean>(lhs.type, rhs.type, lhs, rhs);
+}
+
+double
+SquaredEuclideanDistance::calc_with_limit(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs,
+ double) const
+{
+ // maybe optimize this:
+ return typify_invoke<2,TypifyCellType,CalcEuclidean>(lhs.type, rhs.type, lhs, rhs);
+}
+
+template class SquaredEuclideanDistanceHW<float>;
+template class SquaredEuclideanDistanceHW<double>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h
new file mode 100644
index 00000000000..0669476c7aa
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h
@@ -0,0 +1,59 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+#include <cmath>
+
+namespace search::tensor {
+
+/**
+ * Calculates the square of the standard Euclidean distance.
+ */
+class SquaredEuclideanDistance : public DistanceFunction {
+public:
+ SquaredEuclideanDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {}
+ double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override;
+ double calc_with_limit(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs,
+ double limit) const override;
+ double convert_threshold(double threshold) const override {
+ return threshold*threshold;
+ }
+ double to_rawscore(double distance) const override {
+ double d = sqrt(distance);
+ double score = 1.0 / (1.0 + d);
+ return score;
+ }
+};
+
+/**
+ * Calculates the square of the standard Euclidean distance.
+ * Will use instruction optimal for the cpu it is running on
+ * when both vectors have the expected cell type.
+ */
+template <typename FloatType>
+class SquaredEuclideanDistanceHW : public SquaredEuclideanDistance {
+public:
+ SquaredEuclideanDistanceHW()
+ : SquaredEuclideanDistance(vespalib::eval::get_cell_type<FloatType>()),
+ _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator())
+ {
+ assert(expected_cell_type() == vespalib::eval::get_cell_type<FloatType>());
+ }
+ double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override {
+ constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type<FloatType>();
+ assert(lhs.type == expected && rhs.type == expected);
+ auto lhs_vector = lhs.typify<FloatType>();
+ auto rhs_vector = rhs.typify<FloatType>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ return _computer.squaredEuclideanDistance(&lhs_vector[0], &rhs_vector[0], sz);
+ }
+private:
+ const vespalib::hwaccelrated::IAccelrated & _computer;
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp
new file mode 100644
index 00000000000..946caa0d2cb
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp
@@ -0,0 +1,50 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "geo_degrees_distance.h"
+
+using vespalib::typify_invoke;
+using vespalib::eval::TypifyCellType;
+
+namespace search::tensor {
+
+namespace {
+
+struct CalcGeoDegrees {
+ template <typename LCT, typename RCT>
+ static double invoke(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs)
+ {
+ auto lhs_vector = lhs.unsafe_typify<LCT>();
+ auto rhs_vector = rhs.unsafe_typify<RCT>();
+
+ assert(2 == lhs_vector.size());
+ assert(2 == rhs_vector.size());
+ // convert to radians:
+ double lat_A = lhs_vector[0] * GeoDegreesDistance::degrees_to_radians;
+ double lat_B = rhs_vector[0] * GeoDegreesDistance::degrees_to_radians;
+ double lon_A = lhs_vector[1] * GeoDegreesDistance::degrees_to_radians;
+ double lon_B = rhs_vector[1] * GeoDegreesDistance::degrees_to_radians;
+
+ double lat_diff = lat_A - lat_B;
+ double lon_diff = lon_A - lon_B;
+
+ // haversines of differences:
+ double hav_lat = GeoDegreesDistance::hav(lat_diff);
+ double hav_lon = GeoDegreesDistance::hav(lon_diff);
+
+ // haversine of central angle between the two points:
+ double hav_central_angle = hav_lat + cos(lat_A)*cos(lat_B)*hav_lon;
+ return hav_central_angle;
+ }
+};
+
+}
+
+double
+GeoDegreesDistance::calc(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs) const
+{
+ return typify_invoke<2,TypifyCellType,CalcGeoDegrees>(lhs.type, rhs.type, lhs, rhs);
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h
new file mode 100644
index 00000000000..7ce69ef8aae
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h
@@ -0,0 +1,53 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+#include <vespa/vespalib/util/typify.h>
+#include <cmath>
+
+namespace search::tensor {
+
+/**
+ * Calculates great-circle distance between Latitude/Longitude pairs,
+ * measured in degrees. Output distance is measured in meters.
+ * Uses the haversine formula directly from:
+ * https://en.wikipedia.org/wiki/Haversine_formula
+ **/
+class GeoDegreesDistance : public DistanceFunction {
+public:
+ // in km, as defined by IUGG, see:
+ // https://en.wikipedia.org/wiki/Earth_radius#Mean_radius
+ static constexpr double earth_mean_radius = 6371.0088;
+ static constexpr double degrees_to_radians = M_PI / 180.0;
+
+ GeoDegreesDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {}
+ // haversine function:
+ static double hav(double angle) {
+ double s = sin(0.5*angle);
+ return s*s;
+ }
+ double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override;
+ double convert_threshold(double threshold) const override {
+ double half_angle = threshold / (2 * earth_mean_radius);
+ double rt_hav = sin(half_angle);
+ return rt_hav * rt_hav;
+ }
+ double to_rawscore(double distance) const override {
+ double hav_diff = sqrt(distance);
+ // distance in kilometers:
+ double d = 2 * asin(hav_diff) * earth_mean_radius;
+ // km to rawscore:
+ return 1.0 / (1.0 + d);
+ }
+ double calc_with_limit(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs,
+ double /*limit*/) const override
+ {
+ return calc(lhs, rhs);
+ }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp b/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp
new file mode 100644
index 00000000000..ef00321a145
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp
@@ -0,0 +1,61 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "hamming_distance.h"
+
+using vespalib::typify_invoke;
+using vespalib::eval::TypifyCellType;
+
+namespace search::tensor {
+
+namespace {
+
+struct CalcHamming {
+ template <typename LCT, typename RCT>
+ static double invoke(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs)
+ {
+ auto lhs_vector = lhs.unsafe_typify<LCT>();
+ auto rhs_vector = rhs.unsafe_typify<RCT>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ size_t sum = 0;
+ for (size_t i = 0; i < sz; ++i) {
+ sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1;
+ }
+ return (double)sum;
+ }
+};
+
+}
+
+double
+HammingDistance::calc(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs) const
+{
+ constexpr auto expected = vespalib::eval::CellType::INT8;
+ if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) {
+ const uint64_t *words_a = static_cast<const uint64_t *>(lhs.data);
+ const uint64_t *words_b = static_cast<const uint64_t *>(rhs.data);
+ size_t sum = 0;
+ size_t sz = lhs.size;
+ assert(sz == rhs.size);
+ size_t i = 0;
+ for (; i * 8 + 7 < sz; ++i) {
+ uint64_t xor_bits = words_a[i] ^ words_b[i];
+ sum += __builtin_popcountl(xor_bits);
+ }
+ if (__builtin_expect((i * 8 < sz), false)) {
+ const uint8_t *bytes_a = static_cast<const uint8_t *>(lhs.data);
+ const uint8_t *bytes_b = static_cast<const uint8_t *>(rhs.data);
+ for (i *= 8; i < sz; ++i) {
+ uint64_t xor_bits = bytes_a[i] ^ bytes_b[i];
+ sum += __builtin_popcountl(xor_bits);
+ }
+ }
+ return (double)sum;
+ } else {
+ return typify_invoke<2,TypifyCellType,CalcHamming>(lhs.type, rhs.type, lhs, rhs);
+ }
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.h b/searchlib/src/vespa/searchlib/tensor/hamming_distance.h
new file mode 100644
index 00000000000..d92671e4922
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/hamming_distance.h
@@ -0,0 +1,38 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/vespalib/util/typify.h>
+#include <cmath>
+
+namespace search::tensor {
+
+/**
+ * Calculates the Hamming distance defined as
+ * "number of cells where the values are different"
+ * or (for int8 cells, aka binary data only)
+ * "number of bits that are different"
+ */
+class HammingDistance : public DistanceFunction {
+public:
+ HammingDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {}
+ double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override;
+ double convert_threshold(double threshold) const override {
+ return threshold;
+ }
+ double to_rawscore(double distance) const override {
+ double score = 1.0 / (1.0 + distance);
+ return score;
+ }
+ double calc_with_limit(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs,
+ double) const override
+ {
+ // consider optimizing:
+ return calc(lhs, rhs);
+ }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp
new file mode 100644
index 00000000000..8a48c6c936a
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp
@@ -0,0 +1,44 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "inner_product_distance.h"
+
+using vespalib::typify_invoke;
+using vespalib::eval::TypifyCellType;
+
+namespace search::tensor {
+
+namespace {
+
+struct CalcInnerProduct {
+ template <typename LCT, typename RCT>
+ static double invoke(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs)
+ {
+ auto lhs_vector = lhs.unsafe_typify<LCT>();
+ auto rhs_vector = rhs.unsafe_typify<RCT>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ double dot_product = 0.0;
+ for (size_t i = 0; i < sz; ++i) {
+ double a = lhs_vector[i];
+ double b = rhs_vector[i];
+ dot_product += a*b;
+ }
+ double score = 1.0 - dot_product; // in range [0,2]
+ return std::max(0.0, score);
+ }
+};
+
+}
+
+double
+InnerProductDistance::calc(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs) const
+{
+ return typify_invoke<2,TypifyCellType,CalcInnerProduct>(lhs.type, rhs.type, lhs, rhs);
+}
+
+template class InnerProductDistanceHW<float>;
+template class InnerProductDistanceHW<double>;
+
+}
diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h
new file mode 100644
index 00000000000..981dedea141
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h
@@ -0,0 +1,64 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "distance_function.h"
+#include <vespa/eval/eval/typed_cells.h>
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+#include <cmath>
+
+namespace search::tensor {
+
+/**
+ * Calculates inner-product "distance" between vectors with assumed norm 1.
+ * Should give same ordering as Angular distance, but is less expensive.
+ */
+class InnerProductDistance : public DistanceFunction {
+public:
+ InnerProductDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {}
+ double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override;
+ double convert_threshold(double threshold) const override {
+ return threshold;
+ }
+ double to_rawscore(double distance) const override {
+ double score = 1.0 / (1.0 + distance);
+ return score;
+ }
+ double calc_with_limit(const vespalib::eval::TypedCells& lhs,
+ const vespalib::eval::TypedCells& rhs,
+ double /*limit*/) const override
+ {
+ return calc(lhs, rhs);
+ }
+};
+
+/**
+ * Calculates inner-product "distance" between vectors with assumed norm 1.
+ * Should give same ordering as Angular distance, but is less expensive.
+ * Will use instruction optimal for the cpu it is running on
+ * when both vectors have the expected cell type.
+ */
+template <typename FloatType>
+class InnerProductDistanceHW : public InnerProductDistance {
+public:
+ InnerProductDistanceHW()
+ : InnerProductDistance(vespalib::eval::get_cell_type<FloatType>()),
+ _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator())
+ {
+ assert(expected_cell_type() == vespalib::eval::get_cell_type<FloatType>());
+ }
+ double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override {
+ constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type<FloatType>();
+ assert(lhs.type == expected && rhs.type == expected);
+ auto lhs_vector = lhs.typify<FloatType>();
+ auto rhs_vector = rhs.typify<FloatType>();
+ size_t sz = lhs_vector.size();
+ assert(sz == rhs_vector.size());
+ double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz);
+ return std::max(0.0, score);
+ }
+private:
+ const vespalib::hwaccelrated::IAccelrated & _computer;
+};
+
+}