diff options
author | Arne H Juul <arnej27959@users.noreply.github.com> | 2021-04-14 11:07:37 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-14 11:07:37 +0200 |
commit | 53f821ab8ce9940dca4639738d0ca997016e47ed (patch) | |
tree | 6447e12a919fa69f84eccc33ff2a4ea710229d0b /searchlib | |
parent | 9f8a6c642871300d7c937b610bd32dfceb6b29ea (diff) | |
parent | 9dd87566bafd50e50036a68095cc24df04a20ee0 (diff) |
Merge pull request #17378 from vespa-engine/arnej/redo-nn-distance-for-new-cell-types-2
Arnej/redo nn distance for new cell types 2
Diffstat (limited to 'searchlib')
21 files changed, 673 insertions, 340 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index f1d910f2635..9621b93fd37 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -230,7 +230,7 @@ public: const search::tensor::DistanceFunction *distance_function() const override { - static search::tensor::SquaredEuclideanDistance<double> my_dist_fun; + static search::tensor::SquaredEuclideanDistance my_dist_fun(vespalib::eval::CellType::DOUBLE); return &my_dist_fun; } }; diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp index ee0a2aff80e..701f4c91ff2 100644 --- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp +++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp @@ -10,10 +10,12 @@ LOG_SETUP("distance_function_test"); using namespace search::tensor; +using vespalib::eval::Int8Float; using vespalib::eval::TypedCells; using search::attribute::DistanceMetric; -TypedCells t(const std::vector<double> &v) { return TypedCells(v); } +template <typename T> +TypedCells t(const std::vector<T> &v) { return TypedCells(v); } void verify_geo_miles(const DistanceFunction *dist_fun, const std::vector<double> &p1, @@ -58,6 +60,31 @@ TEST(DistanceFunctionsTest, euclidean_gives_expected_score) EXPECT_EQ(threshold, 0.25); } +TEST(DistanceFunctionsTest, euclidean_int8_smoketest) +{ + auto ct = vespalib::eval::CellType::INT8; + + auto euclid = make_distance_function(DistanceMetric::Euclidean, ct); + + std::vector<double> p00{0.0, 0.0, 0.0}; + std::vector<Int8Float> p0{0.0, 0.0, 0.0}; + std::vector<Int8Float> p1{1.0, 0.0, 0.0}; + std::vector<Int8Float> p5{0.0,-1.0, 0.0}; + std::vector<Int8Float> p7{-1.0, 2.0, -2.0}; + + EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p0), t(p1))); + EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p0), t(p5))); + EXPECT_DOUBLE_EQ(9.0, euclid->calc(t(p0), t(p7))); + + EXPECT_DOUBLE_EQ(2.0, euclid->calc(t(p1), t(p5))); + EXPECT_DOUBLE_EQ(12.0, euclid->calc(t(p1), t(p7))); + EXPECT_DOUBLE_EQ(14.0, euclid->calc(t(p5), t(p7))); + + EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p00), t(p1))); + EXPECT_DOUBLE_EQ(1.0, euclid->calc(t(p00), t(p5))); + EXPECT_DOUBLE_EQ(9.0, euclid->calc(t(p00), t(p7))); +} + TEST(DistanceFunctionsTest, angular_gives_expected_score) { auto ct = vespalib::eval::CellType::DOUBLE; @@ -212,6 +239,11 @@ TEST(DistanceFunctionsTest, hamming_gives_expected_score) EXPECT_DOUBLE_EQ(threshold, 0.5); threshold = hamming->convert_threshold(1.0); EXPECT_DOUBLE_EQ(threshold, 1.0); + + std::vector<Int8Float> bytes_a = { 0, 1, 2, 4, 8, 16, 32, 64, -128, 0, 1, 2, 4, 8, 16, 32, 64, -128, 0, 1, 2 }; + std::vector<Int8Float> bytes_b = { 1, 2, 2, 4, 8, 16, 32, 65, -128, 0, 1, 0, 4, 8, 16, 32, 64, -128, 0, 1, -1 }; + // expect diff: 1 2 1 1 7 + EXPECT_EQ(hamming->calc(TypedCells(bytes_a), TypedCells(bytes_b)), 12.0); } TEST(GeoDegreesTest, gives_expected_score) diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 20dc55df329..3f7ec140781 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -51,7 +51,6 @@ struct LevelGenerator : public RandomLevelGenerator { }; using FloatVectors = MyDocVectorAccess<float>; -using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>; using HnswIndexUP = std::unique_ptr<HnswIndex>; class HnswIndexTest : public ::testing::Test { @@ -79,7 +78,7 @@ public: void init(bool heuristic_select_neighbors) { auto generator = std::make_unique<LevelGenerator>(); level_generator = generator.get(); - index = std::make_unique<HnswIndex>(vectors, std::make_unique<FloatSqEuclideanDistance>(), + index = std::make_unique<HnswIndex>(vectors, std::make_unique<SquaredEuclideanDistance>(vespalib::eval::CellType::FLOAT), std::move(generator), HnswIndex::Config(5, 2, 10, 0, heuristic_select_neighbors)); } diff --git a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp index 8e3bb95a776..154950822ee 100644 --- a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp @@ -117,7 +117,7 @@ public: } }; -using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>; +using FloatSqEuclideanDistance = SquaredEuclideanDistanceHW<float>; using HnswIndexUP = std::unique_ptr<HnswIndex>; class Stressor : public ::testing::Test { diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp index 4ec23b993b6..68af090afb6 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp @@ -4,7 +4,7 @@ #include "nearest_neighbor_blueprint.h" #include "nearest_neighbor_iterator.h" #include "nns_index_iterator.h" -#include <vespa/eval/eval/dense_cells_value.h> +#include <vespa/eval/eval/fast_value.h> #include <vespa/searchlib/fef/termfieldmatchdataarray.h> #include <vespa/searchlib/tensor/dense_tensor_attribute.h> #include <vespa/searchlib/tensor/distance_function_factory.h> @@ -12,35 +12,43 @@ LOG_SETUP(".searchlib.queryeval.nearest_neighbor_blueprint"); -using vespalib::eval::DenseCellsValue; +using vespalib::eval::CellType; +using vespalib::eval::FastValueBuilderFactory; +using vespalib::eval::TypedCells; using vespalib::eval::Value; +using vespalib::eval::ValueType; namespace search::queryeval { namespace { template<typename LCT, typename RCT> -void -convert_cells(std::unique_ptr<Value> &original, const vespalib::eval::ValueType &want_type) +std::unique_ptr<Value> +convert_cells(const ValueType &new_type, std::unique_ptr<Value> old_value) { - if constexpr (std::is_same<LCT,RCT>::value) { - return; - } else { - auto old_cells = original->cells().typify<LCT>(); - std::vector<RCT> new_cells; - new_cells.reserve(old_cells.size()); - for (LCT value : old_cells) { - RCT conv(value); - new_cells.push_back(conv); - } - original = std::make_unique<DenseCellsValue<RCT>>(want_type, std::move(new_cells)); + auto old_cells = old_value->cells().typify<LCT>(); + auto builder = FastValueBuilderFactory::get().create_value_builder<RCT>(new_type); + auto new_cells = builder->add_subspace(); + assert(old_cells.size() == new_cells.size()); + auto p = new_cells.begin(); + for (LCT value : old_cells) { + RCT conv(value); + *p++ = conv; } + return builder->build(std::move(builder)); } struct ConvertCellsSelector { template <typename LCT, typename RCT> - static auto invoke() { return convert_cells<LCT, RCT>; } + static auto invoke(const ValueType &new_type, std::unique_ptr<Value> old_value) { + return convert_cells<LCT, RCT>(new_type, std::move(old_value)); + } + auto operator() (CellType from, CellType to, std::unique_ptr<Value> old_value) const { + using MyTypify = vespalib::eval::TypifyCellType; + ValueType new_type = old_value->type().cell_cast(to); + return vespalib::typify_invoke<2,MyTypify,ConvertCellsSelector>(from, to, new_type, std::move(old_value)); + } }; } // namespace <unnamed> @@ -63,12 +71,8 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f _found_hits(), _global_filter(GlobalFilter::create()) { - auto lct = _query_tensor->cells().type; - auto rct = _attr_tensor.getTensorType().cell_type(); - using MyTypify = vespalib::eval::TypifyCellType; - auto fixup_fun = vespalib::typify_invoke<2,MyTypify,ConvertCellsSelector>(lct, rct); - fixup_fun(_query_tensor, _attr_tensor.getTensorType()); - _fallback_dist_fun = search::tensor::make_distance_function(_attr_tensor.distance_metric(), rct); + CellType attr_ct = _attr_tensor.getTensorType().cell_type(); + _fallback_dist_fun = search::tensor::make_distance_function(_attr_tensor.distance_metric(), attr_ct); _dist_fun = _fallback_dist_fun.get(); assert(_dist_fun); auto nns_index = _attr_tensor.nearest_neighbor_index(); @@ -76,6 +80,12 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f _dist_fun = nns_index->distance_function(); assert(_dist_fun); } + auto query_ct = _query_tensor->cells().type; + CellType required_ct = _dist_fun->expected_cell_type(); + if (query_ct != required_ct) { + ConvertCellsSelector converter; + _query_tensor = converter(query_ct, required_ct, std::move(_query_tensor)); + } if (distance_threshold < std::numeric_limits<double>::max()) { _distance_threshold = _dist_fun->convert_threshold(distance_threshold); _distance_heap.set_distance_threshold(_distance_threshold); @@ -123,18 +133,13 @@ NearestNeighborBlueprint::perform_top_k() { auto nns_index = _attr_tensor.nearest_neighbor_index(); if (_approximate && nns_index) { - auto lhs_type = _query_tensor->type(); - auto rhs_type = _attr_tensor.getTensorType(); - // different cell types should be converted already - if (lhs_type == rhs_type) { - auto lhs = _query_tensor->cells(); - uint32_t k = _target_num_hits; - if (_global_filter->has_filter()) { - auto filter = _global_filter->filter(); - _found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold); - } else { - _found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold); - } + auto lhs = _query_tensor->cells(); + uint32_t k = _target_num_hits; + if (_global_filter->has_filter()) { + auto filter = _global_filter->filter(); + _found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold); + } else { + _found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold); } } } diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp index d85da49c2f7..6379a06e6d6 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp @@ -16,7 +16,7 @@ bool is_compatible(const vespalib::eval::ValueType& lhs, const vespalib::eval::ValueType& rhs) { - return (lhs == rhs); + return (lhs.dimensions() == rhs.dimensions()); } } diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index 8af77a57a44..46bafa189e1 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -1,6 +1,7 @@ # Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(searchlib_tensor OBJECT SOURCES + angular_distance.cpp default_nearest_neighbor_index_factory.cpp dense_tensor_attribute.cpp dense_tensor_attribute_saver.cpp @@ -9,13 +10,16 @@ vespa_add_library(searchlib_tensor OBJECT direct_tensor_saver.cpp direct_tensor_store.cpp distance_function_factory.cpp - distance_functions.cpp + euclidean_distance.cpp + geo_degrees_distance.cpp + hamming_distance.cpp hnsw_graph.cpp hnsw_index.cpp hnsw_index_loader.cpp hnsw_index_saver.cpp imported_tensor_attribute_vector.cpp imported_tensor_attribute_vector_read_guard.cpp + inner_product_distance.cpp inv_log_level_generator.cpp nearest_neighbor_index.cpp nearest_neighbor_index_saver.cpp diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp new file mode 100644 index 00000000000..21b2622283c --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp @@ -0,0 +1,52 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "angular_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcAngular { + template <typename LCT, typename RCT> + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify<LCT>(); + auto rhs_vector = rhs.unsafe_typify<RCT>(); + + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + double a_norm_sq = 0.0; + double b_norm_sq = 0.0; + double dot_product = 0.0; + for (size_t i = 0; i < sz; ++i) { + double a = lhs_vector[i]; + double b = rhs_vector[i]; + a_norm_sq += a*a; + b_norm_sq += b*b; + dot_product += a*b; + } + double squared_norms = a_norm_sq * b_norm_sq; + double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; + double cosine_similarity = dot_product / div; + double distance = 1.0 - cosine_similarity; // in range [0,2] + return std::max(0.0, distance); + } +}; + +} + +double +AngularDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + return typify_invoke<2,TypifyCellType,CalcAngular>(lhs.type, rhs.type, lhs, rhs); +} + +template class AngularDistanceHW<float>; +template class AngularDistanceHW<double>; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.h b/searchlib/src/vespa/searchlib/tensor/angular_distance.h new file mode 100644 index 00000000000..ce1fa40b2a4 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.h @@ -0,0 +1,76 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include <vespa/eval/eval/typed_cells.h> +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> +#include <cmath> + +namespace search::tensor { + +/** + * Calculates angular distance between vectors + */ +class AngularDistance : public DistanceFunction { +public: + AngularDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double convert_threshold(double threshold) const override { + double cosine_similarity = cos(threshold); + return 1.0 - cosine_similarity; + } + double to_rawscore(double distance) const override { + double cosine_similarity = 1.0 - distance; + // should be in in range [-1,1] but roundoff may cause problems: + cosine_similarity = std::min(1.0, cosine_similarity); + cosine_similarity = std::max(-1.0, cosine_similarity); + double angle_distance = acos(cosine_similarity); // in range [0,pi] + double score = 1.0 / (1.0 + angle_distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double /*limit*/) const override + { + return calc(lhs, rhs); + } +}; + +/** + * Calculates angular distance between vectors + * Will use instruction optimal for the cpu it is running on + * when both vectors have the expected cell type. + */ +template <typename FloatType> +class AngularDistanceHW : public AngularDistance { +public: + AngularDistanceHW() + : AngularDistance(vespalib::eval::get_cell_type<FloatType>()), + _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + { + assert(expected_cell_type() == vespalib::eval::get_cell_type<FloatType>()); + } + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type<FloatType>(); + assert(lhs.type == expected && rhs.type == expected); + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + auto a = &lhs_vector[0]; + auto b = &rhs_vector[0]; + double a_norm_sq = _computer.dotProduct(a, a, sz); + double b_norm_sq = _computer.dotProduct(b, b, sz); + double squared_norms = a_norm_sq * b_norm_sq; + double dot_product = _computer.dotProduct(a, b, sz); + double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; + double cosine_similarity = dot_product / div; + double distance = 1.0 - cosine_similarity; // in range [0,2] + return distance; + } +private: + const vespalib::hwaccelrated::IAccelrated & _computer; +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function.h b/searchlib/src/vespa/searchlib/tensor/distance_function.h index 724f83b6129..4c1cd2c5608 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_function.h @@ -1,8 +1,9 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include <memory> +#include <vespa/eval/eval/cell_type.h> namespace vespalib::eval { struct TypedCells; } @@ -15,10 +16,20 @@ namespace search::tensor { * The actual implementation must know which type the vectors are. */ class DistanceFunction { +private: + vespalib::eval::CellType _expect_cell_type; public: using UP = std::unique_ptr<DistanceFunction>; + + DistanceFunction(vespalib::eval::CellType expected) : _expect_cell_type(expected) {} + virtual ~DistanceFunction() {} + // input (query) vectors must be converted to this cell type: + vespalib::eval::CellType expected_cell_type() const { + return _expect_cell_type; + } + // calculate internal distance (comparable) virtual double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const = 0; diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp index 81b27b56258..8ae9441ff11 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp @@ -2,6 +2,10 @@ #include "distance_function_factory.h" #include "distance_functions.h" +#include <vespa/vespalib/util/typify.h> +#include <vespa/log/log.h> + +LOG_SETUP(".searchlib.tensor.distance_function_factory"); using search::attribute::DistanceMetric; using vespalib::eval::CellType; @@ -13,41 +17,28 @@ DistanceFunction::UP make_distance_function(DistanceMetric variant, CellType cell_type) { switch (variant) { - case DistanceMetric::Euclidean: - if (cell_type == CellType::FLOAT) { - return std::make_unique<SquaredEuclideanDistance<float>>(); - } else { - return std::make_unique<SquaredEuclideanDistance<double>>(); - } - break; - case DistanceMetric::Angular: - if (cell_type == CellType::FLOAT) { - return std::make_unique<AngularDistance<float>>(); - } else { - return std::make_unique<AngularDistance<double>>(); - } - break; - case DistanceMetric::GeoDegrees: - if (cell_type == CellType::FLOAT) { - return std::make_unique<GeoDegreesDistance<float>>(); - } else { - return std::make_unique<GeoDegreesDistance<double>>(); - } - break; - case DistanceMetric::InnerProduct: - if (cell_type == CellType::FLOAT) { - return std::make_unique<InnerProductDistance<float>>(); - } else { - return std::make_unique<InnerProductDistance<double>>(); - } - break; - case DistanceMetric::Hamming: - if (cell_type == CellType::FLOAT) { - return std::make_unique<HammingDistance<float>>(); - } else { - return std::make_unique<HammingDistance<double>>(); - } - break; + case DistanceMetric::Euclidean: + switch (cell_type) { + case CellType::FLOAT: return std::make_unique<SquaredEuclideanDistanceHW<float>>(); + case CellType::DOUBLE: return std::make_unique<SquaredEuclideanDistanceHW<double>>(); + default: return std::make_unique<SquaredEuclideanDistance>(CellType::FLOAT); + } + case DistanceMetric::Angular: + switch (cell_type) { + case CellType::FLOAT: return std::make_unique<AngularDistanceHW<float>>(); + case CellType::DOUBLE: return std::make_unique<AngularDistanceHW<double>>(); + default: return std::make_unique<AngularDistance>(CellType::FLOAT); + } + case DistanceMetric::GeoDegrees: + return std::make_unique<GeoDegreesDistance>(CellType::DOUBLE); + case DistanceMetric::InnerProduct: + switch (cell_type) { + case CellType::FLOAT: return std::make_unique<InnerProductDistanceHW<float>>(); + case CellType::DOUBLE: return std::make_unique<InnerProductDistanceHW<double>>(); + default: return std::make_unique<InnerProductDistance>(CellType::FLOAT); + } + case DistanceMetric::Hamming: + return std::make_unique<HammingDistance>(cell_type); } // not reached: return DistanceFunction::UP(); diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp deleted file mode 100644 index 8cf3a95db19..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "distance_functions.h" - -namespace search::tensor { - -template class SquaredEuclideanDistance<float>; -template class SquaredEuclideanDistance<double>; - -template class AngularDistance<float>; -template class AngularDistance<double>; - -template class InnerProductDistance<float>; -template class InnerProductDistance<double>; - -template class GeoDegreesDistance<float>; -template class GeoDegreesDistance<double>; - -template class HammingDistance<float>; -template class HammingDistance<double>; - -} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h index 2557a51e0d7..8ad8c07bd3c 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -1,245 +1,10 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include "distance_function.h" -#include <vespa/eval/eval/typed_cells.h> -#include <vespa/vespalib/hwaccelrated/iaccelrated.h> -#include <cmath> - -namespace search::tensor { - -/** - * Calculates the square of the standard Euclidean distance. - * Will use instruction optimal for the cpu it is running on. - */ -template <typename FloatType> -class SquaredEuclideanDistance : public DistanceFunction { -public: - SquaredEuclideanDistance() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) - {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify<FloatType>(); - auto rhs_vector = rhs.typify<FloatType>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - return _computer.squaredEuclideanDistance(&lhs_vector[0], &rhs_vector[0], sz); - } - double convert_threshold(double threshold) const override { - return threshold*threshold; - } - double to_rawscore(double distance) const override { - double d = sqrt(distance); - double score = 1.0 / (1.0 + d); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double limit) const override - { - auto lhs_vector = lhs.typify<FloatType>(); - auto rhs_vector = rhs.typify<FloatType>(); - double sum = 0.0; - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - for (size_t i = 0; i < sz && sum <= limit; ++i) { - double diff = lhs_vector[i] - rhs_vector[i]; - sum += diff*diff; - } - return sum; - } - - const vespalib::hwaccelrated::IAccelrated & _computer; -}; - -/** - * Calculates angular distance between vectors - */ -template <typename FloatType> -class AngularDistance : public DistanceFunction { -public: - AngularDistance() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) - {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify<FloatType>(); - auto rhs_vector = rhs.typify<FloatType>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - auto a = &lhs_vector[0]; - auto b = &rhs_vector[0]; - double a_norm_sq = _computer.dotProduct(a, a, sz); - double b_norm_sq = _computer.dotProduct(b, b, sz); - double squared_norms = a_norm_sq * b_norm_sq; - double dot_product = _computer.dotProduct(a, b, sz); - double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; - double cosine_similarity = dot_product / div; - double distance = 1.0 - cosine_similarity; // in range [0,2] - return distance; - } - double convert_threshold(double threshold) const override { - double cosine_similarity = cos(threshold); - return 1.0 - cosine_similarity; - } - double to_rawscore(double distance) const override { - double cosine_similarity = 1.0 - distance; - // should be in in range [-1,1] but roundoff may cause problems: - cosine_similarity = std::min(1.0, cosine_similarity); - cosine_similarity = std::max(-1.0, cosine_similarity); - double angle_distance = acos(cosine_similarity); // in range [0,pi] - double score = 1.0 / (1.0 + angle_distance); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double /*limit*/) const override - { - return calc(lhs, rhs); - } - - const vespalib::hwaccelrated::IAccelrated & _computer; -}; - -/** - * Calculates inner-product "distance" between vectors with assumed norm 1. - * Should give same ordering as Angular distance, but is less expensive. - */ -template <typename FloatType> -class InnerProductDistance : public DistanceFunction { -public: - InnerProductDistance() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) - {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify<FloatType>(); - auto rhs_vector = rhs.typify<FloatType>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz); - return std::max(0.0, score); - } - double convert_threshold(double threshold) const override { - return threshold; - } - double to_rawscore(double distance) const override { - double score = 1.0 / (1.0 + distance); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double /*limit*/) const override - { - return calc(lhs, rhs); - } - - const vespalib::hwaccelrated::IAccelrated & _computer; -}; - -/** - * Calculates great-circle distance between Latitude/Longitude pairs, - * measured in degrees. Output distance is measured in meters. - * Uses the haversine formula directly from: - * https://en.wikipedia.org/wiki/Haversine_formula - **/ -template <typename FloatType> -class GeoDegreesDistance : public DistanceFunction { -public: - // in km, as defined by IUGG, see: - // https://en.wikipedia.org/wiki/Earth_radius#Mean_radius - static constexpr double earth_mean_radius = 6371.0088; - static constexpr double degrees_to_radians = M_PI / 180.0; - - GeoDegreesDistance() {} - // haversine function: - static double hav(double angle) { - double s = sin(0.5*angle); - return s*s; - } - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify<FloatType>(); - auto rhs_vector = rhs.typify<FloatType>(); - assert(2 == lhs_vector.size()); - assert(2 == rhs_vector.size()); - // convert to radians: - double lat_A = lhs_vector[0] * degrees_to_radians; - double lat_B = rhs_vector[0] * degrees_to_radians; - double lon_A = lhs_vector[1] * degrees_to_radians; - double lon_B = rhs_vector[1] * degrees_to_radians; - - double lat_diff = lat_A - lat_B; - double lon_diff = lon_A - lon_B; - - // haversines of differences: - double hav_lat = hav(lat_diff); - double hav_lon = hav(lon_diff); - - // haversine of central angle between the two points: - double hav_central_angle = hav_lat + cos(lat_A)*cos(lat_B)*hav_lon; - return hav_central_angle; - } - double convert_threshold(double threshold) const override { - double half_angle = threshold / (2 * earth_mean_radius); - double rt_hav = sin(half_angle); - return rt_hav * rt_hav; - } - double to_rawscore(double distance) const override { - double hav_diff = sqrt(distance); - // distance in kilometers: - double d = 2 * asin(hav_diff) * earth_mean_radius; - // km to rawscore: - return 1.0 / (1.0 + d); - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double /*limit*/) const override - { - return calc(lhs, rhs); - } - -}; - -/** - * Calculates the Hamming distance defined as - * "number of cells where the values are different" - */ -template <typename FloatType> -class HammingDistance : public DistanceFunction { -public: - HammingDistance() {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify<FloatType>(); - auto rhs_vector = rhs.typify<FloatType>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - size_t sum = 0; - for (size_t i = 0; i < sz; ++i) { - sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; - } - return (double)sum; - } - double convert_threshold(double threshold) const override { - return threshold; - } - double to_rawscore(double distance) const override { - double score = 1.0 / (1.0 + distance); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double limit) const override - { - auto lhs_vector = lhs.typify<FloatType>(); - auto rhs_vector = rhs.typify<FloatType>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - size_t sum = 0; - for (size_t i = 0; i < sz && sum <= limit; ++i) { - sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; - } - return (double)sum; - } -}; - - -} +#include "angular_distance.h" +#include "euclidean_distance.h" +#include "geo_degrees_distance.h" +#include "hamming_distance.h" +#include "inner_product_distance.h" diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp new file mode 100644 index 00000000000..223ceeed940 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp @@ -0,0 +1,51 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "euclidean_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcEuclidean { + template <typename LCT, typename RCT> + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify<LCT>(); + auto rhs_vector = rhs.unsafe_typify<RCT>(); + double sum = 0.0; + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + for (size_t i = 0; i < sz; ++i) { + double diff = lhs_vector[i] - rhs_vector[i]; + sum += diff*diff; + } + return sum; + } +}; + +} + +double +SquaredEuclideanDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + return typify_invoke<2,TypifyCellType,CalcEuclidean>(lhs.type, rhs.type, lhs, rhs); +} + +double +SquaredEuclideanDistance::calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double) const +{ + // maybe optimize this: + return typify_invoke<2,TypifyCellType,CalcEuclidean>(lhs.type, rhs.type, lhs, rhs); +} + +template class SquaredEuclideanDistanceHW<float>; +template class SquaredEuclideanDistanceHW<double>; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h new file mode 100644 index 00000000000..0669476c7aa --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h @@ -0,0 +1,59 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include <vespa/eval/eval/typed_cells.h> +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> +#include <cmath> + +namespace search::tensor { + +/** + * Calculates the square of the standard Euclidean distance. + */ +class SquaredEuclideanDistance : public DistanceFunction { +public: + SquaredEuclideanDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double limit) const override; + double convert_threshold(double threshold) const override { + return threshold*threshold; + } + double to_rawscore(double distance) const override { + double d = sqrt(distance); + double score = 1.0 / (1.0 + d); + return score; + } +}; + +/** + * Calculates the square of the standard Euclidean distance. + * Will use instruction optimal for the cpu it is running on + * when both vectors have the expected cell type. + */ +template <typename FloatType> +class SquaredEuclideanDistanceHW : public SquaredEuclideanDistance { +public: + SquaredEuclideanDistanceHW() + : SquaredEuclideanDistance(vespalib::eval::get_cell_type<FloatType>()), + _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + { + assert(expected_cell_type() == vespalib::eval::get_cell_type<FloatType>()); + } + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type<FloatType>(); + assert(lhs.type == expected && rhs.type == expected); + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + return _computer.squaredEuclideanDistance(&lhs_vector[0], &rhs_vector[0], sz); + } +private: + const vespalib::hwaccelrated::IAccelrated & _computer; +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp new file mode 100644 index 00000000000..946caa0d2cb --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp @@ -0,0 +1,50 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "geo_degrees_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcGeoDegrees { + template <typename LCT, typename RCT> + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify<LCT>(); + auto rhs_vector = rhs.unsafe_typify<RCT>(); + + assert(2 == lhs_vector.size()); + assert(2 == rhs_vector.size()); + // convert to radians: + double lat_A = lhs_vector[0] * GeoDegreesDistance::degrees_to_radians; + double lat_B = rhs_vector[0] * GeoDegreesDistance::degrees_to_radians; + double lon_A = lhs_vector[1] * GeoDegreesDistance::degrees_to_radians; + double lon_B = rhs_vector[1] * GeoDegreesDistance::degrees_to_radians; + + double lat_diff = lat_A - lat_B; + double lon_diff = lon_A - lon_B; + + // haversines of differences: + double hav_lat = GeoDegreesDistance::hav(lat_diff); + double hav_lon = GeoDegreesDistance::hav(lon_diff); + + // haversine of central angle between the two points: + double hav_central_angle = hav_lat + cos(lat_A)*cos(lat_B)*hav_lon; + return hav_central_angle; + } +}; + +} + +double +GeoDegreesDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + return typify_invoke<2,TypifyCellType,CalcGeoDegrees>(lhs.type, rhs.type, lhs, rhs); +} + +} diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h new file mode 100644 index 00000000000..7ce69ef8aae --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h @@ -0,0 +1,53 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include <vespa/eval/eval/typed_cells.h> +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> +#include <vespa/vespalib/util/typify.h> +#include <cmath> + +namespace search::tensor { + +/** + * Calculates great-circle distance between Latitude/Longitude pairs, + * measured in degrees. Output distance is measured in meters. + * Uses the haversine formula directly from: + * https://en.wikipedia.org/wiki/Haversine_formula + **/ +class GeoDegreesDistance : public DistanceFunction { +public: + // in km, as defined by IUGG, see: + // https://en.wikipedia.org/wiki/Earth_radius#Mean_radius + static constexpr double earth_mean_radius = 6371.0088; + static constexpr double degrees_to_radians = M_PI / 180.0; + + GeoDegreesDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {} + // haversine function: + static double hav(double angle) { + double s = sin(0.5*angle); + return s*s; + } + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double convert_threshold(double threshold) const override { + double half_angle = threshold / (2 * earth_mean_radius); + double rt_hav = sin(half_angle); + return rt_hav * rt_hav; + } + double to_rawscore(double distance) const override { + double hav_diff = sqrt(distance); + // distance in kilometers: + double d = 2 * asin(hav_diff) * earth_mean_radius; + // km to rawscore: + return 1.0 / (1.0 + d); + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double /*limit*/) const override + { + return calc(lhs, rhs); + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp b/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp new file mode 100644 index 00000000000..ef00321a145 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp @@ -0,0 +1,61 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "hamming_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcHamming { + template <typename LCT, typename RCT> + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify<LCT>(); + auto rhs_vector = rhs.unsafe_typify<RCT>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + size_t sum = 0; + for (size_t i = 0; i < sz; ++i) { + sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; + } + return (double)sum; + } +}; + +} + +double +HammingDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + constexpr auto expected = vespalib::eval::CellType::INT8; + if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) { + const uint64_t *words_a = static_cast<const uint64_t *>(lhs.data); + const uint64_t *words_b = static_cast<const uint64_t *>(rhs.data); + size_t sum = 0; + size_t sz = lhs.size; + assert(sz == rhs.size); + size_t i = 0; + for (; i * 8 + 7 < sz; ++i) { + uint64_t xor_bits = words_a[i] ^ words_b[i]; + sum += __builtin_popcountl(xor_bits); + } + if (__builtin_expect((i * 8 < sz), false)) { + const uint8_t *bytes_a = static_cast<const uint8_t *>(lhs.data); + const uint8_t *bytes_b = static_cast<const uint8_t *>(rhs.data); + for (i *= 8; i < sz; ++i) { + uint64_t xor_bits = bytes_a[i] ^ bytes_b[i]; + sum += __builtin_popcountl(xor_bits); + } + } + return (double)sum; + } else { + return typify_invoke<2,TypifyCellType,CalcHamming>(lhs.type, rhs.type, lhs, rhs); + } +} + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.h b/searchlib/src/vespa/searchlib/tensor/hamming_distance.h new file mode 100644 index 00000000000..d92671e4922 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hamming_distance.h @@ -0,0 +1,38 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include <vespa/eval/eval/typed_cells.h> +#include <vespa/vespalib/util/typify.h> +#include <cmath> + +namespace search::tensor { + +/** + * Calculates the Hamming distance defined as + * "number of cells where the values are different" + * or (for int8 cells, aka binary data only) + * "number of bits that are different" + */ +class HammingDistance : public DistanceFunction { +public: + HammingDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double convert_threshold(double threshold) const override { + return threshold; + } + double to_rawscore(double distance) const override { + double score = 1.0 / (1.0 + distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double) const override + { + // consider optimizing: + return calc(lhs, rhs); + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp new file mode 100644 index 00000000000..8a48c6c936a --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp @@ -0,0 +1,44 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "inner_product_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcInnerProduct { + template <typename LCT, typename RCT> + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify<LCT>(); + auto rhs_vector = rhs.unsafe_typify<RCT>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + double dot_product = 0.0; + for (size_t i = 0; i < sz; ++i) { + double a = lhs_vector[i]; + double b = rhs_vector[i]; + dot_product += a*b; + } + double score = 1.0 - dot_product; // in range [0,2] + return std::max(0.0, score); + } +}; + +} + +double +InnerProductDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + return typify_invoke<2,TypifyCellType,CalcInnerProduct>(lhs.type, rhs.type, lhs, rhs); +} + +template class InnerProductDistanceHW<float>; +template class InnerProductDistanceHW<double>; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h new file mode 100644 index 00000000000..981dedea141 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h @@ -0,0 +1,64 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include <vespa/eval/eval/typed_cells.h> +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> +#include <cmath> + +namespace search::tensor { + +/** + * Calculates inner-product "distance" between vectors with assumed norm 1. + * Should give same ordering as Angular distance, but is less expensive. + */ +class InnerProductDistance : public DistanceFunction { +public: + InnerProductDistance(vespalib::eval::CellType expected) : DistanceFunction(expected) {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double convert_threshold(double threshold) const override { + return threshold; + } + double to_rawscore(double distance) const override { + double score = 1.0 / (1.0 + distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double /*limit*/) const override + { + return calc(lhs, rhs); + } +}; + +/** + * Calculates inner-product "distance" between vectors with assumed norm 1. + * Should give same ordering as Angular distance, but is less expensive. + * Will use instruction optimal for the cpu it is running on + * when both vectors have the expected cell type. + */ +template <typename FloatType> +class InnerProductDistanceHW : public InnerProductDistance { +public: + InnerProductDistanceHW() + : InnerProductDistance(vespalib::eval::get_cell_type<FloatType>()), + _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + { + assert(expected_cell_type() == vespalib::eval::get_cell_type<FloatType>()); + } + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type<FloatType>(); + assert(lhs.type == expected && rhs.type == expected); + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz); + return std::max(0.0, score); + } +private: + const vespalib::hwaccelrated::IAccelrated & _computer; +}; + +} |