From 05391413632841596bd0cd8e40389a185461a0af Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Mon, 12 Apr 2021 11:22:25 +0000 Subject: fix NNS distance for new cell types This reverts commit f167fe4362c5e4e20a7605b99205cfbee77c569a. --- .../tensorattribute/tensorattribute_test.cpp | 2 +- .../distance_functions/distance_functions_test.cpp | 6 + .../tests/tensor/hnsw_index/hnsw_index_test.cpp | 3 +- .../src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp | 3 +- .../queryeval/nearest_neighbor_blueprint.cpp | 28 ++- .../queryeval/nearest_neighbor_iterator.cpp | 2 +- .../src/vespa/searchlib/tensor/CMakeLists.txt | 6 +- .../vespa/searchlib/tensor/angular_distance.cpp | 52 +++++ .../src/vespa/searchlib/tensor/angular_distance.h | 76 +++++++ .../src/vespa/searchlib/tensor/distance_function.h | 2 +- .../searchlib/tensor/distance_function_factory.cpp | 61 +++-- .../vespa/searchlib/tensor/distance_functions.cpp | 22 -- .../vespa/searchlib/tensor/distance_functions.h | 247 +-------------------- .../vespa/searchlib/tensor/euclidean_distance.cpp | 51 +++++ .../vespa/searchlib/tensor/euclidean_distance.h | 59 +++++ .../searchlib/tensor/geo_degrees_distance.cpp | 50 +++++ .../vespa/searchlib/tensor/geo_degrees_distance.h | 53 +++++ .../vespa/searchlib/tensor/hamming_distance.cpp | 61 +++++ .../src/vespa/searchlib/tensor/hamming_distance.h | 38 ++++ .../searchlib/tensor/inner_product_distance.cpp | 44 ++++ .../searchlib/tensor/inner_product_distance.h | 64 ++++++ 21 files changed, 609 insertions(+), 321 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/tensor/angular_distance.cpp create mode 100644 searchlib/src/vespa/searchlib/tensor/angular_distance.h delete mode 100644 searchlib/src/vespa/searchlib/tensor/distance_functions.cpp create mode 100644 searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp create mode 100644 searchlib/src/vespa/searchlib/tensor/euclidean_distance.h create mode 100644 searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp create mode 100644 searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h create mode 100644 searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp create mode 100644 searchlib/src/vespa/searchlib/tensor/hamming_distance.h create mode 100644 searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp create mode 100644 searchlib/src/vespa/searchlib/tensor/inner_product_distance.h diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index f1d910f2635..2a509031e24 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -230,7 +230,7 @@ public: const search::tensor::DistanceFunction *distance_function() const override { - static search::tensor::SquaredEuclideanDistance my_dist_fun; + static search::tensor::SquaredEuclideanDistance my_dist_fun; return &my_dist_fun; } }; diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp index ee0a2aff80e..6f8c9b4c885 100644 --- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp +++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp @@ -10,6 +10,7 @@ LOG_SETUP("distance_function_test"); using namespace search::tensor; +using vespalib::eval::Int8Float; using vespalib::eval::TypedCells; using search::attribute::DistanceMetric; @@ -212,6 +213,11 @@ TEST(DistanceFunctionsTest, hamming_gives_expected_score) EXPECT_DOUBLE_EQ(threshold, 0.5); threshold = hamming->convert_threshold(1.0); EXPECT_DOUBLE_EQ(threshold, 1.0); + + std::vector bytes_a = { 0, 1, 2, 4, 8, 16, 32, 64, -128, 0, 1, 2, 4, 8, 16, 32, 64, -128, 0, 1, 2 }; + std::vector bytes_b = { 1, 2, 2, 4, 8, 16, 32, 65, -128, 0, 1, 0, 4, 8, 16, 32, 64, -128, 0, 1, -1 }; + // expect diff: 1 2 1 1 7 + EXPECT_EQ(hamming->calc(TypedCells(bytes_a), TypedCells(bytes_b)), 12.0); } TEST(GeoDegreesTest, gives_expected_score) diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 20dc55df329..6ffe118aa65 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -51,7 +51,6 @@ struct LevelGenerator : public RandomLevelGenerator { }; using FloatVectors = MyDocVectorAccess; -using FloatSqEuclideanDistance = SquaredEuclideanDistance; using HnswIndexUP = std::unique_ptr; class HnswIndexTest : public ::testing::Test { @@ -79,7 +78,7 @@ public: void init(bool heuristic_select_neighbors) { auto generator = std::make_unique(); level_generator = generator.get(); - index = std::make_unique(vectors, std::make_unique(), + index = std::make_unique(vectors, std::make_unique(), std::move(generator), HnswIndex::Config(5, 2, 10, 0, heuristic_select_neighbors)); } diff --git a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp index 8e3bb95a776..7acdb4df983 100644 --- a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp @@ -117,7 +117,6 @@ public: } }; -using FloatSqEuclideanDistance = SquaredEuclideanDistance; using HnswIndexUP = std::unique_ptr; class Stressor : public ::testing::Test { @@ -232,7 +231,7 @@ public: void init() { uint32_t m = 16; - index = std::make_unique(vectors, std::make_unique(), + index = std::make_unique(vectors, std::make_unique(), std::make_unique(m), HnswIndex::Config(2*m, m, 200, 10, true)); } diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp index 4ec23b993b6..8012c48a04a 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp @@ -65,9 +65,12 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f { auto lct = _query_tensor->cells().type; auto rct = _attr_tensor.getTensorType().cell_type(); - using MyTypify = vespalib::eval::TypifyCellType; - auto fixup_fun = vespalib::typify_invoke<2,MyTypify,ConvertCellsSelector>(lct, rct); - fixup_fun(_query_tensor, _attr_tensor.getTensorType()); + if (rct == vespalib::eval::CellType::FLOAT || rct == vespalib::eval::CellType::DOUBLE) { + // avoid downcasting to bfloat16 etc, that is just extra work + using MyTypify = vespalib::eval::TypifyCellType; + auto fixup_fun = vespalib::typify_invoke<2,MyTypify,ConvertCellsSelector>(lct, rct); + fixup_fun(_query_tensor, _attr_tensor.getTensorType()); + } _fallback_dist_fun = search::tensor::make_distance_function(_attr_tensor.distance_metric(), rct); _dist_fun = _fallback_dist_fun.get(); assert(_dist_fun); @@ -123,18 +126,13 @@ NearestNeighborBlueprint::perform_top_k() { auto nns_index = _attr_tensor.nearest_neighbor_index(); if (_approximate && nns_index) { - auto lhs_type = _query_tensor->type(); - auto rhs_type = _attr_tensor.getTensorType(); - // different cell types should be converted already - if (lhs_type == rhs_type) { - auto lhs = _query_tensor->cells(); - uint32_t k = _target_num_hits; - if (_global_filter->has_filter()) { - auto filter = _global_filter->filter(); - _found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold); - } else { - _found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold); - } + auto lhs = _query_tensor->cells(); + uint32_t k = _target_num_hits; + if (_global_filter->has_filter()) { + auto filter = _global_filter->filter(); + _found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold); + } else { + _found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold); } } } diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp index d85da49c2f7..6379a06e6d6 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp @@ -16,7 +16,7 @@ bool is_compatible(const vespalib::eval::ValueType& lhs, const vespalib::eval::ValueType& rhs) { - return (lhs == rhs); + return (lhs.dimensions() == rhs.dimensions()); } } diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index 8af77a57a44..46bafa189e1 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -1,6 +1,7 @@ # Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(searchlib_tensor OBJECT SOURCES + angular_distance.cpp default_nearest_neighbor_index_factory.cpp dense_tensor_attribute.cpp dense_tensor_attribute_saver.cpp @@ -9,13 +10,16 @@ vespa_add_library(searchlib_tensor OBJECT direct_tensor_saver.cpp direct_tensor_store.cpp distance_function_factory.cpp - distance_functions.cpp + euclidean_distance.cpp + geo_degrees_distance.cpp + hamming_distance.cpp hnsw_graph.cpp hnsw_index.cpp hnsw_index_loader.cpp hnsw_index_saver.cpp imported_tensor_attribute_vector.cpp imported_tensor_attribute_vector_read_guard.cpp + inner_product_distance.cpp inv_log_level_generator.cpp nearest_neighbor_index.cpp nearest_neighbor_index_saver.cpp diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp new file mode 100644 index 00000000000..21b2622283c --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp @@ -0,0 +1,52 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "angular_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcAngular { + template + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify(); + auto rhs_vector = rhs.unsafe_typify(); + + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + double a_norm_sq = 0.0; + double b_norm_sq = 0.0; + double dot_product = 0.0; + for (size_t i = 0; i < sz; ++i) { + double a = lhs_vector[i]; + double b = rhs_vector[i]; + a_norm_sq += a*a; + b_norm_sq += b*b; + dot_product += a*b; + } + double squared_norms = a_norm_sq * b_norm_sq; + double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; + double cosine_similarity = dot_product / div; + double distance = 1.0 - cosine_similarity; // in range [0,2] + return std::max(0.0, distance); + } +}; + +} + +double +AngularDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + return typify_invoke<2,TypifyCellType,CalcAngular>(lhs.type, rhs.type, lhs, rhs); +} + +template class AngularDistanceHW; +template class AngularDistanceHW; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.h b/searchlib/src/vespa/searchlib/tensor/angular_distance.h new file mode 100644 index 00000000000..c480ba2879e --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.h @@ -0,0 +1,76 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include +#include +#include + +namespace search::tensor { + +/** + * Calculates angular distance between vectors + */ +class AngularDistance : public DistanceFunction { +public: + AngularDistance() {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double convert_threshold(double threshold) const override { + double cosine_similarity = cos(threshold); + return 1.0 - cosine_similarity; + } + double to_rawscore(double distance) const override { + double cosine_similarity = 1.0 - distance; + // should be in in range [-1,1] but roundoff may cause problems: + cosine_similarity = std::min(1.0, cosine_similarity); + cosine_similarity = std::max(-1.0, cosine_similarity); + double angle_distance = acos(cosine_similarity); // in range [0,pi] + double score = 1.0 / (1.0 + angle_distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double /*limit*/) const override + { + return calc(lhs, rhs); + } +}; + +/** + * Calculates angular distance between vectors + * Will use instruction optimal for the cpu it is running on + * when both vectors have the expected cell type. + */ +template +class AngularDistanceHW : public AngularDistance { +public: + AngularDistanceHW() + : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type(); + if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) { + auto lhs_vector = lhs.unsafe_typify(); + auto rhs_vector = rhs.unsafe_typify(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + auto a = &lhs_vector[0]; + auto b = &rhs_vector[0]; + double a_norm_sq = _computer.dotProduct(a, a, sz); + double b_norm_sq = _computer.dotProduct(b, b, sz); + double squared_norms = a_norm_sq * b_norm_sq; + double dot_product = _computer.dotProduct(a, b, sz); + double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; + double cosine_similarity = dot_product / div; + double distance = 1.0 - cosine_similarity; // in range [0,2] + return distance; + } else { + return AngularDistance::calc(lhs, rhs); + } + } +private: + const vespalib::hwaccelrated::IAccelrated & _computer; +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function.h b/searchlib/src/vespa/searchlib/tensor/distance_function.h index 724f83b6129..08f90fec041 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_function.h @@ -1,4 +1,4 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp index 81b27b56258..1d58c01fd99 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp @@ -2,6 +2,10 @@ #include "distance_function_factory.h" #include "distance_functions.h" +#include +#include + +LOG_SETUP(".searchlib.tensor.distance_function_factory"); using search::attribute::DistanceMetric; using vespalib::eval::CellType; @@ -13,41 +17,28 @@ DistanceFunction::UP make_distance_function(DistanceMetric variant, CellType cell_type) { switch (variant) { - case DistanceMetric::Euclidean: - if (cell_type == CellType::FLOAT) { - return std::make_unique>(); - } else { - return std::make_unique>(); - } - break; - case DistanceMetric::Angular: - if (cell_type == CellType::FLOAT) { - return std::make_unique>(); - } else { - return std::make_unique>(); - } - break; - case DistanceMetric::GeoDegrees: - if (cell_type == CellType::FLOAT) { - return std::make_unique>(); - } else { - return std::make_unique>(); - } - break; - case DistanceMetric::InnerProduct: - if (cell_type == CellType::FLOAT) { - return std::make_unique>(); - } else { - return std::make_unique>(); - } - break; - case DistanceMetric::Hamming: - if (cell_type == CellType::FLOAT) { - return std::make_unique>(); - } else { - return std::make_unique>(); - } - break; + case DistanceMetric::Euclidean: + switch (cell_type) { + case CellType::FLOAT: return std::make_unique>(); + case CellType::DOUBLE: return std::make_unique>(); + default: return std::make_unique(); + } + case DistanceMetric::Angular: + switch (cell_type) { + case CellType::FLOAT: return std::make_unique>(); + case CellType::DOUBLE: return std::make_unique>(); + default: return std::make_unique(); + } + case DistanceMetric::GeoDegrees: + return std::make_unique(); + case DistanceMetric::InnerProduct: + switch (cell_type) { + case CellType::FLOAT: return std::make_unique>(); + case CellType::DOUBLE: return std::make_unique>(); + default: return std::make_unique(); + } + case DistanceMetric::Hamming: + return std::make_unique(); } // not reached: return DistanceFunction::UP(); diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp deleted file mode 100644 index 8cf3a95db19..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "distance_functions.h" - -namespace search::tensor { - -template class SquaredEuclideanDistance; -template class SquaredEuclideanDistance; - -template class AngularDistance; -template class AngularDistance; - -template class InnerProductDistance; -template class InnerProductDistance; - -template class GeoDegreesDistance; -template class GeoDegreesDistance; - -template class HammingDistance; -template class HammingDistance; - -} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h index 2557a51e0d7..8ad8c07bd3c 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -1,245 +1,10 @@ -// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include "distance_function.h" -#include -#include -#include - -namespace search::tensor { - -/** - * Calculates the square of the standard Euclidean distance. - * Will use instruction optimal for the cpu it is running on. - */ -template -class SquaredEuclideanDistance : public DistanceFunction { -public: - SquaredEuclideanDistance() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) - {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify(); - auto rhs_vector = rhs.typify(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - return _computer.squaredEuclideanDistance(&lhs_vector[0], &rhs_vector[0], sz); - } - double convert_threshold(double threshold) const override { - return threshold*threshold; - } - double to_rawscore(double distance) const override { - double d = sqrt(distance); - double score = 1.0 / (1.0 + d); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double limit) const override - { - auto lhs_vector = lhs.typify(); - auto rhs_vector = rhs.typify(); - double sum = 0.0; - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - for (size_t i = 0; i < sz && sum <= limit; ++i) { - double diff = lhs_vector[i] - rhs_vector[i]; - sum += diff*diff; - } - return sum; - } - - const vespalib::hwaccelrated::IAccelrated & _computer; -}; - -/** - * Calculates angular distance between vectors - */ -template -class AngularDistance : public DistanceFunction { -public: - AngularDistance() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) - {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify(); - auto rhs_vector = rhs.typify(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - auto a = &lhs_vector[0]; - auto b = &rhs_vector[0]; - double a_norm_sq = _computer.dotProduct(a, a, sz); - double b_norm_sq = _computer.dotProduct(b, b, sz); - double squared_norms = a_norm_sq * b_norm_sq; - double dot_product = _computer.dotProduct(a, b, sz); - double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; - double cosine_similarity = dot_product / div; - double distance = 1.0 - cosine_similarity; // in range [0,2] - return distance; - } - double convert_threshold(double threshold) const override { - double cosine_similarity = cos(threshold); - return 1.0 - cosine_similarity; - } - double to_rawscore(double distance) const override { - double cosine_similarity = 1.0 - distance; - // should be in in range [-1,1] but roundoff may cause problems: - cosine_similarity = std::min(1.0, cosine_similarity); - cosine_similarity = std::max(-1.0, cosine_similarity); - double angle_distance = acos(cosine_similarity); // in range [0,pi] - double score = 1.0 / (1.0 + angle_distance); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double /*limit*/) const override - { - return calc(lhs, rhs); - } - - const vespalib::hwaccelrated::IAccelrated & _computer; -}; - -/** - * Calculates inner-product "distance" between vectors with assumed norm 1. - * Should give same ordering as Angular distance, but is less expensive. - */ -template -class InnerProductDistance : public DistanceFunction { -public: - InnerProductDistance() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) - {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify(); - auto rhs_vector = rhs.typify(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz); - return std::max(0.0, score); - } - double convert_threshold(double threshold) const override { - return threshold; - } - double to_rawscore(double distance) const override { - double score = 1.0 / (1.0 + distance); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double /*limit*/) const override - { - return calc(lhs, rhs); - } - - const vespalib::hwaccelrated::IAccelrated & _computer; -}; - -/** - * Calculates great-circle distance between Latitude/Longitude pairs, - * measured in degrees. Output distance is measured in meters. - * Uses the haversine formula directly from: - * https://en.wikipedia.org/wiki/Haversine_formula - **/ -template -class GeoDegreesDistance : public DistanceFunction { -public: - // in km, as defined by IUGG, see: - // https://en.wikipedia.org/wiki/Earth_radius#Mean_radius - static constexpr double earth_mean_radius = 6371.0088; - static constexpr double degrees_to_radians = M_PI / 180.0; - - GeoDegreesDistance() {} - // haversine function: - static double hav(double angle) { - double s = sin(0.5*angle); - return s*s; - } - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify(); - auto rhs_vector = rhs.typify(); - assert(2 == lhs_vector.size()); - assert(2 == rhs_vector.size()); - // convert to radians: - double lat_A = lhs_vector[0] * degrees_to_radians; - double lat_B = rhs_vector[0] * degrees_to_radians; - double lon_A = lhs_vector[1] * degrees_to_radians; - double lon_B = rhs_vector[1] * degrees_to_radians; - - double lat_diff = lat_A - lat_B; - double lon_diff = lon_A - lon_B; - - // haversines of differences: - double hav_lat = hav(lat_diff); - double hav_lon = hav(lon_diff); - - // haversine of central angle between the two points: - double hav_central_angle = hav_lat + cos(lat_A)*cos(lat_B)*hav_lon; - return hav_central_angle; - } - double convert_threshold(double threshold) const override { - double half_angle = threshold / (2 * earth_mean_radius); - double rt_hav = sin(half_angle); - return rt_hav * rt_hav; - } - double to_rawscore(double distance) const override { - double hav_diff = sqrt(distance); - // distance in kilometers: - double d = 2 * asin(hav_diff) * earth_mean_radius; - // km to rawscore: - return 1.0 / (1.0 + d); - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double /*limit*/) const override - { - return calc(lhs, rhs); - } - -}; - -/** - * Calculates the Hamming distance defined as - * "number of cells where the values are different" - */ -template -class HammingDistance : public DistanceFunction { -public: - HammingDistance() {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - auto lhs_vector = lhs.typify(); - auto rhs_vector = rhs.typify(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - size_t sum = 0; - for (size_t i = 0; i < sz; ++i) { - sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; - } - return (double)sum; - } - double convert_threshold(double threshold) const override { - return threshold; - } - double to_rawscore(double distance) const override { - double score = 1.0 / (1.0 + distance); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double limit) const override - { - auto lhs_vector = lhs.typify(); - auto rhs_vector = rhs.typify(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - size_t sum = 0; - for (size_t i = 0; i < sz && sum <= limit; ++i) { - sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; - } - return (double)sum; - } -}; - - -} +#include "angular_distance.h" +#include "euclidean_distance.h" +#include "geo_degrees_distance.h" +#include "hamming_distance.h" +#include "inner_product_distance.h" diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp new file mode 100644 index 00000000000..223ceeed940 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp @@ -0,0 +1,51 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "euclidean_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcEuclidean { + template + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify(); + auto rhs_vector = rhs.unsafe_typify(); + double sum = 0.0; + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + for (size_t i = 0; i < sz; ++i) { + double diff = lhs_vector[i] - rhs_vector[i]; + sum += diff*diff; + } + return sum; + } +}; + +} + +double +SquaredEuclideanDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + return typify_invoke<2,TypifyCellType,CalcEuclidean>(lhs.type, rhs.type, lhs, rhs); +} + +double +SquaredEuclideanDistance::calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double) const +{ + // maybe optimize this: + return typify_invoke<2,TypifyCellType,CalcEuclidean>(lhs.type, rhs.type, lhs, rhs); +} + +template class SquaredEuclideanDistanceHW; +template class SquaredEuclideanDistanceHW; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h new file mode 100644 index 00000000000..6d4d982834f --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h @@ -0,0 +1,59 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include +#include +#include + +namespace search::tensor { + +/** + * Calculates the square of the standard Euclidean distance. + */ +class SquaredEuclideanDistance : public DistanceFunction { +public: + SquaredEuclideanDistance() {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double limit) const override; + double convert_threshold(double threshold) const override { + return threshold*threshold; + } + double to_rawscore(double distance) const override { + double d = sqrt(distance); + double score = 1.0 / (1.0 + d); + return score; + } +}; + +/** + * Calculates the square of the standard Euclidean distance. + * Will use instruction optimal for the cpu it is running on + * when both vectors have the expected cell type. + */ +template +class SquaredEuclideanDistanceHW : public SquaredEuclideanDistance { +public: + SquaredEuclideanDistanceHW() + : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type(); + if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) { + auto lhs_vector = lhs.unsafe_typify(); + auto rhs_vector = rhs.unsafe_typify(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + return _computer.squaredEuclideanDistance(&lhs_vector[0], &rhs_vector[0], sz); + } else { + return SquaredEuclideanDistance::calc(lhs, rhs); + } + } +private: + const vespalib::hwaccelrated::IAccelrated & _computer; +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp new file mode 100644 index 00000000000..946caa0d2cb --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp @@ -0,0 +1,50 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "geo_degrees_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcGeoDegrees { + template + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify(); + auto rhs_vector = rhs.unsafe_typify(); + + assert(2 == lhs_vector.size()); + assert(2 == rhs_vector.size()); + // convert to radians: + double lat_A = lhs_vector[0] * GeoDegreesDistance::degrees_to_radians; + double lat_B = rhs_vector[0] * GeoDegreesDistance::degrees_to_radians; + double lon_A = lhs_vector[1] * GeoDegreesDistance::degrees_to_radians; + double lon_B = rhs_vector[1] * GeoDegreesDistance::degrees_to_radians; + + double lat_diff = lat_A - lat_B; + double lon_diff = lon_A - lon_B; + + // haversines of differences: + double hav_lat = GeoDegreesDistance::hav(lat_diff); + double hav_lon = GeoDegreesDistance::hav(lon_diff); + + // haversine of central angle between the two points: + double hav_central_angle = hav_lat + cos(lat_A)*cos(lat_B)*hav_lon; + return hav_central_angle; + } +}; + +} + +double +GeoDegreesDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + return typify_invoke<2,TypifyCellType,CalcGeoDegrees>(lhs.type, rhs.type, lhs, rhs); +} + +} diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h new file mode 100644 index 00000000000..b8b9bec50d5 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h @@ -0,0 +1,53 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include +#include +#include +#include + +namespace search::tensor { + +/** + * Calculates great-circle distance between Latitude/Longitude pairs, + * measured in degrees. Output distance is measured in meters. + * Uses the haversine formula directly from: + * https://en.wikipedia.org/wiki/Haversine_formula + **/ +class GeoDegreesDistance : public DistanceFunction { +public: + // in km, as defined by IUGG, see: + // https://en.wikipedia.org/wiki/Earth_radius#Mean_radius + static constexpr double earth_mean_radius = 6371.0088; + static constexpr double degrees_to_radians = M_PI / 180.0; + + GeoDegreesDistance() {} + // haversine function: + static double hav(double angle) { + double s = sin(0.5*angle); + return s*s; + } + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double convert_threshold(double threshold) const override { + double half_angle = threshold / (2 * earth_mean_radius); + double rt_hav = sin(half_angle); + return rt_hav * rt_hav; + } + double to_rawscore(double distance) const override { + double hav_diff = sqrt(distance); + // distance in kilometers: + double d = 2 * asin(hav_diff) * earth_mean_radius; + // km to rawscore: + return 1.0 / (1.0 + d); + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double /*limit*/) const override + { + return calc(lhs, rhs); + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp b/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp new file mode 100644 index 00000000000..30ba979a6d8 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp @@ -0,0 +1,61 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "hamming_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcHamming { + template + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify(); + auto rhs_vector = rhs.unsafe_typify(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + size_t sum = 0; + for (size_t i = 0; i < sz; ++i) { + sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; + } + return (double)sum; + } +}; + +} + +double +HammingDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + constexpr auto expected = vespalib::eval::CellType::INT8; + if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) { + const uint64_t *words_a = static_cast(lhs.data); + const uint64_t *words_b = static_cast(rhs.data); + size_t sum = 0; + size_t sz = lhs.size; + assert(sz == rhs.size); + size_t i = 0; + for (; i * 8 < sz; ++i) { + uint64_t xor_bits = words_a[i] ^ words_b[i]; + sum += __builtin_popcountl(xor_bits); + } + if (__builtin_expect((i * 8 < sz), false)) { + const uint8_t *bytes_a = static_cast(lhs.data); + const uint8_t *bytes_b = static_cast(rhs.data); + for (i *= 8; i < sz; ++i) { + uint64_t xor_bits = bytes_a[i] ^ bytes_b[i]; + sum += __builtin_popcountl(xor_bits); + } + } + return (double)sum; + } else { + return typify_invoke<2,TypifyCellType,CalcHamming>(lhs.type, rhs.type, lhs, rhs); + } +} + +} diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.h b/searchlib/src/vespa/searchlib/tensor/hamming_distance.h new file mode 100644 index 00000000000..c2cd7af3863 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/hamming_distance.h @@ -0,0 +1,38 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include +#include +#include + +namespace search::tensor { + +/** + * Calculates the Hamming distance defined as + * "number of cells where the values are different" + * or (for int8 cells, aka binary data only) + * "number of bits that are different" + */ +class HammingDistance : public DistanceFunction { +public: + HammingDistance() {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double convert_threshold(double threshold) const override { + return threshold; + } + double to_rawscore(double distance) const override { + double score = 1.0 / (1.0 + distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double) const override + { + // consider optimizing: + return calc(lhs, rhs); + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp new file mode 100644 index 00000000000..8a48c6c936a --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp @@ -0,0 +1,44 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "inner_product_distance.h" + +using vespalib::typify_invoke; +using vespalib::eval::TypifyCellType; + +namespace search::tensor { + +namespace { + +struct CalcInnerProduct { + template + static double invoke(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) + { + auto lhs_vector = lhs.unsafe_typify(); + auto rhs_vector = rhs.unsafe_typify(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + double dot_product = 0.0; + for (size_t i = 0; i < sz; ++i) { + double a = lhs_vector[i]; + double b = rhs_vector[i]; + dot_product += a*b; + } + double score = 1.0 - dot_product; // in range [0,2] + return std::max(0.0, score); + } +}; + +} + +double +InnerProductDistance::calc(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs) const +{ + return typify_invoke<2,TypifyCellType,CalcInnerProduct>(lhs.type, rhs.type, lhs, rhs); +} + +template class InnerProductDistanceHW; +template class InnerProductDistanceHW; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h new file mode 100644 index 00000000000..cb60d18c0f5 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h @@ -0,0 +1,64 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "distance_function.h" +#include +#include +#include + +namespace search::tensor { + +/** + * Calculates inner-product "distance" between vectors with assumed norm 1. + * Should give same ordering as Angular distance, but is less expensive. + */ +class InnerProductDistance : public DistanceFunction { +public: + InnerProductDistance() {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; + double convert_threshold(double threshold) const override { + return threshold; + } + double to_rawscore(double distance) const override { + double score = 1.0 / (1.0 + distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double /*limit*/) const override + { + return calc(lhs, rhs); + } +}; + +/** + * Calculates inner-product "distance" between vectors with assumed norm 1. + * Should give same ordering as Angular distance, but is less expensive. + * Will use instruction optimal for the cpu it is running on + * when both vectors have the expected cell type. + */ +template +class InnerProductDistanceHW : public InnerProductDistance { +public: + InnerProductDistanceHW() + : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type(); + if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) { + auto lhs_vector = lhs.unsafe_typify(); + auto rhs_vector = rhs.unsafe_typify(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz); + return std::max(0.0, score); + } else { + return InnerProductDistance::calc(lhs, rhs); + } + } +private: + const vespalib::hwaccelrated::IAccelrated & _computer; +}; + +} -- cgit v1.2.3