diff options
author | Arnstein Ressem <aressem@gmail.com> | 2021-04-12 12:15:22 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-04-12 12:15:22 +0200 |
commit | f167fe4362c5e4e20a7605b99205cfbee77c569a (patch) | |
tree | dc422d4954307e03b1fdce1afbc14e8d7f416b1d | |
parent | 8dc926818cdddde34fb287b215203dde02216f8d (diff) |
Revert "fix NNS distance for new cell types"
21 files changed, 321 insertions, 609 deletions
diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index 2a509031e24..f1d910f2635 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -230,7 +230,7 @@ public: const search::tensor::DistanceFunction *distance_function() const override { - static search::tensor::SquaredEuclideanDistance my_dist_fun; + static search::tensor::SquaredEuclideanDistance<double> my_dist_fun; return &my_dist_fun; } }; diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp index 6f8c9b4c885..ee0a2aff80e 100644 --- a/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp +++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_test.cpp @@ -10,7 +10,6 @@ LOG_SETUP("distance_function_test"); using namespace search::tensor; -using vespalib::eval::Int8Float; using vespalib::eval::TypedCells; using search::attribute::DistanceMetric; @@ -213,11 +212,6 @@ TEST(DistanceFunctionsTest, hamming_gives_expected_score) EXPECT_DOUBLE_EQ(threshold, 0.5); threshold = hamming->convert_threshold(1.0); EXPECT_DOUBLE_EQ(threshold, 1.0); - - std::vector<Int8Float> bytes_a = { 0, 1, 2, 4, 8, 16, 32, 64, -128, 0, 1, 2, 4, 8, 16, 32, 64, -128, 0, 1, 2 }; - std::vector<Int8Float> bytes_b = { 1, 2, 2, 4, 8, 16, 32, 65, -128, 0, 1, 0, 4, 8, 16, 32, 64, -128, 0, 1, -1 }; - // expect diff: 1 2 1 1 7 - EXPECT_EQ(hamming->calc(TypedCells(bytes_a), TypedCells(bytes_b)), 12.0); } TEST(GeoDegreesTest, gives_expected_score) diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index 6ffe118aa65..20dc55df329 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -51,6 +51,7 @@ struct LevelGenerator : public RandomLevelGenerator { }; using FloatVectors = MyDocVectorAccess<float>; +using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>; using HnswIndexUP = std::unique_ptr<HnswIndex>; class HnswIndexTest : public ::testing::Test { @@ -78,7 +79,7 @@ public: void init(bool heuristic_select_neighbors) { auto generator = std::make_unique<LevelGenerator>(); level_generator = generator.get(); - index = std::make_unique<HnswIndex>(vectors, std::make_unique<SquaredEuclideanDistance>(), + index = std::make_unique<HnswIndex>(vectors, std::make_unique<FloatSqEuclideanDistance>(), std::move(generator), HnswIndex::Config(5, 2, 10, 0, heuristic_select_neighbors)); } diff --git a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp index 7acdb4df983..8e3bb95a776 100644 --- a/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/stress_hnsw_mt.cpp @@ -117,6 +117,7 @@ public: } }; +using FloatSqEuclideanDistance = SquaredEuclideanDistance<float>; using HnswIndexUP = std::unique_ptr<HnswIndex>; class Stressor : public ::testing::Test { @@ -231,7 +232,7 @@ public: void init() { uint32_t m = 16; - index = std::make_unique<HnswIndex>(vectors, std::make_unique<SquaredEuclideanDistance>(), + index = std::make_unique<HnswIndex>(vectors, std::make_unique<FloatSqEuclideanDistance>(), std::make_unique<InvLogLevelGenerator>(m), HnswIndex::Config(2*m, m, 200, 10, true)); } diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp index 8012c48a04a..4ec23b993b6 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_blueprint.cpp @@ -65,12 +65,9 @@ NearestNeighborBlueprint::NearestNeighborBlueprint(const queryeval::FieldSpec& f { auto lct = _query_tensor->cells().type; auto rct = _attr_tensor.getTensorType().cell_type(); - if (rct == vespalib::eval::CellType::FLOAT || rct == vespalib::eval::CellType::DOUBLE) { - // avoid downcasting to bfloat16 etc, that is just extra work - using MyTypify = vespalib::eval::TypifyCellType; - auto fixup_fun = vespalib::typify_invoke<2,MyTypify,ConvertCellsSelector>(lct, rct); - fixup_fun(_query_tensor, _attr_tensor.getTensorType()); - } + using MyTypify = vespalib::eval::TypifyCellType; + auto fixup_fun = vespalib::typify_invoke<2,MyTypify,ConvertCellsSelector>(lct, rct); + fixup_fun(_query_tensor, _attr_tensor.getTensorType()); _fallback_dist_fun = search::tensor::make_distance_function(_attr_tensor.distance_metric(), rct); _dist_fun = _fallback_dist_fun.get(); assert(_dist_fun); @@ -126,13 +123,18 @@ NearestNeighborBlueprint::perform_top_k() { auto nns_index = _attr_tensor.nearest_neighbor_index(); if (_approximate && nns_index) { - auto lhs = _query_tensor->cells(); - uint32_t k = _target_num_hits; - if (_global_filter->has_filter()) { - auto filter = _global_filter->filter(); - _found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold); - } else { - _found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold); + auto lhs_type = _query_tensor->type(); + auto rhs_type = _attr_tensor.getTensorType(); + // different cell types should be converted already + if (lhs_type == rhs_type) { + auto lhs = _query_tensor->cells(); + uint32_t k = _target_num_hits; + if (_global_filter->has_filter()) { + auto filter = _global_filter->filter(); + _found_hits = nns_index->find_top_k_with_filter(k, lhs, *filter, k + _explore_additional_hits, _distance_threshold); + } else { + _found_hits = nns_index->find_top_k(k, lhs, k + _explore_additional_hits, _distance_threshold); + } } } } diff --git a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp index 6379a06e6d6..d85da49c2f7 100644 --- a/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/nearest_neighbor_iterator.cpp @@ -16,7 +16,7 @@ bool is_compatible(const vespalib::eval::ValueType& lhs, const vespalib::eval::ValueType& rhs) { - return (lhs.dimensions() == rhs.dimensions()); + return (lhs == rhs); } } diff --git a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt index 46bafa189e1..8af77a57a44 100644 --- a/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/tensor/CMakeLists.txt @@ -1,7 +1,6 @@ # Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(searchlib_tensor OBJECT SOURCES - angular_distance.cpp default_nearest_neighbor_index_factory.cpp dense_tensor_attribute.cpp dense_tensor_attribute_saver.cpp @@ -10,16 +9,13 @@ vespa_add_library(searchlib_tensor OBJECT direct_tensor_saver.cpp direct_tensor_store.cpp distance_function_factory.cpp - euclidean_distance.cpp - geo_degrees_distance.cpp - hamming_distance.cpp + distance_functions.cpp hnsw_graph.cpp hnsw_index.cpp hnsw_index_loader.cpp hnsw_index_saver.cpp imported_tensor_attribute_vector.cpp imported_tensor_attribute_vector_read_guard.cpp - inner_product_distance.cpp inv_log_level_generator.cpp nearest_neighbor_index.cpp nearest_neighbor_index_saver.cpp diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp deleted file mode 100644 index 21b2622283c..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "angular_distance.h" - -using vespalib::typify_invoke; -using vespalib::eval::TypifyCellType; - -namespace search::tensor { - -namespace { - -struct CalcAngular { - template <typename LCT, typename RCT> - static double invoke(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) - { - auto lhs_vector = lhs.unsafe_typify<LCT>(); - auto rhs_vector = rhs.unsafe_typify<RCT>(); - - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - double a_norm_sq = 0.0; - double b_norm_sq = 0.0; - double dot_product = 0.0; - for (size_t i = 0; i < sz; ++i) { - double a = lhs_vector[i]; - double b = rhs_vector[i]; - a_norm_sq += a*a; - b_norm_sq += b*b; - dot_product += a*b; - } - double squared_norms = a_norm_sq * b_norm_sq; - double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; - double cosine_similarity = dot_product / div; - double distance = 1.0 - cosine_similarity; // in range [0,2] - return std::max(0.0, distance); - } -}; - -} - -double -AngularDistance::calc(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) const -{ - return typify_invoke<2,TypifyCellType,CalcAngular>(lhs.type, rhs.type, lhs, rhs); -} - -template class AngularDistanceHW<float>; -template class AngularDistanceHW<double>; - -} diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.h b/searchlib/src/vespa/searchlib/tensor/angular_distance.h deleted file mode 100644 index c480ba2879e..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/angular_distance.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "distance_function.h" -#include <vespa/eval/eval/typed_cells.h> -#include <vespa/vespalib/hwaccelrated/iaccelrated.h> -#include <cmath> - -namespace search::tensor { - -/** - * Calculates angular distance between vectors - */ -class AngularDistance : public DistanceFunction { -public: - AngularDistance() {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; - double convert_threshold(double threshold) const override { - double cosine_similarity = cos(threshold); - return 1.0 - cosine_similarity; - } - double to_rawscore(double distance) const override { - double cosine_similarity = 1.0 - distance; - // should be in in range [-1,1] but roundoff may cause problems: - cosine_similarity = std::min(1.0, cosine_similarity); - cosine_similarity = std::max(-1.0, cosine_similarity); - double angle_distance = acos(cosine_similarity); // in range [0,pi] - double score = 1.0 / (1.0 + angle_distance); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double /*limit*/) const override - { - return calc(lhs, rhs); - } -}; - -/** - * Calculates angular distance between vectors - * Will use instruction optimal for the cpu it is running on - * when both vectors have the expected cell type. - */ -template <typename FloatType> -class AngularDistanceHW : public AngularDistance { -public: - AngularDistanceHW() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) - {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type<FloatType>(); - if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) { - auto lhs_vector = lhs.unsafe_typify<FloatType>(); - auto rhs_vector = rhs.unsafe_typify<FloatType>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - auto a = &lhs_vector[0]; - auto b = &rhs_vector[0]; - double a_norm_sq = _computer.dotProduct(a, a, sz); - double b_norm_sq = _computer.dotProduct(b, b, sz); - double squared_norms = a_norm_sq * b_norm_sq; - double dot_product = _computer.dotProduct(a, b, sz); - double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; - double cosine_similarity = dot_product / div; - double distance = 1.0 - cosine_similarity; // in range [0,2] - return distance; - } else { - return AngularDistance::calc(lhs, rhs); - } - } -private: - const vespalib::hwaccelrated::IAccelrated & _computer; -}; - -} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function.h b/searchlib/src/vespa/searchlib/tensor/distance_function.h index 08f90fec041..724f83b6129 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_function.h @@ -1,4 +1,4 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp index 1d58c01fd99..81b27b56258 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp @@ -2,10 +2,6 @@ #include "distance_function_factory.h" #include "distance_functions.h" -#include <vespa/vespalib/util/typify.h> -#include <vespa/log/log.h> - -LOG_SETUP(".searchlib.tensor.distance_function_factory"); using search::attribute::DistanceMetric; using vespalib::eval::CellType; @@ -17,28 +13,41 @@ DistanceFunction::UP make_distance_function(DistanceMetric variant, CellType cell_type) { switch (variant) { - case DistanceMetric::Euclidean: - switch (cell_type) { - case CellType::FLOAT: return std::make_unique<SquaredEuclideanDistanceHW<float>>(); - case CellType::DOUBLE: return std::make_unique<SquaredEuclideanDistanceHW<double>>(); - default: return std::make_unique<SquaredEuclideanDistance>(); - } - case DistanceMetric::Angular: - switch (cell_type) { - case CellType::FLOAT: return std::make_unique<AngularDistanceHW<float>>(); - case CellType::DOUBLE: return std::make_unique<AngularDistanceHW<double>>(); - default: return std::make_unique<AngularDistance>(); - } - case DistanceMetric::GeoDegrees: - return std::make_unique<GeoDegreesDistance>(); - case DistanceMetric::InnerProduct: - switch (cell_type) { - case CellType::FLOAT: return std::make_unique<InnerProductDistanceHW<float>>(); - case CellType::DOUBLE: return std::make_unique<InnerProductDistanceHW<double>>(); - default: return std::make_unique<InnerProductDistance>(); - } - case DistanceMetric::Hamming: - return std::make_unique<HammingDistance>(); + case DistanceMetric::Euclidean: + if (cell_type == CellType::FLOAT) { + return std::make_unique<SquaredEuclideanDistance<float>>(); + } else { + return std::make_unique<SquaredEuclideanDistance<double>>(); + } + break; + case DistanceMetric::Angular: + if (cell_type == CellType::FLOAT) { + return std::make_unique<AngularDistance<float>>(); + } else { + return std::make_unique<AngularDistance<double>>(); + } + break; + case DistanceMetric::GeoDegrees: + if (cell_type == CellType::FLOAT) { + return std::make_unique<GeoDegreesDistance<float>>(); + } else { + return std::make_unique<GeoDegreesDistance<double>>(); + } + break; + case DistanceMetric::InnerProduct: + if (cell_type == CellType::FLOAT) { + return std::make_unique<InnerProductDistance<float>>(); + } else { + return std::make_unique<InnerProductDistance<double>>(); + } + break; + case DistanceMetric::Hamming: + if (cell_type == CellType::FLOAT) { + return std::make_unique<HammingDistance<float>>(); + } else { + return std::make_unique<HammingDistance<double>>(); + } + break; } // not reached: return DistanceFunction::UP(); diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp new file mode 100644 index 00000000000..8cf3a95db19 --- /dev/null +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.cpp @@ -0,0 +1,22 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "distance_functions.h" + +namespace search::tensor { + +template class SquaredEuclideanDistance<float>; +template class SquaredEuclideanDistance<double>; + +template class AngularDistance<float>; +template class AngularDistance<double>; + +template class InnerProductDistance<float>; +template class InnerProductDistance<double>; + +template class GeoDegreesDistance<float>; +template class GeoDegreesDistance<double>; + +template class HammingDistance<float>; +template class HammingDistance<double>; + +} diff --git a/searchlib/src/vespa/searchlib/tensor/distance_functions.h b/searchlib/src/vespa/searchlib/tensor/distance_functions.h index 8ad8c07bd3c..2557a51e0d7 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_functions.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_functions.h @@ -1,10 +1,245 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright 2020 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once #include "distance_function.h" -#include "angular_distance.h" -#include "euclidean_distance.h" -#include "geo_degrees_distance.h" -#include "hamming_distance.h" -#include "inner_product_distance.h" +#include <vespa/eval/eval/typed_cells.h> +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> +#include <cmath> + +namespace search::tensor { + +/** + * Calculates the square of the standard Euclidean distance. + * Will use instruction optimal for the cpu it is running on. + */ +template <typename FloatType> +class SquaredEuclideanDistance : public DistanceFunction { +public: + SquaredEuclideanDistance() + : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + return _computer.squaredEuclideanDistance(&lhs_vector[0], &rhs_vector[0], sz); + } + double convert_threshold(double threshold) const override { + return threshold*threshold; + } + double to_rawscore(double distance) const override { + double d = sqrt(distance); + double score = 1.0 / (1.0 + d); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double limit) const override + { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + double sum = 0.0; + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + for (size_t i = 0; i < sz && sum <= limit; ++i) { + double diff = lhs_vector[i] - rhs_vector[i]; + sum += diff*diff; + } + return sum; + } + + const vespalib::hwaccelrated::IAccelrated & _computer; +}; + +/** + * Calculates angular distance between vectors + */ +template <typename FloatType> +class AngularDistance : public DistanceFunction { +public: + AngularDistance() + : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + auto a = &lhs_vector[0]; + auto b = &rhs_vector[0]; + double a_norm_sq = _computer.dotProduct(a, a, sz); + double b_norm_sq = _computer.dotProduct(b, b, sz); + double squared_norms = a_norm_sq * b_norm_sq; + double dot_product = _computer.dotProduct(a, b, sz); + double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; + double cosine_similarity = dot_product / div; + double distance = 1.0 - cosine_similarity; // in range [0,2] + return distance; + } + double convert_threshold(double threshold) const override { + double cosine_similarity = cos(threshold); + return 1.0 - cosine_similarity; + } + double to_rawscore(double distance) const override { + double cosine_similarity = 1.0 - distance; + // should be in in range [-1,1] but roundoff may cause problems: + cosine_similarity = std::min(1.0, cosine_similarity); + cosine_similarity = std::max(-1.0, cosine_similarity); + double angle_distance = acos(cosine_similarity); // in range [0,pi] + double score = 1.0 / (1.0 + angle_distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double /*limit*/) const override + { + return calc(lhs, rhs); + } + + const vespalib::hwaccelrated::IAccelrated & _computer; +}; + +/** + * Calculates inner-product "distance" between vectors with assumed norm 1. + * Should give same ordering as Angular distance, but is less expensive. + */ +template <typename FloatType> +class InnerProductDistance : public DistanceFunction { +public: + InnerProductDistance() + : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) + {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz); + return std::max(0.0, score); + } + double convert_threshold(double threshold) const override { + return threshold; + } + double to_rawscore(double distance) const override { + double score = 1.0 / (1.0 + distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double /*limit*/) const override + { + return calc(lhs, rhs); + } + + const vespalib::hwaccelrated::IAccelrated & _computer; +}; + +/** + * Calculates great-circle distance between Latitude/Longitude pairs, + * measured in degrees. Output distance is measured in meters. + * Uses the haversine formula directly from: + * https://en.wikipedia.org/wiki/Haversine_formula + **/ +template <typename FloatType> +class GeoDegreesDistance : public DistanceFunction { +public: + // in km, as defined by IUGG, see: + // https://en.wikipedia.org/wiki/Earth_radius#Mean_radius + static constexpr double earth_mean_radius = 6371.0088; + static constexpr double degrees_to_radians = M_PI / 180.0; + + GeoDegreesDistance() {} + // haversine function: + static double hav(double angle) { + double s = sin(0.5*angle); + return s*s; + } + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + assert(2 == lhs_vector.size()); + assert(2 == rhs_vector.size()); + // convert to radians: + double lat_A = lhs_vector[0] * degrees_to_radians; + double lat_B = rhs_vector[0] * degrees_to_radians; + double lon_A = lhs_vector[1] * degrees_to_radians; + double lon_B = rhs_vector[1] * degrees_to_radians; + + double lat_diff = lat_A - lat_B; + double lon_diff = lon_A - lon_B; + + // haversines of differences: + double hav_lat = hav(lat_diff); + double hav_lon = hav(lon_diff); + + // haversine of central angle between the two points: + double hav_central_angle = hav_lat + cos(lat_A)*cos(lat_B)*hav_lon; + return hav_central_angle; + } + double convert_threshold(double threshold) const override { + double half_angle = threshold / (2 * earth_mean_radius); + double rt_hav = sin(half_angle); + return rt_hav * rt_hav; + } + double to_rawscore(double distance) const override { + double hav_diff = sqrt(distance); + // distance in kilometers: + double d = 2 * asin(hav_diff) * earth_mean_radius; + // km to rawscore: + return 1.0 / (1.0 + d); + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double /*limit*/) const override + { + return calc(lhs, rhs); + } + +}; + +/** + * Calculates the Hamming distance defined as + * "number of cells where the values are different" + */ +template <typename FloatType> +class HammingDistance : public DistanceFunction { +public: + HammingDistance() {} + double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + size_t sum = 0; + for (size_t i = 0; i < sz; ++i) { + sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; + } + return (double)sum; + } + double convert_threshold(double threshold) const override { + return threshold; + } + double to_rawscore(double distance) const override { + double score = 1.0 / (1.0 + distance); + return score; + } + double calc_with_limit(const vespalib::eval::TypedCells& lhs, + const vespalib::eval::TypedCells& rhs, + double limit) const override + { + auto lhs_vector = lhs.typify<FloatType>(); + auto rhs_vector = rhs.typify<FloatType>(); + size_t sz = lhs_vector.size(); + assert(sz == rhs_vector.size()); + size_t sum = 0; + for (size_t i = 0; i < sz && sum <= limit; ++i) { + sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; + } + return (double)sum; + } +}; + + +} diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp deleted file mode 100644 index 223ceeed940..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "euclidean_distance.h" - -using vespalib::typify_invoke; -using vespalib::eval::TypifyCellType; - -namespace search::tensor { - -namespace { - -struct CalcEuclidean { - template <typename LCT, typename RCT> - static double invoke(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) - { - auto lhs_vector = lhs.unsafe_typify<LCT>(); - auto rhs_vector = rhs.unsafe_typify<RCT>(); - double sum = 0.0; - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - for (size_t i = 0; i < sz; ++i) { - double diff = lhs_vector[i] - rhs_vector[i]; - sum += diff*diff; - } - return sum; - } -}; - -} - -double -SquaredEuclideanDistance::calc(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) const -{ - return typify_invoke<2,TypifyCellType,CalcEuclidean>(lhs.type, rhs.type, lhs, rhs); -} - -double -SquaredEuclideanDistance::calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double) const -{ - // maybe optimize this: - return typify_invoke<2,TypifyCellType,CalcEuclidean>(lhs.type, rhs.type, lhs, rhs); -} - -template class SquaredEuclideanDistanceHW<float>; -template class SquaredEuclideanDistanceHW<double>; - -} diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h deleted file mode 100644 index 6d4d982834f..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "distance_function.h" -#include <vespa/eval/eval/typed_cells.h> -#include <vespa/vespalib/hwaccelrated/iaccelrated.h> -#include <cmath> - -namespace search::tensor { - -/** - * Calculates the square of the standard Euclidean distance. - */ -class SquaredEuclideanDistance : public DistanceFunction { -public: - SquaredEuclideanDistance() {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double limit) const override; - double convert_threshold(double threshold) const override { - return threshold*threshold; - } - double to_rawscore(double distance) const override { - double d = sqrt(distance); - double score = 1.0 / (1.0 + d); - return score; - } -}; - -/** - * Calculates the square of the standard Euclidean distance. - * Will use instruction optimal for the cpu it is running on - * when both vectors have the expected cell type. - */ -template <typename FloatType> -class SquaredEuclideanDistanceHW : public SquaredEuclideanDistance { -public: - SquaredEuclideanDistanceHW() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) - {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type<FloatType>(); - if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) { - auto lhs_vector = lhs.unsafe_typify<FloatType>(); - auto rhs_vector = rhs.unsafe_typify<FloatType>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - return _computer.squaredEuclideanDistance(&lhs_vector[0], &rhs_vector[0], sz); - } else { - return SquaredEuclideanDistance::calc(lhs, rhs); - } - } -private: - const vespalib::hwaccelrated::IAccelrated & _computer; -}; - -} diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp deleted file mode 100644 index 946caa0d2cb..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "geo_degrees_distance.h" - -using vespalib::typify_invoke; -using vespalib::eval::TypifyCellType; - -namespace search::tensor { - -namespace { - -struct CalcGeoDegrees { - template <typename LCT, typename RCT> - static double invoke(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) - { - auto lhs_vector = lhs.unsafe_typify<LCT>(); - auto rhs_vector = rhs.unsafe_typify<RCT>(); - - assert(2 == lhs_vector.size()); - assert(2 == rhs_vector.size()); - // convert to radians: - double lat_A = lhs_vector[0] * GeoDegreesDistance::degrees_to_radians; - double lat_B = rhs_vector[0] * GeoDegreesDistance::degrees_to_radians; - double lon_A = lhs_vector[1] * GeoDegreesDistance::degrees_to_radians; - double lon_B = rhs_vector[1] * GeoDegreesDistance::degrees_to_radians; - - double lat_diff = lat_A - lat_B; - double lon_diff = lon_A - lon_B; - - // haversines of differences: - double hav_lat = GeoDegreesDistance::hav(lat_diff); - double hav_lon = GeoDegreesDistance::hav(lon_diff); - - // haversine of central angle between the two points: - double hav_central_angle = hav_lat + cos(lat_A)*cos(lat_B)*hav_lon; - return hav_central_angle; - } -}; - -} - -double -GeoDegreesDistance::calc(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) const -{ - return typify_invoke<2,TypifyCellType,CalcGeoDegrees>(lhs.type, rhs.type, lhs, rhs); -} - -} diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h deleted file mode 100644 index b8b9bec50d5..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "distance_function.h" -#include <vespa/eval/eval/typed_cells.h> -#include <vespa/vespalib/hwaccelrated/iaccelrated.h> -#include <vespa/vespalib/util/typify.h> -#include <cmath> - -namespace search::tensor { - -/** - * Calculates great-circle distance between Latitude/Longitude pairs, - * measured in degrees. Output distance is measured in meters. - * Uses the haversine formula directly from: - * https://en.wikipedia.org/wiki/Haversine_formula - **/ -class GeoDegreesDistance : public DistanceFunction { -public: - // in km, as defined by IUGG, see: - // https://en.wikipedia.org/wiki/Earth_radius#Mean_radius - static constexpr double earth_mean_radius = 6371.0088; - static constexpr double degrees_to_radians = M_PI / 180.0; - - GeoDegreesDistance() {} - // haversine function: - static double hav(double angle) { - double s = sin(0.5*angle); - return s*s; - } - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; - double convert_threshold(double threshold) const override { - double half_angle = threshold / (2 * earth_mean_radius); - double rt_hav = sin(half_angle); - return rt_hav * rt_hav; - } - double to_rawscore(double distance) const override { - double hav_diff = sqrt(distance); - // distance in kilometers: - double d = 2 * asin(hav_diff) * earth_mean_radius; - // km to rawscore: - return 1.0 / (1.0 + d); - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double /*limit*/) const override - { - return calc(lhs, rhs); - } -}; - -} diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp b/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp deleted file mode 100644 index 30ba979a6d8..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "hamming_distance.h" - -using vespalib::typify_invoke; -using vespalib::eval::TypifyCellType; - -namespace search::tensor { - -namespace { - -struct CalcHamming { - template <typename LCT, typename RCT> - static double invoke(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) - { - auto lhs_vector = lhs.unsafe_typify<LCT>(); - auto rhs_vector = rhs.unsafe_typify<RCT>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - size_t sum = 0; - for (size_t i = 0; i < sz; ++i) { - sum += (lhs_vector[i] == rhs_vector[i]) ? 0 : 1; - } - return (double)sum; - } -}; - -} - -double -HammingDistance::calc(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) const -{ - constexpr auto expected = vespalib::eval::CellType::INT8; - if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) { - const uint64_t *words_a = static_cast<const uint64_t *>(lhs.data); - const uint64_t *words_b = static_cast<const uint64_t *>(rhs.data); - size_t sum = 0; - size_t sz = lhs.size; - assert(sz == rhs.size); - size_t i = 0; - for (; i * 8 < sz; ++i) { - uint64_t xor_bits = words_a[i] ^ words_b[i]; - sum += __builtin_popcountl(xor_bits); - } - if (__builtin_expect((i * 8 < sz), false)) { - const uint8_t *bytes_a = static_cast<const uint8_t *>(lhs.data); - const uint8_t *bytes_b = static_cast<const uint8_t *>(rhs.data); - for (i *= 8; i < sz; ++i) { - uint64_t xor_bits = bytes_a[i] ^ bytes_b[i]; - sum += __builtin_popcountl(xor_bits); - } - } - return (double)sum; - } else { - return typify_invoke<2,TypifyCellType,CalcHamming>(lhs.type, rhs.type, lhs, rhs); - } -} - -} diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.h b/searchlib/src/vespa/searchlib/tensor/hamming_distance.h deleted file mode 100644 index c2cd7af3863..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/hamming_distance.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "distance_function.h" -#include <vespa/eval/eval/typed_cells.h> -#include <vespa/vespalib/util/typify.h> -#include <cmath> - -namespace search::tensor { - -/** - * Calculates the Hamming distance defined as - * "number of cells where the values are different" - * or (for int8 cells, aka binary data only) - * "number of bits that are different" - */ -class HammingDistance : public DistanceFunction { -public: - HammingDistance() {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; - double convert_threshold(double threshold) const override { - return threshold; - } - double to_rawscore(double distance) const override { - double score = 1.0 / (1.0 + distance); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double) const override - { - // consider optimizing: - return calc(lhs, rhs); - } -}; - -} diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp deleted file mode 100644 index 8a48c6c936a..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "inner_product_distance.h" - -using vespalib::typify_invoke; -using vespalib::eval::TypifyCellType; - -namespace search::tensor { - -namespace { - -struct CalcInnerProduct { - template <typename LCT, typename RCT> - static double invoke(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) - { - auto lhs_vector = lhs.unsafe_typify<LCT>(); - auto rhs_vector = rhs.unsafe_typify<RCT>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - double dot_product = 0.0; - for (size_t i = 0; i < sz; ++i) { - double a = lhs_vector[i]; - double b = rhs_vector[i]; - dot_product += a*b; - } - double score = 1.0 - dot_product; // in range [0,2] - return std::max(0.0, score); - } -}; - -} - -double -InnerProductDistance::calc(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs) const -{ - return typify_invoke<2,TypifyCellType,CalcInnerProduct>(lhs.type, rhs.type, lhs, rhs); -} - -template class InnerProductDistanceHW<float>; -template class InnerProductDistanceHW<double>; - -} diff --git a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h b/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h deleted file mode 100644 index cb60d18c0f5..00000000000 --- a/searchlib/src/vespa/searchlib/tensor/inner_product_distance.h +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "distance_function.h" -#include <vespa/eval/eval/typed_cells.h> -#include <vespa/vespalib/hwaccelrated/iaccelrated.h> -#include <cmath> - -namespace search::tensor { - -/** - * Calculates inner-product "distance" between vectors with assumed norm 1. - * Should give same ordering as Angular distance, but is less expensive. - */ -class InnerProductDistance : public DistanceFunction { -public: - InnerProductDistance() {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override; - double convert_threshold(double threshold) const override { - return threshold; - } - double to_rawscore(double distance) const override { - double score = 1.0 / (1.0 + distance); - return score; - } - double calc_with_limit(const vespalib::eval::TypedCells& lhs, - const vespalib::eval::TypedCells& rhs, - double /*limit*/) const override - { - return calc(lhs, rhs); - } -}; - -/** - * Calculates inner-product "distance" between vectors with assumed norm 1. - * Should give same ordering as Angular distance, but is less expensive. - * Will use instruction optimal for the cpu it is running on - * when both vectors have the expected cell type. - */ -template <typename FloatType> -class InnerProductDistanceHW : public InnerProductDistance { -public: - InnerProductDistanceHW() - : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()) - {} - double calc(const vespalib::eval::TypedCells& lhs, const vespalib::eval::TypedCells& rhs) const override { - constexpr vespalib::eval::CellType expected = vespalib::eval::get_cell_type<FloatType>(); - if (__builtin_expect((lhs.type == expected && rhs.type == expected), true)) { - auto lhs_vector = lhs.unsafe_typify<FloatType>(); - auto rhs_vector = rhs.unsafe_typify<FloatType>(); - size_t sz = lhs_vector.size(); - assert(sz == rhs_vector.size()); - double score = 1.0 - _computer.dotProduct(&lhs_vector[0], &rhs_vector[0], sz); - return std::max(0.0, score); - } else { - return InnerProductDistance::calc(lhs, rhs); - } - } -private: - const vespalib::hwaccelrated::IAccelrated & _computer; -}; - -} |