From 73b16bba68712e47d4f820a67e2c0e7d74d9eb1b Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Tue, 10 Mar 2020 10:30:47 +0000 Subject: Add a euclidian distance that is optimal for avx, avx2 and avx512. --- vespalib/CMakeLists.txt | 1 + vespalib/src/tests/hwaccelrated/CMakeLists.txt | 8 ++ .../src/tests/hwaccelrated/hwaccelrated_test.cpp | 39 +++++++++ .../src/vespa/vespalib/hwaccelrated/CMakeLists.txt | 2 - vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp | 13 --- vespalib/src/vespa/vespalib/hwaccelrated/avx.h | 18 ----- vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp | 10 +++ vespalib/src/vespa/vespalib/hwaccelrated/avx2.h | 6 +- .../src/vespa/vespalib/hwaccelrated/avx512.cpp | 10 +++ vespalib/src/vespa/vespalib/hwaccelrated/avx512.h | 2 + .../src/vespa/vespalib/hwaccelrated/generic.cpp | 10 +++ vespalib/src/vespa/vespalib/hwaccelrated/generic.h | 2 + .../vespa/vespalib/hwaccelrated/iaccelrated.cpp | 93 ++++++++++++---------- .../src/vespa/vespalib/hwaccelrated/iaccelrated.h | 2 + .../vespalib/hwaccelrated/private_helpers.hpp | 26 +++++- vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp | 13 --- vespalib/src/vespa/vespalib/hwaccelrated/sse2.h | 18 ----- 17 files changed, 166 insertions(+), 107 deletions(-) create mode 100644 vespalib/src/tests/hwaccelrated/CMakeLists.txt create mode 100644 vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp delete mode 100644 vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp delete mode 100644 vespalib/src/vespa/vespalib/hwaccelrated/avx.h delete mode 100644 vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp delete mode 100644 vespalib/src/vespa/vespalib/hwaccelrated/sse2.h (limited to 'vespalib') diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index 3530fb816df..2129649f81c 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -58,6 +58,7 @@ vespa_define_module( src/tests/gencnt src/tests/guard src/tests/host_name + src/tests/hwaccelrated src/tests/io/fileutil src/tests/io/mapped_file_input src/tests/latch diff --git a/vespalib/src/tests/hwaccelrated/CMakeLists.txt b/vespalib/src/tests/hwaccelrated/CMakeLists.txt new file mode 100644 index 00000000000..9dda949d088 --- /dev/null +++ b/vespalib/src/tests/hwaccelrated/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_hwaccelrated_test_app TEST + SOURCES + hwaccelrated_test.cpp + DEPENDS + vespalib +) +vespa_add_test(NAME vespalib_hwaccelrated_test_app COMMAND vespalib_hwaccelrated_test_app) diff --git a/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp b/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp new file mode 100644 index 00000000000..757626ead94 --- /dev/null +++ b/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp @@ -0,0 +1,39 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include +#include +#include + +using namespace vespalib; + +template +std::vector createAndFill(size_t sz) { + std::vector v(sz); + for (size_t i(0); i < sz; i++) { + v[i] = i; + } + return v; +} + +template +void verifyEuclidianDistance(const hwaccelrated::IAccelrated & accel) { + const size_t testLength(255); + std::vector a = createAndFill(testLength); + std::vector b = createAndFill(testLength); + for (size_t j(0); j < 0x20; j++) { + T sum(0); + for (size_t i(j); i < testLength; i++) { + sum += (a[i] - b[i]) * (a[i] - b[i]); + } + T hwComputedSum(accel.squaredEuclidianDistance(&a[j], &b[j], testLength - j)); + EXPECT_EQUAL (sum, hwComputedSum); + } +} + +TEST("test euclidian distance") { + hwaccelrated::GenericAccelrator genericAccelrator; + verifyEuclidianDistance(genericAccelrator); + verifyEuclidianDistance(genericAccelrator); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt b/vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt index f6b38028471..ac9d8d76074 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt @@ -3,8 +3,6 @@ vespa_add_library(vespalib_vespalib_hwaccelrated OBJECT SOURCES iaccelrated.cpp generic.cpp - sse2.cpp - avx.cpp avx2.cpp avx512.cpp DEPENDS diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp deleted file mode 100644 index ec6dc164323..00000000000 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "avx.h" -#include "avxprivate.hpp" - -namespace vespalib::hwaccelrated { - -size_t -AvxAccelrator::populationCount(const uint64_t *a, size_t sz) const { - return helper::populationCount(a, sz); -} - -} diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx.h deleted file mode 100644 index e7f090b4695..00000000000 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "sse2.h" - -namespace vespalib::hwaccelrated { - -/** - * Avx-256 implementation. - */ -class AvxAccelrator : public Sse2Accelrator -{ -public: - size_t populationCount(const uint64_t *a, size_t sz) const override; -}; - -} diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index f0d03a995e4..cd80261fc76 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -10,4 +10,14 @@ Avx2Accelrator::populationCount(const uint64_t *a, size_t sz) const { return helper::populationCount(a, sz); } +double +Avx2Accelrator::squaredEuclidianDistance(const float * a, const float * b, size_t sz) const { + return helper::euclidianDistanceT(a, b, sz); +} + +double +Avx2Accelrator::squaredEuclidianDistance(const double * a, const double * b, size_t sz) const { + return helper::euclidianDistanceT(a, b, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index 7e1784698f1..68c0bb5695c 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -2,17 +2,19 @@ #pragma once -#include "avx.h" +#include "generic.h" namespace vespalib::hwaccelrated { /** * Avx-512 implementation. */ -class Avx2Accelrator : public AvxAccelrator +class Avx2Accelrator : public GenericAccelrator { public: size_t populationCount(const uint64_t *a, size_t sz) const override; + double squaredEuclidianDistance(const float * a, const float * b, size_t sz) const override; + double squaredEuclidianDistance(const double * a, const double * b, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp index 1abf6b270cf..747119d7a8e 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -22,4 +22,14 @@ Avx512Accelrator::populationCount(const uint64_t *a, size_t sz) const { return helper::populationCount(a, sz); } +double +Avx512Accelrator::squaredEuclidianDistance(const float * a, const float * b, size_t sz) const { + return helper::euclidianDistanceT(a, b, sz); +} + +double +Avx512Accelrator::squaredEuclidianDistance(const double * a, const double * b, size_t sz) const { + return helper::euclidianDistanceT(a, b, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h index eac8c96832b..78712811b8b 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h @@ -15,6 +15,8 @@ public: float dotProduct(const float * a, const float * b, size_t sz) const override; double dotProduct(const double * a, const double * b, size_t sz) const override; size_t populationCount(const uint64_t *a, size_t sz) const override; + double squaredEuclidianDistance(const float * a, const float * b, size_t sz) const override; + double squaredEuclidianDistance(const double * a, const double * b, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index 4e02ca371f0..7018f37d49a 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -131,4 +131,14 @@ GenericAccelrator::populationCount(const uint64_t *a, size_t sz) const { return helper::populationCount(a, sz); } +double +GenericAccelrator::squaredEuclidianDistance(const float * a, const float * b, size_t sz) const { + return helper::euclidianDistanceT(a, b, sz); +} + +double +GenericAccelrator::squaredEuclidianDistance(const double * a, const double * b, size_t sz) const { + return helper::euclidianDistanceT(a, b, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h index d76d0728bdd..224394a9595 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h @@ -23,6 +23,8 @@ public: void andNotBit(void * a, const void * b, size_t bytes) const override; void notBit(void * a, size_t bytes) const override; size_t populationCount(const uint64_t *a, size_t sz) const override; + double squaredEuclidianDistance(const float * a, const float * b, size_t sz) const override; + double squaredEuclidianDistance(const double * a, const double * b, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp index 061963f95a4..b298ffdc4af 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp @@ -2,12 +2,11 @@ #include "iaccelrated.h" #include "generic.h" -#include "sse2.h" -#include "avx.h" #include "avx2.h" #include "avx512.h" #include #include +#include #include LOG_SETUP(".vespalib.hwaccelrated"); @@ -18,7 +17,7 @@ namespace { class Factory { public: - virtual ~Factory() { } + virtual ~Factory() = default; virtual IAccelrated::UP create() const = 0; }; @@ -27,16 +26,6 @@ public: IAccelrated::UP create() const override { return std::make_unique(); } }; -class Sse2Factory :public Factory{ -public: - IAccelrated::UP create() const override { return std::make_unique(); } -}; - -class AvxFactory :public Factory{ -public: - IAccelrated::UP create() const override { return std::make_unique(); } -}; - class Avx2Factory :public Factory{ public: IAccelrated::UP create() const override { return std::make_unique(); } @@ -48,15 +37,23 @@ public: }; template -void verifyAccelrator(const IAccelrated & accel) +std::vector createAndFill(size_t sz) { + std::vector v(sz); + for (size_t i(0); i < sz; i++) { + v[i] = i; + } + return v; +} + +template +void verifyDotproduct(const IAccelrated & accel) { - const size_t testLength(127); - T * a = new T[testLength]; - T * b = new T[testLength]; + const size_t testLength(255); + std::vector a = createAndFill(testLength); + std::vector b = createAndFill(testLength); for (size_t j(0); j < 0x20; j++) { T sum(0); for (size_t i(j); i < testLength; i++) { - a[i] = b[i] = i; sum += i*i; } T hwComputedSum(accel.dotProduct(&a[j], &b[j], testLength - j)); @@ -65,8 +62,24 @@ void verifyAccelrator(const IAccelrated & accel) LOG_ABORT("should not be reached"); } } - delete [] a; - delete [] b; +} + +template +void verifyEuclidianDistance(const IAccelrated & accel) { + const size_t testLength(255); + std::vector a = createAndFill(testLength); + std::vector b = createAndFill(testLength); + for (size_t j(0); j < 0x20; j++) { + T sum(0); + for (size_t i(j); i < testLength; i++) { + sum += (a[i] - b[i]) * (a[i] - b[i]); + } + T hwComputedSum(accel.squaredEuclidianDistance(&a[j], &b[j], testLength - j)); + if (sum != hwComputedSum) { + fprintf(stderr, "Accelrator is not computing euclidian distance correctly.\n"); + LOG_ABORT("should not be reached"); + } + } } void verifyPopulationCount(const IAccelrated & accel) @@ -90,23 +103,25 @@ class RuntimeVerificator { public: RuntimeVerificator(); +private: + void verify(IAccelrated & accelrated) { + verifyDotproduct(accelrated); + verifyDotproduct(accelrated); + verifyDotproduct(accelrated); + verifyDotproduct(accelrated); + verifyEuclidianDistance(accelrated); + verifyEuclidianDistance(accelrated); + verifyPopulationCount(accelrated); + } }; RuntimeVerificator::RuntimeVerificator() { - GenericAccelrator generic; - verifyAccelrator(generic); - verifyAccelrator(generic); - verifyAccelrator(generic); - verifyAccelrator(generic); - verifyPopulationCount(generic); - - IAccelrated::UP thisCpu(IAccelrated::getAccelrator()); - verifyAccelrator(*thisCpu); - verifyAccelrator(*thisCpu); - verifyAccelrator(*thisCpu); - verifyAccelrator(*thisCpu); - + GenericAccelrator generic; + verify(generic); + + IAccelrated::UP thisCpu(IAccelrated::getAccelrator()); + verify(*thisCpu); } class Selector @@ -119,17 +134,15 @@ private: }; Selector::Selector() : - _factory(new GenericFactory()) + _factory() { __builtin_cpu_init (); if (__builtin_cpu_supports("avx512f")) { - _factory.reset(new Avx512Factory()); + _factory = std::make_unique(); } else if (__builtin_cpu_supports("avx2")) { - _factory.reset(new Avx2Factory()); - } else if (__builtin_cpu_supports("avx")) { - _factory.reset(new AvxFactory()); - } else if (__builtin_cpu_supports("sse2")) { - _factory.reset(new Sse2Factory()); + _factory = std::make_unique(); + } else { + _factory = std::make_unique(); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h index 4031169c44d..61739e1837a 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h @@ -27,6 +27,8 @@ public: virtual void andNotBit(void * a, const void * b, size_t bytes) const = 0; virtual void notBit(void * a, size_t bytes) const = 0; virtual size_t populationCount(const uint64_t *a, size_t sz) const = 0; + virtual double squaredEuclidianDistance(const float * a, const float * b, size_t sz) const = 0; + virtual double squaredEuclidianDistance(const double * a, const double * b, size_t sz) const = 0; static IAccelrated::UP getAccelrator() __attribute__((noinline)); }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp index 8eba313d5f1..6a70e14c290 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -23,5 +23,29 @@ populationCount(const uint64_t *a, size_t sz) { return count; } +template +double +euclidianDistanceT(const T * a, const T * b, size_t sz) +{ + T partial[UNROLL]; + for (size_t i(0); i < UNROLL; i++) { + partial[i] = 0; + } + size_t i(0); + for (; i + UNROLL <= sz; i += UNROLL) { + for (size_t j(0); j < UNROLL; j++) { + partial[j] += (a[i+j] - b[i+j]) * (a[i+j] - b[i+j]); + } + } + for (;i < sz; i++) { + partial[i%UNROLL] += (a[i] - b[i]) * (a[i] - b[i]); + } + double sum(0); + for (size_t j(0); j < UNROLL; j++) { + sum += partial[j]; + } + return sum; +} + +} } -} \ No newline at end of file diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp deleted file mode 100644 index 64a26d49f2b..00000000000 --- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "sse2.h" -#include "private_helpers.hpp" - -namespace vespalib::hwaccelrated { - -size_t -Sse2Accelrator::populationCount(const uint64_t *a, size_t sz) const { - return helper::populationCount(a, sz); -} - -} diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h deleted file mode 100644 index 0b2462423b1..00000000000 --- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "generic.h" - -namespace vespalib::hwaccelrated { - -/** - * Generic cpu agnostic implementation. - */ -class Sse2Accelrator : public GenericAccelrator -{ -public: - size_t populationCount(const uint64_t *a, size_t sz) const override; -}; - -} -- cgit v1.2.3