diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2020-03-10 21:45:14 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-03-10 21:45:14 +0100 |
commit | 811cb02ae056b5b61520fa578ec5a90bcb6a4c4f (patch) | |
tree | 0f21ed18c624c66cfcf6b8ad57b7821eba8baaed /vespalib | |
parent | 2b6323bdd0f2dbf825d65d6527bbcbcb92d526ae (diff) | |
parent | 8d52222315d29507930750941e268491ade98934 (diff) |
Merge pull request #12527 from vespa-engine/balder/compile-euclidian-distance-for-avx2-and-avx512
Balder/compile euclidian distance for avx2 and avx512
Diffstat (limited to 'vespalib')
18 files changed, 219 insertions, 109 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index 98d50bcefba..2675bc16bf2 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -58,6 +58,7 @@ vespa_define_module( src/tests/gencnt src/tests/guard src/tests/host_name + src/tests/hwaccelrated src/tests/io/fileutil src/tests/io/mapped_file_input src/tests/latch diff --git a/vespalib/src/tests/hwaccelrated/CMakeLists.txt b/vespalib/src/tests/hwaccelrated/CMakeLists.txt new file mode 100644 index 00000000000..49501341098 --- /dev/null +++ b/vespalib/src/tests/hwaccelrated/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_hwaccelrated_test_app TEST + SOURCES + hwaccelrated_test.cpp + DEPENDS + vespalib +) +vespa_add_test(NAME vespalib_hwaccelrated_test_app COMMAND vespalib_hwaccelrated_test_app) diff --git a/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp b/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp new file mode 100644 index 00000000000..421f3f6f520 --- /dev/null +++ b/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp @@ -0,0 +1,40 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> +#include <vespa/vespalib/hwaccelrated/generic.h> + +using namespace vespalib; + +template<typename T> +std::vector<T> createAndFill(size_t sz) { + std::vector<T> v(sz); + for (size_t i(0); i < sz; i++) { + v[i] = rand()%500; + } + return v; +} + +template<typename T> +void verifyEuclideanDistance(const hwaccelrated::IAccelrated & accel) { + const size_t testLength(255); + srand(1); + std::vector<T> a = createAndFill<T>(testLength); + std::vector<T> b = createAndFill<T>(testLength); + for (size_t j(0); j < 0x20; j++) { + T sum(0); + for (size_t i(j); i < testLength; i++) { + sum += (a[i] - b[i]) * (a[i] - b[i]); + } + T hwComputedSum(accel.squaredEuclideanDistance(&a[j], &b[j], testLength - j)); + EXPECT_EQUAL(sum, hwComputedSum); + } +} + +TEST("test euclidean distance") { + hwaccelrated::GenericAccelrator genericAccelrator; + verifyEuclideanDistance<float>(genericAccelrator); + verifyEuclideanDistance<double >(genericAccelrator); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt b/vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt index f6b38028471..ac9d8d76074 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt @@ -3,8 +3,6 @@ vespa_add_library(vespalib_vespalib_hwaccelrated OBJECT SOURCES iaccelrated.cpp generic.cpp - sse2.cpp - avx.cpp avx2.cpp avx512.cpp DEPENDS diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp deleted file mode 100644 index ec6dc164323..00000000000 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "avx.h" -#include "avxprivate.hpp" - -namespace vespalib::hwaccelrated { - -size_t -AvxAccelrator::populationCount(const uint64_t *a, size_t sz) const { - return helper::populationCount(a, sz); -} - -} diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx.h deleted file mode 100644 index e7f090b4695..00000000000 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "sse2.h" - -namespace vespalib::hwaccelrated { - -/** - * Avx-256 implementation. - */ -class AvxAccelrator : public Sse2Accelrator -{ -public: - size_t populationCount(const uint64_t *a, size_t sz) const override; -}; - -} diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index f0d03a995e4..7ff393c87f8 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -10,4 +10,14 @@ Avx2Accelrator::populationCount(const uint64_t *a, size_t sz) const { return helper::populationCount(a, sz); } +double +Avx2Accelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const { + return avx::euclideanDistanceSelectAlignment<float, 32>(a, b, sz); +} + +double +Avx2Accelrator::squaredEuclideanDistance(const double * a, const double * b, size_t sz) const { + return avx::euclideanDistanceSelectAlignment<double, 32>(a, b, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index 7e1784698f1..3e0dbb28110 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -2,17 +2,19 @@ #pragma once -#include "avx.h" +#include "generic.h" namespace vespalib::hwaccelrated { /** * Avx-512 implementation. */ -class Avx2Accelrator : public AvxAccelrator +class Avx2Accelrator : public GenericAccelrator { public: size_t populationCount(const uint64_t *a, size_t sz) const override; + double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; + double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp index 1abf6b270cf..0941e6d6ad8 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -22,4 +22,14 @@ Avx512Accelrator::populationCount(const uint64_t *a, size_t sz) const { return helper::populationCount(a, sz); } +double +Avx512Accelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const { + return avx::euclideanDistanceSelectAlignment<float, 64>(a, b, sz); +} + +double +Avx512Accelrator::squaredEuclideanDistance(const double * a, const double * b, size_t sz) const { + return avx::euclideanDistanceSelectAlignment<double, 64>(a, b, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h index eac8c96832b..209ec06c857 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h @@ -15,6 +15,8 @@ public: float dotProduct(const float * a, const float * b, size_t sz) const override; double dotProduct(const double * a, const double * b, size_t sz) const override; size_t populationCount(const uint64_t *a, size_t sz) const override; + double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; + double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp index 9e6a6d8817f..087729a23b2 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp @@ -4,7 +4,6 @@ #include "private_helpers.hpp" #include <vespa/fastos/dynamiclibrary.h> -#include <cstring> namespace vespalib::hwaccelrated::avx { @@ -87,4 +86,52 @@ T dotProductSelectAlignment(const T * af, const T * bf, size_t sz) } } +template <typename T, unsigned VLEN, unsigned AlignA, unsigned AlignB> +double +euclideanDistanceT(const T * af, const T * bf, size_t sz) +{ + constexpr unsigned VectorsPerChunk = 4; + constexpr unsigned ChunkSize = VLEN*VectorsPerChunk/sizeof(T); + typedef T V __attribute__ ((vector_size (VLEN))); + typedef T A __attribute__ ((vector_size (VLEN), aligned(AlignA))); + typedef T B __attribute__ ((vector_size (VLEN), aligned(AlignB))); + V partial[VectorsPerChunk]; + memset(partial, 0, sizeof(partial)); + const A * a = reinterpret_cast<const A *>(af); + const B * b = reinterpret_cast<const B *>(bf); + + const size_t numChunks(sz/ChunkSize); + for (size_t i(0); i < numChunks; i++) { + for (size_t j(0); j < VectorsPerChunk; j++) { + partial[j] += (a[VectorsPerChunk*i+j] - b[VectorsPerChunk*i+j]) * (a[VectorsPerChunk*i+j] - b[VectorsPerChunk*i+j]); + } + } + double sum(0); + for (size_t i(numChunks*ChunkSize); i < sz; i++) { + sum += (af[i] - bf[i]) * (af[i] - bf[i]); + } + partial[0] = sumR<V, VectorsPerChunk>(partial); + + return sum + sumT<T, V>(partial[0]); +} + +template <typename T, unsigned VLEN> +double euclideanDistanceSelectAlignment(const T * af, const T * bf, size_t sz) +{ + constexpr unsigned ALIGN = 32; + if (validAlignment(af, ALIGN)) { + if (validAlignment(bf, ALIGN)) { + return euclideanDistanceT<T, VLEN, ALIGN, ALIGN>(af, bf, sz); + } else { + return euclideanDistanceT<T, ALIGN, ALIGN, 1>(af, bf, sz); + } + } else { + if (validAlignment(bf, ALIGN)) { + return euclideanDistanceT<T, VLEN, 1, ALIGN>(af, bf, sz); + } else { + return euclideanDistanceT<T, VLEN, 1, 1>(af, bf, sz); + } + } +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index 4e02ca371f0..f9684e88c63 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -32,6 +32,30 @@ multiplyAdd(const T * a, const T * b, size_t sz) return sum; } +template <typename T, size_t UNROLL> +double +euclideanDistanceT(const T * a, const T * b, size_t sz) +{ + T partial[UNROLL]; + for (size_t i(0); i < UNROLL; i++) { + partial[i] = 0; + } + size_t i(0); + for (; i + UNROLL <= sz; i += UNROLL) { + for (size_t j(0); j < UNROLL; j++) { + partial[j] += (a[i+j] - b[i+j]) * (a[i+j] - b[i+j]); + } + } + for (;i < sz; i++) { + partial[i%UNROLL] += (a[i] - b[i]) * (a[i] - b[i]); + } + double sum(0); + for (size_t j(0); j < UNROLL; j++) { + sum += partial[j]; + } + return sum; +} + template<size_t UNROLL, typename Operation> void bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) { @@ -131,4 +155,14 @@ GenericAccelrator::populationCount(const uint64_t *a, size_t sz) const { return helper::populationCount(a, sz); } +double +GenericAccelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const { + return euclideanDistanceT<float, 8>(a, b, sz); +} + +double +GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, size_t sz) const { + return euclideanDistanceT<double, 4>(a, b, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h index d76d0728bdd..50a3d59d49d 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h @@ -23,6 +23,8 @@ public: void andNotBit(void * a, const void * b, size_t bytes) const override; void notBit(void * a, size_t bytes) const override; size_t populationCount(const uint64_t *a, size_t sz) const override; + double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; + double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp index 061963f95a4..86bd4d2c7b2 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp @@ -2,12 +2,11 @@ #include "iaccelrated.h" #include "generic.h" -#include "sse2.h" -#include "avx.h" #include "avx2.h" #include "avx512.h" #include <vespa/vespalib/util/memory.h> #include <cstdio> +#include <vector> #include <vespa/log/log.h> LOG_SETUP(".vespalib.hwaccelrated"); @@ -18,7 +17,7 @@ namespace { class Factory { public: - virtual ~Factory() { } + virtual ~Factory() = default; virtual IAccelrated::UP create() const = 0; }; @@ -27,16 +26,6 @@ public: IAccelrated::UP create() const override { return std::make_unique<GenericAccelrator>(); } }; -class Sse2Factory :public Factory{ -public: - IAccelrated::UP create() const override { return std::make_unique<Sse2Accelrator>(); } -}; - -class AvxFactory :public Factory{ -public: - IAccelrated::UP create() const override { return std::make_unique<AvxAccelrator>(); } -}; - class Avx2Factory :public Factory{ public: IAccelrated::UP create() const override { return std::make_unique<Avx2Accelrator>(); } @@ -48,16 +37,25 @@ public: }; template<typename T> -void verifyAccelrator(const IAccelrated & accel) +std::vector<T> createAndFill(size_t sz) { + std::vector<T> v(sz); + for (size_t i(0); i < sz; i++) { + v[i] = rand()%100; + } + return v; +} + +template<typename T> +void verifyDotproduct(const IAccelrated & accel) { - const size_t testLength(127); - T * a = new T[testLength]; - T * b = new T[testLength]; + const size_t testLength(255); + srand(1); + std::vector<T> a = createAndFill<T>(testLength); + std::vector<T> b = createAndFill<T>(testLength); for (size_t j(0); j < 0x20; j++) { T sum(0); for (size_t i(j); i < testLength; i++) { - a[i] = b[i] = i; - sum += i*i; + sum += a[i]*b[i]; } T hwComputedSum(accel.dotProduct(&a[j], &b[j], testLength - j)); if (sum != hwComputedSum) { @@ -65,8 +63,25 @@ void verifyAccelrator(const IAccelrated & accel) LOG_ABORT("should not be reached"); } } - delete [] a; - delete [] b; +} + +template<typename T> +void verifyEuclidianDistance(const IAccelrated & accel) { + const size_t testLength(255); + srand(1); + std::vector<T> a = createAndFill<T>(testLength); + std::vector<T> b = createAndFill<T>(testLength); + for (size_t j(0); j < 0x20; j++) { + T sum(0); + for (size_t i(j); i < testLength; i++) { + sum += (a[i] - b[i]) * (a[i] - b[i]); + } + T hwComputedSum(accel.squaredEuclideanDistance(&a[j], &b[j], testLength - j)); + if (sum != hwComputedSum) { + fprintf(stderr, "Accelrator is not computing euclidian distance correctly.\n"); + LOG_ABORT("should not be reached"); + } + } } void verifyPopulationCount(const IAccelrated & accel) @@ -90,23 +105,25 @@ class RuntimeVerificator { public: RuntimeVerificator(); +private: + void verify(IAccelrated & accelrated) { + verifyDotproduct<float>(accelrated); + verifyDotproduct<double>(accelrated); + verifyDotproduct<int32_t>(accelrated); + verifyDotproduct<int64_t>(accelrated); + verifyEuclidianDistance<float>(accelrated); + verifyEuclidianDistance<double>(accelrated); + verifyPopulationCount(accelrated); + } }; RuntimeVerificator::RuntimeVerificator() { - GenericAccelrator generic; - verifyAccelrator<float>(generic); - verifyAccelrator<double>(generic); - verifyAccelrator<int32_t>(generic); - verifyAccelrator<int64_t>(generic); - verifyPopulationCount(generic); - - IAccelrated::UP thisCpu(IAccelrated::getAccelrator()); - verifyAccelrator<float>(*thisCpu); - verifyAccelrator<double>(*thisCpu); - verifyAccelrator<int32_t>(*thisCpu); - verifyAccelrator<int64_t>(*thisCpu); - + GenericAccelrator generic; + verify(generic); + + IAccelrated::UP thisCpu(IAccelrated::getAccelrator()); + verify(*thisCpu); } class Selector @@ -119,17 +136,15 @@ private: }; Selector::Selector() : - _factory(new GenericFactory()) + _factory() { __builtin_cpu_init (); if (__builtin_cpu_supports("avx512f")) { - _factory.reset(new Avx512Factory()); + _factory = std::make_unique<Avx512Factory>(); } else if (__builtin_cpu_supports("avx2")) { - _factory.reset(new Avx2Factory()); - } else if (__builtin_cpu_supports("avx")) { - _factory.reset(new AvxFactory()); - } else if (__builtin_cpu_supports("sse2")) { - _factory.reset(new Sse2Factory()); + _factory = std::make_unique<Avx2Factory>(); + } else { + _factory = std::make_unique<GenericFactory>(); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h index 4031169c44d..ea7e268bc6f 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h @@ -27,6 +27,8 @@ public: virtual void andNotBit(void * a, const void * b, size_t bytes) const = 0; virtual void notBit(void * a, size_t bytes) const = 0; virtual size_t populationCount(const uint64_t *a, size_t sz) const = 0; + virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const = 0; + virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const = 0; static IAccelrated::UP getAccelrator() __attribute__((noinline)); }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp index 8eba313d5f1..f5daf2b9081 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -3,6 +3,7 @@ #pragma once #include <vespa/vespalib/util/optimized.h> +#include <cstring> namespace vespalib::hwaccelrated::helper { namespace { @@ -24,4 +25,4 @@ populationCount(const uint64_t *a, size_t sz) { } } -}
\ No newline at end of file +} diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp deleted file mode 100644 index 64a26d49f2b..00000000000 --- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "sse2.h" -#include "private_helpers.hpp" - -namespace vespalib::hwaccelrated { - -size_t -Sse2Accelrator::populationCount(const uint64_t *a, size_t sz) const { - return helper::populationCount(a, sz); -} - -} diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h deleted file mode 100644 index 0b2462423b1..00000000000 --- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "generic.h" - -namespace vespalib::hwaccelrated { - -/** - * Generic cpu agnostic implementation. - */ -class Sse2Accelrator : public GenericAccelrator -{ -public: - size_t populationCount(const uint64_t *a, size_t sz) const override; -}; - -} |