From 4ff929491b2966e91cefdb473470a628488205c8 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Tue, 11 Oct 2016 21:19:38 +0000 Subject: Differentiate between avx, avx2 and avx512f --- .../src/vespa/vespalib/hwaccelrated/CMakeLists.txt | 4 +- .../src/vespa/vespalib/hwaccelrated/avx.cpp | 88 +-------------- .../src/vespa/vespalib/hwaccelrated/avx.h | 14 +-- .../src/vespa/vespalib/hwaccelrated/avx2.cpp | 86 +------------- .../src/vespa/vespalib/hwaccelrated/avx2.h | 13 +-- .../src/vespa/vespalib/hwaccelrated/avx512.cpp | 23 ++++ .../src/vespa/vespalib/hwaccelrated/avx512.h | 22 ++++ .../src/vespa/vespalib/hwaccelrated/avxprivate.hpp | 123 +++++++++++++++++++++ .../vespa/vespalib/hwaccelrated/iaccelrated.cpp | 10 +- .../src/vespa/vespalib/hwaccelrated/sse2.h | 4 +- 10 files changed, 194 insertions(+), 193 deletions(-) create mode 100644 staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp create mode 100644 staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.h create mode 100644 staging_vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp (limited to 'staging_vespalib') diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt index 2495c106277..1c80add3d8e 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt @@ -6,7 +6,9 @@ vespa_add_library(staging_vespalib_vespalib_hwaccelrated OBJECT sse2.cpp avx.cpp avx2.cpp + avx512.cpp DEPENDS ) set_source_files_properties(avx.cpp PROPERTIES COMPILE_FLAGS -march=sandybridge) -set_source_files_properties(avx2.cpp PROPERTIES COMPILE_FLAGS -march=broadwell) +set_source_files_properties(avx2.cpp PROPERTIES COMPILE_FLAGS -march=haswell) +set_source_files_properties(avx512.cpp PROPERTIES COMPILE_FLAGS -march=skylake) # should be skylake-avx512 when assembler supports it. diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp index d9e0cc41882..ec5064bf647 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp @@ -1,104 +1,22 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -// Copyright (C) 2003 Fast Search & Transfer ASA -// Copyright (C) 2003 Overture Services Norway AS #include +#include namespace vespalib { namespace hwaccelrated { -namespace { - -bool validAlignment32(const void * p) { - return (reinterpret_cast(p) & 0x1ful) == 0; -} - -template -class TypeSpecifics { }; - -template <> -struct TypeSpecifics { - static constexpr const size_t V_SZ = 32; - typedef float V __attribute__ ((vector_size (V_SZ))); - static constexpr const size_t VectorsPerChunk = 4; - static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - static float sum(V v) { - return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7]; - } -}; - -template <> -struct TypeSpecifics { - static constexpr const size_t V_SZ = 32; - typedef double V __attribute__ ((vector_size (V_SZ))); - static constexpr const size_t VectorsPerChunk = 4; - static constexpr const V zero = {0.0, 0.0, 0.0, 0.0}; - static float sum(V v) { - return v[0] + v[1] + v[2] + v[3]; - } -}; - -} - -template -T -AvxAccelrator::computeDotProduct(const T * af, const T * bf, size_t sz) -{ - using TT = TypeSpecifics; - constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T); - constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk; - typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero}; - typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA))); - typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB))); - const A * a = reinterpret_cast(af); - const B * b = reinterpret_cast(bf); - - const size_t numChunks(sz/ChunkSize); - for (size_t i(0); i < numChunks; i++) { - for (size_t j(0); j < VectorsPerChunk; j++) { - partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j]; - } - } - T sum(0); - for (size_t i(numChunks*ChunkSize); i < sz; i++) { - sum += af[i] * bf[i]; - } - for (size_t i(1); i < VectorsPerChunk; i++) { - partial[0] += partial[i]; - } - return sum + TT::sum(partial[0]); -} - -template -T -AvxAccelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz) -{ - if (validAlignment32(af)) { - if (validAlignment32(bf)) { - return computeDotProduct(af, bf, sz); - } else { - return computeDotProduct(af, bf, sz); - } - } else { - if (validAlignment32(bf)) { - return computeDotProduct(af, bf, sz); - } else { - return computeDotProduct(af, bf, sz); - } - } -} - float AvxAccelrator::dotProduct(const float * af, const float * bf, size_t sz) const { - return dotProductSelectAlignment(af, bf, sz); + return avx::dotProductSelectAlignment(af, bf, sz); } double AvxAccelrator::dotProduct(const double * af, const double * bf, size_t sz) const { - return dotProductSelectAlignment(af, bf, sz); + return avx::dotProductSelectAlignment(af, bf, sz); } } diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h index ed833713c2d..4b391c163ac 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h @@ -1,29 +1,21 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -// Copyright (C) 2003 Fast Search & Transfer ASA -// Copyright (C) 2003 Overture Services Norway AS #pragma once #include -#include namespace vespalib { namespace hwaccelrated { /** - * Generic cpu agnostic implementation. + * Avx-256 implementation. */ class AvxAccelrator : public Sse2Accelrator { public: - virtual float dotProduct(const float * a, const float * b, size_t sz) const; - virtual double dotProduct(const double * a, const double * b, size_t sz) const; -private: - template - VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz); - template - VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline)); + float dotProduct(const float * a, const float * b, size_t sz) const override; + double dotProduct(const double * a, const double * b, size_t sz) const override; }; } diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index 867b39ec326..f87738e3a6c 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -1,102 +1,22 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include +#include namespace vespalib { namespace hwaccelrated { -namespace { - -bool validAlignment32(const void * p) { - return (reinterpret_cast(p) & 0x1ful) == 0; -} - -template -class TypeSpecifics { }; - -template <> -struct TypeSpecifics { - static constexpr const size_t V_SZ = 32; - typedef float V __attribute__ ((vector_size (V_SZ))); - static constexpr const size_t VectorsPerChunk = 4; - static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; - static float sum(V v) { - return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7]; - } -}; - -template <> -struct TypeSpecifics { - static constexpr const size_t V_SZ = 32; - typedef double V __attribute__ ((vector_size (V_SZ))); - static constexpr const size_t VectorsPerChunk = 4; - static constexpr const V zero = {0.0, 0.0, 0.0, 0.0}; - static float sum(V v) { - return v[0] + v[1] + v[2] + v[3]; - } -}; - -} - -template -T -Avx2Accelrator::computeDotProduct(const T * af, const T * bf, size_t sz) -{ - using TT = TypeSpecifics; - constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T); - constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk; - typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero}; - typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA))); - typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB))); - const A * a = reinterpret_cast(af); - const B * b = reinterpret_cast(bf); - - const size_t numChunks(sz/ChunkSize); - for (size_t i(0); i < numChunks; i++) { - for (size_t j(0); j < VectorsPerChunk; j++) { - partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j]; - } - } - T sum(0); - for (size_t i(numChunks*ChunkSize); i < sz; i++) { - sum += af[i] * bf[i]; - } - for (size_t i(1); i < VectorsPerChunk; i++) { - partial[0] += partial[i]; - } - return sum + TT::sum(partial[0]); -} - -template -T -Avx2Accelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz) -{ - if (validAlignment32(af)) { - if (validAlignment32(bf)) { - return computeDotProduct(af, bf, sz); - } else { - return computeDotProduct(af, bf, sz); - } - } else { - if (validAlignment32(bf)) { - return computeDotProduct(af, bf, sz); - } else { - return computeDotProduct(af, bf, sz); - } - } -} - float Avx2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const { - return dotProductSelectAlignment(af, bf, sz); + return avx::dotProductSelectAlignment(af, bf, sz); } double Avx2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const { - return dotProductSelectAlignment(af, bf, sz); + return avx::dotProductSelectAlignment(af, bf, sz); } } diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index e434bfc84e6..56d3a8ac65e 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -1,6 +1,4 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -// Copyright (C) 2003 Fast Search & Transfer ASA -// Copyright (C) 2003 Overture Services Norway AS #pragma once @@ -11,18 +9,13 @@ namespace vespalib { namespace hwaccelrated { /** - * Generic cpu agnostic implementation. + * Avx-512 implementation. */ class Avx2Accelrator : public AvxAccelrator { public: - virtual float dotProduct(const float * a, const float * b, size_t sz) const; - virtual double dotProduct(const double * a, const double * b, size_t sz) const; -private: - template - VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz); - template - VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline)); + float dotProduct(const float * a, const float * b, size_t sz) const override; + double dotProduct(const double * a, const double * b, size_t sz) const override; }; } diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp new file mode 100644 index 00000000000..9f7a6dcda3e --- /dev/null +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -0,0 +1,23 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include +#include + +namespace vespalib { + +namespace hwaccelrated { + +float +Avx512Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const +{ + return avx::dotProductSelectAlignment(af, bf, sz); +} + +double +Avx512Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const +{ + return avx::dotProductSelectAlignment(af, bf, sz); +} + +} +} diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.h new file mode 100644 index 00000000000..5d7028c30ba --- /dev/null +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx512.h @@ -0,0 +1,22 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include + +namespace vespalib { + +namespace hwaccelrated { + +/** + * Avx-512 implementation. + */ +class Avx512Accelrator : public Avx2Accelrator +{ +public: + float dotProduct(const float * a, const float * b, size_t sz) const override; + double dotProduct(const double * a, const double * b, size_t sz) const override; +}; + +} +} diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp new file mode 100644 index 00000000000..5491fe1eef7 --- /dev/null +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp @@ -0,0 +1,123 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include + +namespace vespalib { + +namespace hwaccelrated { + +namespace avx { + +namespace { + +inline bool validAlignment(const void * p, const size_t align) { + return (reinterpret_cast(p) & (align-1)) == 0; +} + +template +T sumT(const V & v) { + T sum(0); + for (size_t i(0); i < (sizeof(V)/sizeof(T)); i++) { + sum += v[i]; + } + return sum; +} + +template +class TypeSpecifics { }; + +template <> +struct TypeSpecifics { + static constexpr const size_t V_SZ = 32u; + typedef float V __attribute__ ((vector_size (V_SZ))); + static constexpr const size_t VectorsPerChunk = 4; + static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + static float sum(const V & v) { return sumT(v); } +}; + +template <> +struct TypeSpecifics { + static constexpr const size_t V_SZ = 32u; + typedef double V __attribute__ ((vector_size (V_SZ))); + static constexpr const size_t VectorsPerChunk = 4; + static constexpr const V zero = {0.0, 0.0, 0.0, 0.0}; + static double sum(const V & v) { return sumT(v); } +}; + +template <> +struct TypeSpecifics { + static constexpr const size_t V_SZ = 64u; + typedef float V __attribute__ ((vector_size (V_SZ))); + static constexpr const size_t VectorsPerChunk = 4; + static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + static float sum(const V & v) { return sumT(v); } +}; + +template <> +struct TypeSpecifics { + static constexpr const size_t V_SZ = 64u; + typedef double V __attribute__ ((vector_size (V_SZ))); + static constexpr const size_t VectorsPerChunk = 4; + static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + static double sum(const V & v) { return sumT(v); } +}; + +template +static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline)); + +template +T computeDotProduct(const T * af, const T * bf, size_t sz) +{ + using TT = TypeSpecifics; + constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T); + constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk; + typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero}; + typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA))); + typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB))); + const A * a = reinterpret_cast(af); + const B * b = reinterpret_cast(bf); + + const size_t numChunks(sz/ChunkSize); + for (size_t i(0); i < numChunks; i++) { + for (size_t j(0); j < VectorsPerChunk; j++) { + partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j]; + } + } + T sum(0); + for (size_t i(numChunks*ChunkSize); i < sz; i++) { + sum += af[i] * bf[i]; + } + for (size_t i(1); i < VectorsPerChunk; i++) { + partial[0] += partial[i]; + } + return sum + TT::sum(partial[0]); +} + +} + +template +VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz); + +template +T dotProductSelectAlignment(const T * af, const T * bf, size_t sz) +{ + if (validAlignment(af, VLEN)) { + if (validAlignment(bf, VLEN)) { + return computeDotProduct(af, bf, sz); + } else { + return computeDotProduct(af, bf, sz); + } + } else { + if (validAlignment(bf, VLEN)) { + return computeDotProduct(af, bf, sz); + } else { + return computeDotProduct(af, bf, sz); + } + } +} + +} +} +} diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp index 769d40fd0ec..aede024f5af 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace vespalib { @@ -41,6 +42,11 @@ public: virtual IAccelrated::UP create() const { return IAccelrated::UP(new Avx2Accelrator()); } }; +class Avx512Factory :public Factory{ +public: + virtual IAccelrated::UP create() const { return IAccelrated::UP(new Avx512Accelrator()); } +}; + template void verifyAccelrator(const IAccelrated & accel) { @@ -95,7 +101,9 @@ Selector::Selector() : _factory(new GenericFactory()) { __builtin_cpu_init (); - if (__builtin_cpu_supports("avx2")) { + if (__builtin_cpu_supports("avx512f")) { + _factory.reset(new Avx512Factory()); + } else if (__builtin_cpu_supports("avx2")) { _factory.reset(new Avx2Factory()); } else if (__builtin_cpu_supports("avx")) { _factory.reset(new AvxFactory()); diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/sse2.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/sse2.h index 86fbc41a486..a7c39581997 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/sse2.h +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/sse2.h @@ -16,8 +16,8 @@ namespace hwaccelrated { class Sse2Accelrator : public GenericAccelrator { public: - virtual float dotProduct(const float * a, const float * b, size_t sz) const; - virtual double dotProduct(const double * a, const double * b, size_t sz) const; + float dotProduct(const float * a, const float * b, size_t sz) const override; + double dotProduct(const double * a, const double * b, size_t sz) const override; }; } -- cgit v1.2.3