diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2016-10-11 15:41:58 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2016-10-11 15:41:58 +0000 |
commit | b12b624cde3cc4c8137ce6fce8599d3243c72e7d (patch) | |
tree | 2f2197fa0f0a1d663d37f0357403d94293357992 /staging_vespalib | |
parent | 1bdee48d21521dc7915d39981483564bea686a9c (diff) |
templatize the dotproduct for avx and add skeleton for avx2
Diffstat (limited to 'staging_vespalib')
6 files changed, 182 insertions, 61 deletions
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt index eb59210d413..2495c106277 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt @@ -5,6 +5,8 @@ vespa_add_library(staging_vespalib_vespalib_hwaccelrated OBJECT generic.cpp sse2.cpp avx.cpp + avx2.cpp DEPENDS ) set_source_files_properties(avx.cpp PROPERTIES COMPILE_FLAGS -march=sandybridge) +set_source_files_properties(avx2.cpp PROPERTIES COMPILE_FLAGS -march=broadwell) diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp index 00708edf5b7..d9e0cc41882 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp @@ -14,50 +14,43 @@ bool validAlignment32(const void * p) { return (reinterpret_cast<uint64_t>(p) & 0x1ful) == 0; } -} - -template <unsigned AlignA, unsigned AlignB> -float -AvxAccelrator::computeDotProduct(const float * af, const float * bf, size_t sz) -{ - typedef float v8saf __attribute__ ((vector_size (32))); - const size_t ChunkSize(32); - const size_t VectorsPerChunk(ChunkSize/8); - v8saf partial[VectorsPerChunk] = { {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, - {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0} }; - typedef float A __attribute__ ((vector_size (32), aligned(AlignA))); - typedef float B __attribute__ ((vector_size (32), aligned(AlignB))); - const A * a = reinterpret_cast<const A *>(af); - const B * b = reinterpret_cast<const B *>(bf); +template <typename T> +class TypeSpecifics { }; - const size_t numChunks(sz/ChunkSize); - for (size_t i(0); i < numChunks; i++) { - for (size_t j(0); j < VectorsPerChunk; j++) { - partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j]; - } +template <> +struct TypeSpecifics<float> { + static constexpr const size_t V_SZ = 32; + typedef float V __attribute__ ((vector_size (V_SZ))); + static constexpr const size_t VectorsPerChunk = 4; + static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + static float sum(V v) { + return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7]; } - float sum(0); - for (size_t i(numChunks*ChunkSize); i < sz; i++) { - sum += af[i] * bf[i]; - } - for (size_t i(1); i < VectorsPerChunk; i++) { - partial[0] += partial[i]; +}; + +template <> +struct TypeSpecifics<double> { + static constexpr const size_t V_SZ = 32; + typedef double V __attribute__ ((vector_size (V_SZ))); + static constexpr const size_t VectorsPerChunk = 4; + static constexpr const V zero = {0.0, 0.0, 0.0, 0.0}; + static float sum(V v) { + return v[0] + v[1] + v[2] + v[3]; } - sum += partial[0][0] + partial[0][1] + partial[0][2] + partial[0][3] + - partial[0][4] + partial[0][5] + partial[0][6] + partial[0][7]; - return sum; +}; + } -template <unsigned AlignA, unsigned AlignB> -double -AvxAccelrator::computeDotProduct(const double * af, const double * bf, size_t sz) +template <typename T, unsigned AlignA, unsigned AlignB> +T +AvxAccelrator::computeDotProduct(const T * af, const T * bf, size_t sz) { - typedef double v4sd __attribute__ ((vector_size (32))); - const size_t ChunkSize(16); - const size_t VectorsPerChunk(ChunkSize/4); - v4sd partial[VectorsPerChunk] = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} }; - typedef double A __attribute__ ((vector_size (32), aligned(AlignA))); - typedef double B __attribute__ ((vector_size (32), aligned(AlignB))); + using TT = TypeSpecifics<T>; + constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T); + constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk; + typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero}; + typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA))); + typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB))); const A * a = reinterpret_cast<const A *>(af); const B * b = reinterpret_cast<const B *>(bf); @@ -67,15 +60,14 @@ AvxAccelrator::computeDotProduct(const double * af, const double * bf, size_t sz partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j]; } } - double sum(0); + T sum(0); for (size_t i(numChunks*ChunkSize); i < sz; i++) { sum += af[i] * bf[i]; } for (size_t i(1); i < VectorsPerChunk; i++) { partial[0] += partial[i]; } - sum += partial[0][0] + partial[0][1] + partial[0][2] + partial[0][3]; - return sum; + return sum + TT::sum(partial[0]); } template <typename T> @@ -84,15 +76,15 @@ AvxAccelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz) { if (validAlignment32(af)) { if (validAlignment32(bf)) { - return computeDotProduct<32, 32>(af, bf, sz); + return computeDotProduct<T, 32, 32>(af, bf, sz); } else { - return computeDotProduct<32, 1>(af, bf, sz); + return computeDotProduct<T, 32, 1>(af, bf, sz); } } else { if (validAlignment32(bf)) { - return computeDotProduct<1, 32>(af, bf, sz); + return computeDotProduct<T, 1, 32>(af, bf, sz); } else { - return computeDotProduct<1, 1>(af, bf, sz); + return computeDotProduct<T, 1, 1>(af, bf, sz); } } } diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h index bd4816eb3a1..ed833713c2d 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h @@ -22,10 +22,8 @@ public: private: template <typename T> VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz); - template <unsigned AlignA, unsigned AlignB> - VESPA_DLL_LOCAL static double computeDotProduct(const double * af, const double * bf, size_t sz) __attribute__((noinline)); - template <unsigned AlignA, unsigned AlignB> - VESPA_DLL_LOCAL static float computeDotProduct(const float * af, const float * bf, size_t sz) __attribute__((noinline)); + template <typename T, unsigned AlignA, unsigned AlignB> + VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline)); }; } diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp new file mode 100644 index 00000000000..867b39ec326 --- /dev/null +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -0,0 +1,103 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/hwaccelrated/avx2.h> + +namespace vespalib { + +namespace hwaccelrated { + +namespace { + +bool validAlignment32(const void * p) { + return (reinterpret_cast<uint64_t>(p) & 0x1ful) == 0; +} + +template <typename T> +class TypeSpecifics { }; + +template <> +struct TypeSpecifics<float> { + static constexpr const size_t V_SZ = 32; + typedef float V __attribute__ ((vector_size (V_SZ))); + static constexpr const size_t VectorsPerChunk = 4; + static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + static float sum(V v) { + return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7]; + } +}; + +template <> +struct TypeSpecifics<double> { + static constexpr const size_t V_SZ = 32; + typedef double V __attribute__ ((vector_size (V_SZ))); + static constexpr const size_t VectorsPerChunk = 4; + static constexpr const V zero = {0.0, 0.0, 0.0, 0.0}; + static float sum(V v) { + return v[0] + v[1] + v[2] + v[3]; + } +}; + +} + +template <typename T, unsigned AlignA, unsigned AlignB> +T +Avx2Accelrator::computeDotProduct(const T * af, const T * bf, size_t sz) +{ + using TT = TypeSpecifics<T>; + constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T); + constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk; + typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero}; + typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA))); + typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB))); + const A * a = reinterpret_cast<const A *>(af); + const B * b = reinterpret_cast<const B *>(bf); + + const size_t numChunks(sz/ChunkSize); + for (size_t i(0); i < numChunks; i++) { + for (size_t j(0); j < VectorsPerChunk; j++) { + partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j]; + } + } + T sum(0); + for (size_t i(numChunks*ChunkSize); i < sz; i++) { + sum += af[i] * bf[i]; + } + for (size_t i(1); i < VectorsPerChunk; i++) { + partial[0] += partial[i]; + } + return sum + TT::sum(partial[0]); +} + +template <typename T> +T +Avx2Accelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz) +{ + if (validAlignment32(af)) { + if (validAlignment32(bf)) { + return computeDotProduct<T, 32, 32>(af, bf, sz); + } else { + return computeDotProduct<T, 32, 1>(af, bf, sz); + } + } else { + if (validAlignment32(bf)) { + return computeDotProduct<T, 1, 32>(af, bf, sz); + } else { + return computeDotProduct<T, 1, 1>(af, bf, sz); + } + } +} + +float +Avx2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const +{ + return dotProductSelectAlignment(af, bf, sz); +} + +double +Avx2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const +{ + return dotProductSelectAlignment(af, bf, sz); +} + +} +} diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h new file mode 100644 index 00000000000..e434bfc84e6 --- /dev/null +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -0,0 +1,29 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright (C) 2003 Fast Search & Transfer ASA +// Copyright (C) 2003 Overture Services Norway AS + +#pragma once + +#include <vespa/vespalib/hwaccelrated/avx.h> + +namespace vespalib { + +namespace hwaccelrated { + +/** + * Generic cpu agnostic implementation. + */ +class Avx2Accelrator : public AvxAccelrator +{ +public: + virtual float dotProduct(const float * a, const float * b, size_t sz) const; + virtual double dotProduct(const double * a, const double * b, size_t sz) const; +private: + template <typename T> + VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz); + template <typename T, unsigned AlignA, unsigned AlignB> + VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline)); +}; + +} +} diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp index fd4fc706b89..769d40fd0ec 100644 --- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp +++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp @@ -6,23 +6,13 @@ #include <vespa/vespalib/hwaccelrated/generic.h> #include <vespa/vespalib/hwaccelrated/sse2.h> #include <vespa/vespalib/hwaccelrated/avx.h> +#include <vespa/vespalib/hwaccelrated/avx2.h> #include <assert.h> namespace vespalib { namespace hwaccelrated { -#if 0 -__attribute__ ((target ("default"), noinline)) -vespalib::hwaccelrated::IAccelrated::UP selectAccelrator() { return vespalib::hwaccelrated::IAccelrated::UP(new vespalib::hwaccelrated::GenericAccelrator()); } - -__attribute__ ((target ("sse2"), noinline)) -vespalib::hwaccelrated::IAccelrated::UP selectAccelrator() { return vespalib::hwaccelrated::IAccelrated::UP(new vespalib::hwaccelrated::Sse2Accelrator()); } - -__attribute__ ((target ("avx"), noinline)) -vespalib::hwaccelrated::IAccelrated::UP selectAccelrator() { return vespalib::hwaccelrated::IAccelrated::UP(new AvxAccelrator()); } -#endif - namespace { class Factory { @@ -46,6 +36,11 @@ public: virtual IAccelrated::UP create() const { return IAccelrated::UP(new AvxAccelrator()); } }; +class Avx2Factory :public Factory{ +public: + virtual IAccelrated::UP create() const { return IAccelrated::UP(new Avx2Accelrator()); } +}; + template<typename T> void verifyAccelrator(const IAccelrated & accel) { @@ -100,7 +95,9 @@ Selector::Selector() : _factory(new GenericFactory()) { __builtin_cpu_init (); - if (__builtin_cpu_supports("avx")) { + if (__builtin_cpu_supports("avx2")) { + _factory.reset(new Avx2Factory()); + } else if (__builtin_cpu_supports("avx")) { _factory.reset(new AvxFactory()); } else if (__builtin_cpu_supports("sse2")) { _factory.reset(new Sse2Factory()); |