diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2017-10-23 23:27:26 +0200 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2017-10-24 13:04:50 +0200 |
commit | 7e879e9f6494fd9c1d26c737542e153b1fd2eced (patch) | |
tree | 1d9aa7a6819de1026168cb421e9afeb05203ddae /vespalib | |
parent | 09465310d539137ce7fc7682bfdbbcdcdf3c5b0e (diff) |
Use recursive sum template to enable the compiler to select more optimal code generation due to less dependencies.
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp | 16 |
1 files changed, 13 insertions, 3 deletions
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp index ee25652a3a8..2db7ebfd8fd 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp @@ -22,6 +22,17 @@ T sumT(const V & v) { return sum; } +template <typename T, size_t C> +T sumR(const T * v) { + if (C == 1) { + return v[0]; + } else if (C == 2) { + return v[0] + v[1]; + } else { + return sumR<T, C/2>(v) + sumR<T, C/2>(v+C/2); + } +} + template <typename T, size_t VLEN, unsigned AlignA, unsigned AlignB, size_t VectorsPerChunk> static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline)); @@ -47,9 +58,8 @@ T computeDotProduct(const T * af, const T * bf, size_t sz) for (size_t i(numChunks*ChunkSize); i < sz; i++) { sum += af[i] * bf[i]; } - for (size_t i(1); i < VectorsPerChunk; i++) { - partial[0] += partial[i]; - } + partial[0] = sumR<V, VectorsPerChunk>(partial); + return sum + sumT<T, V>(partial[0]); } |