summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2017-10-23 23:27:26 +0200
committerHenning Baldersheim <balder@yahoo-inc.com>2017-10-24 13:04:50 +0200
commit7e879e9f6494fd9c1d26c737542e153b1fd2eced (patch)
tree1d9aa7a6819de1026168cb421e9afeb05203ddae /vespalib
parent09465310d539137ce7fc7682bfdbbcdcdf3c5b0e (diff)
Use recursive sum template to enable the compiler to select more optimal code generation due to less dependencies.
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp16
1 files changed, 13 insertions, 3 deletions
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp
index ee25652a3a8..2db7ebfd8fd 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp
@@ -22,6 +22,17 @@ T sumT(const V & v) {
return sum;
}
+template <typename T, size_t C>
+T sumR(const T * v) {
+ if (C == 1) {
+ return v[0];
+ } else if (C == 2) {
+ return v[0] + v[1];
+ } else {
+ return sumR<T, C/2>(v) + sumR<T, C/2>(v+C/2);
+ }
+}
+
template <typename T, size_t VLEN, unsigned AlignA, unsigned AlignB, size_t VectorsPerChunk>
static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline));
@@ -47,9 +58,8 @@ T computeDotProduct(const T * af, const T * bf, size_t sz)
for (size_t i(numChunks*ChunkSize); i < sz; i++) {
sum += af[i] * bf[i];
}
- for (size_t i(1); i < VectorsPerChunk; i++) {
- partial[0] += partial[i];
- }
+ partial[0] = sumR<V, VectorsPerChunk>(partial);
+
return sum + sumT<T, V>(partial[0]);
}