templatize the dotproduct for avx and add skeleton for avx2

author: Henning Baldersheim <balder@yahoo-inc.com> 2016-10-11 15:41:58 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2016-10-11 15:41:58 +0000
commit: b12b624cde3cc4c8137ce6fce8599d3243c72e7d (patch)
tree: 2f2197fa0f0a1d663d37f0357403d94293357992 /staging_vespalib
parent: 1bdee48d21521dc7915d39981483564bea686a9c (diff)
6 files changed, 182 insertions, 61 deletions
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt
index eb59210d413..2495c106277 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt
@@ -5,6 +5,8 @@ vespa_add_library(staging_vespalib_vespalib_hwaccelrated OBJECT
     generic.cpp
     sse2.cpp
     avx.cpp
+    avx2.cpp
     DEPENDS
 )
 set_source_files_properties(avx.cpp PROPERTIES COMPILE_FLAGS -march=sandybridge)
+set_source_files_properties(avx2.cpp PROPERTIES COMPILE_FLAGS -march=broadwell)
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
index 00708edf5b7..d9e0cc41882 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
@@ -14,50 +14,43 @@ bool validAlignment32(const void * p) {
     return (reinterpret_cast<uint64_t>(p) & 0x1ful) == 0;
 }
 
-}
-
-template <unsigned AlignA, unsigned AlignB>
-float
-AvxAccelrator::computeDotProduct(const float * af, const float * bf, size_t sz)
-{
-    typedef float v8saf __attribute__ ((vector_size (32)));
-    const size_t ChunkSize(32);
-    const size_t VectorsPerChunk(ChunkSize/8);
-    v8saf partial[VectorsPerChunk] = { {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-                                      {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0} };
-    typedef float A __attribute__ ((vector_size (32), aligned(AlignA)));
-    typedef float B __attribute__ ((vector_size (32), aligned(AlignB)));
-    const A * a = reinterpret_cast<const A *>(af);
-    const B * b = reinterpret_cast<const B *>(bf);
+template <typename T>
+class TypeSpecifics { };
 
-    const size_t numChunks(sz/ChunkSize);
-    for (size_t i(0); i < numChunks; i++) {
-        for (size_t j(0); j < VectorsPerChunk; j++) {
-            partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
-        }
+template <>
+struct TypeSpecifics<float> {
+    static constexpr const size_t V_SZ = 32;
+    typedef float V __attribute__ ((vector_size (V_SZ)));
+    static constexpr const size_t VectorsPerChunk = 4;
+    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    static float sum(V v) {
+        return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7];
     }
-    float sum(0);
-    for (size_t i(numChunks*ChunkSize); i < sz; i++) {
-        sum += af[i] * bf[i];
-    }
-    for (size_t i(1); i < VectorsPerChunk; i++) {
-        partial[0] += partial[i];
+};
+
+template <>
+struct TypeSpecifics<double> {
+    static constexpr const size_t V_SZ = 32;
+    typedef double V __attribute__ ((vector_size (V_SZ)));
+    static constexpr const size_t VectorsPerChunk = 4;
+    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0};
+    static float sum(V v) {
+        return v[0] + v[1] + v[2] + v[3];
     }
-    sum += partial[0][0] + partial[0][1] + partial[0][2] + partial[0][3] +
-           partial[0][4] + partial[0][5] + partial[0][6] + partial[0][7];
-    return sum; 
+};
+
 }
 
-template <unsigned AlignA, unsigned AlignB>
-double
-AvxAccelrator::computeDotProduct(const double * af, const double * bf, size_t sz)
+template <typename T, unsigned AlignA, unsigned AlignB>
+T
+AvxAccelrator::computeDotProduct(const T * af, const T * bf, size_t sz)
 {
-    typedef double v4sd __attribute__ ((vector_size (32)));
-    const size_t ChunkSize(16);
-    const size_t VectorsPerChunk(ChunkSize/4);
-    v4sd partial[VectorsPerChunk] = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
-    typedef double A __attribute__ ((vector_size (32), aligned(AlignA)));
-    typedef double B __attribute__ ((vector_size (32), aligned(AlignB)));
+    using TT = TypeSpecifics<T>;
+    constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T);
+    constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk;
+    typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero};
+    typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA)));
+    typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB)));
     const A * a = reinterpret_cast<const A *>(af);
     const B * b = reinterpret_cast<const B *>(bf);
 
@@ -67,15 +60,14 @@ AvxAccelrator::computeDotProduct(const double * af, const double * bf, size_t sz
             partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
         }
     }
-    double sum(0);
+    T sum(0);
     for (size_t i(numChunks*ChunkSize); i < sz; i++) {
         sum += af[i] * bf[i];
     }
     for (size_t i(1); i < VectorsPerChunk; i++) {
         partial[0] += partial[i];
     }
-    sum += partial[0][0] + partial[0][1] + partial[0][2] + partial[0][3];
-    return sum; 
+    return sum + TT::sum(partial[0]);
 }
 
 template <typename T>
@@ -84,15 +76,15 @@ AvxAccelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz)
 {
     if (validAlignment32(af)) {
         if (validAlignment32(bf)) {
-            return computeDotProduct<32, 32>(af, bf, sz);
+            return computeDotProduct<T, 32, 32>(af, bf, sz);
         } else {
-            return computeDotProduct<32, 1>(af, bf, sz);
+            return computeDotProduct<T, 32, 1>(af, bf, sz);
         }
     } else {
         if (validAlignment32(bf)) {
-            return computeDotProduct<1, 32>(af, bf, sz);
+            return computeDotProduct<T, 1, 32>(af, bf, sz);
         } else {
-            return computeDotProduct<1, 1>(af, bf, sz);
+            return computeDotProduct<T, 1, 1>(af, bf, sz);
         }
     }
 }
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h
index bd4816eb3a1..ed833713c2d 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h
@@ -22,10 +22,8 @@ public:
 private:
     template <typename T>
     VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz);
-    template <unsigned AlignA, unsigned AlignB>
-    VESPA_DLL_LOCAL static double computeDotProduct(const double * af, const double * bf, size_t sz) __attribute__((noinline));
-    template <unsigned AlignA, unsigned AlignB>
-    VESPA_DLL_LOCAL static float computeDotProduct(const float * af, const float * bf, size_t sz) __attribute__((noinline));
+    template <typename T, unsigned AlignA, unsigned AlignB>
+    VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline));
 };
 
 }
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
new file mode 100644
index 00000000000..867b39ec326
--- /dev/null
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
@@ -0,0 +1,103 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/hwaccelrated/avx2.h>
+
+namespace vespalib {
+
+namespace hwaccelrated {
+
+namespace {
+
+bool validAlignment32(const void * p) {
+    return (reinterpret_cast<uint64_t>(p) & 0x1ful) == 0;
+}
+
+template <typename T>
+class TypeSpecifics { };
+
+template <>
+struct TypeSpecifics<float> {
+    static constexpr const size_t V_SZ = 32;
+    typedef float V __attribute__ ((vector_size (V_SZ)));
+    static constexpr const size_t VectorsPerChunk = 4;
+    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    static float sum(V v) {
+        return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7];
+    }
+};
+
+template <>
+struct TypeSpecifics<double> {
+    static constexpr const size_t V_SZ = 32;
+    typedef double V __attribute__ ((vector_size (V_SZ)));
+    static constexpr const size_t VectorsPerChunk = 4;
+    static constexpr const V zero = {0.0, 0.0, 0.0, 0.0};
+    static float sum(V v) {
+        return v[0] + v[1] + v[2] + v[3];
+    }
+};
+
+}
+
+template <typename T, unsigned AlignA, unsigned AlignB>
+T
+Avx2Accelrator::computeDotProduct(const T * af, const T * bf, size_t sz)
+{
+    using TT = TypeSpecifics<T>;
+    constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T);
+    constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk;
+    typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero};
+    typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA)));
+    typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB)));
+    const A * a = reinterpret_cast<const A *>(af);
+    const B * b = reinterpret_cast<const B *>(bf);
+
+    const size_t numChunks(sz/ChunkSize);
+    for (size_t i(0); i < numChunks; i++) {
+        for (size_t j(0); j < VectorsPerChunk; j++) {
+            partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
+        }
+    }
+    T sum(0);
+    for (size_t i(numChunks*ChunkSize); i < sz; i++) {
+        sum += af[i] * bf[i];
+    }
+    for (size_t i(1); i < VectorsPerChunk; i++) {
+        partial[0] += partial[i];
+    }
+    return sum + TT::sum(partial[0]);
+}
+
+template <typename T>
+T
+Avx2Accelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz)
+{
+    if (validAlignment32(af)) {
+        if (validAlignment32(bf)) {
+            return computeDotProduct<T, 32, 32>(af, bf, sz);
+        } else {
+            return computeDotProduct<T, 32, 1>(af, bf, sz);
+        }
+    } else {
+        if (validAlignment32(bf)) {
+            return computeDotProduct<T, 1, 32>(af, bf, sz);
+        } else {
+            return computeDotProduct<T, 1, 1>(af, bf, sz);
+        }
+    }
+}
+
+float
+Avx2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const
+{
+    return dotProductSelectAlignment(af, bf, sz);
+}
+
+double
+Avx2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const
+{
+    return dotProductSelectAlignment(af, bf, sz);
+}
+
+}
+}
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
new file mode 100644
index 00000000000..e434bfc84e6
--- /dev/null
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright (C) 2003 Fast Search & Transfer ASA
+// Copyright (C) 2003 Overture Services Norway AS
+
+#pragma once
+
+#include <vespa/vespalib/hwaccelrated/avx.h>
+
+namespace vespalib {
+
+namespace hwaccelrated {
+
+/**
+ * Generic cpu agnostic implementation.
+ */
+class Avx2Accelrator : public AvxAccelrator
+{
+public:
+    virtual float dotProduct(const float * a, const float * b, size_t sz) const;
+    virtual double dotProduct(const double * a, const double * b, size_t sz) const;
+private:
+    template <typename T>
+    VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz);
+    template <typename T, unsigned AlignA, unsigned AlignB>
+    VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline));
+};
+
+}
+}
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
index fd4fc706b89..769d40fd0ec 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
@@ -6,23 +6,13 @@
 #include <vespa/vespalib/hwaccelrated/generic.h>
 #include <vespa/vespalib/hwaccelrated/sse2.h>
 #include <vespa/vespalib/hwaccelrated/avx.h>
+#include <vespa/vespalib/hwaccelrated/avx2.h>
 #include <assert.h>
 
 namespace vespalib {
 
 namespace hwaccelrated {
 
-#if 0
-__attribute__ ((target ("default"), noinline)) 
-vespalib::hwaccelrated::IAccelrated::UP selectAccelrator() { return vespalib::hwaccelrated::IAccelrated::UP(new vespalib::hwaccelrated::GenericAccelrator()); }
-
-__attribute__ ((target ("sse2"), noinline))
-vespalib::hwaccelrated::IAccelrated::UP selectAccelrator() { return vespalib::hwaccelrated::IAccelrated::UP(new vespalib::hwaccelrated::Sse2Accelrator()); }
-
-__attribute__ ((target ("avx"), noinline))
-vespalib::hwaccelrated::IAccelrated::UP selectAccelrator() { return vespalib::hwaccelrated::IAccelrated::UP(new AvxAccelrator()); }
-#endif
-
 namespace {
 
 class Factory {
@@ -46,6 +36,11 @@ public:
     virtual IAccelrated::UP create() const { return IAccelrated::UP(new AvxAccelrator()); }
 };
 
+class Avx2Factory :public Factory{
+public:
+    virtual IAccelrated::UP create() const { return IAccelrated::UP(new Avx2Accelrator()); }
+};
+
 template<typename T>
 void verifyAccelrator(const IAccelrated & accel)
 {
@@ -100,7 +95,9 @@ Selector::Selector() :
     _factory(new GenericFactory())
 {
     __builtin_cpu_init ();
-    if (__builtin_cpu_supports("avx")) {
+    if (__builtin_cpu_supports("avx2")) {
+        _factory.reset(new Avx2Factory());
+    } else if (__builtin_cpu_supports("avx")) {
         _factory.reset(new AvxFactory());
     } else if (__builtin_cpu_supports("sse2")) {
         _factory.reset(new Sse2Factory());
author	Henning Baldersheim <balder@yahoo-inc.com>	2016-10-11 15:41:58 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2016-10-11 15:41:58 +0000
commit	b12b624cde3cc4c8137ce6fce8599d3243c72e7d (patch)
tree	2f2197fa0f0a1d663d37f0357403d94293357992 /staging_vespalib
parent	1bdee48d21521dc7915d39981483564bea686a9c (diff)