summaryrefslogtreecommitdiffstats
path: root/staging_vespalib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2016-10-11 15:41:58 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2016-10-11 15:41:58 +0000
commitb12b624cde3cc4c8137ce6fce8599d3243c72e7d (patch)
tree2f2197fa0f0a1d663d37f0357403d94293357992 /staging_vespalib
parent1bdee48d21521dc7915d39981483564bea686a9c (diff)
templatize the dotproduct for avx and add skeleton for avx2
Diffstat (limited to 'staging_vespalib')
-rw-r--r--staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt2
-rw-r--r--staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp82
-rw-r--r--staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h6
-rw-r--r--staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp103
-rw-r--r--staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h29
-rw-r--r--staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp21
6 files changed, 182 insertions, 61 deletions
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt
index eb59210d413..2495c106277 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/CMakeLists.txt
@@ -5,6 +5,8 @@ vespa_add_library(staging_vespalib_vespalib_hwaccelrated OBJECT
generic.cpp
sse2.cpp
avx.cpp
+ avx2.cpp
DEPENDS
)
set_source_files_properties(avx.cpp PROPERTIES COMPILE_FLAGS -march=sandybridge)
+set_source_files_properties(avx2.cpp PROPERTIES COMPILE_FLAGS -march=broadwell)
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
index 00708edf5b7..d9e0cc41882 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
@@ -14,50 +14,43 @@ bool validAlignment32(const void * p) {
return (reinterpret_cast<uint64_t>(p) & 0x1ful) == 0;
}
-}
-
-template <unsigned AlignA, unsigned AlignB>
-float
-AvxAccelrator::computeDotProduct(const float * af, const float * bf, size_t sz)
-{
- typedef float v8saf __attribute__ ((vector_size (32)));
- const size_t ChunkSize(32);
- const size_t VectorsPerChunk(ChunkSize/8);
- v8saf partial[VectorsPerChunk] = { {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
- {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0} };
- typedef float A __attribute__ ((vector_size (32), aligned(AlignA)));
- typedef float B __attribute__ ((vector_size (32), aligned(AlignB)));
- const A * a = reinterpret_cast<const A *>(af);
- const B * b = reinterpret_cast<const B *>(bf);
+template <typename T>
+class TypeSpecifics { };
- const size_t numChunks(sz/ChunkSize);
- for (size_t i(0); i < numChunks; i++) {
- for (size_t j(0); j < VectorsPerChunk; j++) {
- partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
- }
+template <>
+struct TypeSpecifics<float> {
+ static constexpr const size_t V_SZ = 32;
+ typedef float V __attribute__ ((vector_size (V_SZ)));
+ static constexpr const size_t VectorsPerChunk = 4;
+ static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+ static float sum(V v) {
+ return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7];
}
- float sum(0);
- for (size_t i(numChunks*ChunkSize); i < sz; i++) {
- sum += af[i] * bf[i];
- }
- for (size_t i(1); i < VectorsPerChunk; i++) {
- partial[0] += partial[i];
+};
+
+template <>
+struct TypeSpecifics<double> {
+ static constexpr const size_t V_SZ = 32;
+ typedef double V __attribute__ ((vector_size (V_SZ)));
+ static constexpr const size_t VectorsPerChunk = 4;
+ static constexpr const V zero = {0.0, 0.0, 0.0, 0.0};
+ static float sum(V v) {
+ return v[0] + v[1] + v[2] + v[3];
}
- sum += partial[0][0] + partial[0][1] + partial[0][2] + partial[0][3] +
- partial[0][4] + partial[0][5] + partial[0][6] + partial[0][7];
- return sum;
+};
+
}
-template <unsigned AlignA, unsigned AlignB>
-double
-AvxAccelrator::computeDotProduct(const double * af, const double * bf, size_t sz)
+template <typename T, unsigned AlignA, unsigned AlignB>
+T
+AvxAccelrator::computeDotProduct(const T * af, const T * bf, size_t sz)
{
- typedef double v4sd __attribute__ ((vector_size (32)));
- const size_t ChunkSize(16);
- const size_t VectorsPerChunk(ChunkSize/4);
- v4sd partial[VectorsPerChunk] = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
- typedef double A __attribute__ ((vector_size (32), aligned(AlignA)));
- typedef double B __attribute__ ((vector_size (32), aligned(AlignB)));
+ using TT = TypeSpecifics<T>;
+ constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T);
+ constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk;
+ typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero};
+ typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA)));
+ typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB)));
const A * a = reinterpret_cast<const A *>(af);
const B * b = reinterpret_cast<const B *>(bf);
@@ -67,15 +60,14 @@ AvxAccelrator::computeDotProduct(const double * af, const double * bf, size_t sz
partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
}
}
- double sum(0);
+ T sum(0);
for (size_t i(numChunks*ChunkSize); i < sz; i++) {
sum += af[i] * bf[i];
}
for (size_t i(1); i < VectorsPerChunk; i++) {
partial[0] += partial[i];
}
- sum += partial[0][0] + partial[0][1] + partial[0][2] + partial[0][3];
- return sum;
+ return sum + TT::sum(partial[0]);
}
template <typename T>
@@ -84,15 +76,15 @@ AvxAccelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz)
{
if (validAlignment32(af)) {
if (validAlignment32(bf)) {
- return computeDotProduct<32, 32>(af, bf, sz);
+ return computeDotProduct<T, 32, 32>(af, bf, sz);
} else {
- return computeDotProduct<32, 1>(af, bf, sz);
+ return computeDotProduct<T, 32, 1>(af, bf, sz);
}
} else {
if (validAlignment32(bf)) {
- return computeDotProduct<1, 32>(af, bf, sz);
+ return computeDotProduct<T, 1, 32>(af, bf, sz);
} else {
- return computeDotProduct<1, 1>(af, bf, sz);
+ return computeDotProduct<T, 1, 1>(af, bf, sz);
}
}
}
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h
index bd4816eb3a1..ed833713c2d 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx.h
@@ -22,10 +22,8 @@ public:
private:
template <typename T>
VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz);
- template <unsigned AlignA, unsigned AlignB>
- VESPA_DLL_LOCAL static double computeDotProduct(const double * af, const double * bf, size_t sz) __attribute__((noinline));
- template <unsigned AlignA, unsigned AlignB>
- VESPA_DLL_LOCAL static float computeDotProduct(const float * af, const float * bf, size_t sz) __attribute__((noinline));
+ template <typename T, unsigned AlignA, unsigned AlignB>
+ VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline));
};
}
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
new file mode 100644
index 00000000000..867b39ec326
--- /dev/null
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
@@ -0,0 +1,103 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/hwaccelrated/avx2.h>
+
+namespace vespalib {
+
+namespace hwaccelrated {
+
+namespace {
+
+bool validAlignment32(const void * p) {
+ return (reinterpret_cast<uint64_t>(p) & 0x1ful) == 0;
+}
+
+template <typename T>
+class TypeSpecifics { };
+
+template <>
+struct TypeSpecifics<float> {
+ static constexpr const size_t V_SZ = 32;
+ typedef float V __attribute__ ((vector_size (V_SZ)));
+ static constexpr const size_t VectorsPerChunk = 4;
+ static constexpr const V zero = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+ static float sum(V v) {
+ return v[0] + v[1] + v[2] + v[3] + v[4] + v[5] + v[6] + v[7];
+ }
+};
+
+template <>
+struct TypeSpecifics<double> {
+ static constexpr const size_t V_SZ = 32;
+ typedef double V __attribute__ ((vector_size (V_SZ)));
+ static constexpr const size_t VectorsPerChunk = 4;
+ static constexpr const V zero = {0.0, 0.0, 0.0, 0.0};
+ static float sum(V v) {
+ return v[0] + v[1] + v[2] + v[3];
+ }
+};
+
+}
+
+template <typename T, unsigned AlignA, unsigned AlignB>
+T
+Avx2Accelrator::computeDotProduct(const T * af, const T * bf, size_t sz)
+{
+ using TT = TypeSpecifics<T>;
+ constexpr const size_t ChunkSize = TT::V_SZ*4/sizeof(T);
+ constexpr const size_t VectorsPerChunk = TT::VectorsPerChunk;
+ typename TT::V partial[VectorsPerChunk] = { TT::zero, TT::zero, TT::zero, TT::zero};
+ typedef T A __attribute__ ((vector_size (TT::V_SZ), aligned(AlignA)));
+ typedef T B __attribute__ ((vector_size (TT::V_SZ), aligned(AlignB)));
+ const A * a = reinterpret_cast<const A *>(af);
+ const B * b = reinterpret_cast<const B *>(bf);
+
+ const size_t numChunks(sz/ChunkSize);
+ for (size_t i(0); i < numChunks; i++) {
+ for (size_t j(0); j < VectorsPerChunk; j++) {
+ partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
+ }
+ }
+ T sum(0);
+ for (size_t i(numChunks*ChunkSize); i < sz; i++) {
+ sum += af[i] * bf[i];
+ }
+ for (size_t i(1); i < VectorsPerChunk; i++) {
+ partial[0] += partial[i];
+ }
+ return sum + TT::sum(partial[0]);
+}
+
+template <typename T>
+T
+Avx2Accelrator::dotProductSelectAlignment(const T * af, const T * bf, size_t sz)
+{
+ if (validAlignment32(af)) {
+ if (validAlignment32(bf)) {
+ return computeDotProduct<T, 32, 32>(af, bf, sz);
+ } else {
+ return computeDotProduct<T, 32, 1>(af, bf, sz);
+ }
+ } else {
+ if (validAlignment32(bf)) {
+ return computeDotProduct<T, 1, 32>(af, bf, sz);
+ } else {
+ return computeDotProduct<T, 1, 1>(af, bf, sz);
+ }
+ }
+}
+
+float
+Avx2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const
+{
+ return dotProductSelectAlignment(af, bf, sz);
+}
+
+double
+Avx2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const
+{
+ return dotProductSelectAlignment(af, bf, sz);
+}
+
+}
+}
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
new file mode 100644
index 00000000000..e434bfc84e6
--- /dev/null
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// Copyright (C) 2003 Fast Search & Transfer ASA
+// Copyright (C) 2003 Overture Services Norway AS
+
+#pragma once
+
+#include <vespa/vespalib/hwaccelrated/avx.h>
+
+namespace vespalib {
+
+namespace hwaccelrated {
+
+/**
+ * Generic cpu agnostic implementation.
+ */
+class Avx2Accelrator : public AvxAccelrator
+{
+public:
+ virtual float dotProduct(const float * a, const float * b, size_t sz) const;
+ virtual double dotProduct(const double * a, const double * b, size_t sz) const;
+private:
+ template <typename T>
+ VESPA_DLL_LOCAL static T dotProductSelectAlignment(const T * af, const T * bf, size_t sz);
+ template <typename T, unsigned AlignA, unsigned AlignB>
+ VESPA_DLL_LOCAL static T computeDotProduct(const T * af, const T * bf, size_t sz) __attribute__((noinline));
+};
+
+}
+}
diff --git a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
index fd4fc706b89..769d40fd0ec 100644
--- a/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
+++ b/staging_vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
@@ -6,23 +6,13 @@
#include <vespa/vespalib/hwaccelrated/generic.h>
#include <vespa/vespalib/hwaccelrated/sse2.h>
#include <vespa/vespalib/hwaccelrated/avx.h>
+#include <vespa/vespalib/hwaccelrated/avx2.h>
#include <assert.h>
namespace vespalib {
namespace hwaccelrated {
-#if 0
-__attribute__ ((target ("default"), noinline))
-vespalib::hwaccelrated::IAccelrated::UP selectAccelrator() { return vespalib::hwaccelrated::IAccelrated::UP(new vespalib::hwaccelrated::GenericAccelrator()); }
-
-__attribute__ ((target ("sse2"), noinline))
-vespalib::hwaccelrated::IAccelrated::UP selectAccelrator() { return vespalib::hwaccelrated::IAccelrated::UP(new vespalib::hwaccelrated::Sse2Accelrator()); }
-
-__attribute__ ((target ("avx"), noinline))
-vespalib::hwaccelrated::IAccelrated::UP selectAccelrator() { return vespalib::hwaccelrated::IAccelrated::UP(new AvxAccelrator()); }
-#endif
-
namespace {
class Factory {
@@ -46,6 +36,11 @@ public:
virtual IAccelrated::UP create() const { return IAccelrated::UP(new AvxAccelrator()); }
};
+class Avx2Factory :public Factory{
+public:
+ virtual IAccelrated::UP create() const { return IAccelrated::UP(new Avx2Accelrator()); }
+};
+
template<typename T>
void verifyAccelrator(const IAccelrated & accel)
{
@@ -100,7 +95,9 @@ Selector::Selector() :
_factory(new GenericFactory())
{
__builtin_cpu_init ();
- if (__builtin_cpu_supports("avx")) {
+ if (__builtin_cpu_supports("avx2")) {
+ _factory.reset(new Avx2Factory());
+ } else if (__builtin_cpu_supports("avx")) {
_factory.reset(new AvxFactory());
} else if (__builtin_cpu_supports("sse2")) {
_factory.reset(new Sse2Factory());