diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2020-03-05 10:30:32 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2020-03-05 10:53:10 +0000 |
commit | e18f87ea69a2e7f60ca60cf7b8516125d1807363 (patch) | |
tree | e5c724edd07c9a6c77b0bbb9d0bf8c9588d6a129 /vespalib | |
parent | 916e3950a7ec137bf213134a6a4ca1c365110dde (diff) |
Use openblas for dotproduct on everything up to and including avx2.
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/src/tests/exception_classes/mmap.cpp | 2 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/CMakeLists.txt | 2 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp | 12 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/avx.h | 2 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp | 12 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/avx2.h | 2 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp | 25 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp | 72 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/sse2.h | 3 |
9 files changed, 16 insertions, 116 deletions
diff --git a/vespalib/src/tests/exception_classes/mmap.cpp b/vespalib/src/tests/exception_classes/mmap.cpp index 81b5e0de30e..2a6896bb0e0 100644 --- a/vespalib/src/tests/exception_classes/mmap.cpp +++ b/vespalib/src/tests/exception_classes/mmap.cpp @@ -2,7 +2,7 @@ #include <vespa/vespalib/util/alloc.h> #include <vector> #include <cassert> -#include <string.h> +#include <cstring> #include <cstdlib> #include <sys/resource.h> diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt index 4a753a66394..92149d3f0ea 100644 --- a/vespalib/src/vespa/vespalib/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/CMakeLists.txt @@ -30,6 +30,8 @@ vespa_add_library(vespalib ${VESPA_GCC_LIB} ) +set(BLA_VENDOR OpenBLAS) +vespa_add_target_package_dependency(vespalib BLAS) vespa_add_target_package_dependency(vespalib OpenSSL) vespa_add_target_package_dependency(vespalib RE2) diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp index 39ea0d2d73b..ec6dc164323 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp @@ -5,18 +5,6 @@ namespace vespalib::hwaccelrated { -float -AvxAccelrator::dotProduct(const float * af, const float * bf, size_t sz) const -{ - return avx::dotProductSelectAlignment<float, 32>(af, bf, sz); -} - -double -AvxAccelrator::dotProduct(const double * af, const double * bf, size_t sz) const -{ - return avx::dotProductSelectAlignment<double, 32>(af, bf, sz); -} - size_t AvxAccelrator::populationCount(const uint64_t *a, size_t sz) const { return helper::populationCount(a, sz); diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx.h index 624531a9ca5..e7f090b4695 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx.h @@ -12,8 +12,6 @@ namespace vespalib::hwaccelrated { class AvxAccelrator : public Sse2Accelrator { public: - float dotProduct(const float * a, const float * b, size_t sz) const override; - double dotProduct(const double * a, const double * b, size_t sz) const override; size_t populationCount(const uint64_t *a, size_t sz) const override; }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index ea8a3ead538..f0d03a995e4 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -5,18 +5,6 @@ namespace vespalib::hwaccelrated { -float -Avx2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const -{ - return avx::dotProductSelectAlignment<float, 32>(af, bf, sz); -} - -double -Avx2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const -{ - return avx::dotProductSelectAlignment<double, 32>(af, bf, sz); -} - size_t Avx2Accelrator::populationCount(const uint64_t *a, size_t sz) const { return helper::populationCount(a, sz); diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index cf91bc81cfd..7e1784698f1 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -12,8 +12,6 @@ namespace vespalib::hwaccelrated { class Avx2Accelrator : public AvxAccelrator { public: - float dotProduct(const float * a, const float * b, size_t sz) const override; - double dotProduct(const double * a, const double * b, size_t sz) const override; size_t populationCount(const uint64_t *a, size_t sz) const override; }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index b70ebb4051a..c0a9258cd74 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -2,6 +2,7 @@ #include "generic.h" #include "private_helpers.hpp" +#include <cblas.h> namespace vespalib::hwaccelrated { @@ -37,8 +38,8 @@ bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) const size_t sz(bytes/sizeof(uint64_t)); { - uint64_t *a(static_cast<uint64_t *>(aOrg)); - const uint64_t *b(static_cast<const uint64_t *>(bOrg)); + auto a(static_cast<uint64_t *>(aOrg)); + auto b(static_cast<const uint64_t *>(bOrg)); size_t i(0); for (; i + UNROLL <= sz; i += UNROLL) { for (size_t j(0); j < UNROLL; j++) { @@ -50,8 +51,8 @@ bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) } } - uint8_t *a(static_cast<uint8_t *>(aOrg)); - const uint8_t *b(static_cast<const uint8_t *>(bOrg)); + auto a(static_cast<uint8_t *>(aOrg)); + auto *b(static_cast<const uint8_t *>(bOrg)); for (size_t i(sz*sizeof(uint64_t)); i < bytes; i++) { a[i] = operation(a[i], b[i]); } @@ -62,36 +63,36 @@ bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) float GenericAccelrator::dotProduct(const float * a, const float * b, size_t sz) const { - return multiplyAdd<float, float, 4>(a, b, sz); + return cblas_sdot(sz, a, 1, b, 1); } double GenericAccelrator::dotProduct(const double * a, const double * b, size_t sz) const { - return multiplyAdd<double, double, 4>(a, b, sz); + return cblas_ddot(sz, a, 1, b, 1); } int64_t GenericAccelrator::dotProduct(const int8_t * a, const int8_t * b, size_t sz) const { - return multiplyAdd<int64_t, int8_t, 4>(a, b, sz); + return multiplyAdd<int64_t, int8_t, 8>(a, b, sz); } int64_t GenericAccelrator::dotProduct(const int16_t * a, const int16_t * b, size_t sz) const { - return multiplyAdd<int64_t, int16_t, 4>(a, b, sz); + return multiplyAdd<int64_t, int16_t, 8>(a, b, sz); } int64_t GenericAccelrator::dotProduct(const int32_t * a, const int32_t * b, size_t sz) const { - return multiplyAdd<int64_t, int32_t, 4>(a, b, sz); + return multiplyAdd<int64_t, int32_t, 8>(a, b, sz); } long long GenericAccelrator::dotProduct(const int64_t * a, const int64_t * b, size_t sz) const { - return multiplyAdd<long long, int64_t, 4>(a, b, sz); + return multiplyAdd<long long, int64_t, 8>(a, b, sz); } void @@ -114,12 +115,12 @@ GenericAccelrator::andNotBit(void * aOrg, const void * bOrg, size_t bytes) const void GenericAccelrator::notBit(void * aOrg, size_t bytes) const { - uint64_t *a(static_cast<uint64_t *>(aOrg)); + auto a(static_cast<uint64_t *>(aOrg)); const size_t sz(bytes/sizeof(uint64_t)); for (size_t i(0); i < sz; i++) { a[i] = ~a[i]; } - uint8_t *ac(static_cast<uint8_t *>(aOrg)); + auto ac(static_cast<uint8_t *>(aOrg)); for (size_t i(sz*sizeof(uint64_t)); i < bytes; i++) { ac[i] = ~ac[i]; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp index a0f584f8a9f..64a26d49f2b 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp @@ -5,78 +5,6 @@ namespace vespalib::hwaccelrated { -namespace { - -bool validAlignment16(const void * p) { - return (reinterpret_cast<uint64_t>(p) & 0xful) == 0; -} - -bool validAlignment16(const void * a, const void * b) { - return validAlignment16(a) && validAlignment16(b); -} - -} - -float -Sse2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const -{ - if ( ! validAlignment16(af, bf)) { - return GenericAccelrator::dotProduct(af, bf, sz); - } - typedef float v4sf __attribute__ ((vector_size (16))); - const size_t ChunkSize(16); - const size_t VectorsPerChunk(ChunkSize/4); - v4sf partial[VectorsPerChunk] = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} }; - const v4sf * a = reinterpret_cast<const v4sf *>(af); - const v4sf * b = reinterpret_cast<const v4sf *>(bf); - - const size_t numChunks(sz/ChunkSize); - for (size_t i(0); i < numChunks; i++) { - for (size_t j(0); j < VectorsPerChunk; j++) { - partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j]; - } - } - float sum(0); - for (size_t i(numChunks*ChunkSize); i < sz; i++) { - sum += af[i] * bf[i]; - } - for (size_t i(1); i < VectorsPerChunk; i++) { - partial[0] += partial[i]; - } - sum += partial[0][0] + partial[0][1] + partial[0][2] + partial[0][3]; - return sum; -} - -double -Sse2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const -{ - if ( ! validAlignment16(af, bf)) { - return GenericAccelrator::dotProduct(af, bf, sz); - } - typedef double v2sd __attribute__ ((vector_size (16))); - const size_t ChunkSize(8); - const size_t VectorsPerChunk(ChunkSize/2); - v2sd partial[VectorsPerChunk] = { {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0} }; - const v2sd * a = reinterpret_cast<const v2sd *>(af); - const v2sd * b = reinterpret_cast<const v2sd *>(bf); - - const size_t numChunks(sz/ChunkSize); - for (size_t i(0); i < numChunks; i++) { - for (size_t j(0); j < VectorsPerChunk; j++) { - partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j]; - } - } - double sum(0); - for (size_t i(numChunks*ChunkSize); i < sz; i++) { - sum += af[i] * bf[i]; - } - for (size_t i(1); i < VectorsPerChunk; i++) { - partial[0] += partial[i]; - } - sum += partial[0][0] + partial[0][1]; - return sum; -} - size_t Sse2Accelrator::populationCount(const uint64_t *a, size_t sz) const { return helper::populationCount(a, sz); diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h index d0fbefe5f03..0b2462423b1 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h @@ -12,9 +12,6 @@ namespace vespalib::hwaccelrated { class Sse2Accelrator : public GenericAccelrator { public: - float dotProduct(const float * a, const float * b, size_t sz) const override; - double dotProduct(const double * a, const double * b, size_t sz) const override; - size_t populationCount(const uint64_t *a, size_t sz) const override; }; |