aboutsummaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2020-03-05 10:30:32 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2020-03-05 10:53:10 +0000
commite18f87ea69a2e7f60ca60cf7b8516125d1807363 (patch)
treee5c724edd07c9a6c77b0bbb9d0bf8c9588d6a129 /vespalib
parent916e3950a7ec137bf213134a6a4ca1c365110dde (diff)
Use openblas for dotproduct on everything up to and including avx2.
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/tests/exception_classes/mmap.cpp2
-rw-r--r--vespalib/src/vespa/vespalib/CMakeLists.txt2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp12
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx.h2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp12
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.h2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp25
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp72
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/sse2.h3
9 files changed, 16 insertions, 116 deletions
diff --git a/vespalib/src/tests/exception_classes/mmap.cpp b/vespalib/src/tests/exception_classes/mmap.cpp
index 81b5e0de30e..2a6896bb0e0 100644
--- a/vespalib/src/tests/exception_classes/mmap.cpp
+++ b/vespalib/src/tests/exception_classes/mmap.cpp
@@ -2,7 +2,7 @@
#include <vespa/vespalib/util/alloc.h>
#include <vector>
#include <cassert>
-#include <string.h>
+#include <cstring>
#include <cstdlib>
#include <sys/resource.h>
diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt
index 4a753a66394..92149d3f0ea 100644
--- a/vespalib/src/vespa/vespalib/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/CMakeLists.txt
@@ -30,6 +30,8 @@ vespa_add_library(vespalib
${VESPA_GCC_LIB}
)
+set(BLA_VENDOR OpenBLAS)
+vespa_add_target_package_dependency(vespalib BLAS)
vespa_add_target_package_dependency(vespalib OpenSSL)
vespa_add_target_package_dependency(vespalib RE2)
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
index 39ea0d2d73b..ec6dc164323 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp
@@ -5,18 +5,6 @@
namespace vespalib::hwaccelrated {
-float
-AvxAccelrator::dotProduct(const float * af, const float * bf, size_t sz) const
-{
- return avx::dotProductSelectAlignment<float, 32>(af, bf, sz);
-}
-
-double
-AvxAccelrator::dotProduct(const double * af, const double * bf, size_t sz) const
-{
- return avx::dotProductSelectAlignment<double, 32>(af, bf, sz);
-}
-
size_t
AvxAccelrator::populationCount(const uint64_t *a, size_t sz) const {
return helper::populationCount(a, sz);
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx.h
index 624531a9ca5..e7f090b4695 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx.h
@@ -12,8 +12,6 @@ namespace vespalib::hwaccelrated {
class AvxAccelrator : public Sse2Accelrator
{
public:
- float dotProduct(const float * a, const float * b, size_t sz) const override;
- double dotProduct(const double * a, const double * b, size_t sz) const override;
size_t populationCount(const uint64_t *a, size_t sz) const override;
};
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
index ea8a3ead538..f0d03a995e4 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
@@ -5,18 +5,6 @@
namespace vespalib::hwaccelrated {
-float
-Avx2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const
-{
- return avx::dotProductSelectAlignment<float, 32>(af, bf, sz);
-}
-
-double
-Avx2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const
-{
- return avx::dotProductSelectAlignment<double, 32>(af, bf, sz);
-}
-
size_t
Avx2Accelrator::populationCount(const uint64_t *a, size_t sz) const {
return helper::populationCount(a, sz);
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
index cf91bc81cfd..7e1784698f1 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
@@ -12,8 +12,6 @@ namespace vespalib::hwaccelrated {
class Avx2Accelrator : public AvxAccelrator
{
public:
- float dotProduct(const float * a, const float * b, size_t sz) const override;
- double dotProduct(const double * a, const double * b, size_t sz) const override;
size_t populationCount(const uint64_t *a, size_t sz) const override;
};
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
index b70ebb4051a..c0a9258cd74 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
@@ -2,6 +2,7 @@
#include "generic.h"
#include "private_helpers.hpp"
+#include <cblas.h>
namespace vespalib::hwaccelrated {
@@ -37,8 +38,8 @@ bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes)
const size_t sz(bytes/sizeof(uint64_t));
{
- uint64_t *a(static_cast<uint64_t *>(aOrg));
- const uint64_t *b(static_cast<const uint64_t *>(bOrg));
+ auto a(static_cast<uint64_t *>(aOrg));
+ auto b(static_cast<const uint64_t *>(bOrg));
size_t i(0);
for (; i + UNROLL <= sz; i += UNROLL) {
for (size_t j(0); j < UNROLL; j++) {
@@ -50,8 +51,8 @@ bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes)
}
}
- uint8_t *a(static_cast<uint8_t *>(aOrg));
- const uint8_t *b(static_cast<const uint8_t *>(bOrg));
+ auto a(static_cast<uint8_t *>(aOrg));
+ auto *b(static_cast<const uint8_t *>(bOrg));
for (size_t i(sz*sizeof(uint64_t)); i < bytes; i++) {
a[i] = operation(a[i], b[i]);
}
@@ -62,36 +63,36 @@ bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes)
float
GenericAccelrator::dotProduct(const float * a, const float * b, size_t sz) const
{
- return multiplyAdd<float, float, 4>(a, b, sz);
+ return cblas_sdot(sz, a, 1, b, 1);
}
double
GenericAccelrator::dotProduct(const double * a, const double * b, size_t sz) const
{
- return multiplyAdd<double, double, 4>(a, b, sz);
+ return cblas_ddot(sz, a, 1, b, 1);
}
int64_t
GenericAccelrator::dotProduct(const int8_t * a, const int8_t * b, size_t sz) const
{
- return multiplyAdd<int64_t, int8_t, 4>(a, b, sz);
+ return multiplyAdd<int64_t, int8_t, 8>(a, b, sz);
}
int64_t
GenericAccelrator::dotProduct(const int16_t * a, const int16_t * b, size_t sz) const
{
- return multiplyAdd<int64_t, int16_t, 4>(a, b, sz);
+ return multiplyAdd<int64_t, int16_t, 8>(a, b, sz);
}
int64_t
GenericAccelrator::dotProduct(const int32_t * a, const int32_t * b, size_t sz) const
{
- return multiplyAdd<int64_t, int32_t, 4>(a, b, sz);
+ return multiplyAdd<int64_t, int32_t, 8>(a, b, sz);
}
long long
GenericAccelrator::dotProduct(const int64_t * a, const int64_t * b, size_t sz) const
{
- return multiplyAdd<long long, int64_t, 4>(a, b, sz);
+ return multiplyAdd<long long, int64_t, 8>(a, b, sz);
}
void
@@ -114,12 +115,12 @@ GenericAccelrator::andNotBit(void * aOrg, const void * bOrg, size_t bytes) const
void
GenericAccelrator::notBit(void * aOrg, size_t bytes) const
{
- uint64_t *a(static_cast<uint64_t *>(aOrg));
+ auto a(static_cast<uint64_t *>(aOrg));
const size_t sz(bytes/sizeof(uint64_t));
for (size_t i(0); i < sz; i++) {
a[i] = ~a[i];
}
- uint8_t *ac(static_cast<uint8_t *>(aOrg));
+ auto ac(static_cast<uint8_t *>(aOrg));
for (size_t i(sz*sizeof(uint64_t)); i < bytes; i++) {
ac[i] = ~ac[i];
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp
index a0f584f8a9f..64a26d49f2b 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp
@@ -5,78 +5,6 @@
namespace vespalib::hwaccelrated {
-namespace {
-
-bool validAlignment16(const void * p) {
- return (reinterpret_cast<uint64_t>(p) & 0xful) == 0;
-}
-
-bool validAlignment16(const void * a, const void * b) {
- return validAlignment16(a) && validAlignment16(b);
-}
-
-}
-
-float
-Sse2Accelrator::dotProduct(const float * af, const float * bf, size_t sz) const
-{
- if ( ! validAlignment16(af, bf)) {
- return GenericAccelrator::dotProduct(af, bf, sz);
- }
- typedef float v4sf __attribute__ ((vector_size (16)));
- const size_t ChunkSize(16);
- const size_t VectorsPerChunk(ChunkSize/4);
- v4sf partial[VectorsPerChunk] = { {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0} };
- const v4sf * a = reinterpret_cast<const v4sf *>(af);
- const v4sf * b = reinterpret_cast<const v4sf *>(bf);
-
- const size_t numChunks(sz/ChunkSize);
- for (size_t i(0); i < numChunks; i++) {
- for (size_t j(0); j < VectorsPerChunk; j++) {
- partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
- }
- }
- float sum(0);
- for (size_t i(numChunks*ChunkSize); i < sz; i++) {
- sum += af[i] * bf[i];
- }
- for (size_t i(1); i < VectorsPerChunk; i++) {
- partial[0] += partial[i];
- }
- sum += partial[0][0] + partial[0][1] + partial[0][2] + partial[0][3];
- return sum;
-}
-
-double
-Sse2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) const
-{
- if ( ! validAlignment16(af, bf)) {
- return GenericAccelrator::dotProduct(af, bf, sz);
- }
- typedef double v2sd __attribute__ ((vector_size (16)));
- const size_t ChunkSize(8);
- const size_t VectorsPerChunk(ChunkSize/2);
- v2sd partial[VectorsPerChunk] = { {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0} };
- const v2sd * a = reinterpret_cast<const v2sd *>(af);
- const v2sd * b = reinterpret_cast<const v2sd *>(bf);
-
- const size_t numChunks(sz/ChunkSize);
- for (size_t i(0); i < numChunks; i++) {
- for (size_t j(0); j < VectorsPerChunk; j++) {
- partial[j] += a[VectorsPerChunk*i+j] * b[VectorsPerChunk*i+j];
- }
- }
- double sum(0);
- for (size_t i(numChunks*ChunkSize); i < sz; i++) {
- sum += af[i] * bf[i];
- }
- for (size_t i(1); i < VectorsPerChunk; i++) {
- partial[0] += partial[i];
- }
- sum += partial[0][0] + partial[0][1];
- return sum;
-}
-
size_t
Sse2Accelrator::populationCount(const uint64_t *a, size_t sz) const {
return helper::populationCount(a, sz);
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h
index d0fbefe5f03..0b2462423b1 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h
@@ -12,9 +12,6 @@ namespace vespalib::hwaccelrated {
class Sse2Accelrator : public GenericAccelrator
{
public:
- float dotProduct(const float * a, const float * b, size_t sz) const override;
- double dotProduct(const double * a, const double * b, size_t sz) const override;
-
size_t populationCount(const uint64_t *a, size_t sz) const override;
};