summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2020-03-10 13:54:03 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2020-03-10 13:54:03 +0000
commit10856a1ba094837bf3a098f377a0d77d9e753ce8 (patch)
tree2b3ea28aaecf4ad7fb59b108a5d576b1366c8fd2 /vespalib
parente0bd73b016439f087edb7ce165524fed3c9ab1ee (diff)
Simply follow pattern from dotproduct.
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp4
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp4
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp49
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp28
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp25
5 files changed, 79 insertions, 31 deletions
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
index cd80261fc76..2c784f5dda2 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
@@ -12,12 +12,12 @@ Avx2Accelrator::populationCount(const uint64_t *a, size_t sz) const {
double
Avx2Accelrator::squaredEuclidianDistance(const float * a, const float * b, size_t sz) const {
- return helper::euclidianDistanceT<float, 32>(a, b, sz);
+ return avx::euclidianDistanceSelectAlignment<float, 32>(a, b, sz);
}
double
Avx2Accelrator::squaredEuclidianDistance(const double * a, const double * b, size_t sz) const {
- return helper::euclidianDistanceT<double, 8>(a, b, sz);
+ return avx::euclidianDistanceSelectAlignment<double, 32>(a, b, sz);
}
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
index 747119d7a8e..45126c366ad 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
@@ -24,12 +24,12 @@ Avx512Accelrator::populationCount(const uint64_t *a, size_t sz) const {
double
Avx512Accelrator::squaredEuclidianDistance(const float * a, const float * b, size_t sz) const {
- return helper::euclidianDistanceT<float, 64>(a, b, sz);
+ return avx::euclidianDistanceSelectAlignment<float, 64>(a, b, sz);
}
double
Avx512Accelrator::squaredEuclidianDistance(const double * a, const double * b, size_t sz) const {
- return helper::euclidianDistanceT<double, 32>(a, b, sz);
+ return avx::euclidianDistanceSelectAlignment<double, 64>(a, b, sz);
}
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp
index 9e6a6d8817f..406757d239c 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp
@@ -4,7 +4,6 @@
#include "private_helpers.hpp"
#include <vespa/fastos/dynamiclibrary.h>
-#include <cstring>
namespace vespalib::hwaccelrated::avx {
@@ -87,4 +86,52 @@ T dotProductSelectAlignment(const T * af, const T * bf, size_t sz)
}
}
+template <typename T, unsigned VLEN, unsigned AlignA, unsigned AlignB>
+double
+euclidianDistanceT(const T * af, const T * bf, size_t sz)
+{
+ constexpr unsigned VectorsPerChunk = 4;
+ constexpr unsigned ChunkSize = VLEN*VectorsPerChunk/sizeof(T);
+ typedef T V __attribute__ ((vector_size (VLEN)));
+ typedef T A __attribute__ ((vector_size (VLEN), aligned(AlignA)));
+ typedef T B __attribute__ ((vector_size (VLEN), aligned(AlignB)));
+ V partial[VectorsPerChunk];
+ memset(partial, 0, sizeof(partial));
+ const A * a = reinterpret_cast<const A *>(af);
+ const B * b = reinterpret_cast<const B *>(bf);
+
+ const size_t numChunks(sz/ChunkSize);
+ for (size_t i(0); i < numChunks; i++) {
+ for (size_t j(0); j < VectorsPerChunk; j++) {
+ partial[j] += (a[VectorsPerChunk*i+j] - b[VectorsPerChunk*i+j]) * (a[VectorsPerChunk*i+j] - b[VectorsPerChunk*i+j]);
+ }
+ }
+ double sum(0);
+ for (size_t i(numChunks*ChunkSize); i < sz; i++) {
+ sum += (af[i] - bf[i]) * (af[i] - bf[i]);
+ }
+ partial[0] = sumR<V, VectorsPerChunk>(partial);
+
+ return sum + sumT<T, V>(partial[0]);
+}
+
+template <typename T, unsigned VLEN>
+double euclidianDistanceSelectAlignment(const T * af, const T * bf, size_t sz)
+{
+ constexpr unsigned ALIGN = 32;
+ if (validAlignment(af, ALIGN)) {
+ if (validAlignment(bf, ALIGN)) {
+ return euclidianDistanceT<T, VLEN, ALIGN, ALIGN>(af, bf, sz);
+ } else {
+ return euclidianDistanceT<T, ALIGN, ALIGN, 1>(af, bf, sz);
+ }
+ } else {
+ if (validAlignment(bf, ALIGN)) {
+ return euclidianDistanceT<T, VLEN, 1, ALIGN>(af, bf, sz);
+ } else {
+ return euclidianDistanceT<T, VLEN, 1, 1>(af, bf, sz);
+ }
+ }
+}
+
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
index 7018f37d49a..13929a288f9 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
@@ -32,6 +32,30 @@ multiplyAdd(const T * a, const T * b, size_t sz)
return sum;
}
+template <typename T, size_t UNROLL>
+double
+euclidianDistanceT(const T * a, const T * b, size_t sz)
+{
+ T partial[UNROLL];
+ for (size_t i(0); i < UNROLL; i++) {
+ partial[i] = 0;
+ }
+ size_t i(0);
+ for (; i + UNROLL <= sz; i += UNROLL) {
+ for (size_t j(0); j < UNROLL; j++) {
+ partial[j] += (a[i+j] - b[i+j]) * (a[i+j] - b[i+j]);
+ }
+ }
+ for (;i < sz; i++) {
+ partial[i%UNROLL] += (a[i] - b[i]) * (a[i] - b[i]);
+ }
+ double sum(0);
+ for (size_t j(0); j < UNROLL; j++) {
+ sum += partial[j];
+ }
+ return sum;
+}
+
template<size_t UNROLL, typename Operation>
void
bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) {
@@ -133,12 +157,12 @@ GenericAccelrator::populationCount(const uint64_t *a, size_t sz) const {
double
GenericAccelrator::squaredEuclidianDistance(const float * a, const float * b, size_t sz) const {
- return helper::euclidianDistanceT<float, 8>(a, b, sz);
+ return euclidianDistanceT<float, 8>(a, b, sz);
}
double
GenericAccelrator::squaredEuclidianDistance(const double * a, const double * b, size_t sz) const {
- return helper::euclidianDistanceT<double, 4>(a, b, sz);
+ return euclidianDistanceT<double, 4>(a, b, sz);
}
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
index 6a70e14c290..f5daf2b9081 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
@@ -3,6 +3,7 @@
#pragma once
#include <vespa/vespalib/util/optimized.h>
+#include <cstring>
namespace vespalib::hwaccelrated::helper {
namespace {
@@ -23,29 +24,5 @@ populationCount(const uint64_t *a, size_t sz) {
return count;
}
-template <typename T, size_t UNROLL>
-double
-euclidianDistanceT(const T * a, const T * b, size_t sz)
-{
- T partial[UNROLL];
- for (size_t i(0); i < UNROLL; i++) {
- partial[i] = 0;
- }
- size_t i(0);
- for (; i + UNROLL <= sz; i += UNROLL) {
- for (size_t j(0); j < UNROLL; j++) {
- partial[j] += (a[i+j] - b[i+j]) * (a[i+j] - b[i+j]);
- }
- }
- for (;i < sz; i++) {
- partial[i%UNROLL] += (a[i] - b[i]) * (a[i] - b[i]);
- }
- double sum(0);
- for (size_t j(0); j < UNROLL; j++) {
- sum += partial[j];
- }
- return sum;
-}
-
}
}