diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2020-03-10 13:54:03 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2020-03-10 13:54:03 +0000 |
commit | 10856a1ba094837bf3a098f377a0d77d9e753ce8 (patch) | |
tree | 2b3ea28aaecf4ad7fb59b108a5d576b1366c8fd2 /vespalib | |
parent | e0bd73b016439f087edb7ce165524fed3c9ab1ee (diff) |
Simply follow pattern from dotproduct.
Diffstat (limited to 'vespalib')
5 files changed, 79 insertions, 31 deletions
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index cd80261fc76..2c784f5dda2 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -12,12 +12,12 @@ Avx2Accelrator::populationCount(const uint64_t *a, size_t sz) const { double Avx2Accelrator::squaredEuclidianDistance(const float * a, const float * b, size_t sz) const { - return helper::euclidianDistanceT<float, 32>(a, b, sz); + return avx::euclidianDistanceSelectAlignment<float, 32>(a, b, sz); } double Avx2Accelrator::squaredEuclidianDistance(const double * a, const double * b, size_t sz) const { - return helper::euclidianDistanceT<double, 8>(a, b, sz); + return avx::euclidianDistanceSelectAlignment<double, 32>(a, b, sz); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp index 747119d7a8e..45126c366ad 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -24,12 +24,12 @@ Avx512Accelrator::populationCount(const uint64_t *a, size_t sz) const { double Avx512Accelrator::squaredEuclidianDistance(const float * a, const float * b, size_t sz) const { - return helper::euclidianDistanceT<float, 64>(a, b, sz); + return avx::euclidianDistanceSelectAlignment<float, 64>(a, b, sz); } double Avx512Accelrator::squaredEuclidianDistance(const double * a, const double * b, size_t sz) const { - return helper::euclidianDistanceT<double, 32>(a, b, sz); + return avx::euclidianDistanceSelectAlignment<double, 64>(a, b, sz); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp index 9e6a6d8817f..406757d239c 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp @@ -4,7 +4,6 @@ #include "private_helpers.hpp" #include <vespa/fastos/dynamiclibrary.h> -#include <cstring> namespace vespalib::hwaccelrated::avx { @@ -87,4 +86,52 @@ T dotProductSelectAlignment(const T * af, const T * bf, size_t sz) } } +template <typename T, unsigned VLEN, unsigned AlignA, unsigned AlignB> +double +euclidianDistanceT(const T * af, const T * bf, size_t sz) +{ + constexpr unsigned VectorsPerChunk = 4; + constexpr unsigned ChunkSize = VLEN*VectorsPerChunk/sizeof(T); + typedef T V __attribute__ ((vector_size (VLEN))); + typedef T A __attribute__ ((vector_size (VLEN), aligned(AlignA))); + typedef T B __attribute__ ((vector_size (VLEN), aligned(AlignB))); + V partial[VectorsPerChunk]; + memset(partial, 0, sizeof(partial)); + const A * a = reinterpret_cast<const A *>(af); + const B * b = reinterpret_cast<const B *>(bf); + + const size_t numChunks(sz/ChunkSize); + for (size_t i(0); i < numChunks; i++) { + for (size_t j(0); j < VectorsPerChunk; j++) { + partial[j] += (a[VectorsPerChunk*i+j] - b[VectorsPerChunk*i+j]) * (a[VectorsPerChunk*i+j] - b[VectorsPerChunk*i+j]); + } + } + double sum(0); + for (size_t i(numChunks*ChunkSize); i < sz; i++) { + sum += (af[i] - bf[i]) * (af[i] - bf[i]); + } + partial[0] = sumR<V, VectorsPerChunk>(partial); + + return sum + sumT<T, V>(partial[0]); +} + +template <typename T, unsigned VLEN> +double euclidianDistanceSelectAlignment(const T * af, const T * bf, size_t sz) +{ + constexpr unsigned ALIGN = 32; + if (validAlignment(af, ALIGN)) { + if (validAlignment(bf, ALIGN)) { + return euclidianDistanceT<T, VLEN, ALIGN, ALIGN>(af, bf, sz); + } else { + return euclidianDistanceT<T, ALIGN, ALIGN, 1>(af, bf, sz); + } + } else { + if (validAlignment(bf, ALIGN)) { + return euclidianDistanceT<T, VLEN, 1, ALIGN>(af, bf, sz); + } else { + return euclidianDistanceT<T, VLEN, 1, 1>(af, bf, sz); + } + } +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index 7018f37d49a..13929a288f9 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -32,6 +32,30 @@ multiplyAdd(const T * a, const T * b, size_t sz) return sum; } +template <typename T, size_t UNROLL> +double +euclidianDistanceT(const T * a, const T * b, size_t sz) +{ + T partial[UNROLL]; + for (size_t i(0); i < UNROLL; i++) { + partial[i] = 0; + } + size_t i(0); + for (; i + UNROLL <= sz; i += UNROLL) { + for (size_t j(0); j < UNROLL; j++) { + partial[j] += (a[i+j] - b[i+j]) * (a[i+j] - b[i+j]); + } + } + for (;i < sz; i++) { + partial[i%UNROLL] += (a[i] - b[i]) * (a[i] - b[i]); + } + double sum(0); + for (size_t j(0); j < UNROLL; j++) { + sum += partial[j]; + } + return sum; +} + template<size_t UNROLL, typename Operation> void bitOperation(Operation operation, void * aOrg, const void * bOrg, size_t bytes) { @@ -133,12 +157,12 @@ GenericAccelrator::populationCount(const uint64_t *a, size_t sz) const { double GenericAccelrator::squaredEuclidianDistance(const float * a, const float * b, size_t sz) const { - return helper::euclidianDistanceT<float, 8>(a, b, sz); + return euclidianDistanceT<float, 8>(a, b, sz); } double GenericAccelrator::squaredEuclidianDistance(const double * a, const double * b, size_t sz) const { - return helper::euclidianDistanceT<double, 4>(a, b, sz); + return euclidianDistanceT<double, 4>(a, b, sz); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp index 6a70e14c290..f5daf2b9081 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -3,6 +3,7 @@ #pragma once #include <vespa/vespalib/util/optimized.h> +#include <cstring> namespace vespalib::hwaccelrated::helper { namespace { @@ -23,29 +24,5 @@ populationCount(const uint64_t *a, size_t sz) { return count; } -template <typename T, size_t UNROLL> -double -euclidianDistanceT(const T * a, const T * b, size_t sz) -{ - T partial[UNROLL]; - for (size_t i(0); i < UNROLL; i++) { - partial[i] = 0; - } - size_t i(0); - for (; i + UNROLL <= sz; i += UNROLL) { - for (size_t j(0); j < UNROLL; j++) { - partial[j] += (a[i+j] - b[i+j]) * (a[i+j] - b[i+j]); - } - } - for (;i < sz; i++) { - partial[i%UNROLL] += (a[i] - b[i]) * (a[i] - b[i]); - } - double sum(0); - for (size_t j(0); j < UNROLL; j++) { - sum += partial[j]; - } - return sum; -} - } } |