From 8ef08f0c80a409ecfebd179d2a71382ee9b9d814 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Thu, 4 Jun 2020 22:34:44 +0000 Subject: When we pull in a cacheline, we should use it too. There is possibly wasting 7/8 of it and very likely suffer a cache miss. --- vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp | 10 +++ vespalib/src/vespa/vespalib/hwaccelrated/avx2.h | 2 + .../src/vespa/vespalib/hwaccelrated/avx512.cpp | 10 +++ vespalib/src/vespa/vespalib/hwaccelrated/avx512.h | 2 + .../src/vespa/vespalib/hwaccelrated/generic.cpp | 10 +++ vespalib/src/vespa/vespalib/hwaccelrated/generic.h | 2 + .../vespa/vespalib/hwaccelrated/iaccelrated.cpp | 87 +++++++++++++++++++++- .../src/vespa/vespalib/hwaccelrated/iaccelrated.h | 4 + .../vespalib/hwaccelrated/private_helpers.hpp | 41 ++++++++++ 9 files changed, 165 insertions(+), 3 deletions(-) (limited to 'vespalib') diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index 7ff393c87f8..233609d505b 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -20,4 +20,14 @@ Avx2Accelrator::squaredEuclideanDistance(const double * a, const double * b, siz return avx::euclideanDistanceSelectAlignment(a, b, sz); } +void +Avx2Accelrator::and64(size_t offset, const std::vector> &src, uint64_t *dest) const { + helper::andChunks<32u, 2u>(offset, src, dest); +} + +void +Avx2Accelrator::or64(size_t offset, const std::vector> &src, uint64_t *dest) const { + helper::orChunks<32u, 2u>(offset, src, dest); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index 3e0dbb28110..292961a6f4d 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -15,6 +15,8 @@ public: size_t populationCount(const uint64_t *a, size_t sz) const override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; + void and64(size_t offset, const std::vector> &src, uint64_t *dest) const override; + void or64(size_t offset, const std::vector> &src, uint64_t *dest) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp index 0941e6d6ad8..9cfae4757b9 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -32,4 +32,14 @@ Avx512Accelrator::squaredEuclideanDistance(const double * a, const double * b, s return avx::euclideanDistanceSelectAlignment(a, b, sz); } +void +Avx512Accelrator::and64(size_t offset, const std::vector> &src, uint64_t *dest) const { + helper::andChunks<64, 1>(offset, src, dest); +} + +void +Avx512Accelrator::or64(size_t offset, const std::vector> &src, uint64_t *dest) const { + helper::orChunks<64, 1>(offset, src, dest); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h index 209ec06c857..ee422b57171 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h @@ -17,6 +17,8 @@ public: size_t populationCount(const uint64_t *a, size_t sz) const override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; + void and64(size_t offset, const std::vector> &src, uint64_t *dest) const override; + void or64(size_t offset, const std::vector> &src, uint64_t *dest) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index f9684e88c63..460ae7e7388 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -165,4 +165,14 @@ GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, return euclideanDistanceT(a, b, sz); } +void +GenericAccelrator::and64(size_t offset, const std::vector> &src, uint64_t *dest) const { + helper::andChunks<16, 4>(offset, src, dest); +} + +void +GenericAccelrator::or64(size_t offset, const std::vector> &src, uint64_t *dest) const { + helper::orChunks<16,4>(offset, src, dest); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h index 50a3d59d49d..8ce320cd4c4 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h @@ -25,6 +25,8 @@ public: size_t populationCount(const uint64_t *a, size_t sz) const override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; + void and64(size_t offset, const std::vector> &src, uint64_t *dest) const override; + void or64(size_t offset, const std::vector> &src, uint64_t *dest) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp index bb132165e53..b0215307f4c 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp @@ -46,7 +46,8 @@ std::vector createAndFill(size_t sz) { } template -void verifyDotproduct(const IAccelrated & accel) +void +verifyDotproduct(const IAccelrated & accel) { const size_t testLength(255); srand(1); @@ -66,7 +67,8 @@ void verifyDotproduct(const IAccelrated & accel) } template -void verifyEuclideanDistance(const IAccelrated & accel) { +void +verifyEuclideanDistance(const IAccelrated & accel) { const size_t testLength(255); srand(1); std::vector a = createAndFill(testLength); @@ -84,7 +86,8 @@ void verifyEuclideanDistance(const IAccelrated & accel) { } } -void verifyPopulationCount(const IAccelrated & accel) +void +verifyPopulationCount(const IAccelrated & accel) { const uint64_t words[7] = {0x123456789abcdef0L, // 32 0x0000000000000000L, // 0 @@ -101,6 +104,82 @@ void verifyPopulationCount(const IAccelrated & accel) } } +void +fill(std::vector & v, size_t n) { + v.reserve(n); + for (size_t i(0); i < n; i++) { + v.emplace_back(random()); + } +} + +void +simpleAndWith(std::vector & dest, const std::vector & src) { + for (size_t i(0); i < dest.size(); i++) { + dest[i] &= src[i]; + } +} + +void +simpleOrWith(std::vector & dest, const std::vector & src) { + for (size_t i(0); i < dest.size(); i++) { + dest[i] |= src[i]; + } +} + +void +verifyOr64(const IAccelrated & accel) { + std::vector vectors[3] ; + for (auto & v : vectors) { + fill(v, 16); + } + for (size_t offset = 0; offset < 8; offset++) { + for (size_t i = 1; i < VESPA_NELEMS(vectors); i++) { + std::vector expected = vectors[0]; + for (size_t j = 1; j < i; j++) { + simpleOrWith(expected, vectors[j]); + } + std::vector> vRefs; + for (size_t j(0); j < i; j++) { + vRefs.emplace_back(&vectors[j][0], false); + } + uint64_t dest[8] __attribute((aligned(64))); + accel.or64(offset, vRefs, dest); + int diff = memcmp(&expected[offset], dest, sizeof(dest)); + if (diff != 0) { + fprintf(stderr, "Accelrator is not failing and64\n"); + LOG_ABORT("should not be reached"); + } + } + } +} + +void +verifyAnd64(const IAccelrated & accel) { + std::vector vectors[3] ; + for (auto & v : vectors) { + fill(v, 16); + } + for (size_t offset = 0; offset < 8; offset++) { + for (size_t i = 1; i < VESPA_NELEMS(vectors); i++) { + std::vector expected = vectors[0]; + for (size_t j = 1; j < i; j++) { + simpleAndWith(expected, vectors[j]); + } + std::vector> vRefs; + for (size_t j(0); j < i; j++) { + vRefs.emplace_back(&vectors[j][0], false); + } + uint64_t dest[8] __attribute((aligned(64))); + accel.and64(offset, vRefs, dest); + int diff = memcmp(&expected[offset], dest, sizeof(dest)); + if (diff != 0) { + fprintf(stderr, "Accelrator is not failing and64\n"); + LOG_ABORT("should not be reached"); + } + } + } +} + class RuntimeVerificator { public: @@ -114,6 +193,8 @@ private: verifyEuclideanDistance(accelrated); verifyEuclideanDistance(accelrated); verifyPopulationCount(accelrated); + verifyAnd64(accelrated); + verifyOr64(accelrated); } }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h index 0292ad14643..f352fb292ce 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h @@ -4,6 +4,7 @@ #include #include +#include namespace vespalib::hwaccelrated { @@ -29,6 +30,9 @@ public: virtual size_t populationCount(const uint64_t *a, size_t sz) const = 0; virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const = 0; virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const = 0; + // And 64 bytes from multiple sources + virtual void and64(size_t offset, const std::vector> &src, uint64_t *dest) const = 0; + virtual void or64(size_t offset, const std::vector> &src, uint64_t *dest) const = 0; static const IAccelrated & getAccelrator() __attribute__((noinline)); }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp index f5daf2b9081..2759cc35ba9 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -24,5 +24,46 @@ populationCount(const uint64_t *a, size_t sz) { return count; } +template +T get(const void * base, bool invert) { + T v; + memcpy(&v, base, sizeof(T)); + return __builtin_expect(invert, false) ? ~v : v; +} + +template +void +andChunks(size_t offset, const std::vector> & src, uint64_t * dest) { + typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); + Chunk * chunk = reinterpret_cast(dest); + const Chunk * tmp = reinterpret_cast(src[0].first+offset); + for (size_t n=0; n < Chunks; n++) { + chunk[n] = get(tmp+n, src[0].second); + } + for (size_t i(1); i < src.size(); i++) { + tmp = reinterpret_cast(src[i].first+offset); + for (size_t n=0; n < Chunks; n++) { + chunk[n] &= get(tmp+n, src[i].second); + } + } +} + +template +void +orChunks(size_t offset, const std::vector> & src, uint64_t * dest) { + typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); + Chunk * chunk = reinterpret_cast(dest); + const Chunk * tmp = reinterpret_cast(src[0].first+offset); + for (size_t n=0; n < Chunks; n++) { + chunk[n] = get(tmp+n, src[0].second); + } + for (size_t i(1); i < src.size(); i++) { + tmp = reinterpret_cast(src[i].first+offset); + for (size_t n=0; n < Chunks; n++) { + chunk[n] |= get(tmp+n, src[i].second); + } + } +} + } } -- cgit v1.2.3