diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-05-15 13:34:21 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2024-05-15 13:34:21 +0000 |
commit | ef6e5edc57653df6ac228de7704dbe4a98b32607 (patch) | |
tree | 73d5eee2fbe024d5e0ab82756c6842409bb60dcf /vespalib | |
parent | 61c22f760d11f3b87a175149acedc24550f1f9fd (diff) |
Speed up dotproduct for int8.
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp | 2 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp | 58 |
2 files changed, 44 insertions, 16 deletions
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index 4307b38d18b..4ac99d90a7e 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -101,7 +101,7 @@ GenericAccelrator::dotProduct(const double * a, const double * b, size_t sz) con int64_t GenericAccelrator::dotProduct(const int8_t * a, const int8_t * b, size_t sz) const noexcept { - return multiplyAdd<int64_t, int8_t, 8>(a, b, sz); + return helper::multiplyAdd(a, b, sz); } int64_t diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp index 173fe151831..b2bc087b7e9 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -52,7 +52,7 @@ void andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) { typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize"); - static_assert(ChunkSize*Chunks == 128, "ChunkSize*Chunks == 128"); + static_assert(ChunkSize * Chunks == 128, "ChunkSize*Chunks == 128"); Chunk * chunk = static_cast<Chunk *>(dest); const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset); for (size_t n=0; n < Chunks; n++) { @@ -68,28 +68,28 @@ andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, template<unsigned ChunkSize, unsigned Chunks> void -orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) { +orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) { typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize"); - static_assert(ChunkSize*Chunks == 128, "ChunkSize*Chunks == 128"); + static_assert(ChunkSize * Chunks == 128, "ChunkSize*Chunks == 128"); Chunk * chunk = static_cast<Chunk *>(dest); const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset); - for (size_t n=0; n < Chunks; n++) { - chunk[n] = get<Chunk, ChunkSize>(tmp+n, src[0].second); + for (size_t n = 0; n < Chunks; n++) { + chunk[n] = get<Chunk, ChunkSize>(tmp + n, src[0].second); } for (size_t i(1); i < src.size(); i++) { tmp = cast<Chunk, ChunkSize>(src[i].first, offset); - for (size_t n=0; n < Chunks; n++) { - chunk[n] |= get<Chunk, ChunkSize>(tmp+n, src[i].second); + for (size_t n = 0; n < Chunks; n++) { + chunk[n] |= get<Chunk, ChunkSize>(tmp + n, src[i].second); } } } template<typename TemporaryT=int32_t> -double squaredEuclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz) __attribute__((noinline)); +double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) __attribute__((noinline)); + template<typename TemporaryT> -double squaredEuclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz) -{ +double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) { //Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float TemporaryT sum = 0; for (size_t i(0); i < sz; i++) { @@ -100,10 +100,10 @@ double squaredEuclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz) } inline double -squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) { - constexpr size_t LOOP_COUNT = 0x200; +squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) { + constexpr size_t LOOP_COUNT = 0x100; double sum(0); - size_t i=0; + size_t i = 0; for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) { sum += squaredEuclideanDistanceT<int32_t>(a + i, b + i, LOOP_COUNT); } @@ -114,12 +114,40 @@ squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) { } inline void -convert_bfloat16_to_float(const uint16_t * src, float * dest, size_t sz) noexcept { - uint32_t * asu32 = reinterpret_cast<uint32_t *>(dest); +convert_bfloat16_to_float(const uint16_t *src, float *dest, size_t sz) noexcept { + uint32_t *asu32 = reinterpret_cast<uint32_t *>(dest); for (size_t i(0); i < sz; i++) { asu32[i] = src[i] << 16; } } +template<typename ACCUM = uint32_t> +ACCUM +multiplyAddT(const int8_t *a, const int8_t *b, size_t sz) noexcept __attribute_noinline__; + +template<typename ACCUM> +ACCUM +multiplyAddT(const int8_t *a, const int8_t *b, size_t sz) noexcept { + ACCUM sum = 0; + for (size_t i(0); i < sz; i++) { + sum += uint16_t(a[i]) * uint16_t(b[i]); + } + return sum; +} + +inline int64_t +multiplyAdd(const int8_t *a, const int8_t *b, size_t sz) noexcept { + constexpr size_t LOOP_COUNT = 0x100; + int64_t sum(0); + size_t i = 0; + for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) { + sum += multiplyAddT<int32_t>(a + i, b + i, LOOP_COUNT); + } + if (sz > i) [[unlikely]] { + sum += multiplyAddT<int32_t>(a + i, b + i, sz - i); + } + return sum; +} + } } |