Speed up dotproduct for int8.

author: Henning Baldersheim <balder@yahoo-inc.com> 2024-05-15 13:34:21 +0000
committer: Henning Baldersheim <balder@yahoo-inc.com> 2024-05-15 13:34:21 +0000
commit: ef6e5edc57653df6ac228de7704dbe4a98b32607 (patch)
tree: 73d5eee2fbe024d5e0ab82756c6842409bb60dcf /vespalib
parent: 61c22f760d11f3b87a175149acedc24550f1f9fd (diff)
2 files changed, 44 insertions, 16 deletions
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
index 4307b38d18b..4ac99d90a7e 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
@@ -101,7 +101,7 @@ GenericAccelrator::dotProduct(const double * a, const double * b, size_t sz) con
 int64_t
 GenericAccelrator::dotProduct(const int8_t * a, const int8_t * b, size_t sz) const noexcept
 {
-    return multiplyAdd<int64_t, int8_t, 8>(a, b, sz);
+    return helper::multiplyAdd(a, b, sz);
 }
 
 int64_t
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
index 173fe151831..b2bc087b7e9 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
@@ -52,7 +52,7 @@ void
 andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) {
     typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
     static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize");
-    static_assert(ChunkSize*Chunks == 128, "ChunkSize*Chunks == 128");
+    static_assert(ChunkSize * Chunks == 128, "ChunkSize*Chunks == 128");
     Chunk * chunk = static_cast<Chunk *>(dest);
     const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset);
     for (size_t n=0; n < Chunks; n++) {
@@ -68,28 +68,28 @@ andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src,
 
 template<unsigned ChunkSize, unsigned Chunks>
 void
-orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) {
+orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) {
     typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
     static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize");
-    static_assert(ChunkSize*Chunks == 128, "ChunkSize*Chunks == 128");
+    static_assert(ChunkSize * Chunks == 128, "ChunkSize*Chunks == 128");
     Chunk * chunk = static_cast<Chunk *>(dest);
     const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset);
-    for (size_t n=0; n < Chunks; n++) {
-        chunk[n] = get<Chunk, ChunkSize>(tmp+n, src[0].second);
+    for (size_t n = 0; n < Chunks; n++) {
+        chunk[n] = get<Chunk, ChunkSize>(tmp + n, src[0].second);
     }
     for (size_t i(1); i < src.size(); i++) {
         tmp = cast<Chunk, ChunkSize>(src[i].first, offset);
-        for (size_t n=0; n < Chunks; n++) {
-            chunk[n] |= get<Chunk, ChunkSize>(tmp+n, src[i].second);
+        for (size_t n = 0; n < Chunks; n++) {
+            chunk[n] |= get<Chunk, ChunkSize>(tmp + n, src[i].second);
         }
     }
 }
 
 template<typename TemporaryT=int32_t>
-double squaredEuclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz) __attribute__((noinline));
+double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) __attribute__((noinline));
+
 template<typename TemporaryT>
-double squaredEuclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz)
-{
+double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) {
     //Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float
     TemporaryT sum = 0;
     for (size_t i(0); i < sz; i++) {
@@ -100,10 +100,10 @@ double squaredEuclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz)
 }
 
 inline double
-squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) {
-    constexpr size_t LOOP_COUNT = 0x200;
+squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) {
+    constexpr size_t LOOP_COUNT = 0x100;
     double sum(0);
-    size_t i=0;
+    size_t i = 0;
     for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) {
         sum += squaredEuclideanDistanceT<int32_t>(a + i, b + i, LOOP_COUNT);
     }
@@ -114,12 +114,40 @@ squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) {
 }
 
 inline void
-convert_bfloat16_to_float(const uint16_t * src, float * dest, size_t sz) noexcept {
-    uint32_t * asu32 = reinterpret_cast<uint32_t *>(dest);
+convert_bfloat16_to_float(const uint16_t *src, float *dest, size_t sz) noexcept {
+    uint32_t *asu32 = reinterpret_cast<uint32_t *>(dest);
     for (size_t i(0); i < sz; i++) {
         asu32[i] = src[i] << 16;
     }
 }
 
+template<typename ACCUM = uint32_t>
+ACCUM
+multiplyAddT(const int8_t *a, const int8_t *b, size_t sz) noexcept __attribute_noinline__;
+
+template<typename ACCUM>
+ACCUM
+multiplyAddT(const int8_t *a, const int8_t *b, size_t sz) noexcept {
+    ACCUM sum = 0;
+    for (size_t i(0); i < sz; i++) {
+        sum += uint16_t(a[i]) * uint16_t(b[i]);
+    }
+    return sum;
+}
+
+inline int64_t
+multiplyAdd(const int8_t *a, const int8_t *b, size_t sz) noexcept {
+    constexpr size_t LOOP_COUNT = 0x100;
+    int64_t sum(0);
+    size_t i = 0;
+    for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) {
+        sum += multiplyAddT<int32_t>(a + i, b + i, LOOP_COUNT);
+    }
+    if (sz > i) [[unlikely]] {
+        sum += multiplyAddT<int32_t>(a + i, b + i, sz - i);
+    }
+    return sum;
+}
+
 }
 }
author	Henning Baldersheim <balder@yahoo-inc.com>	2024-05-15 13:34:21 +0000
committer	Henning Baldersheim <balder@yahoo-inc.com>	2024-05-15 13:34:21 +0000
commit	ef6e5edc57653df6ac228de7704dbe4a98b32607 (patch)
tree	73d5eee2fbe024d5e0ab82756c6842409bb60dcf /vespalib
parent	61c22f760d11f3b87a175149acedc24550f1f9fd (diff)