aboutsummaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-05-15 13:34:21 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2024-05-15 13:34:21 +0000
commitef6e5edc57653df6ac228de7704dbe4a98b32607 (patch)
tree73d5eee2fbe024d5e0ab82756c6842409bb60dcf /vespalib
parent61c22f760d11f3b87a175149acedc24550f1f9fd (diff)
Speed up dotproduct for int8.
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp58
2 files changed, 44 insertions, 16 deletions
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
index 4307b38d18b..4ac99d90a7e 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
@@ -101,7 +101,7 @@ GenericAccelrator::dotProduct(const double * a, const double * b, size_t sz) con
int64_t
GenericAccelrator::dotProduct(const int8_t * a, const int8_t * b, size_t sz) const noexcept
{
- return multiplyAdd<int64_t, int8_t, 8>(a, b, sz);
+ return helper::multiplyAdd(a, b, sz);
}
int64_t
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
index 173fe151831..b2bc087b7e9 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
@@ -52,7 +52,7 @@ void
andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) {
typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize");
- static_assert(ChunkSize*Chunks == 128, "ChunkSize*Chunks == 128");
+ static_assert(ChunkSize * Chunks == 128, "ChunkSize*Chunks == 128");
Chunk * chunk = static_cast<Chunk *>(dest);
const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset);
for (size_t n=0; n < Chunks; n++) {
@@ -68,28 +68,28 @@ andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src,
template<unsigned ChunkSize, unsigned Chunks>
void
-orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) {
+orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) {
typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize");
- static_assert(ChunkSize*Chunks == 128, "ChunkSize*Chunks == 128");
+ static_assert(ChunkSize * Chunks == 128, "ChunkSize*Chunks == 128");
Chunk * chunk = static_cast<Chunk *>(dest);
const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset);
- for (size_t n=0; n < Chunks; n++) {
- chunk[n] = get<Chunk, ChunkSize>(tmp+n, src[0].second);
+ for (size_t n = 0; n < Chunks; n++) {
+ chunk[n] = get<Chunk, ChunkSize>(tmp + n, src[0].second);
}
for (size_t i(1); i < src.size(); i++) {
tmp = cast<Chunk, ChunkSize>(src[i].first, offset);
- for (size_t n=0; n < Chunks; n++) {
- chunk[n] |= get<Chunk, ChunkSize>(tmp+n, src[i].second);
+ for (size_t n = 0; n < Chunks; n++) {
+ chunk[n] |= get<Chunk, ChunkSize>(tmp + n, src[i].second);
}
}
}
template<typename TemporaryT=int32_t>
-double squaredEuclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz) __attribute__((noinline));
+double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) __attribute__((noinline));
+
template<typename TemporaryT>
-double squaredEuclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz)
-{
+double squaredEuclideanDistanceT(const int8_t *a, const int8_t *b, size_t sz) {
//Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float
TemporaryT sum = 0;
for (size_t i(0); i < sz; i++) {
@@ -100,10 +100,10 @@ double squaredEuclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz)
}
inline double
-squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) {
- constexpr size_t LOOP_COUNT = 0x200;
+squaredEuclideanDistance(const int8_t *a, const int8_t *b, size_t sz) {
+ constexpr size_t LOOP_COUNT = 0x100;
double sum(0);
- size_t i=0;
+ size_t i = 0;
for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) {
sum += squaredEuclideanDistanceT<int32_t>(a + i, b + i, LOOP_COUNT);
}
@@ -114,12 +114,40 @@ squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) {
}
inline void
-convert_bfloat16_to_float(const uint16_t * src, float * dest, size_t sz) noexcept {
- uint32_t * asu32 = reinterpret_cast<uint32_t *>(dest);
+convert_bfloat16_to_float(const uint16_t *src, float *dest, size_t sz) noexcept {
+ uint32_t *asu32 = reinterpret_cast<uint32_t *>(dest);
for (size_t i(0); i < sz; i++) {
asu32[i] = src[i] << 16;
}
}
+template<typename ACCUM = uint32_t>
+ACCUM
+multiplyAddT(const int8_t *a, const int8_t *b, size_t sz) noexcept __attribute_noinline__;
+
+template<typename ACCUM>
+ACCUM
+multiplyAddT(const int8_t *a, const int8_t *b, size_t sz) noexcept {
+ ACCUM sum = 0;
+ for (size_t i(0); i < sz; i++) {
+ sum += uint16_t(a[i]) * uint16_t(b[i]);
+ }
+ return sum;
+}
+
+inline int64_t
+multiplyAdd(const int8_t *a, const int8_t *b, size_t sz) noexcept {
+ constexpr size_t LOOP_COUNT = 0x100;
+ int64_t sum(0);
+ size_t i = 0;
+ for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) {
+ sum += multiplyAddT<int32_t>(a + i, b + i, LOOP_COUNT);
+ }
+ if (sz > i) [[unlikely]] {
+ sum += multiplyAddT<int32_t>(a + i, b + i, sz - i);
+ }
+ return sum;
+}
+
}
}