diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2020-01-24 15:10:15 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2020-01-24 15:10:15 +0000 |
commit | 5cb24ee230a6d5f7eb1155b2746c6a3f11d28b16 (patch) | |
tree | d2d9628909144c2a1affb432392d34fc39658d55 /vespalib | |
parent | 5ff453a5a69bbae2f05ba67240f08774be025e79 (diff) |
Count bits faster when hardware supports it.
Diffstat (limited to 'vespalib')
14 files changed, 87 insertions, 6 deletions
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp index 14abb93d8d0..39ea0d2d73b 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx.cpp @@ -17,4 +17,9 @@ AvxAccelrator::dotProduct(const double * af, const double * bf, size_t sz) const return avx::dotProductSelectAlignment<double, 32>(af, bf, sz); } +size_t +AvxAccelrator::populationCount(const uint64_t *a, size_t sz) const { + return helper::populationCount(a, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx.h index ffbe0b8d27f..624531a9ca5 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx.h @@ -14,6 +14,7 @@ class AvxAccelrator : public Sse2Accelrator public: float dotProduct(const float * a, const float * b, size_t sz) const override; double dotProduct(const double * a, const double * b, size_t sz) const override; + size_t populationCount(const uint64_t *a, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index 4c4e53e88db..ea8a3ead538 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -17,4 +17,9 @@ Avx2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) cons return avx::dotProductSelectAlignment<double, 32>(af, bf, sz); } +size_t +Avx2Accelrator::populationCount(const uint64_t *a, size_t sz) const { + return helper::populationCount(a, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index f20068c6478..cf91bc81cfd 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -14,6 +14,7 @@ class Avx2Accelrator : public AvxAccelrator public: float dotProduct(const float * a, const float * b, size_t sz) const override; double dotProduct(const double * a, const double * b, size_t sz) const override; + size_t populationCount(const uint64_t *a, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp index 4d21c9358ec..1abf6b270cf 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -17,4 +17,9 @@ Avx512Accelrator::dotProduct(const double * af, const double * bf, size_t sz) co return avx::dotProductSelectAlignment<double, 64>(af, bf, sz); } +size_t +Avx512Accelrator::populationCount(const uint64_t *a, size_t sz) const { + return helper::populationCount(a, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h index 5807aeeee57..eac8c96832b 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h @@ -14,6 +14,7 @@ class Avx512Accelrator : public Avx2Accelrator public: float dotProduct(const float * a, const float * b, size_t sz) const override; double dotProduct(const double * a, const double * b, size_t sz) const override; + size_t populationCount(const uint64_t *a, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp index 2db7ebfd8fd..9e6a6d8817f 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avxprivate.hpp @@ -2,6 +2,7 @@ #pragma once +#include "private_helpers.hpp" #include <vespa/fastos/dynamiclibrary.h> #include <cstring> diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index d70071525c6..b70ebb4051a 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "generic.h" +#include "private_helpers.hpp" namespace vespalib::hwaccelrated { @@ -124,4 +125,9 @@ GenericAccelrator::notBit(void * aOrg, size_t bytes) const } } +size_t +GenericAccelrator::populationCount(const uint64_t *a, size_t sz) const { + return helper::populationCount(a, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h index f9aab3ae845..d76d0728bdd 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h @@ -22,6 +22,7 @@ public: void andBit(void * a, const void * b, size_t bytes) const override; void andNotBit(void * a, const void * b, size_t bytes) const override; void notBit(void * a, size_t bytes) const override; + size_t populationCount(const uint64_t *a, size_t sz) const override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp index aae277b48d8..4006897dce5 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp @@ -6,6 +6,7 @@ #include "avx.h" #include "avx2.h" #include "avx512.h" +#include <vespa/vespalib/util/memory.h> #include <vespa/log/log.h> LOG_SETUP(".vespalib.hwaccelrated"); @@ -22,27 +23,27 @@ public: class GenericFactory :public Factory{ public: - IAccelrated::UP create() const override { return IAccelrated::UP(new GenericAccelrator()); } + IAccelrated::UP create() const override { return std::make_unique<GenericAccelrator>(); } }; class Sse2Factory :public Factory{ public: - IAccelrated::UP create() const override { return IAccelrated::UP(new Sse2Accelrator()); } + IAccelrated::UP create() const override { return std::make_unique<Sse2Accelrator>(); } }; class AvxFactory :public Factory{ public: - IAccelrated::UP create() const override { return IAccelrated::UP(new AvxAccelrator()); } + IAccelrated::UP create() const override { return std::make_unique<AvxAccelrator>(); } }; class Avx2Factory :public Factory{ public: - IAccelrated::UP create() const override { return IAccelrated::UP(new Avx2Accelrator()); } + IAccelrated::UP create() const override { return std::make_unique<Avx2Accelrator>(); } }; class Avx512Factory :public Factory{ public: - IAccelrated::UP create() const override { return IAccelrated::UP(new Avx512Accelrator()); } + IAccelrated::UP create() const override { return std::make_unique<Avx512Accelrator>(); } }; template<typename T> @@ -67,6 +68,23 @@ void verifyAccelrator(const IAccelrated & accel) delete [] b; } +void verifyPopulationCount(const IAccelrated & accel) +{ + const uint64_t words[7] = {0x123456789abcdef0L, // 32 + 0x0000000000000000L, // 0 + 0x8000000000000000L, // 1 + 0xdeadbeefbeefdeadUL, // 48 + 0x5555555555555555L, // 32 + 0x00000000000000001, // 1 + 0xffffffffffffffff}; // 64 + constexpr size_t expected = 32 + 0 + 1 + 48 + 32 + 1 + 64; + size_t hwComputedPopulationCount = accel.populationCount(words, VESPA_NELEMS(words)); + if (hwComputedPopulationCount != expected) { + fprintf(stderr, "Accelrator is not computing populationCount correctly.Expected %zu, computed %zu\n", expected, hwComputedPopulationCount); + LOG_ABORT("should not be reached"); + } +} + class RuntimeVerificator { public: @@ -79,7 +97,8 @@ RuntimeVerificator::RuntimeVerificator() verifyAccelrator<float>(generic); verifyAccelrator<double>(generic); verifyAccelrator<int32_t>(generic); - verifyAccelrator<int64_t>(generic); + verifyAccelrator<int64_t>(generic); + verifyPopulationCount(generic); IAccelrated::UP thisCpu(IAccelrated::getAccelrator()); verifyAccelrator<float>(*thisCpu); diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h index aae60279d06..4031169c44d 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h @@ -26,6 +26,7 @@ public: virtual void andBit(void * a, const void * b, size_t bytes) const = 0; virtual void andNotBit(void * a, const void * b, size_t bytes) const = 0; virtual void notBit(void * a, size_t bytes) const = 0; + virtual size_t populationCount(const uint64_t *a, size_t sz) const = 0; static IAccelrated::UP getAccelrator() __attribute__((noinline)); }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp new file mode 100644 index 00000000000..8eba313d5f1 --- /dev/null +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -0,0 +1,27 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/util/optimized.h> + +namespace vespalib::hwaccelrated::helper { +namespace { + +inline size_t +populationCount(const uint64_t *a, size_t sz) { + size_t count(0); + size_t i(0); + for (; (i + 3) < sz; i += 4) { + count += Optimized::popCount(a[i + 0]) + + Optimized::popCount(a[i + 1]) + + Optimized::popCount(a[i + 2]) + + Optimized::popCount(a[i + 3]); + } + for (; i < sz; i++) { + count += Optimized::popCount(a[i]); + } + return count; +} + +} +}
\ No newline at end of file diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp index f135de52e5a..a0f584f8a9f 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.cpp @@ -1,6 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "sse2.h" +#include "private_helpers.hpp" namespace vespalib::hwaccelrated { @@ -76,4 +77,9 @@ Sse2Accelrator::dotProduct(const double * af, const double * bf, size_t sz) cons return sum; } +size_t +Sse2Accelrator::populationCount(const uint64_t *a, size_t sz) const { + return helper::populationCount(a, sz); +} + } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h index a539aa44b03..d0fbefe5f03 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/sse2.h @@ -14,6 +14,8 @@ class Sse2Accelrator : public GenericAccelrator public: float dotProduct(const float * a, const float * b, size_t sz) const override; double dotProduct(const double * a, const double * b, size_t sz) const override; + + size_t populationCount(const uint64_t *a, size_t sz) const override; }; } |