diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2021-11-25 09:49:13 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2021-11-25 17:07:25 +0000 |
commit | 727e8c771e96d2d596b48b558baa0bf8fa5b4ab2 (patch) | |
tree | 9532e906685cd1e8d93ae7a01289ee56bc5191cd | |
parent | 71d5af664706f0120f4bddd596d27935e45eb863 (diff) |
- Add optimisation of int8_t squared euclidian distance.
- Also add a benchmark.
12 files changed, 131 insertions, 10 deletions
diff --git a/vespalib/src/tests/hwaccelrated/.gitignore b/vespalib/src/tests/hwaccelrated/.gitignore new file mode 100644 index 00000000000..42f73a39d78 --- /dev/null +++ b/vespalib/src/tests/hwaccelrated/.gitignore @@ -0,0 +1 @@ +vespalib_hwaccelrated_bench_app diff --git a/vespalib/src/tests/hwaccelrated/CMakeLists.txt b/vespalib/src/tests/hwaccelrated/CMakeLists.txt index 960ae840995..9edea9c4472 100644 --- a/vespalib/src/tests/hwaccelrated/CMakeLists.txt +++ b/vespalib/src/tests/hwaccelrated/CMakeLists.txt @@ -6,3 +6,10 @@ vespa_add_executable(vespalib_hwaccelrated_test_app TEST vespalib ) vespa_add_test(NAME vespalib_hwaccelrated_test_app COMMAND vespalib_hwaccelrated_test_app) + +vespa_add_executable(vespalib_hwaccelrated_bench_app + SOURCES + hwaccelrated_bench.cpp + DEPENDS + vespalib +) diff --git a/vespalib/src/tests/hwaccelrated/hwaccelrated_bench.cpp b/vespalib/src/tests/hwaccelrated/hwaccelrated_bench.cpp new file mode 100644 index 00000000000..9984cfca440 --- /dev/null +++ b/vespalib/src/tests/hwaccelrated/hwaccelrated_bench.cpp @@ -0,0 +1,59 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> +#include <vespa/vespalib/hwaccelrated/generic.h> +#include <vespa/vespalib/util/time.h> +# +using namespace vespalib; + +template<typename T> +std::vector<T> createAndFill(size_t sz) { + std::vector<T> v(sz); + for (size_t i(0); i < sz; i++) { + v[i] = rand()%128; + } + return v; +} + +template<typename T> +void +benchmarkEuclideanDistance(const hwaccelrated::IAccelrated & accel, size_t sz, size_t count) { + srand(1); + std::vector<T> a = createAndFill<T>(sz); + std::vector<T> b = createAndFill<T>(sz); + steady_time start = steady_clock::now(); + double sumOfSums(0); + for (size_t j(0); j < count; j++) { + double sum = accel.squaredEuclideanDistance(&a[0], &b[0], sz); + sumOfSums += sum; + } + duration elapsed = steady_clock::now() - start; + printf("sum=%f of N=%zu and vector length=%zu took %ld\n", sumOfSums, count, sz, count_ms(elapsed)); +} + +void +benchMarkEuclidianDistance(const hwaccelrated::IAccelrated & accelrator, size_t sz, size_t count) { + printf("double : "); + benchmarkEuclideanDistance<double>(accelrator, sz, count); + printf("float : "); + benchmarkEuclideanDistance<float>(accelrator, sz, count); + printf("int8_t : "); + benchmarkEuclideanDistance<int8_t>(accelrator, sz, count); +} + +int main(int argc, char *argv[]) { + int length = 1000; + int count = 1000000; + if (argc > 1) { + length = atol(argv[1]); + } + if (argc > 2) { + count = atol(argv[2]); + } + printf("%s %d %d\n", argv[0], length, count); + printf("Squared Euclidian Distance - Generic\n"); + benchMarkEuclidianDistance(hwaccelrated::GenericAccelrator(), length, count); + printf("Squared Euclidian Distance - Optimized for this cpu\n"); + benchMarkEuclidianDistance(hwaccelrated::IAccelrated::getAccelerator(), length, count); + return 0; +} diff --git a/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp b/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp index 3d66769c15a..45d4f09e720 100644 --- a/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp +++ b/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp @@ -3,6 +3,8 @@ #include <vespa/vespalib/testkit/test_kit.h> #include <vespa/vespalib/hwaccelrated/iaccelrated.h> #include <vespa/vespalib/hwaccelrated/generic.h> +#include <vespa/log/log.h> +LOG_SETUP("hwaccelrated_test"); using namespace vespalib; @@ -15,26 +17,33 @@ std::vector<T> createAndFill(size_t sz) { return v; } -template<typename T> -void verifyEuclideanDistance(const hwaccelrated::IAccelrated & accel) { - const size_t testLength(255); +template<typename T, typename P> +void verifyEuclideanDistance(const hwaccelrated::IAccelrated & accel, size_t testLength) { srand(1); std::vector<T> a = createAndFill<T>(testLength); std::vector<T> b = createAndFill<T>(testLength); for (size_t j(0); j < 0x20; j++) { - T sum(0); + P sum(0); for (size_t i(j); i < testLength; i++) { - sum += (a[i] - b[i]) * (a[i] - b[i]); + P d = P(a[i]) - P(b[i]); + sum += d * d; } - T hwComputedSum(accel.squaredEuclideanDistance(&a[j], &b[j], testLength - j)); + P hwComputedSum(accel.squaredEuclideanDistance(&a[j], &b[j], testLength - j)); EXPECT_EQUAL(sum, hwComputedSum); } } +void +verifyEuclideanDistance(const hwaccelrated::IAccelrated & accelrator, size_t testLength) { + verifyEuclideanDistance<int8_t, float>(accelrator, testLength); + verifyEuclideanDistance<float, float>(accelrator, testLength); + verifyEuclideanDistance<double, float>(accelrator, testLength); +} + TEST("test euclidean distance") { hwaccelrated::GenericAccelrator genericAccelrator; - verifyEuclideanDistance<float>(genericAccelrator); - verifyEuclideanDistance<double >(genericAccelrator); + verifyEuclideanDistance(hwaccelrated::GenericAccelrator(),255); + verifyEuclideanDistance(hwaccelrated::IAccelrated::getAccelerator(),255); } TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index 6a6421ad016..07ef897ca11 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -11,6 +11,11 @@ Avx2Accelrator::populationCount(const uint64_t *a, size_t sz) const { } double +Avx2Accelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const { + return helper::euclideanDistance(a, b, sz); +} + +double Avx2Accelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const { return avx::euclideanDistanceSelectAlignment<float, 32>(a, b, sz); } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index 44752dd9270..2949e81fd36 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -13,6 +13,7 @@ class Avx2Accelrator : public GenericAccelrator { public: size_t populationCount(const uint64_t *a, size_t sz) const override; + double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp index 94a6637a072..491fe04752d 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -23,6 +23,11 @@ Avx512Accelrator::populationCount(const uint64_t *a, size_t sz) const { } double +Avx512Accelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const { + return helper::euclideanDistance(a, b, sz); +} + +double Avx512Accelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const { return avx::euclideanDistanceSelectAlignment<float, 64>(a, b, sz); } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h index 826cf63be70..4989f72e698 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h @@ -15,6 +15,7 @@ public: float dotProduct(const float * a, const float * b, size_t sz) const override; double dotProduct(const double * a, const double * b, size_t sz) const override; size_t populationCount(const uint64_t *a, size_t sz) const override; + double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index fb6ec167cf4..a715d1b5758 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -156,13 +156,18 @@ GenericAccelrator::populationCount(const uint64_t *a, size_t sz) const { } double +GenericAccelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const { + return helper::euclideanDistance(a, b, sz); +} + +double GenericAccelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const { - return euclideanDistanceT<float, 8>(a, b, sz); + return euclideanDistanceT<float, 2>(a, b, sz); } double GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, size_t sz) const { - return euclideanDistanceT<double, 4>(a, b, sz); + return euclideanDistanceT<double, 2>(a, b, sz); } void diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h index c6b75bbcaf0..315e807da07 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h @@ -23,6 +23,7 @@ public: void andNotBit(void * a, const void * b, size_t bytes) const override; void notBit(void * a, size_t bytes) const override; size_t populationCount(const uint64_t *a, size_t sz) const override; + double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override; void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h index afb2024b322..6eae41ead4b 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h @@ -28,6 +28,7 @@ public: virtual void andNotBit(void * a, const void * b, size_t bytes) const = 0; virtual void notBit(void * a, size_t bytes) const = 0; virtual size_t populationCount(const uint64_t *a, size_t sz) const = 0; + virtual double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const = 0; virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const = 0; virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const = 0; // AND 64 bytes from multiple, optionally inverted sources diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp index 824e0e1ebd9..ffa1c2e5c49 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -74,5 +74,31 @@ orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, } } +template<typename TemporaryT=int32_t> +double euclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz) __attribute__((noinline)); +template<typename TemporaryT> +double euclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz) +{ + //Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float + TemporaryT sum = 0; + for (size_t i(0); i < sz; i++) { + int16_t d = int16_t(a[i]) - int16_t(b[i]); + sum += d * d; + } + return sum; +} + +inline double +euclideanDistance(const int8_t * a, const int8_t * b, size_t sz) { + constexpr size_t LOOP_COUNT = 0x10000; + double sum(0); + size_t i=0; + for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) { + sum += euclideanDistanceT<int32_t>(a+i, b+i, LOOP_COUNT); + } + sum += euclideanDistanceT<int32_t>(a+i, b+i, sz-i); + return sum; +} + } } |