aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2021-11-25 09:49:13 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2021-11-25 17:07:25 +0000
commit727e8c771e96d2d596b48b558baa0bf8fa5b4ab2 (patch)
tree9532e906685cd1e8d93ae7a01289ee56bc5191cd
parent71d5af664706f0120f4bddd596d27935e45eb863 (diff)
- Add optimisation of int8_t squared euclidian distance.
- Also add a benchmark.
-rw-r--r--vespalib/src/tests/hwaccelrated/.gitignore1
-rw-r--r--vespalib/src/tests/hwaccelrated/CMakeLists.txt7
-rw-r--r--vespalib/src/tests/hwaccelrated/hwaccelrated_bench.cpp59
-rw-r--r--vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp25
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp5
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.h1
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp5
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx512.h1
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp9
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.h1
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h1
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp26
12 files changed, 131 insertions, 10 deletions
diff --git a/vespalib/src/tests/hwaccelrated/.gitignore b/vespalib/src/tests/hwaccelrated/.gitignore
new file mode 100644
index 00000000000..42f73a39d78
--- /dev/null
+++ b/vespalib/src/tests/hwaccelrated/.gitignore
@@ -0,0 +1 @@
+vespalib_hwaccelrated_bench_app
diff --git a/vespalib/src/tests/hwaccelrated/CMakeLists.txt b/vespalib/src/tests/hwaccelrated/CMakeLists.txt
index 960ae840995..9edea9c4472 100644
--- a/vespalib/src/tests/hwaccelrated/CMakeLists.txt
+++ b/vespalib/src/tests/hwaccelrated/CMakeLists.txt
@@ -6,3 +6,10 @@ vespa_add_executable(vespalib_hwaccelrated_test_app TEST
vespalib
)
vespa_add_test(NAME vespalib_hwaccelrated_test_app COMMAND vespalib_hwaccelrated_test_app)
+
+vespa_add_executable(vespalib_hwaccelrated_bench_app
+ SOURCES
+ hwaccelrated_bench.cpp
+ DEPENDS
+ vespalib
+)
diff --git a/vespalib/src/tests/hwaccelrated/hwaccelrated_bench.cpp b/vespalib/src/tests/hwaccelrated/hwaccelrated_bench.cpp
new file mode 100644
index 00000000000..9984cfca440
--- /dev/null
+++ b/vespalib/src/tests/hwaccelrated/hwaccelrated_bench.cpp
@@ -0,0 +1,59 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
+#include <vespa/vespalib/hwaccelrated/generic.h>
+#include <vespa/vespalib/util/time.h>
+#
+using namespace vespalib;
+
+template<typename T>
+std::vector<T> createAndFill(size_t sz) {
+ std::vector<T> v(sz);
+ for (size_t i(0); i < sz; i++) {
+ v[i] = rand()%128;
+ }
+ return v;
+}
+
+template<typename T>
+void
+benchmarkEuclideanDistance(const hwaccelrated::IAccelrated & accel, size_t sz, size_t count) {
+ srand(1);
+ std::vector<T> a = createAndFill<T>(sz);
+ std::vector<T> b = createAndFill<T>(sz);
+ steady_time start = steady_clock::now();
+ double sumOfSums(0);
+ for (size_t j(0); j < count; j++) {
+ double sum = accel.squaredEuclideanDistance(&a[0], &b[0], sz);
+ sumOfSums += sum;
+ }
+ duration elapsed = steady_clock::now() - start;
+ printf("sum=%f of N=%zu and vector length=%zu took %ld\n", sumOfSums, count, sz, count_ms(elapsed));
+}
+
+void
+benchMarkEuclidianDistance(const hwaccelrated::IAccelrated & accelrator, size_t sz, size_t count) {
+ printf("double : ");
+ benchmarkEuclideanDistance<double>(accelrator, sz, count);
+ printf("float : ");
+ benchmarkEuclideanDistance<float>(accelrator, sz, count);
+ printf("int8_t : ");
+ benchmarkEuclideanDistance<int8_t>(accelrator, sz, count);
+}
+
+int main(int argc, char *argv[]) {
+ int length = 1000;
+ int count = 1000000;
+ if (argc > 1) {
+ length = atol(argv[1]);
+ }
+ if (argc > 2) {
+ count = atol(argv[2]);
+ }
+ printf("%s %d %d\n", argv[0], length, count);
+ printf("Squared Euclidian Distance - Generic\n");
+ benchMarkEuclidianDistance(hwaccelrated::GenericAccelrator(), length, count);
+ printf("Squared Euclidian Distance - Optimized for this cpu\n");
+ benchMarkEuclidianDistance(hwaccelrated::IAccelrated::getAccelerator(), length, count);
+ return 0;
+}
diff --git a/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp b/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp
index 3d66769c15a..45d4f09e720 100644
--- a/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp
+++ b/vespalib/src/tests/hwaccelrated/hwaccelrated_test.cpp
@@ -3,6 +3,8 @@
#include <vespa/vespalib/testkit/test_kit.h>
#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
#include <vespa/vespalib/hwaccelrated/generic.h>
+#include <vespa/log/log.h>
+LOG_SETUP("hwaccelrated_test");
using namespace vespalib;
@@ -15,26 +17,33 @@ std::vector<T> createAndFill(size_t sz) {
return v;
}
-template<typename T>
-void verifyEuclideanDistance(const hwaccelrated::IAccelrated & accel) {
- const size_t testLength(255);
+template<typename T, typename P>
+void verifyEuclideanDistance(const hwaccelrated::IAccelrated & accel, size_t testLength) {
srand(1);
std::vector<T> a = createAndFill<T>(testLength);
std::vector<T> b = createAndFill<T>(testLength);
for (size_t j(0); j < 0x20; j++) {
- T sum(0);
+ P sum(0);
for (size_t i(j); i < testLength; i++) {
- sum += (a[i] - b[i]) * (a[i] - b[i]);
+ P d = P(a[i]) - P(b[i]);
+ sum += d * d;
}
- T hwComputedSum(accel.squaredEuclideanDistance(&a[j], &b[j], testLength - j));
+ P hwComputedSum(accel.squaredEuclideanDistance(&a[j], &b[j], testLength - j));
EXPECT_EQUAL(sum, hwComputedSum);
}
}
+void
+verifyEuclideanDistance(const hwaccelrated::IAccelrated & accelrator, size_t testLength) {
+ verifyEuclideanDistance<int8_t, float>(accelrator, testLength);
+ verifyEuclideanDistance<float, float>(accelrator, testLength);
+ verifyEuclideanDistance<double, float>(accelrator, testLength);
+}
+
TEST("test euclidean distance") {
hwaccelrated::GenericAccelrator genericAccelrator;
- verifyEuclideanDistance<float>(genericAccelrator);
- verifyEuclideanDistance<double >(genericAccelrator);
+ verifyEuclideanDistance(hwaccelrated::GenericAccelrator(),255);
+ verifyEuclideanDistance(hwaccelrated::IAccelrated::getAccelerator(),255);
}
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
index 6a6421ad016..07ef897ca11 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
@@ -11,6 +11,11 @@ Avx2Accelrator::populationCount(const uint64_t *a, size_t sz) const {
}
double
+Avx2Accelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const {
+ return helper::euclideanDistance(a, b, sz);
+}
+
+double
Avx2Accelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const {
return avx::euclideanDistanceSelectAlignment<float, 32>(a, b, sz);
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
index 44752dd9270..2949e81fd36 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
@@ -13,6 +13,7 @@ class Avx2Accelrator : public GenericAccelrator
{
public:
size_t populationCount(const uint64_t *a, size_t sz) const override;
+ double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override;
void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override;
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
index 94a6637a072..491fe04752d 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
@@ -23,6 +23,11 @@ Avx512Accelrator::populationCount(const uint64_t *a, size_t sz) const {
}
double
+Avx512Accelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const {
+ return helper::euclideanDistance(a, b, sz);
+}
+
+double
Avx512Accelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const {
return avx::euclideanDistanceSelectAlignment<float, 64>(a, b, sz);
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
index 826cf63be70..4989f72e698 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
@@ -15,6 +15,7 @@ public:
float dotProduct(const float * a, const float * b, size_t sz) const override;
double dotProduct(const double * a, const double * b, size_t sz) const override;
size_t populationCount(const uint64_t *a, size_t sz) const override;
+ double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override;
void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override;
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
index fb6ec167cf4..a715d1b5758 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
@@ -156,13 +156,18 @@ GenericAccelrator::populationCount(const uint64_t *a, size_t sz) const {
}
double
+GenericAccelrator::squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const {
+ return helper::euclideanDistance(a, b, sz);
+}
+
+double
GenericAccelrator::squaredEuclideanDistance(const float * a, const float * b, size_t sz) const {
- return euclideanDistanceT<float, 8>(a, b, sz);
+ return euclideanDistanceT<float, 2>(a, b, sz);
}
double
GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, size_t sz) const {
- return euclideanDistanceT<double, 4>(a, b, sz);
+ return euclideanDistanceT<double, 2>(a, b, sz);
}
void
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
index c6b75bbcaf0..315e807da07 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
@@ -23,6 +23,7 @@ public:
void andNotBit(void * a, const void * b, size_t bytes) const override;
void notBit(void * a, size_t bytes) const override;
size_t populationCount(const uint64_t *a, size_t sz) const override;
+ double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override;
void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override;
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
index afb2024b322..6eae41ead4b 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
@@ -28,6 +28,7 @@ public:
virtual void andNotBit(void * a, const void * b, size_t bytes) const = 0;
virtual void notBit(void * a, size_t bytes) const = 0;
virtual size_t populationCount(const uint64_t *a, size_t sz) const = 0;
+ virtual double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const = 0;
virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const = 0;
virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const = 0;
// AND 64 bytes from multiple, optionally inverted sources
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
index 824e0e1ebd9..ffa1c2e5c49 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
@@ -74,5 +74,31 @@ orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src,
}
}
+template<typename TemporaryT=int32_t>
+double euclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz) __attribute__((noinline));
+template<typename TemporaryT>
+double euclideanDistanceT(const int8_t * a, const int8_t * b, size_t sz)
+{
+ //Note that this is 3 times faster with int32_t than with int64_t and 16x faster than float
+ TemporaryT sum = 0;
+ for (size_t i(0); i < sz; i++) {
+ int16_t d = int16_t(a[i]) - int16_t(b[i]);
+ sum += d * d;
+ }
+ return sum;
+}
+
+inline double
+euclideanDistance(const int8_t * a, const int8_t * b, size_t sz) {
+ constexpr size_t LOOP_COUNT = 0x10000;
+ double sum(0);
+ size_t i=0;
+ for (; i + LOOP_COUNT <= sz; i += LOOP_COUNT) {
+ sum += euclideanDistanceT<int32_t>(a+i, b+i, LOOP_COUNT);
+ }
+ sum += euclideanDistanceT<int32_t>(a+i, b+i, sz-i);
+ return sum;
+}
+
}
}