summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2020-06-08 08:51:50 +0200
committerGitHub <noreply@github.com>2020-06-08 08:51:50 +0200
commitbebe03c7b970744d09eb26f7383da4f8c1244a36 (patch)
tree0456ec2cec34c515d681934e803d9ab075348ebc /vespalib
parentaa622a409b6ba44a1b3c9438240b30316fcec149 (diff)
Revert "Revert "When we pull in a cacheline, we should use it too.""
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/tests/dotproduct/dotproductbenchmark.cpp2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp10
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.h2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp10
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx512.h2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp10
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.h2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp127
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h7
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp46
10 files changed, 211 insertions, 7 deletions
diff --git a/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp b/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp
index d6e1aef9394..e95e8a5c58b 100644
--- a/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp
+++ b/vespalib/src/tests/dotproduct/dotproductbenchmark.cpp
@@ -60,7 +60,7 @@ template <typename T>
FullBenchmark<T>::FullBenchmark(size_t numDocs, size_t numValues)
: _values(numDocs*numValues),
_query(numValues),
- _dp(IAccelrated::getAccelrator())
+ _dp(IAccelrated::getAccelerator())
{
for (size_t i(0); i < numDocs; i++) {
for (size_t j(0); j < numValues; j++) {
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
index 7ff393c87f8..8588a5510f7 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
@@ -20,4 +20,14 @@ Avx2Accelrator::squaredEuclideanDistance(const double * a, const double * b, siz
return avx::euclideanDistanceSelectAlignment<double, 32>(a, b, sz);
}
+void
+Avx2Accelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const {
+ helper::andChunks<32u, 2u>(offset, src, dest);
+}
+
+void
+Avx2Accelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const {
+ helper::orChunks<32u, 2u>(offset, src, dest);
+}
+
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
index 3e0dbb28110..b6f3d299748 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
@@ -15,6 +15,8 @@ public:
size_t populationCount(const uint64_t *a, size_t sz) const override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override;
+ void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override;
+ void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override;
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
index 0941e6d6ad8..4dade08e77a 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
@@ -32,4 +32,14 @@ Avx512Accelrator::squaredEuclideanDistance(const double * a, const double * b, s
return avx::euclideanDistanceSelectAlignment<double, 64>(a, b, sz);
}
+void
+Avx512Accelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const {
+ helper::andChunks<64, 1>(offset, src, dest);
+}
+
+void
+Avx512Accelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const {
+ helper::orChunks<64, 1>(offset, src, dest);
+}
+
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
index 209ec06c857..a54d57407b2 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
@@ -17,6 +17,8 @@ public:
size_t populationCount(const uint64_t *a, size_t sz) const override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override;
+ void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override;
+ void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override;
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
index f9684e88c63..f9dfaacf626 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
@@ -165,4 +165,14 @@ GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b,
return euclideanDistanceT<double, 4>(a, b, sz);
}
+void
+GenericAccelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const {
+ helper::andChunks<16, 4>(offset, src, dest);
+}
+
+void
+GenericAccelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const {
+ helper::orChunks<16,4>(offset, src, dest);
+}
+
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
index 50a3d59d49d..2335b40fe85 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
@@ -25,6 +25,8 @@ public:
size_t populationCount(const uint64_t *a, size_t sz) const override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override;
+ void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override;
+ void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const override;
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
index bb132165e53..de917c5f065 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
@@ -46,7 +46,8 @@ std::vector<T> createAndFill(size_t sz) {
}
template<typename T>
-void verifyDotproduct(const IAccelrated & accel)
+void
+verifyDotproduct(const IAccelrated & accel)
{
const size_t testLength(255);
srand(1);
@@ -66,7 +67,8 @@ void verifyDotproduct(const IAccelrated & accel)
}
template<typename T>
-void verifyEuclideanDistance(const IAccelrated & accel) {
+void
+verifyEuclideanDistance(const IAccelrated & accel) {
const size_t testLength(255);
srand(1);
std::vector<T> a = createAndFill<T>(testLength);
@@ -84,7 +86,8 @@ void verifyEuclideanDistance(const IAccelrated & accel) {
}
}
-void verifyPopulationCount(const IAccelrated & accel)
+void
+verifyPopulationCount(const IAccelrated & accel)
{
const uint64_t words[7] = {0x123456789abcdef0L, // 32
0x0000000000000000L, // 0
@@ -101,6 +104,118 @@ void verifyPopulationCount(const IAccelrated & accel)
}
}
+void
+fill(std::vector<uint64_t> & v, size_t n) {
+ v.reserve(n);
+ for (size_t i(0); i < n; i++) {
+ v.emplace_back(random());
+ }
+}
+
+void
+simpleAndWith(std::vector<uint64_t> & dest, const std::vector<uint64_t> & src) {
+ for (size_t i(0); i < dest.size(); i++) {
+ dest[i] &= src[i];
+ }
+}
+
+void
+simpleOrWith(std::vector<uint64_t> & dest, const std::vector<uint64_t> & src) {
+ for (size_t i(0); i < dest.size(); i++) {
+ dest[i] |= src[i];
+ }
+}
+
+std::vector<uint64_t>
+simpleInvert(const std::vector<uint64_t> & src) {
+ std::vector<uint64_t> inverted;
+ inverted.reserve(src.size());
+ for (size_t i(0); i < src.size(); i++) {
+ inverted.push_back(~src[i]);
+ }
+ return inverted;
+}
+
+std::vector<uint64_t>
+optionallyInvert(bool invert, std::vector<uint64_t> v) {
+ return invert ? simpleInvert(std::move(v)) : std::move(v);
+}
+
+bool shouldInvert(bool invertSome) {
+ return invertSome ? (random() & 1) : false;
+}
+
+void
+verifyOr64(const IAccelrated & accel, const std::vector<std::vector<uint64_t>> & vectors,
+ size_t offset, size_t num_vectors, bool invertSome)
+{
+ std::vector<std::pair<const void *, bool>> vRefs;
+ for (size_t j(0); j < num_vectors; j++) {
+ vRefs.emplace_back(&vectors[j][0], shouldInvert(invertSome));
+ }
+
+ std::vector<uint64_t> expected = optionallyInvert(vRefs[0].second, vectors[0]);
+ for (size_t j = 1; j < num_vectors; j++) {
+ simpleOrWith(expected, optionallyInvert(vRefs[j].second, vectors[j]));
+ }
+
+ uint64_t dest[8] __attribute((aligned(64)));
+ accel.or64(offset*sizeof(uint64_t), vRefs, dest);
+ int diff = memcmp(&expected[offset], dest, sizeof(dest));
+ if (diff != 0) {
+ LOG_ABORT("Accelerator fails to compute correct 64 bytes OR");
+ }
+}
+
+void
+verifyAnd64(const IAccelrated & accel, const std::vector<std::vector<uint64_t>> & vectors,
+ size_t offset, size_t num_vectors, bool invertSome)
+{
+ std::vector<std::pair<const void *, bool>> vRefs;
+ for (size_t j(0); j < num_vectors; j++) {
+ vRefs.emplace_back(&vectors[j][0], shouldInvert(invertSome));
+ }
+ std::vector<uint64_t> expected = optionallyInvert(vRefs[0].second, vectors[0]);
+ for (size_t j = 1; j < num_vectors; j++) {
+ simpleAndWith(expected, optionallyInvert(vRefs[j].second, vectors[j]));
+ }
+
+ uint64_t dest[8] __attribute((aligned(64)));
+ accel.and64(offset*sizeof(uint64_t), vRefs, dest);
+ int diff = memcmp(&expected[offset], dest, sizeof(dest));
+ if (diff != 0) {
+ LOG_ABORT("Accelerator fails to compute correct 64 bytes AND");
+ }
+}
+
+void
+verifyOr64(const IAccelrated & accel) {
+ std::vector<std::vector<uint64_t>> vectors(3) ;
+ for (auto & v : vectors) {
+ fill(v, 16);
+ }
+ for (size_t offset = 0; offset < 8; offset++) {
+ for (size_t i = 1; i < vectors.size(); i++) {
+ verifyOr64(accel, vectors, offset, i, false);
+ verifyOr64(accel, vectors, offset, i, true);
+ }
+ }
+}
+
+void
+verifyAnd64(const IAccelrated & accel) {
+ std::vector<std::vector<uint64_t>> vectors(3);
+ for (auto & v : vectors) {
+ fill(v, 16);
+ }
+ for (size_t offset = 0; offset < 8; offset++) {
+ for (size_t i = 1; i < vectors.size(); i++) {
+ verifyAnd64(accel, vectors, offset, i, false);
+ verifyAnd64(accel, vectors, offset, i, true);
+ }
+ }
+}
+
class RuntimeVerificator
{
public:
@@ -114,6 +229,8 @@ private:
verifyEuclideanDistance<float>(accelrated);
verifyEuclideanDistance<double>(accelrated);
verifyPopulationCount(accelrated);
+ verifyAnd64(accelrated);
+ verifyOr64(accelrated);
}
};
@@ -122,7 +239,7 @@ RuntimeVerificator::RuntimeVerificator()
GenericAccelrator generic;
verify(generic);
- const IAccelrated & thisCpu(IAccelrated::getAccelrator());
+ const IAccelrated & thisCpu(IAccelrated::getAccelerator());
verify(thisCpu);
}
@@ -155,7 +272,7 @@ static Selector _G_selector;
RuntimeVerificator _G_verifyAccelrator;
const IAccelrated &
-IAccelrated::getAccelrator()
+IAccelrated::getAccelerator()
{
static IAccelrated::UP accelrator = _G_selector.create();
return *accelrator;
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
index 0292ad14643..2594a48dd33 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
@@ -4,6 +4,7 @@
#include <memory>
#include <cstdint>
+#include <vector>
namespace vespalib::hwaccelrated {
@@ -29,8 +30,12 @@ public:
virtual size_t populationCount(const uint64_t *a, size_t sz) const = 0;
virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const = 0;
virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const = 0;
+ // AND 64 bytes from multiple, optionally inverted sources
+ virtual void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const = 0;
+ // OR 64 bytes from multiple, optionally inverted sources
+ virtual void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const = 0;
- static const IAccelrated & getAccelrator() __attribute__((noinline));
+ static const IAccelrated & getAccelerator() __attribute__((noinline));
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
index f5daf2b9081..6fc49f969f2 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
@@ -24,5 +24,51 @@ populationCount(const uint64_t *a, size_t sz) {
return count;
}
+template<typename T>
+T get(const void * base, bool invert) {
+ T v;
+ memcpy(&v, base, sizeof(T));
+ return __builtin_expect(invert, false) ? ~v : v;
+}
+
+template <typename T>
+const T * cast(const void * ptr, size_t offsetBytes) {
+ return static_cast<const T *>(static_cast<const void *>(static_cast<const char *>(ptr) + offsetBytes));
+}
+
+template<unsigned ChunkSize, unsigned Chunks>
+void
+andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) {
+ typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
+ Chunk * chunk = static_cast<Chunk *>(dest);
+ const Chunk * tmp = cast<Chunk>(src[0].first, offset);
+ for (size_t n=0; n < Chunks; n++) {
+ chunk[n] = get<Chunk>(tmp+n, src[0].second);
+ }
+ for (size_t i(1); i < src.size(); i++) {
+ tmp = cast<Chunk>(src[i].first, offset);
+ for (size_t n=0; n < Chunks; n++) {
+ chunk[n] &= get<Chunk>(tmp+n, src[i].second);
+ }
+ }
+}
+
+template<unsigned ChunkSize, unsigned Chunks>
+void
+orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) {
+ typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
+ Chunk * chunk = static_cast<Chunk *>(dest);
+ const Chunk * tmp = cast<Chunk>(src[0].first, offset);
+ for (size_t n=0; n < Chunks; n++) {
+ chunk[n] = get<Chunk>(tmp+n, src[0].second);
+ }
+ for (size_t i(1); i < src.size(); i++) {
+ tmp = cast<Chunk>(src[i].first, offset);
+ for (size_t n=0; n < Chunks; n++) {
+ chunk[n] |= get<Chunk>(tmp+n, src[i].second);
+ }
+ }
+}
+
}
}