summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2020-06-04 22:34:44 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2020-06-04 22:34:44 +0000
commit8ef08f0c80a409ecfebd179d2a71382ee9b9d814 (patch)
tree0092cbb6976838bb5bfbf9d8c4ffa3954105aec8 /vespalib
parent7cbc790d3d4aefe89e43cf5a39ec2916d8432eb1 (diff)
When we pull in a cacheline, we should use it too.
There is possibly wasting 7/8 of it and very likely suffer a cache miss.
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp10
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.h2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp10
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx512.h2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp10
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.h2
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp87
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h4
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp41
9 files changed, 165 insertions, 3 deletions
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
index 7ff393c87f8..233609d505b 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
@@ -20,4 +20,14 @@ Avx2Accelrator::squaredEuclideanDistance(const double * a, const double * b, siz
return avx::euclideanDistanceSelectAlignment<double, 32>(a, b, sz);
}
+void
+Avx2Accelrator::and64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const {
+ helper::andChunks<32u, 2u>(offset, src, dest);
+}
+
+void
+Avx2Accelrator::or64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const {
+ helper::orChunks<32u, 2u>(offset, src, dest);
+}
+
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
index 3e0dbb28110..292961a6f4d 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
@@ -15,6 +15,8 @@ public:
size_t populationCount(const uint64_t *a, size_t sz) const override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override;
+ void and64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const override;
+ void or64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const override;
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
index 0941e6d6ad8..9cfae4757b9 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
@@ -32,4 +32,14 @@ Avx512Accelrator::squaredEuclideanDistance(const double * a, const double * b, s
return avx::euclideanDistanceSelectAlignment<double, 64>(a, b, sz);
}
+void
+Avx512Accelrator::and64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const {
+ helper::andChunks<64, 1>(offset, src, dest);
+}
+
+void
+Avx512Accelrator::or64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const {
+ helper::orChunks<64, 1>(offset, src, dest);
+}
+
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
index 209ec06c857..ee422b57171 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
@@ -17,6 +17,8 @@ public:
size_t populationCount(const uint64_t *a, size_t sz) const override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override;
+ void and64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const override;
+ void or64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const override;
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
index f9684e88c63..460ae7e7388 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
@@ -165,4 +165,14 @@ GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b,
return euclideanDistanceT<double, 4>(a, b, sz);
}
+void
+GenericAccelrator::and64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const {
+ helper::andChunks<16, 4>(offset, src, dest);
+}
+
+void
+GenericAccelrator::or64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const {
+ helper::orChunks<16,4>(offset, src, dest);
+}
+
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
index 50a3d59d49d..8ce320cd4c4 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
@@ -25,6 +25,8 @@ public:
size_t populationCount(const uint64_t *a, size_t sz) const override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const override;
+ void and64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const override;
+ void or64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const override;
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
index bb132165e53..b0215307f4c 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
@@ -46,7 +46,8 @@ std::vector<T> createAndFill(size_t sz) {
}
template<typename T>
-void verifyDotproduct(const IAccelrated & accel)
+void
+verifyDotproduct(const IAccelrated & accel)
{
const size_t testLength(255);
srand(1);
@@ -66,7 +67,8 @@ void verifyDotproduct(const IAccelrated & accel)
}
template<typename T>
-void verifyEuclideanDistance(const IAccelrated & accel) {
+void
+verifyEuclideanDistance(const IAccelrated & accel) {
const size_t testLength(255);
srand(1);
std::vector<T> a = createAndFill<T>(testLength);
@@ -84,7 +86,8 @@ void verifyEuclideanDistance(const IAccelrated & accel) {
}
}
-void verifyPopulationCount(const IAccelrated & accel)
+void
+verifyPopulationCount(const IAccelrated & accel)
{
const uint64_t words[7] = {0x123456789abcdef0L, // 32
0x0000000000000000L, // 0
@@ -101,6 +104,82 @@ void verifyPopulationCount(const IAccelrated & accel)
}
}
+void
+fill(std::vector<uint64_t> & v, size_t n) {
+ v.reserve(n);
+ for (size_t i(0); i < n; i++) {
+ v.emplace_back(random());
+ }
+}
+
+void
+simpleAndWith(std::vector<uint64_t> & dest, const std::vector<uint64_t> & src) {
+ for (size_t i(0); i < dest.size(); i++) {
+ dest[i] &= src[i];
+ }
+}
+
+void
+simpleOrWith(std::vector<uint64_t> & dest, const std::vector<uint64_t> & src) {
+ for (size_t i(0); i < dest.size(); i++) {
+ dest[i] |= src[i];
+ }
+}
+
+void
+verifyOr64(const IAccelrated & accel) {
+ std::vector<uint64_t> vectors[3] ;
+ for (auto & v : vectors) {
+ fill(v, 16);
+ }
+ for (size_t offset = 0; offset < 8; offset++) {
+ for (size_t i = 1; i < VESPA_NELEMS(vectors); i++) {
+ std::vector<uint64_t> expected = vectors[0];
+ for (size_t j = 1; j < i; j++) {
+ simpleOrWith(expected, vectors[j]);
+ }
+ std::vector<std::pair<const uint64_t *, bool>> vRefs;
+ for (size_t j(0); j < i; j++) {
+ vRefs.emplace_back(&vectors[j][0], false);
+ }
+ uint64_t dest[8] __attribute((aligned(64)));
+ accel.or64(offset, vRefs, dest);
+ int diff = memcmp(&expected[offset], dest, sizeof(dest));
+ if (diff != 0) {
+ fprintf(stderr, "Accelrator is not failing and64\n");
+ LOG_ABORT("should not be reached");
+ }
+ }
+ }
+}
+
+void
+verifyAnd64(const IAccelrated & accel) {
+ std::vector<uint64_t> vectors[3] ;
+ for (auto & v : vectors) {
+ fill(v, 16);
+ }
+ for (size_t offset = 0; offset < 8; offset++) {
+ for (size_t i = 1; i < VESPA_NELEMS(vectors); i++) {
+ std::vector<uint64_t> expected = vectors[0];
+ for (size_t j = 1; j < i; j++) {
+ simpleAndWith(expected, vectors[j]);
+ }
+ std::vector<std::pair<const uint64_t *, bool>> vRefs;
+ for (size_t j(0); j < i; j++) {
+ vRefs.emplace_back(&vectors[j][0], false);
+ }
+ uint64_t dest[8] __attribute((aligned(64)));
+ accel.and64(offset, vRefs, dest);
+ int diff = memcmp(&expected[offset], dest, sizeof(dest));
+ if (diff != 0) {
+ fprintf(stderr, "Accelrator is not failing and64\n");
+ LOG_ABORT("should not be reached");
+ }
+ }
+ }
+}
+
class RuntimeVerificator
{
public:
@@ -114,6 +193,8 @@ private:
verifyEuclideanDistance<float>(accelrated);
verifyEuclideanDistance<double>(accelrated);
verifyPopulationCount(accelrated);
+ verifyAnd64(accelrated);
+ verifyOr64(accelrated);
}
};
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
index 0292ad14643..f352fb292ce 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
@@ -4,6 +4,7 @@
#include <memory>
#include <cstdint>
+#include <vector>
namespace vespalib::hwaccelrated {
@@ -29,6 +30,9 @@ public:
virtual size_t populationCount(const uint64_t *a, size_t sz) const = 0;
virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const = 0;
virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const = 0;
+ // And 64 bytes from multiple sources
+ virtual void and64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const = 0;
+ virtual void or64(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> &src, uint64_t *dest) const = 0;
static const IAccelrated & getAccelrator() __attribute__((noinline));
};
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
index f5daf2b9081..2759cc35ba9 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
@@ -24,5 +24,46 @@ populationCount(const uint64_t *a, size_t sz) {
return count;
}
+template<typename T>
+T get(const void * base, bool invert) {
+ T v;
+ memcpy(&v, base, sizeof(T));
+ return __builtin_expect(invert, false) ? ~v : v;
+}
+
+template<unsigned ChunkSize, unsigned Chunks>
+void
+andChunks(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> & src, uint64_t * dest) {
+ typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
+ Chunk * chunk = reinterpret_cast<Chunk *>(dest);
+ const Chunk * tmp = reinterpret_cast<const Chunk *>(src[0].first+offset);
+ for (size_t n=0; n < Chunks; n++) {
+ chunk[n] = get<Chunk>(tmp+n, src[0].second);
+ }
+ for (size_t i(1); i < src.size(); i++) {
+ tmp = reinterpret_cast<const Chunk *>(src[i].first+offset);
+ for (size_t n=0; n < Chunks; n++) {
+ chunk[n] &= get<Chunk>(tmp+n, src[i].second);
+ }
+ }
+}
+
+template<unsigned ChunkSize, unsigned Chunks>
+void
+orChunks(size_t offset, const std::vector<std::pair<const uint64_t *, bool>> & src, uint64_t * dest) {
+ typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
+ Chunk * chunk = reinterpret_cast<Chunk *>(dest);
+ const Chunk * tmp = reinterpret_cast<const Chunk *>(src[0].first+offset);
+ for (size_t n=0; n < Chunks; n++) {
+ chunk[n] = get<Chunk>(tmp+n, src[0].second);
+ }
+ for (size_t i(1); i < src.size(); i++) {
+ tmp = reinterpret_cast<const Chunk *>(src[i].first+offset);
+ for (size_t n=0; n < Chunks; n++) {
+ chunk[n] |= get<Chunk>(tmp+n, src[i].second);
+ }
+ }
+}
+
}
}