aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2023-12-15 15:27:45 +0100
committerGitHub <noreply@github.com>2023-12-15 15:27:45 +0100
commit3a9f89fe60e3420eed435daee435a4f8534c9512 (patch)
treeeb1fad9d94f1494eb065cd84985a277dd6d5c48c
parentb8ba7d82cd08462a9a48b5acbd03c6869be9a9a3 (diff)
parent06210a9350c7448d8ef22cb6308d17e75a3b1f2e (diff)
Merge pull request #29673 from vespa-engine/revert-29663-balder/separate-hot-cold-path-tomake-fast-path-faster
Revert "Balder/separate hot cold path tomake fast path faster"
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp63
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h12
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp8
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx2.h4
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp8
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/avx512.h4
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp8
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/generic.h4
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp16
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h8
-rw-r--r--vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp4
11 files changed, 66 insertions, 73 deletions
diff --git a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp
index 0681768db2b..fdf4ec950dd 100644
--- a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp
@@ -4,6 +4,7 @@
#include "andsearch.h"
#include "andnotsearch.h"
#include "sourceblendersearch.h"
+#include <vespa/searchlib/common/bitvectoriterator.h>
#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
namespace search::queryeval {
@@ -17,17 +18,17 @@ namespace {
struct And {
using Word = BitWord::Word;
void operator () (const IAccelrated & accel, size_t offset, const std::vector<Meta> & src, void *dest) noexcept {
- accel.and256(offset, src, dest);
+ accel.and64(offset, src, dest);
}
- static constexpr bool isAnd() noexcept { return true; }
+ static bool isAnd() noexcept { return true; }
};
struct Or {
using Word = BitWord::Word;
void operator () (const IAccelrated & accel, size_t offset, const std::vector<Meta> & src, void *dest) noexcept {
- accel.or256(offset, src, dest);
+ accel.or64(offset, src, dest);
}
- static constexpr bool isAnd() noexcept { return false; }
+ static bool isAnd() noexcept { return false; }
};
}
@@ -55,47 +56,43 @@ MultiBitVector<Update>::MultiBitVector(size_t reserved)
_accel(IAccelrated::getAccelerator()),
_lastWords()
{
- static_assert(sizeof(_lastWords) == 256, "Lastwords should have 256 byte size");
- static_assert(NumWordsInBatch == 32, "Batch size should be 32 words.");
+ static_assert(sizeof(_lastWords) == 64, "Lastwords should have 64 byte size");
+ static_assert(NumWordsInBatch == 8, "Batch size should be 8 words.");
memset(_lastWords, 0, sizeof(_lastWords));
}
template<typename Update>
bool
-MultiBitVector<Update>::updateLastValueCold(uint32_t docId) noexcept
+MultiBitVector<Update>::updateLastValue(uint32_t docId) noexcept
{
- if (__builtin_expect(isAtEnd(docId), false)) {
- return true;
- }
- const uint32_t index(BitWord::wordNum(docId));
- if (docId >= _lastMaxDocIdLimitRequireFetch) {
- fetchChunk(index);
+ if (docId >= _lastMaxDocIdLimit) {
+ if (__builtin_expect(isAtEnd(docId), false)) {
+ return true;
+ }
+ const uint32_t index(BitWord::wordNum(docId));
+ if (docId >= _lastMaxDocIdLimitRequireFetch) {
+ uint32_t baseIndex = index & ~(NumWordsInBatch - 1);
+ _update(_accel, baseIndex*sizeof(Word), _bvs, _lastWords);
+ _lastMaxDocIdLimitRequireFetch = (baseIndex + NumWordsInBatch) * BitWord::WordLen;
+ }
+ _lastValue = _lastWords[index % NumWordsInBatch];
+ _lastMaxDocIdLimit = (index + 1) * BitWord::WordLen;
}
- _lastValue = _lastWords[index % NumWordsInBatch];
- _lastMaxDocIdLimit = (index + 1) * BitWord::WordLen;
return false;
}
template<typename Update>
-void
-MultiBitVector<Update>::fetchChunk(uint32_t index) noexcept
-{
- uint32_t baseIndex = index & ~(NumWordsInBatch - 1);
- _update(_accel, baseIndex*sizeof(Word), _bvs, _lastWords);
- _lastMaxDocIdLimitRequireFetch = (baseIndex + NumWordsInBatch) * BitWord::WordLen;
-}
-
-template<typename Update>
uint32_t
MultiBitVector<Update>::strictSeek(uint32_t docId) noexcept
{
bool atEnd;
for (atEnd = updateLastValue(docId), _lastValue = _lastValue & BitWord::checkTab(docId);
- __builtin_expect(_lastValue == 0, Update::isAnd()) && __builtin_expect(! atEnd, true); // And is likely to have few bits, while Or has many.
+ (_lastValue == 0) && __builtin_expect(! atEnd, true);
atEnd = updateLastValue(_lastMaxDocIdLimit));
- return (__builtin_expect(!atEnd, true))
- ? _lastMaxDocIdLimit - BitWord::WordLen + vespalib::Optimized::lsbIdx(_lastValue)
- : _numDocs;
+ if (__builtin_expect(!atEnd, true)) {
+ return _lastMaxDocIdLimit - BitWord::WordLen + vespalib::Optimized::lsbIdx(_lastValue);
+ }
+ return _numDocs;
}
template<typename Update>
@@ -103,8 +100,12 @@ bool
MultiBitVector<Update>::seek(uint32_t docId) noexcept
{
bool atEnd = updateLastValue(docId);
- return __builtin_expect( ! atEnd, true) &&
- __builtin_expect(_lastValue & BitWord::mask(docId), false);
+ if (__builtin_expect( ! atEnd, true)) {
+ if (_lastValue & BitWord::mask(docId)) {
+ return true;
+ }
+ }
+ return false;
}
namespace {
@@ -159,7 +160,7 @@ template<typename Update>
void
MultiBitVectorIterator<Update>::doSeek(uint32_t docId)
{
- if (_mbv.seek(docId)) [[unlikely]] {
+ if (_mbv.seek(docId)) {
setDocId(docId);
}
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h
index 5f9a3b20b50..2b4f90544ac 100644
--- a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h
+++ b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h
@@ -37,20 +37,12 @@ public:
bool seek(uint32_t docId) noexcept;
bool acceptExtraFilter() const noexcept { return Update::isAnd(); }
private:
- bool updateLastValue(uint32_t docId) noexcept {
- if (docId >= _lastMaxDocIdLimit) {
- return updateLastValueCold(docId);
- }
- return false;
- }
- VESPA_DLL_LOCAL bool updateLastValueCold(uint32_t docId) noexcept __attribute__((noinline));
- VESPA_DLL_LOCAL void fetchChunk(uint32_t docId) noexcept __attribute__((noinline));
-
+ bool updateLastValue(uint32_t docId) noexcept;
using IAccelrated = vespalib::hwaccelrated::IAccelrated;
Update _update;
const IAccelrated & _accel;
- alignas(64) Word _lastWords[32];
+ alignas(64) Word _lastWords[8];
static constexpr size_t NumWordsInBatch = sizeof(_lastWords) / sizeof(Word);
};
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
index c6a9cc7ae9e..bbba4109fc2 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp
@@ -26,13 +26,13 @@ Avx2Accelrator::squaredEuclideanDistance(const double * a, const double * b, siz
}
void
-Avx2Accelrator::and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
- helper::andChunks<32u, 8u>(offset, src, dest);
+Avx2Accelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
+ helper::andChunks<32u, 2u>(offset, src, dest);
}
void
-Avx2Accelrator::or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
- helper::orChunks<32u, 8u>(offset, src, dest);
+Avx2Accelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
+ helper::orChunks<32u, 2u>(offset, src, dest);
}
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
index 61ca1573601..934d815d67b 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h
@@ -16,8 +16,8 @@ public:
double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept override;
- void and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
- void or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
+ void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
+ void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
index 5b3a7deb564..035f33cb25e 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp
@@ -36,13 +36,13 @@ Avx512Accelrator::squaredEuclideanDistance(const double * a, const double * b, s
}
void
-Avx512Accelrator::and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
- helper::andChunks<64, 4>(offset, src, dest);
+Avx512Accelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
+ helper::andChunks<64, 1>(offset, src, dest);
}
void
-Avx512Accelrator::or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
- helper::orChunks<64, 4>(offset, src, dest);
+Avx512Accelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
+ helper::orChunks<64, 1>(offset, src, dest);
}
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
index fbfdd021619..38eab0a2549 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h
@@ -18,8 +18,8 @@ public:
double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept override;
- void and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
- void or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
+ void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
+ void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
index b6b8436a389..a8e5535cc21 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp
@@ -173,13 +173,13 @@ GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b,
}
void
-GenericAccelrator::and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
- helper::andChunks<16, 16>(offset, src, dest);
+GenericAccelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
+ helper::andChunks<16, 4>(offset, src, dest);
}
void
-GenericAccelrator::or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
- helper::orChunks<16, 16>(offset, src, dest);
+GenericAccelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept {
+ helper::orChunks<16,4>(offset, src, dest);
}
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
index 5cbabc3de53..16c8bab71da 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h
@@ -26,8 +26,8 @@ public:
double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept override;
double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept override;
double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept override;
- void and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
- void or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
+ void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
+ void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override;
};
}
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
index 77d168a2c5d..d707553b504 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp
@@ -153,8 +153,8 @@ verifyOr64(const IAccelrated & accel, const std::vector<std::vector<uint64_t>> &
simpleOrWith(expected, optionallyInvert(vRefs[j].second, vectors[j]));
}
- uint64_t dest[32] __attribute((aligned(64)));
- accel.or256(offset * sizeof(uint64_t), vRefs, dest);
+ uint64_t dest[8] __attribute((aligned(64)));
+ accel.or64(offset*sizeof(uint64_t), vRefs, dest);
int diff = memcmp(&expected[offset], dest, sizeof(dest));
if (diff != 0) {
LOG_ABORT("Accelerator fails to compute correct 64 bytes OR");
@@ -174,8 +174,8 @@ verifyAnd64(const IAccelrated & accel, const std::vector<std::vector<uint64_t>>
simpleAndWith(expected, optionallyInvert(vRefs[j].second, vectors[j]));
}
- uint64_t dest[32] __attribute((aligned(64)));
- accel.and256(offset * sizeof(uint64_t), vRefs, dest);
+ uint64_t dest[8] __attribute((aligned(64)));
+ accel.and64(offset*sizeof(uint64_t), vRefs, dest);
int diff = memcmp(&expected[offset], dest, sizeof(dest));
if (diff != 0) {
LOG_ABORT("Accelerator fails to compute correct 64 bytes AND");
@@ -186,9 +186,9 @@ void
verifyOr64(const IAccelrated & accel) {
std::vector<std::vector<uint64_t>> vectors(3) ;
for (auto & v : vectors) {
- fill(v, 64);
+ fill(v, 16);
}
- for (size_t offset = 0; offset < 32; offset++) {
+ for (size_t offset = 0; offset < 8; offset++) {
for (size_t i = 1; i < vectors.size(); i++) {
verifyOr64(accel, vectors, offset, i, false);
verifyOr64(accel, vectors, offset, i, true);
@@ -200,9 +200,9 @@ void
verifyAnd64(const IAccelrated & accel) {
std::vector<std::vector<uint64_t>> vectors(3);
for (auto & v : vectors) {
- fill(v, 64);
+ fill(v, 16);
}
- for (size_t offset = 0; offset < 32; offset++) {
+ for (size_t offset = 0; offset < 8; offset++) {
for (size_t i = 1; i < vectors.size(); i++) {
verifyAnd64(accel, vectors, offset, i, false);
verifyAnd64(accel, vectors, offset, i, true);
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
index e6bd86957db..806e77caced 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h
@@ -31,10 +31,10 @@ public:
virtual double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept = 0;
virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept = 0;
virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept = 0;
- // AND 256 bytes from multiple, optionally inverted sources
- virtual void and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept = 0;
- // OR 256 bytes from multiple, optionally inverted sources
- virtual void or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept = 0;
+ // AND 64 bytes from multiple, optionally inverted sources
+ virtual void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept = 0;
+ // OR 64 bytes from multiple, optionally inverted sources
+ virtual void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept = 0;
static const IAccelrated & getAccelerator() __attribute__((noinline));
};
diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
index 3185d6e77cd..c884f0d7bb9 100644
--- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
+++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp
@@ -43,7 +43,7 @@ void
andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) {
typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize");
- static_assert(ChunkSize*Chunks == 256, "ChunkSize*Chunks == 256");
+ static_assert(ChunkSize*Chunks == 64, "ChunkSize*Chunks == 64");
Chunk * chunk = static_cast<Chunk *>(dest);
const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset);
for (size_t n=0; n < Chunks; n++) {
@@ -62,7 +62,7 @@ void
orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) {
typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize)));
static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize");
- static_assert(ChunkSize*Chunks == 256, "ChunkSize*Chunks == 256");
+ static_assert(ChunkSize*Chunks == 64, "ChunkSize*Chunks == 64");
Chunk * chunk = static_cast<Chunk *>(dest);
const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset);
for (size_t n=0; n < Chunks; n++) {