diff options
11 files changed, 66 insertions, 73 deletions
diff --git a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp index 0681768db2b..fdf4ec950dd 100644 --- a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp @@ -4,6 +4,7 @@ #include "andsearch.h" #include "andnotsearch.h" #include "sourceblendersearch.h" +#include <vespa/searchlib/common/bitvectoriterator.h> #include <vespa/vespalib/hwaccelrated/iaccelrated.h> namespace search::queryeval { @@ -17,17 +18,17 @@ namespace { struct And { using Word = BitWord::Word; void operator () (const IAccelrated & accel, size_t offset, const std::vector<Meta> & src, void *dest) noexcept { - accel.and256(offset, src, dest); + accel.and64(offset, src, dest); } - static constexpr bool isAnd() noexcept { return true; } + static bool isAnd() noexcept { return true; } }; struct Or { using Word = BitWord::Word; void operator () (const IAccelrated & accel, size_t offset, const std::vector<Meta> & src, void *dest) noexcept { - accel.or256(offset, src, dest); + accel.or64(offset, src, dest); } - static constexpr bool isAnd() noexcept { return false; } + static bool isAnd() noexcept { return false; } }; } @@ -55,47 +56,43 @@ MultiBitVector<Update>::MultiBitVector(size_t reserved) _accel(IAccelrated::getAccelerator()), _lastWords() { - static_assert(sizeof(_lastWords) == 256, "Lastwords should have 256 byte size"); - static_assert(NumWordsInBatch == 32, "Batch size should be 32 words."); + static_assert(sizeof(_lastWords) == 64, "Lastwords should have 64 byte size"); + static_assert(NumWordsInBatch == 8, "Batch size should be 8 words."); memset(_lastWords, 0, sizeof(_lastWords)); } template<typename Update> bool -MultiBitVector<Update>::updateLastValueCold(uint32_t docId) noexcept +MultiBitVector<Update>::updateLastValue(uint32_t docId) noexcept { - if (__builtin_expect(isAtEnd(docId), false)) { - return true; - } - const uint32_t index(BitWord::wordNum(docId)); - if (docId >= _lastMaxDocIdLimitRequireFetch) { - fetchChunk(index); + if (docId >= _lastMaxDocIdLimit) { + if (__builtin_expect(isAtEnd(docId), false)) { + return true; + } + const uint32_t index(BitWord::wordNum(docId)); + if (docId >= _lastMaxDocIdLimitRequireFetch) { + uint32_t baseIndex = index & ~(NumWordsInBatch - 1); + _update(_accel, baseIndex*sizeof(Word), _bvs, _lastWords); + _lastMaxDocIdLimitRequireFetch = (baseIndex + NumWordsInBatch) * BitWord::WordLen; + } + _lastValue = _lastWords[index % NumWordsInBatch]; + _lastMaxDocIdLimit = (index + 1) * BitWord::WordLen; } - _lastValue = _lastWords[index % NumWordsInBatch]; - _lastMaxDocIdLimit = (index + 1) * BitWord::WordLen; return false; } template<typename Update> -void -MultiBitVector<Update>::fetchChunk(uint32_t index) noexcept -{ - uint32_t baseIndex = index & ~(NumWordsInBatch - 1); - _update(_accel, baseIndex*sizeof(Word), _bvs, _lastWords); - _lastMaxDocIdLimitRequireFetch = (baseIndex + NumWordsInBatch) * BitWord::WordLen; -} - -template<typename Update> uint32_t MultiBitVector<Update>::strictSeek(uint32_t docId) noexcept { bool atEnd; for (atEnd = updateLastValue(docId), _lastValue = _lastValue & BitWord::checkTab(docId); - __builtin_expect(_lastValue == 0, Update::isAnd()) && __builtin_expect(! atEnd, true); // And is likely to have few bits, while Or has many. + (_lastValue == 0) && __builtin_expect(! atEnd, true); atEnd = updateLastValue(_lastMaxDocIdLimit)); - return (__builtin_expect(!atEnd, true)) - ? _lastMaxDocIdLimit - BitWord::WordLen + vespalib::Optimized::lsbIdx(_lastValue) - : _numDocs; + if (__builtin_expect(!atEnd, true)) { + return _lastMaxDocIdLimit - BitWord::WordLen + vespalib::Optimized::lsbIdx(_lastValue); + } + return _numDocs; } template<typename Update> @@ -103,8 +100,12 @@ bool MultiBitVector<Update>::seek(uint32_t docId) noexcept { bool atEnd = updateLastValue(docId); - return __builtin_expect( ! atEnd, true) && - __builtin_expect(_lastValue & BitWord::mask(docId), false); + if (__builtin_expect( ! atEnd, true)) { + if (_lastValue & BitWord::mask(docId)) { + return true; + } + } + return false; } namespace { @@ -159,7 +160,7 @@ template<typename Update> void MultiBitVectorIterator<Update>::doSeek(uint32_t docId) { - if (_mbv.seek(docId)) [[unlikely]] { + if (_mbv.seek(docId)) { setDocId(docId); } } diff --git a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h index 5f9a3b20b50..2b4f90544ac 100644 --- a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h +++ b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h @@ -37,20 +37,12 @@ public: bool seek(uint32_t docId) noexcept; bool acceptExtraFilter() const noexcept { return Update::isAnd(); } private: - bool updateLastValue(uint32_t docId) noexcept { - if (docId >= _lastMaxDocIdLimit) { - return updateLastValueCold(docId); - } - return false; - } - VESPA_DLL_LOCAL bool updateLastValueCold(uint32_t docId) noexcept __attribute__((noinline)); - VESPA_DLL_LOCAL void fetchChunk(uint32_t docId) noexcept __attribute__((noinline)); - + bool updateLastValue(uint32_t docId) noexcept; using IAccelrated = vespalib::hwaccelrated::IAccelrated; Update _update; const IAccelrated & _accel; - alignas(64) Word _lastWords[32]; + alignas(64) Word _lastWords[8]; static constexpr size_t NumWordsInBatch = sizeof(_lastWords) / sizeof(Word); }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index c6a9cc7ae9e..bbba4109fc2 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -26,13 +26,13 @@ Avx2Accelrator::squaredEuclideanDistance(const double * a, const double * b, siz } void -Avx2Accelrator::and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { - helper::andChunks<32u, 8u>(offset, src, dest); +Avx2Accelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { + helper::andChunks<32u, 2u>(offset, src, dest); } void -Avx2Accelrator::or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { - helper::orChunks<32u, 8u>(offset, src, dest); +Avx2Accelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { + helper::orChunks<32u, 2u>(offset, src, dest); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index 61ca1573601..934d815d67b 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -16,8 +16,8 @@ public: double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept override; - void and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; - void or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; + void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; + void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp index 5b3a7deb564..035f33cb25e 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -36,13 +36,13 @@ Avx512Accelrator::squaredEuclideanDistance(const double * a, const double * b, s } void -Avx512Accelrator::and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { - helper::andChunks<64, 4>(offset, src, dest); +Avx512Accelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { + helper::andChunks<64, 1>(offset, src, dest); } void -Avx512Accelrator::or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { - helper::orChunks<64, 4>(offset, src, dest); +Avx512Accelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { + helper::orChunks<64, 1>(offset, src, dest); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h index fbfdd021619..38eab0a2549 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h @@ -18,8 +18,8 @@ public: double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept override; - void and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; - void or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; + void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; + void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index b6b8436a389..a8e5535cc21 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -173,13 +173,13 @@ GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, } void -GenericAccelrator::and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { - helper::andChunks<16, 16>(offset, src, dest); +GenericAccelrator::and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { + helper::andChunks<16, 4>(offset, src, dest); } void -GenericAccelrator::or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { - helper::orChunks<16, 16>(offset, src, dest); +GenericAccelrator::or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept { + helper::orChunks<16,4>(offset, src, dest); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h index 5cbabc3de53..16c8bab71da 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h @@ -26,8 +26,8 @@ public: double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept override; - void and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; - void or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; + void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; + void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp index 77d168a2c5d..d707553b504 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp @@ -153,8 +153,8 @@ verifyOr64(const IAccelrated & accel, const std::vector<std::vector<uint64_t>> & simpleOrWith(expected, optionallyInvert(vRefs[j].second, vectors[j])); } - uint64_t dest[32] __attribute((aligned(64))); - accel.or256(offset * sizeof(uint64_t), vRefs, dest); + uint64_t dest[8] __attribute((aligned(64))); + accel.or64(offset*sizeof(uint64_t), vRefs, dest); int diff = memcmp(&expected[offset], dest, sizeof(dest)); if (diff != 0) { LOG_ABORT("Accelerator fails to compute correct 64 bytes OR"); @@ -174,8 +174,8 @@ verifyAnd64(const IAccelrated & accel, const std::vector<std::vector<uint64_t>> simpleAndWith(expected, optionallyInvert(vRefs[j].second, vectors[j])); } - uint64_t dest[32] __attribute((aligned(64))); - accel.and256(offset * sizeof(uint64_t), vRefs, dest); + uint64_t dest[8] __attribute((aligned(64))); + accel.and64(offset*sizeof(uint64_t), vRefs, dest); int diff = memcmp(&expected[offset], dest, sizeof(dest)); if (diff != 0) { LOG_ABORT("Accelerator fails to compute correct 64 bytes AND"); @@ -186,9 +186,9 @@ void verifyOr64(const IAccelrated & accel) { std::vector<std::vector<uint64_t>> vectors(3) ; for (auto & v : vectors) { - fill(v, 64); + fill(v, 16); } - for (size_t offset = 0; offset < 32; offset++) { + for (size_t offset = 0; offset < 8; offset++) { for (size_t i = 1; i < vectors.size(); i++) { verifyOr64(accel, vectors, offset, i, false); verifyOr64(accel, vectors, offset, i, true); @@ -200,9 +200,9 @@ void verifyAnd64(const IAccelrated & accel) { std::vector<std::vector<uint64_t>> vectors(3); for (auto & v : vectors) { - fill(v, 64); + fill(v, 16); } - for (size_t offset = 0; offset < 32; offset++) { + for (size_t offset = 0; offset < 8; offset++) { for (size_t i = 1; i < vectors.size(); i++) { verifyAnd64(accel, vectors, offset, i, false); verifyAnd64(accel, vectors, offset, i, true); diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h index e6bd86957db..806e77caced 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h @@ -31,10 +31,10 @@ public: virtual double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept = 0; virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept = 0; virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept = 0; - // AND 256 bytes from multiple, optionally inverted sources - virtual void and256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept = 0; - // OR 256 bytes from multiple, optionally inverted sources - virtual void or256(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept = 0; + // AND 64 bytes from multiple, optionally inverted sources + virtual void and64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept = 0; + // OR 64 bytes from multiple, optionally inverted sources + virtual void or64(size_t offset, const std::vector<std::pair<const void *, bool>> &src, void *dest) const noexcept = 0; static const IAccelrated & getAccelerator() __attribute__((noinline)); }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp index 3185d6e77cd..c884f0d7bb9 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -43,7 +43,7 @@ void andChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) { typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize"); - static_assert(ChunkSize*Chunks == 256, "ChunkSize*Chunks == 256"); + static_assert(ChunkSize*Chunks == 64, "ChunkSize*Chunks == 64"); Chunk * chunk = static_cast<Chunk *>(dest); const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset); for (size_t n=0; n < Chunks; n++) { @@ -62,7 +62,7 @@ void orChunks(size_t offset, const std::vector<std::pair<const void *, bool>> & src, void * dest) { typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize"); - static_assert(ChunkSize*Chunks == 256, "ChunkSize*Chunks == 256"); + static_assert(ChunkSize*Chunks == 64, "ChunkSize*Chunks == 64"); Chunk * chunk = static_cast<Chunk *>(dest); const Chunk * tmp = cast<Chunk, ChunkSize>(src[0].first, offset); for (size_t n=0; n < Chunks; n++) { |