From 48b1bae2a6cdf58a237aa7be59632a06aba86861 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Thu, 14 Dec 2023 16:00:46 +0000 Subject: Slice the vectors and use 1 thread per slice when computing the OR. --- .../src/tests/common/bitvector/bitvector_test.cpp | 54 ++++++++++++++++++++ .../attribute/postinglistsearchcontext.hpp | 8 ++- searchlib/src/vespa/searchlib/common/bitvector.cpp | 57 ++++++++++++++++++++++ searchlib/src/vespa/searchlib/common/bitvector.h | 8 +++ 4 files changed, 125 insertions(+), 2 deletions(-) diff --git a/searchlib/src/tests/common/bitvector/bitvector_test.cpp b/searchlib/src/tests/common/bitvector/bitvector_test.cpp index 571ea80ef36..2ac9fb738f8 100644 --- a/searchlib/src/tests/common/bitvector/bitvector_test.cpp +++ b/searchlib/src/tests/common/bitvector/bitvector_test.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include using namespace search; @@ -744,4 +745,57 @@ TEST("require that creating partial overlapping vector is properly copied") { EXPECT_EQUAL(100u, before->countTrueBits()); } +void fill(BitVector & bv) { + uint32_t numBitsSet = bv.size()/10; + for (uint32_t i(0); i < numBitsSet; i++) { + bv.setBit(rand()%bv.size()); + } +} + +BitVector::UP +orSerial(const std::vector & bvs) { + BitVector::UP master = BitVector::create(*bvs[0]); + for (uint32_t i(1); i < bvs.size(); i++) { + master->orWith(*bvs[i]); + } + return master; +} + +BitVector::UP +orParallell(vespalib::ThreadBundle & thread_bundle, const std::vector & bvs) { + BitVector::UP master = BitVector::create(*bvs[0]); + std::vector vectors; + vectors.reserve(bvs.size()); + vectors.push_back(master.get()); + for (uint32_t i(1); i < bvs.size(); i++) { + vectors.push_back(bvs[i].get()); + } + BitVector::parallellOr(thread_bundle, vectors); + return master; +} + +void verifyParallellOr(vespalib::ThreadBundle & thread_bundle, uint32_t numVectors, uint32_t numBits) { + std::vector bvs; + bvs.reserve(numVectors); + for (uint32_t i(0); i < numVectors; i++) { + bvs.push_back(BitVector::create(numBits)); + fill(*bvs.back()); + } + auto serial = orSerial(bvs); + auto parallell = orParallell(thread_bundle, bvs); + EXPECT_TRUE(*serial == *parallell); +} + +TEST("Require that parallell OR computes same result as serial") { + srand(7); + for (uint32_t numThreads : {1, 3, 7}) { + vespalib::SimpleThreadBundle thread_bundle(numThreads); + for (uint32_t numVectors : {1, 2, 5}) { + for (uint32_t numBits : {1117, 11117, 111117, 1111117, 11111117}) { + verifyParallellOr(thread_bundle, numVectors, numBits); + } + } + } +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp index 68d78619cb5..65ed15a866f 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp @@ -121,9 +121,12 @@ PostingListSearchContextT::fillBitVector(vespalib::ThreadBundle & thread_ parts.emplace_back(_posting_store, parts[i-1]._to, num_this_thread, _merger.getDocIdLimit()); } thread_bundle.run(parts); - for (size_t i(1); i < parts.size(); i++) { - master->orWith(*parts[i]._bv); + std::vector vectors; + vectors.reserve(parts.size()); + for (const auto & part : parts) { + vectors.push_back(part._bv); } + BitVector::parallellOr(thread_bundle, vectors); } template @@ -161,6 +164,7 @@ PostingListSearchContextT::fetchPostings(const queryeval::ExecuteInfo & e if (!_merger.merge_done() && _uniqueValues >= 2u && this->_dictionary.get_has_btree_dictionary()) { if (execInfo.is_strict() || use_posting_lists_when_non_strict(execInfo)) { size_t sum = estimated_hits_in_range(); + //TODO Honour soft_doom and forward it to merge code if (sum < (_docIdLimit * threshold_for_using_array)) { _merger.reserveArray(_uniqueValues, sum); fillArray(); diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp index b79703a8e5c..898a21d860e 100644 --- a/searchlib/src/vespa/searchlib/common/bitvector.cpp +++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,62 @@ using vespalib::nbostream; bool BitVector::_enable_range_check = false; + +struct BitVector::OrParts : vespalib::Runnable +{ + OrParts(vespalib::ConstArrayRef vectors, BitVector::Index offset, BitVector::Index size) noexcept + : _vectors(vectors), + _offset(offset), + _byte_size((size + 7)/8) + {} + void run() override { + const auto & accelrator = IAccelrated::getAccelerator(); + BitVector * master = _vectors[0]; + Word * destination = master->getWordIndex(_offset); + for (uint32_t i(1); i < _vectors.size(); i++) { + accelrator.orBit(destination, _vectors[i]->getWordIndex(_offset), _byte_size); + } + } + vespalib::ConstArrayRef _vectors; + BitVector::Index _offset; + BitVector::Index _byte_size; +}; + + +void +BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef vectors) { + constexpr uint32_t MIN_BITS_PER_THREAD = 128_Ki; + constexpr uint32_t ALIGNMENT_BITS = 8_Ki; + if (vectors.size() < 2) return; + BitVector * master = vectors[0]; + Index size = master->size(); + uint32_t bits_per_thread = size/thread_bundle.size(); + if ((bits_per_thread < MIN_BITS_PER_THREAD) || (thread_bundle.size() < 2)) { + for (uint32_t i(1); i < vectors.size(); i++) { + master->orWith(*vectors[i]); + } + } else { + Index startIndex = master->getStartIndex(); + for (const BitVector *bv: vectors) { + assert(bv->getStartIndex() == startIndex); + assert(bv->size() == size); + } + std::vector parts; + parts.reserve(thread_bundle.size()); + bits_per_thread = (bits_per_thread/ALIGNMENT_BITS) * ALIGNMENT_BITS; + parts.emplace_back(vectors, 0, bits_per_thread); + BitVector::Index offset = bits_per_thread; + for (uint32_t i(1); (i + 1) < thread_bundle.size(); i++) { + parts.emplace_back(vectors, offset, bits_per_thread); + offset += bits_per_thread; + } + parts.emplace_back(vectors, offset, size - offset); + thread_bundle.run(parts); + master->repairEnds(); + } + +} + Alloc BitVector::allocatePaddedAndAligned(Index start, Index end, Index capacity, const Alloc* init_alloc) { diff --git a/searchlib/src/vespa/searchlib/common/bitvector.h b/searchlib/src/vespa/searchlib/common/bitvector.h index 943db5f06ba..0312e37a33c 100644 --- a/searchlib/src/vespa/searchlib/common/bitvector.h +++ b/searchlib/src/vespa/searchlib/common/bitvector.h @@ -5,6 +5,7 @@ #include "bitword.h" #include #include +#include "vespa/vespalib/util/arrayref.h" #include #if VESPA_ENABLE_BITVECTOR_RANGE_CHECK #include @@ -12,6 +13,7 @@ namespace vespalib { class nbostream; + struct ThreadBundle; } class FastOS_FileInterface; @@ -281,6 +283,11 @@ public: static UP create(Index numberOfElements); static UP create(const BitVector & rhs); static void consider_enable_range_check(); + /** + * Will slice the vectors and if possible use the thread bundle do the operation in parallell + * TODO: Extend to handle both AND/OR + */ + static void parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef vectors); protected: using Alloc = vespalib::alloc::Alloc; VESPA_DLL_LOCAL BitVector(void * buf, Index start, Index end) noexcept; @@ -306,6 +313,7 @@ protected: static Alloc allocatePaddedAndAligned(Index start, Index end, Index capacity, const Alloc* init_alloc = nullptr); private: + struct OrParts; static Word load(const Word &word) noexcept { return vespalib::atomic::load_ref_relaxed(word); } VESPA_DLL_LOCAL void store(Word &word, Word value); static void store_unchecked(Word &word, Word value) noexcept { -- cgit v1.2.3 From bc751bbc305da60c730f0198a4f024b7d7dc0dd4 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Mon, 15 Jan 2024 11:51:17 +0000 Subject: No need to special handle the first. --- searchlib/src/vespa/searchlib/common/bitvector.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp index cee5801beb9..9deadf2e0bb 100644 --- a/searchlib/src/vespa/searchlib/common/bitvector.cpp +++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp @@ -78,9 +78,8 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr std::vector parts; parts.reserve(thread_bundle.size()); bits_per_thread = (bits_per_thread/ALIGNMENT_BITS) * ALIGNMENT_BITS; - parts.emplace_back(vectors, 0, bits_per_thread); - BitVector::Index offset = bits_per_thread; - for (uint32_t i(1); (i + 1) < thread_bundle.size(); i++) { + BitVector::Index offset = 0; + for (uint32_t i(0); (i + 1) < thread_bundle.size(); i++) { parts.emplace_back(vectors, offset, bits_per_thread); offset += bits_per_thread; } -- cgit v1.2.3 From 3583d425668e648ba0fd2702d2e011bae7b7a55d Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Tue, 16 Jan 2024 09:45:00 +0000 Subject: Allow to use only some threads. --- searchlib/src/vespa/searchlib/common/bitvector.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp index 9deadf2e0bb..2297e886916 100644 --- a/searchlib/src/vespa/searchlib/common/bitvector.cpp +++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp @@ -56,7 +56,6 @@ struct BitVector::OrParts : vespalib::Runnable BitVector::Index _byte_size; }; - void BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef vectors) { constexpr uint32_t MIN_BITS_PER_THREAD = 128_Ki; @@ -64,8 +63,11 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr if (vectors.size() < 2) return; BitVector * master = vectors[0]; Index size = master->size(); - uint32_t bits_per_thread = size/thread_bundle.size(); - if ((bits_per_thread < MIN_BITS_PER_THREAD) || (thread_bundle.size() < 2)) { + size_t max_num_chunks = (size + (MIN_BITS_PER_THREAD - 1)) / MIN_BITS_PER_THREAD; + size_t max_threads = std::max(1ul, std::min(thread_bundle.size(), max_num_chunks)); + + uint32_t bits_per_thread = size/max_threads; + if (max_threads < 2) { for (uint32_t i(1); i < vectors.size(); i++) { master->orWith(*vectors[i]); } @@ -75,11 +77,11 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr assert(bv->getStartIndex() == startIndex); assert(bv->size() == size); } - std::vector parts; - parts.reserve(thread_bundle.size()); + std::vector parts; + parts.reserve(max_threads); bits_per_thread = (bits_per_thread/ALIGNMENT_BITS) * ALIGNMENT_BITS; - BitVector::Index offset = 0; - for (uint32_t i(0); (i + 1) < thread_bundle.size(); i++) { + Index offset = 0; + for (uint32_t i(0); (i + 1) < max_threads; i++) { parts.emplace_back(vectors, offset, bits_per_thread); offset += bits_per_thread; } @@ -87,7 +89,6 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr thread_bundle.run(parts); master->repairEnds(); } - } Alloc -- cgit v1.2.3 From 802cc7f81943675a1f0585645a21efc7340b84fb Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Tue, 16 Jan 2024 17:07:05 +0000 Subject: Improve comment and make assert more explicit. --- searchlib/src/vespa/searchlib/common/bitvector.cpp | 6 ++---- searchlib/src/vespa/searchlib/common/bitvector.h | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp index 2297e886916..4f1d3a3a72c 100644 --- a/searchlib/src/vespa/searchlib/common/bitvector.cpp +++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp @@ -66,20 +66,18 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr size_t max_num_chunks = (size + (MIN_BITS_PER_THREAD - 1)) / MIN_BITS_PER_THREAD; size_t max_threads = std::max(1ul, std::min(thread_bundle.size(), max_num_chunks)); - uint32_t bits_per_thread = size/max_threads; if (max_threads < 2) { for (uint32_t i(1); i < vectors.size(); i++) { master->orWith(*vectors[i]); } } else { - Index startIndex = master->getStartIndex(); for (const BitVector *bv: vectors) { - assert(bv->getStartIndex() == startIndex); + assert(bv->getStartIndex() == 0u); assert(bv->size() == size); } std::vector parts; parts.reserve(max_threads); - bits_per_thread = (bits_per_thread/ALIGNMENT_BITS) * ALIGNMENT_BITS; + uint32_t bits_per_thread = ((size/max_threads)/ALIGNMENT_BITS) * ALIGNMENT_BITS; Index offset = 0; for (uint32_t i(0); (i + 1) < max_threads; i++) { parts.emplace_back(vectors, offset, bits_per_thread); diff --git a/searchlib/src/vespa/searchlib/common/bitvector.h b/searchlib/src/vespa/searchlib/common/bitvector.h index c403ce0d8a9..30d01c4a58b 100644 --- a/searchlib/src/vespa/searchlib/common/bitvector.h +++ b/searchlib/src/vespa/searchlib/common/bitvector.h @@ -285,6 +285,7 @@ public: static void consider_enable_range_check(); /** * Will slice the vectors and if possible use the thread bundle do the operation in parallell + * The result of the operation ends up in the first vector. * TODO: Extend to handle both AND/OR */ static void parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef vectors); -- cgit v1.2.3