aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-01-16 20:05:40 +0100
committerGitHub <noreply@github.com>2024-01-16 20:05:40 +0100
commita7b8a285c1075f5f5e6160ede3d53a3c3b9241fd (patch)
tree25850b2ef1c5e03e791225e86c73d5df015cbab1
parentdf5d011fffe330efc8321b0f0050cae03d45cf91 (diff)
parent802cc7f81943675a1f0585645a21efc7340b84fb (diff)
Merge pull request #29660 from vespa-engine/balder/sliced-parallell-orv8.288.15
Slice the vectors and use 1 thread per slice when computing the OR.
-rw-r--r--searchlib/src/tests/common/bitvector/bitvector_test.cpp54
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp8
-rw-r--r--searchlib/src/vespa/searchlib/common/bitvector.cpp55
-rw-r--r--searchlib/src/vespa/searchlib/common/bitvector.h9
4 files changed, 124 insertions, 2 deletions
diff --git a/searchlib/src/tests/common/bitvector/bitvector_test.cpp b/searchlib/src/tests/common/bitvector/bitvector_test.cpp
index 351890e94bf..758f44cdba2 100644
--- a/searchlib/src/tests/common/bitvector/bitvector_test.cpp
+++ b/searchlib/src/tests/common/bitvector/bitvector_test.cpp
@@ -11,6 +11,7 @@
#include <vespa/vespalib/test/memory_allocator_observer.h>
#include <vespa/vespalib/util/rand48.h>
#include <vespa/vespalib/util/exceptions.h>
+#include <vespa/vespalib/util/simple_thread_bundle.h>
#include <algorithm>
using namespace search;
@@ -744,4 +745,57 @@ TEST("require that creating partial overlapping vector is properly copied") {
EXPECT_EQUAL(100u, before->countTrueBits());
}
+void fill(BitVector & bv) {
+ uint32_t numBitsSet = bv.size()/10;
+ for (uint32_t i(0); i < numBitsSet; i++) {
+ bv.setBit(rand()%bv.size());
+ }
+}
+
+BitVector::UP
+orSerial(const std::vector<BitVector::UP> & bvs) {
+ BitVector::UP master = BitVector::create(*bvs[0]);
+ for (uint32_t i(1); i < bvs.size(); i++) {
+ master->orWith(*bvs[i]);
+ }
+ return master;
+}
+
+BitVector::UP
+orParallell(vespalib::ThreadBundle & thread_bundle, const std::vector<BitVector::UP> & bvs) {
+ BitVector::UP master = BitVector::create(*bvs[0]);
+ std::vector<BitVector *> vectors;
+ vectors.reserve(bvs.size());
+ vectors.push_back(master.get());
+ for (uint32_t i(1); i < bvs.size(); i++) {
+ vectors.push_back(bvs[i].get());
+ }
+ BitVector::parallellOr(thread_bundle, vectors);
+ return master;
+}
+
+void verifyParallellOr(vespalib::ThreadBundle & thread_bundle, uint32_t numVectors, uint32_t numBits) {
+ std::vector<BitVector::UP> bvs;
+ bvs.reserve(numVectors);
+ for (uint32_t i(0); i < numVectors; i++) {
+ bvs.push_back(BitVector::create(numBits));
+ fill(*bvs.back());
+ }
+ auto serial = orSerial(bvs);
+ auto parallell = orParallell(thread_bundle, bvs);
+ EXPECT_TRUE(*serial == *parallell);
+}
+
+TEST("Require that parallell OR computes same result as serial") {
+ srand(7);
+ for (uint32_t numThreads : {1, 3, 7}) {
+ vespalib::SimpleThreadBundle thread_bundle(numThreads);
+ for (uint32_t numVectors : {1, 2, 5}) {
+ for (uint32_t numBits : {1117, 11117, 111117, 1111117, 11111117}) {
+ verifyParallellOr(thread_bundle, numVectors, numBits);
+ }
+ }
+ }
+}
+
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
index 65c103152d3..f937d567588 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
@@ -127,9 +127,12 @@ PostingListSearchContextT<DataT>::fillBitVector(const ExecuteInfo & exec_info)
parts.emplace_back(exec_info.doom(), _posting_store, parts[i-1]._to, num_this_thread, _merger.getDocIdLimit());
}
thread_bundle.run(parts);
- for (size_t i(1); i < parts.size(); i++) {
- master->orWith(*parts[i]._bv);
+ std::vector<BitVector *> vectors;
+ vectors.reserve(parts.size());
+ for (const auto & part : parts) {
+ vectors.push_back(part._bv);
}
+ BitVector::parallellOr(thread_bundle, vectors);
}
template <typename DataT>
@@ -167,6 +170,7 @@ PostingListSearchContextT<DataT>::fetchPostings(const ExecuteInfo & exec_info)
if (!_merger.merge_done() && _uniqueValues >= 2u && this->_dictionary.get_has_btree_dictionary()) {
if (exec_info.is_strict() || use_posting_lists_when_non_strict(exec_info)) {
size_t sum = estimated_hits_in_range();
+ //TODO Honour soft_doom and forward it to merge code
if (sum < (_docIdLimit * threshold_for_using_array)) {
_merger.reserveArray(_uniqueValues, sum);
fillArray();
diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp
index ab46ac348e6..4f1d3a3a72c 100644
--- a/searchlib/src/vespa/searchlib/common/bitvector.cpp
+++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp
@@ -6,6 +6,7 @@
#include <vespa/searchlib/util/file_settings.h>
#include <vespa/vespalib/hwaccelrated/iaccelrated.h>
#include <vespa/vespalib/util/exceptions.h>
+#include <vespa/vespalib/util/thread_bundle.h>
#include <vespa/vespalib/util/size_literals.h>
#include <vespa/vespalib/objects/nbostream.h>
#include <vespa/fastos/file.h>
@@ -34,6 +35,60 @@ using vespalib::nbostream;
bool BitVector::_enable_range_check = false;
+
+struct BitVector::OrParts : vespalib::Runnable
+{
+ OrParts(vespalib::ConstArrayRef<BitVector *> vectors, BitVector::Index offset, BitVector::Index size) noexcept
+ : _vectors(vectors),
+ _offset(offset),
+ _byte_size((size + 7)/8)
+ {}
+ void run() override {
+ const auto & accelrator = IAccelrated::getAccelerator();
+ BitVector * master = _vectors[0];
+ Word * destination = master->getWordIndex(_offset);
+ for (uint32_t i(1); i < _vectors.size(); i++) {
+ accelrator.orBit(destination, _vectors[i]->getWordIndex(_offset), _byte_size);
+ }
+ }
+ vespalib::ConstArrayRef<BitVector *> _vectors;
+ BitVector::Index _offset;
+ BitVector::Index _byte_size;
+};
+
+void
+BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef<BitVector *> vectors) {
+ constexpr uint32_t MIN_BITS_PER_THREAD = 128_Ki;
+ constexpr uint32_t ALIGNMENT_BITS = 8_Ki;
+ if (vectors.size() < 2) return;
+ BitVector * master = vectors[0];
+ Index size = master->size();
+ size_t max_num_chunks = (size + (MIN_BITS_PER_THREAD - 1)) / MIN_BITS_PER_THREAD;
+ size_t max_threads = std::max(1ul, std::min(thread_bundle.size(), max_num_chunks));
+
+ if (max_threads < 2) {
+ for (uint32_t i(1); i < vectors.size(); i++) {
+ master->orWith(*vectors[i]);
+ }
+ } else {
+ for (const BitVector *bv: vectors) {
+ assert(bv->getStartIndex() == 0u);
+ assert(bv->size() == size);
+ }
+ std::vector<BitVector::OrParts> parts;
+ parts.reserve(max_threads);
+ uint32_t bits_per_thread = ((size/max_threads)/ALIGNMENT_BITS) * ALIGNMENT_BITS;
+ Index offset = 0;
+ for (uint32_t i(0); (i + 1) < max_threads; i++) {
+ parts.emplace_back(vectors, offset, bits_per_thread);
+ offset += bits_per_thread;
+ }
+ parts.emplace_back(vectors, offset, size - offset);
+ thread_bundle.run(parts);
+ master->repairEnds();
+ }
+}
+
Alloc
BitVector::allocatePaddedAndAligned(Index start, Index end, Index capacity, const Alloc* init_alloc)
{
diff --git a/searchlib/src/vespa/searchlib/common/bitvector.h b/searchlib/src/vespa/searchlib/common/bitvector.h
index d69f2ac845b..30d01c4a58b 100644
--- a/searchlib/src/vespa/searchlib/common/bitvector.h
+++ b/searchlib/src/vespa/searchlib/common/bitvector.h
@@ -5,6 +5,7 @@
#include "bitword.h"
#include <vespa/vespalib/util/alloc.h>
#include <vespa/vespalib/util/atomic.h>
+#include "vespa/vespalib/util/arrayref.h"
#include <algorithm>
#if VESPA_ENABLE_BITVECTOR_RANGE_CHECK
#include <cassert>
@@ -12,6 +13,7 @@
namespace vespalib {
class nbostream;
+ struct ThreadBundle;
}
class FastOS_FileInterface;
@@ -281,6 +283,12 @@ public:
static UP create(Index numberOfElements);
static UP create(const BitVector & rhs);
static void consider_enable_range_check();
+ /**
+ * Will slice the vectors and if possible use the thread bundle do the operation in parallell
+ * The result of the operation ends up in the first vector.
+ * TODO: Extend to handle both AND/OR
+ */
+ static void parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef<BitVector *> vectors);
static Index numWords(Index bits) noexcept { return wordNum(bits + 1 + (WordLen - 1)); }
static Index numBytes(Index bits) noexcept { return numWords(bits) * sizeof(Word); }
protected:
@@ -306,6 +314,7 @@ protected:
static Alloc allocatePaddedAndAligned(Index start, Index end, Index capacity, const Alloc* init_alloc = nullptr);
private:
+ struct OrParts;
static Word load(const Word &word) noexcept { return vespalib::atomic::load_ref_relaxed(word); }
VESPA_DLL_LOCAL void store(Word &word, Word value);
static void store_unchecked(Word &word, Word value) noexcept {