From 48b1bae2a6cdf58a237aa7be59632a06aba86861 Mon Sep 17 00:00:00 2001
From: Henning Baldersheim <balder@yahoo-inc.com>
Date: Thu, 14 Dec 2023 16:00:46 +0000
Subject: Slice the vectors and use 1 thread per slice when computing the OR.

---
 .../src/tests/common/bitvector/bitvector_test.cpp  | 54 ++++++++++++++++++++
 .../attribute/postinglistsearchcontext.hpp         |  8 ++-
 searchlib/src/vespa/searchlib/common/bitvector.cpp | 57 ++++++++++++++++++++++
 searchlib/src/vespa/searchlib/common/bitvector.h   |  8 +++
 4 files changed, 125 insertions(+), 2 deletions(-)
diff --git a/searchlib/src/tests/common/bitvector/bitvector_test.cpp b/searchlib/src/tests/common/bitvector/bitvector_test.cpp
index 571ea80ef36..2ac9fb738f8 100644
--- a/searchlib/src/tests/common/bitvector/bitvector_test.cpp
+++ b/searchlib/src/tests/common/bitvector/bitvector_test.cpp
@@ -11,6 +11,7 @@
 #include <vespa/vespalib/test/memory_allocator_observer.h>
 #include <vespa/vespalib/util/rand48.h>
 #include <vespa/vespalib/util/exceptions.h>
+#include <vespa/vespalib/util/simple_thread_bundle.h>
 #include <algorithm>
 
 using namespace search;
@@ -744,4 +745,57 @@ TEST("require that creating partial overlapping vector is properly copied") {
     EXPECT_EQUAL(100u, before->countTrueBits());
 }
 
+void fill(BitVector & bv) {
+    uint32_t numBitsSet = bv.size()/10;
+    for (uint32_t i(0); i < numBitsSet; i++) {
+        bv.setBit(rand()%bv.size());
+    }
+}
+
+BitVector::UP
+orSerial(const std::vector<BitVector::UP> & bvs) {
+    BitVector::UP master = BitVector::create(*bvs[0]);
+    for (uint32_t i(1); i < bvs.size(); i++) {
+        master->orWith(*bvs[i]);
+    }
+    return master;
+}
+
+BitVector::UP
+orParallell(vespalib::ThreadBundle & thread_bundle, const std::vector<BitVector::UP> & bvs) {
+    BitVector::UP master = BitVector::create(*bvs[0]);
+    std::vector<BitVector *> vectors;
+    vectors.reserve(bvs.size());
+    vectors.push_back(master.get());
+    for (uint32_t i(1); i < bvs.size(); i++) {
+        vectors.push_back(bvs[i].get());
+    }
+    BitVector::parallellOr(thread_bundle, vectors);
+    return master;
+}
+
+void verifyParallellOr(vespalib::ThreadBundle & thread_bundle, uint32_t numVectors, uint32_t numBits) {
+    std::vector<BitVector::UP> bvs;
+    bvs.reserve(numVectors);
+    for (uint32_t i(0); i < numVectors; i++) {
+        bvs.push_back(BitVector::create(numBits));
+        fill(*bvs.back());
+    }
+    auto serial = orSerial(bvs);
+    auto parallell = orParallell(thread_bundle, bvs);
+    EXPECT_TRUE(*serial == *parallell);
+}
+
+TEST("Require that parallell OR computes same result as serial") {
+    srand(7);
+    for (uint32_t numThreads : {1, 3, 7}) {
+        vespalib::SimpleThreadBundle thread_bundle(numThreads);
+        for (uint32_t numVectors : {1, 2, 5}) {
+            for (uint32_t numBits : {1117, 11117, 111117, 1111117, 11111117}) {
+                verifyParallellOr(thread_bundle, numVectors, numBits);
+            }
+        }
+    }
+}
+
 TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
index 68d78619cb5..65ed15a866f 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
@@ -121,9 +121,12 @@ PostingListSearchContextT<DataT>::fillBitVector(vespalib::ThreadBundle & thread_
         parts.emplace_back(_posting_store, parts[i-1]._to, num_this_thread, _merger.getDocIdLimit());
     }
     thread_bundle.run(parts);
-    for (size_t i(1); i < parts.size(); i++) {
-        master->orWith(*parts[i]._bv);
+    std::vector<BitVector *> vectors;
+    vectors.reserve(parts.size());
+    for (const auto & part : parts) {
+        vectors.push_back(part._bv);
     }
+    BitVector::parallellOr(thread_bundle, vectors);
 }
 
 template <typename DataT>
@@ -161,6 +164,7 @@ PostingListSearchContextT<DataT>::fetchPostings(const queryeval::ExecuteInfo & e
     if (!_merger.merge_done() && _uniqueValues >= 2u && this->_dictionary.get_has_btree_dictionary()) {
         if (execInfo.is_strict() || use_posting_lists_when_non_strict(execInfo)) {
             size_t sum = estimated_hits_in_range();
+            //TODO Honour soft_doom and forward it to merge code
             if (sum < (_docIdLimit * threshold_for_using_array)) {
                 _merger.reserveArray(_uniqueValues, sum);
                 fillArray();
diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp
index b79703a8e5c..898a21d860e 100644
--- a/searchlib/src/vespa/searchlib/common/bitvector.cpp
+++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp
@@ -6,6 +6,7 @@
 #include <vespa/searchlib/util/file_settings.h>
 #include <vespa/vespalib/hwaccelrated/iaccelrated.h>
 #include <vespa/vespalib/util/exceptions.h>
+#include <vespa/vespalib/util/thread_bundle.h>
 #include <vespa/vespalib/util/size_literals.h>
 #include <vespa/vespalib/objects/nbostream.h>
 #include <vespa/fastos/file.h>
@@ -34,6 +35,62 @@ using vespalib::nbostream;
 
 bool BitVector::_enable_range_check = false;
 
+
+struct BitVector::OrParts : vespalib::Runnable
+{
+    OrParts(vespalib::ConstArrayRef<BitVector *> vectors, BitVector::Index offset, BitVector::Index size) noexcept
+        : _vectors(vectors),
+          _offset(offset),
+          _byte_size((size + 7)/8)
+    {}
+    void run() override {
+        const auto & accelrator = IAccelrated::getAccelerator();
+        BitVector * master = _vectors[0];
+        Word * destination = master->getWordIndex(_offset);
+        for (uint32_t i(1); i < _vectors.size(); i++) {
+            accelrator.orBit(destination, _vectors[i]->getWordIndex(_offset), _byte_size);
+        }
+    }
+    vespalib::ConstArrayRef<BitVector *> _vectors;
+    BitVector::Index _offset;
+    BitVector::Index _byte_size;
+};
+
+
+void
+BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef<BitVector *> vectors) {
+    constexpr uint32_t MIN_BITS_PER_THREAD = 128_Ki;
+    constexpr uint32_t ALIGNMENT_BITS = 8_Ki;
+    if (vectors.size() < 2) return;
+    BitVector * master = vectors[0];
+    Index size = master->size();
+    uint32_t bits_per_thread = size/thread_bundle.size();
+    if ((bits_per_thread < MIN_BITS_PER_THREAD) || (thread_bundle.size() < 2)) {
+        for (uint32_t i(1); i < vectors.size(); i++) {
+            master->orWith(*vectors[i]);
+        }
+    } else {
+        Index startIndex = master->getStartIndex();
+        for (const BitVector *bv: vectors) {
+            assert(bv->getStartIndex() == startIndex);
+            assert(bv->size() == size);
+        }
+        std::vector<OrParts> parts;
+        parts.reserve(thread_bundle.size());
+        bits_per_thread = (bits_per_thread/ALIGNMENT_BITS) * ALIGNMENT_BITS;
+        parts.emplace_back(vectors, 0, bits_per_thread);
+        BitVector::Index offset = bits_per_thread;
+        for (uint32_t i(1); (i + 1) < thread_bundle.size(); i++) {
+            parts.emplace_back(vectors, offset, bits_per_thread);
+            offset += bits_per_thread;
+        }
+        parts.emplace_back(vectors, offset, size - offset);
+        thread_bundle.run(parts);
+        master->repairEnds();
+    }
+
+}
+
 Alloc
 BitVector::allocatePaddedAndAligned(Index start, Index end, Index capacity, const Alloc* init_alloc)
 {
diff --git a/searchlib/src/vespa/searchlib/common/bitvector.h b/searchlib/src/vespa/searchlib/common/bitvector.h
index 943db5f06ba..0312e37a33c 100644
--- a/searchlib/src/vespa/searchlib/common/bitvector.h
+++ b/searchlib/src/vespa/searchlib/common/bitvector.h
@@ -5,6 +5,7 @@
 #include "bitword.h"
 #include <vespa/vespalib/util/alloc.h>
 #include <vespa/vespalib/util/atomic.h>
+#include "vespa/vespalib/util/arrayref.h"
 #include <algorithm>
 #if VESPA_ENABLE_BITVECTOR_RANGE_CHECK
 #include <cassert>
@@ -12,6 +13,7 @@
 
 namespace vespalib {
     class nbostream;
+    struct ThreadBundle;
 }
 
 class FastOS_FileInterface;
@@ -281,6 +283,11 @@ public:
     static UP create(Index numberOfElements);
     static UP create(const BitVector & rhs);
     static void consider_enable_range_check();
+    /**
+     * Will slice the vectors and if possible use the thread bundle do the operation in parallell
+     * TODO: Extend to handle both AND/OR
+     */
+    static void parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef<BitVector *> vectors);
 protected:
     using Alloc = vespalib::alloc::Alloc;
     VESPA_DLL_LOCAL BitVector(void * buf, Index start, Index end) noexcept;
@@ -306,6 +313,7 @@ protected:
     static Alloc allocatePaddedAndAligned(Index start, Index end, Index capacity, const Alloc* init_alloc = nullptr);
 
 private:
+    struct OrParts;
     static Word load(const Word &word) noexcept { return vespalib::atomic::load_ref_relaxed(word); }
     VESPA_DLL_LOCAL void store(Word &word, Word value);
     static void store_unchecked(Word &word, Word value) noexcept {
-- 
cgit v1.2.3


From bc751bbc305da60c730f0198a4f024b7d7dc0dd4 Mon Sep 17 00:00:00 2001
From: Henning Baldersheim <balder@yahoo-inc.com>
Date: Mon, 15 Jan 2024 11:51:17 +0000
Subject: No need to special handle the first.

---
 searchlib/src/vespa/searchlib/common/bitvector.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp
index cee5801beb9..9deadf2e0bb 100644
--- a/searchlib/src/vespa/searchlib/common/bitvector.cpp
+++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp
@@ -78,9 +78,8 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr
         std::vector<OrParts> parts;
         parts.reserve(thread_bundle.size());
         bits_per_thread = (bits_per_thread/ALIGNMENT_BITS) * ALIGNMENT_BITS;
-        parts.emplace_back(vectors, 0, bits_per_thread);
-        BitVector::Index offset = bits_per_thread;
-        for (uint32_t i(1); (i + 1) < thread_bundle.size(); i++) {
+        BitVector::Index offset = 0;
+        for (uint32_t i(0); (i + 1) < thread_bundle.size(); i++) {
             parts.emplace_back(vectors, offset, bits_per_thread);
             offset += bits_per_thread;
         }
-- 
cgit v1.2.3


From 3583d425668e648ba0fd2702d2e011bae7b7a55d Mon Sep 17 00:00:00 2001
From: Henning Baldersheim <balder@yahoo-inc.com>
Date: Tue, 16 Jan 2024 09:45:00 +0000
Subject: Allow to use only some threads.

---
 searchlib/src/vespa/searchlib/common/bitvector.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp
index 9deadf2e0bb..2297e886916 100644
--- a/searchlib/src/vespa/searchlib/common/bitvector.cpp
+++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp
@@ -56,7 +56,6 @@ struct BitVector::OrParts : vespalib::Runnable
     BitVector::Index _byte_size;
 };
 
-
 void
 BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef<BitVector *> vectors) {
     constexpr uint32_t MIN_BITS_PER_THREAD = 128_Ki;
@@ -64,8 +63,11 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr
     if (vectors.size() < 2) return;
     BitVector * master = vectors[0];
     Index size = master->size();
-    uint32_t bits_per_thread = size/thread_bundle.size();
-    if ((bits_per_thread < MIN_BITS_PER_THREAD) || (thread_bundle.size() < 2)) {
+    size_t max_num_chunks = (size + (MIN_BITS_PER_THREAD - 1)) / MIN_BITS_PER_THREAD;
+    size_t max_threads = std::max(1ul, std::min(thread_bundle.size(), max_num_chunks));
+
+    uint32_t bits_per_thread = size/max_threads;
+    if (max_threads < 2) {
         for (uint32_t i(1); i < vectors.size(); i++) {
             master->orWith(*vectors[i]);
         }
@@ -75,11 +77,11 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr
             assert(bv->getStartIndex() == startIndex);
             assert(bv->size() == size);
         }
-        std::vector<OrParts> parts;
-        parts.reserve(thread_bundle.size());
+        std::vector<BitVector::OrParts> parts;
+        parts.reserve(max_threads);
         bits_per_thread = (bits_per_thread/ALIGNMENT_BITS) * ALIGNMENT_BITS;
-        BitVector::Index offset = 0;
-        for (uint32_t i(0); (i + 1) < thread_bundle.size(); i++) {
+        Index offset = 0;
+        for (uint32_t i(0); (i + 1) < max_threads; i++) {
             parts.emplace_back(vectors, offset, bits_per_thread);
             offset += bits_per_thread;
         }
@@ -87,7 +89,6 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr
         thread_bundle.run(parts);
         master->repairEnds();
     }
-
 }
 
 Alloc
-- 
cgit v1.2.3


From 802cc7f81943675a1f0585645a21efc7340b84fb Mon Sep 17 00:00:00 2001
From: Henning Baldersheim <balder@yahoo-inc.com>
Date: Tue, 16 Jan 2024 17:07:05 +0000
Subject: Improve comment and make assert more explicit.

---
 searchlib/src/vespa/searchlib/common/bitvector.cpp | 6 ++----
 searchlib/src/vespa/searchlib/common/bitvector.h   | 1 +
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp
index 2297e886916..4f1d3a3a72c 100644
--- a/searchlib/src/vespa/searchlib/common/bitvector.cpp
+++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp
@@ -66,20 +66,18 @@ BitVector::parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstAr
     size_t max_num_chunks = (size + (MIN_BITS_PER_THREAD - 1)) / MIN_BITS_PER_THREAD;
     size_t max_threads = std::max(1ul, std::min(thread_bundle.size(), max_num_chunks));
 
-    uint32_t bits_per_thread = size/max_threads;
     if (max_threads < 2) {
         for (uint32_t i(1); i < vectors.size(); i++) {
             master->orWith(*vectors[i]);
         }
     } else {
-        Index startIndex = master->getStartIndex();
         for (const BitVector *bv: vectors) {
-            assert(bv->getStartIndex() == startIndex);
+            assert(bv->getStartIndex() == 0u);
             assert(bv->size() == size);
         }
         std::vector<BitVector::OrParts> parts;
         parts.reserve(max_threads);
-        bits_per_thread = (bits_per_thread/ALIGNMENT_BITS) * ALIGNMENT_BITS;
+        uint32_t bits_per_thread = ((size/max_threads)/ALIGNMENT_BITS) * ALIGNMENT_BITS;
         Index offset = 0;
         for (uint32_t i(0); (i + 1) < max_threads; i++) {
             parts.emplace_back(vectors, offset, bits_per_thread);
diff --git a/searchlib/src/vespa/searchlib/common/bitvector.h b/searchlib/src/vespa/searchlib/common/bitvector.h
index c403ce0d8a9..30d01c4a58b 100644
--- a/searchlib/src/vespa/searchlib/common/bitvector.h
+++ b/searchlib/src/vespa/searchlib/common/bitvector.h
@@ -285,6 +285,7 @@ public:
     static void consider_enable_range_check();
     /**
      * Will slice the vectors and if possible use the thread bundle do the operation in parallell
+     * The result of the operation ends up in the first vector.
      * TODO: Extend to handle both AND/OR
      */
     static void parallellOr(vespalib::ThreadBundle & thread_bundle, vespalib::ConstArrayRef<BitVector *> vectors);
-- 
cgit v1.2.3