summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@yahooinc.com>2024-01-31 13:32:10 +0000
committerHåvard Pettersen <havardpe@yahooinc.com>2024-01-31 13:32:10 +0000
commit8aec9ee9f53591991e7706c5adc920540a45fe2d (patch)
treedd92c656aa6d59bbe7b9bf905b76457db829263f /searchlib
parentad797f8f6b74d9e1079e08e35d11f1386d5d7660 (diff)
simple heap-based strict OR implementation
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/queryeval/or_speed/or_speed_test.cpp61
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/orlikesearch.h81
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/orsearch.cpp42
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/orsearch.h4
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/unpackinfo.h1
5 files changed, 161 insertions, 28 deletions
diff --git a/searchlib/src/tests/queryeval/or_speed/or_speed_test.cpp b/searchlib/src/tests/queryeval/or_speed/or_speed_test.cpp
index 8662281f2ef..543441565a2 100644
--- a/searchlib/src/tests/queryeval/or_speed/or_speed_test.cpp
+++ b/searchlib/src/tests/queryeval/or_speed/or_speed_test.cpp
@@ -20,12 +20,22 @@ using search::queryeval::OrSearch;
using search::queryeval::UnpackInfo;
using TMD = search::fef::TermFieldMatchData;
using vespalib::make_string_short::fmt;
+using Impl = OrSearch::StrictImpl;
double budget = 0.25;
size_t bench_docs = 1000;
constexpr uint32_t default_seed = 5489u;
std::mt19937 gen(default_seed);
+const char *impl_str(Impl impl) {
+ if (impl == Impl::PLAIN) { return "plain"; }
+ if (impl == Impl::HEAP) { return " heap"; }
+ return "unknown";
+}
+const char *bool_str(bool bit) { return bit ? "true" : "false"; }
+const char *leaf_str(bool array) { return array ? "A" : "B"; }
+const char *opt_str(bool optimize) { return optimize ? "OPT" : "std"; }
+
BitVector::UP make_bitvector(size_t size, size_t num_bits) {
EXPECT_GT(size, num_bits);
auto bv = BitVector::create(size);
@@ -120,7 +130,7 @@ struct OrSetup {
return BitVectorIterator::create(child_hits[i].get(), *match_data[i], true);
}
}
- SearchIterator::UP make_or(bool optimize = false) {
+ SearchIterator::UP make_or(Impl impl, bool optimize) {
assert(!child_hits.empty());
if (child_hits.size() == 1) {
// use child directly if there is only one
@@ -140,7 +150,7 @@ struct OrSetup {
}
}
}
- auto result = OrSearch::create(std::move(children), true, unpack);
+ auto result = OrSearch::create(std::move(children), true, unpack, impl);
if (optimize) {
result = queryeval::MultiBitVectorIteratorBase::optimize(std::move(result));
}
@@ -152,8 +162,8 @@ struct OrSetup {
}
return *this;
}
- std::pair<size_t,double> bm_search_ms(bool optimized = false) {
- auto search_up = make_or(optimized);
+ std::pair<size_t,double> bm_search_ms(Impl impl, bool optimized) {
+ auto search_up = make_or(impl, optimized);
SearchIterator &search = *search_up;
size_t hits = 0;
BenchmarkTimer timer(budget);
@@ -205,8 +215,8 @@ struct OrSetup {
tmd->resetOnlyDocId(0);
}
}
- void verify_seek_unpack(bool check_skipped_unpack = false, bool optimized = false) {
- auto search_up = make_or(optimized);
+ void verify_seek_unpack(Impl impl, bool check_skipped_unpack, bool optimized) {
+ auto search_up = make_or(impl, optimized);
SearchIterator &search = *search_up;
for (size_t unpack_nth: {1, 3}) {
for (size_t skip: {1, 31}) {
@@ -241,7 +251,7 @@ OrSetup::~OrSetup() = default;
TEST(OrSpeed, array_iterator_seek_unpack) {
OrSetup setup(100);
setup.add(10, true, true);
- setup.verify_seek_unpack();
+ setup.verify_seek_unpack(Impl::PLAIN, true, false);
}
TEST(OrSpeed, or_seek_unpack) {
@@ -250,8 +260,6 @@ TEST(OrSpeed, or_seek_unpack) {
for (int unpack: {0,1,2}) {
OrSetup setup(1000);
size_t part = setup.per_child(target, 13);
- SCOPED_TRACE(fmt("optimize: %s, part: %zu, unpack: %d",
- optimize ? "true" : "false", part, unpack));
for (size_t i = 0; i < 13; ++i) {
bool use_array = (i/2)%2 == 0;
bool need_unpack = unpack > 0;
@@ -260,7 +268,11 @@ TEST(OrSpeed, or_seek_unpack) {
}
setup.add(part, use_array, need_unpack);
}
- setup.verify_seek_unpack(true, optimize);
+ for (auto impl: {Impl::PLAIN, Impl::HEAP}) {
+ SCOPED_TRACE(fmt("impl: %s, optimize: %s, part: %zu, unpack: %d",
+ impl_str(impl), bool_str(optimize), part, unpack));
+ setup.verify_seek_unpack(impl, true, optimize);
+ }
}
}
}
@@ -274,23 +286,30 @@ TEST(OrSpeed, bm_array_vs_bitvector) {
setup.add(hits, false, false);
for (bool use_array: {false, true}) {
setup.use_array[0] = use_array;
- auto result = setup.bm_search_ms();
- fprintf(stderr, "LEAF(%s): (one of %4zu) hits: %8zu, time: %10.3f ms, time per hits: %10.3f ns\n", use_array
- ? " array"
- : "bitvector", one_of, result.first, result.second, (result.second * 1000.0 * 1000.0) / result.first);
+ auto result = setup.bm_search_ms(Impl::PLAIN, false);
+ fprintf(stderr, "LEAF(%s): (one of %4zu) hits: %8zu, time: %10.3f ms, time per hits: %10.3f ns\n",
+ leaf_str(use_array), one_of, result.first, result.second, (result.second * 1000.0 * 1000.0) / result.first);
}
}
}
TEST(OrSpeed, bm_strict_or) {
for (double target: {0.001, 0.01, 0.1, 1.0, 10.0}) {
- for (size_t child_cnt: {5, 10, 100, 1000}) {
- OrSetup setup(bench_docs);
- size_t part = setup.per_child(target, child_cnt);
- if (part > 0) {
- auto result = setup.prepare_bm(child_cnt, part).bm_search_ms();
- fprintf(stderr, "OR bench(children: %4zu, hits_per_child: %8zu): total_hits: %8zu, time: %10.3f ms, time per hits: %10.3f ns\n",
- child_cnt, part, result.first, result.second, (result.second * 1000.0 * 1000.0) / result.first);
+ for (size_t child_cnt: {2, 5, 10, 100, 1000}) {
+ for (bool optimize: {false, true}) {
+ OrSetup setup(bench_docs);
+ size_t part = setup.per_child(target, child_cnt);
+ bool use_array = setup.should_use_array(part);
+ if (part > 0 && (!use_array || !optimize)) {
+ setup.prepare_bm(child_cnt, part);
+ for (auto impl: {Impl::PLAIN, Impl::HEAP}) {
+ auto result = setup.bm_search_ms(impl, optimize);
+ fprintf(stderr, "OR bench(%s, %s, children: %4zu, hits_per_child: %8zu %s): "
+ "total_hits: %8zu, time: %10.3f ms, time per hits: %10.3f ns\n",
+ impl_str(impl), opt_str(optimize), child_cnt, part, leaf_str(use_array),
+ result.first, result.second, (result.second * 1000.0 * 1000.0) / result.first);
+ }
+ }
}
}
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/orlikesearch.h b/searchlib/src/vespa/searchlib/queryeval/orlikesearch.h
index 9c67d2c6a01..906f51d5bec 100644
--- a/searchlib/src/vespa/searchlib/queryeval/orlikesearch.h
+++ b/searchlib/src/vespa/searchlib/queryeval/orlikesearch.h
@@ -66,5 +66,86 @@ private:
Unpack _unpacker;
};
+template <typename Unpack, typename HEAP, typename ref_t>
+class StrictHeapOrSearch : public OrSearch
+{
+private:
+ struct Less {
+ const uint32_t *child_docid;
+ explicit Less(const std::vector<uint32_t> &cd) : child_docid(cd.data()) {}
+ bool operator()(const ref_t &a, const ref_t &b) const {
+ return (child_docid[a] < child_docid[b]);
+ }
+ };
+
+ std::vector<uint32_t> _child_docid;
+ std::vector<ref_t> _data_space;
+ ref_t *_data_begin;
+ ref_t *_data_end;
+ Unpack _unpacker;
+
+ void init_data() {
+ _data_space.resize(getChildren().size());
+ for (size_t i = 0; i < getChildren().size(); ++i) {
+ _data_space[i] = i;
+ }
+ _data_begin = _data_space.data();
+ _data_end = _data_begin + _data_space.size();
+ }
+ void onRemove(size_t index) final {
+ _unpacker.onRemove(index);
+ _child_docid.erase(_child_docid.begin() + index);
+ init_data();
+ }
+ void onInsert(size_t index) final {
+ _unpacker.onInsert(index);
+ _child_docid.insert(_child_docid.begin() + index, getChildren()[index]->getDocId());
+ init_data();
+ }
+ void seek_child(ref_t child, uint32_t docid) {
+ getChildren()[child]->doSeek(docid);
+ _child_docid[child] = getChildren()[child]->getDocId();
+ }
+
+public:
+ StrictHeapOrSearch(Children children, const Unpack &unpacker)
+ : OrSearch(std::move(children)),
+ _child_docid(getChildren().size()),
+ _data_space(),
+ _data_begin(nullptr),
+ _data_end(nullptr),
+ _unpacker(unpacker)
+ {
+ HEAP::require_left_heap();
+ init_data();
+ }
+ void initRange(uint32_t begin, uint32_t end) final {
+ OrSearch::initRange(begin, end);
+ for (size_t i = 0; i < getChildren().size(); ++i) {
+ _child_docid[i] = getChildren()[i]->getDocId();
+ }
+ for (size_t i = 2; i <= _data_space.size(); ++i) {
+ HEAP::push(_data_begin, _data_begin + i, Less(_child_docid));
+ }
+ }
+ void doSeek(uint32_t docid) final {
+ while (_child_docid[HEAP::front(_data_begin, _data_end)] < docid) {
+ seek_child(HEAP::front(_data_begin, _data_end), docid);
+ HEAP::adjust(_data_begin, _data_end, Less(_child_docid));
+ }
+ setDocId(_child_docid[HEAP::front(_data_begin, _data_end)]);
+ }
+ void doUnpack(uint32_t docid) final {
+ _unpacker.each([&](ref_t child) {
+ if (__builtin_expect(_child_docid[child] == docid, false)) {
+ getChildren()[child]->doUnpack(docid);
+ }
+ }, getChildren().size());
+ }
+ bool needUnpack(size_t index) const final {
+ return _unpacker.needUnpack(index);
+ }
+ Trinary is_strict() const final { return Trinary::True; }
+};
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/orsearch.cpp b/searchlib/src/vespa/searchlib/queryeval/orsearch.cpp
index 7cdb13fc159..f7ab9af072f 100644
--- a/searchlib/src/vespa/searchlib/queryeval/orsearch.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/orsearch.cpp
@@ -4,11 +4,15 @@
#include "orlikesearch.h"
#include "termwise_helper.h"
#include <vespa/searchlib/common/bitvector.h>
+#include <vespa/vespalib/util/left_right_heap.h>
namespace search::queryeval {
namespace {
+using vespalib::LeftArrayHeap;
+using vespalib::LeftHeap;
+
class FullUnpack
{
public:
@@ -24,6 +28,11 @@ public:
}
}
}
+ void each(auto &&f, size_t n) {
+ for (size_t i = 0; i < n; ++i) {
+ f(i);
+ }
+ }
void onRemove(size_t index) { (void) index; }
void onInsert(size_t index) { (void) index; }
bool needUnpack(size_t index) const { (void) index; return true; }
@@ -47,6 +56,9 @@ public:
}
}, children.size());
}
+ void each(auto &&f, size_t n) {
+ _unpackInfo.each(std::forward<decltype(f)>(f), n);
+ }
void onRemove(size_t index) {
_unpackInfo.remove(index);
}
@@ -60,6 +72,20 @@ private:
UnpackInfo _unpackInfo;
};
+template <typename Unpack>
+SearchIterator::UP create_strict_or(std::vector<SearchIterator::UP> children, const Unpack &unpack, OrSearch::StrictImpl strict_impl) {
+ if (strict_impl == OrSearch::StrictImpl::HEAP) {
+ if (children.size() <= 0xff) {
+ return std::make_unique<StrictHeapOrSearch<Unpack,LeftArrayHeap,uint8_t>>(std::move(children), unpack);
+ } else if (children.size() <= 0xffff) {
+ return std::make_unique<StrictHeapOrSearch<Unpack,LeftHeap,uint16_t>>(std::move(children), unpack);
+ } else {
+ return std::make_unique<StrictHeapOrSearch<Unpack,LeftHeap,uint32_t>>(std::move(children), unpack);
+ }
+ }
+ return std::make_unique<OrLikeSearch<true,Unpack>>(std::move(children), unpack);
+}
+
}
BitVector::UP
@@ -82,21 +108,23 @@ SearchIterator::UP
OrSearch::create(ChildrenIterators children, bool strict) {
UnpackInfo unpackInfo;
unpackInfo.forceAll();
- return create(std::move(children), strict, unpackInfo);
+ return create(std::move(children), strict, unpackInfo, StrictImpl::PLAIN);
}
SearchIterator::UP
OrSearch::create(ChildrenIterators children, bool strict, const UnpackInfo & unpackInfo) {
+ return create(std::move(children), strict, unpackInfo, StrictImpl::PLAIN);
+}
+
+SearchIterator::UP
+OrSearch::create(ChildrenIterators children, bool strict, const UnpackInfo & unpackInfo, StrictImpl strict_impl) {
if (strict) {
if (unpackInfo.unpackAll()) {
- using MyOr = OrLikeSearch<true, FullUnpack>;
- return std::make_unique<MyOr>(std::move(children), FullUnpack());
+ return create_strict_or(std::move(children), FullUnpack(), strict_impl);
} else if(unpackInfo.empty()) {
- using MyOr = OrLikeSearch<true, NoUnpack>;
- return std::make_unique<MyOr>(std::move(children), NoUnpack());
+ return create_strict_or(std::move(children), NoUnpack(), strict_impl);
} else {
- using MyOr = OrLikeSearch<true, SelectiveUnpack>;
- return std::make_unique<MyOr>(std::move(children), SelectiveUnpack(unpackInfo));
+ return create_strict_or(std::move(children), SelectiveUnpack(unpackInfo), strict_impl);
}
} else {
if (unpackInfo.unpackAll()) {
diff --git a/searchlib/src/vespa/searchlib/queryeval/orsearch.h b/searchlib/src/vespa/searchlib/queryeval/orsearch.h
index d56fb0e99b4..02ee80f1bd8 100644
--- a/searchlib/src/vespa/searchlib/queryeval/orsearch.h
+++ b/searchlib/src/vespa/searchlib/queryeval/orsearch.h
@@ -15,8 +15,12 @@ class OrSearch : public MultiSearch
public:
using Children = MultiSearch::Children;
+ enum class StrictImpl { PLAIN, HEAP };
+
static SearchIterator::UP create(ChildrenIterators children, bool strict);
static SearchIterator::UP create(ChildrenIterators children, bool strict, const UnpackInfo & unpackInfo);
+ static SearchIterator::UP create(ChildrenIterators children, bool strict, const UnpackInfo & unpackInfo,
+ StrictImpl strict_impl);
std::unique_ptr<BitVector> get_hits(uint32_t begin_id) override;
void or_hits_into(BitVector &result, uint32_t begin_id) override;
diff --git a/searchlib/src/vespa/searchlib/queryeval/unpackinfo.h b/searchlib/src/vespa/searchlib/queryeval/unpackinfo.h
index 0ec8d07e19e..64efd66b8c0 100644
--- a/searchlib/src/vespa/searchlib/queryeval/unpackinfo.h
+++ b/searchlib/src/vespa/searchlib/queryeval/unpackinfo.h
@@ -59,6 +59,7 @@ struct NoUnpack {
(void) docid;
(void) search;
}
+ void each(auto &&f, size_t n) { (void) f; (void) n; }
void onRemove(size_t index) { (void) index; }
void onInsert(size_t index) { (void) index; }
bool needUnpack(size_t index) const { (void) index; return false; }