diff options
author | Geir Storli <geirst@yahooinc.com> | 2024-01-18 10:55:54 +0000 |
---|---|---|
committer | Geir Storli <geirst@yahooinc.com> | 2024-01-18 10:55:54 +0000 |
commit | 76ae65285e66c316e8c2892bc531e183c56a691c (patch) | |
tree | fd1cf3136974e083899b1f39f5a11054038ae39b /searchlib | |
parent | 282d9ecb5381317e4f56dd799fce52b45147414d (diff) |
Stop using MultiTermOrFilterSearch for InTerm and WeightedSetTerm.
Benchmarking the IN operator (https://github.com/vespa-engine/system-test/tree/master/tests/performance/in_operator)
has shown that the heap-based implementation in WeightedSetTermSearchImpl
is on par when having fewer than 10 tokens/terms, and significantly better
the more tokens/terms the operator contains.
Diffstat (limited to 'searchlib')
7 files changed, 93 insertions, 80 deletions
diff --git a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp index 899ddaa3cc0..a94ca9d890e 100644 --- a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp +++ b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp @@ -156,12 +156,17 @@ using MultiInBlueprintType = DirectMultiTermBlueprint<IDocidWithWeightPostingSto using SingleWSetBlueprintType = DirectMultiTermBlueprint<IDocidPostingStore, WeightedSetTermSearch>; using MultiWSetBlueprintType = DirectMultiTermBlueprint<IDocidWithWeightPostingStore, WeightedSetTermSearch>; +vespalib::string iterator_unpack_docid_and_weights = "search::queryeval::WeightedSetTermSearchImpl<(search::queryeval::UnpackType)0"; +vespalib::string iterator_unpack_docid = "search::queryeval::WeightedSetTermSearchImpl<(search::queryeval::UnpackType)1"; +vespalib::string iterator_unpack_none = "search::queryeval::WeightedSetTermSearchImpl<(search::queryeval::UnpackType)2"; + class DirectMultiTermBlueprintTest : public ::testing::TestWithParam<TestParam> { public: std::shared_ptr<AttributeVector> attr; bool in_operator; bool single_type; bool integer_type; + bool field_is_filter; std::shared_ptr<ComplexLeafBlueprint> blueprint; Blueprint::HitEstimate estimate; fef::TermFieldMatchData tfmd; @@ -171,6 +176,7 @@ public: in_operator(true), single_type(true), integer_type(true), + field_is_filter(false), blueprint(), tfmd(), tfmda() @@ -178,7 +184,8 @@ public: tfmda.add(&tfmd); } ~DirectMultiTermBlueprintTest() {} - void setup(bool field_is_filter, bool need_term_field_match_data) { + void setup(bool field_is_filter_in, bool need_term_field_match_data) { + field_is_filter = field_is_filter_in; attr = make_attribute(GetParam().col_type, GetParam().type, field_is_filter); in_operator = GetParam().op_type == OperatorType::In; single_type = GetParam().col_type == CollectionType::SINGLE; @@ -242,8 +249,11 @@ public: std::unique_ptr<SearchIterator> create_leaf_search() const { return blueprint->createLeafSearch(tfmda, true); } - vespalib::string multi_term_iterator() const { - return in_operator ? "search::attribute::MultiTermOrFilterSearchImpl" : "search::queryeval::WeightedSetTermSearchImpl"; + vespalib::string resolve_iterator_with_unpack() const { + if (in_operator) { + return iterator_unpack_docid; + } + return field_is_filter ? iterator_unpack_docid : iterator_unpack_docid_and_weights; } }; @@ -287,11 +297,11 @@ TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_none_filter_field) add_term(1); add_term(3); auto itr = create_leaf_search(); - EXPECT_THAT(itr->asString(), StartsWith(multi_term_iterator())); + EXPECT_THAT(itr->asString(), StartsWith(resolve_iterator_with_unpack())); expect_hits({10, 30, 31}, *itr); } -TEST_P(DirectMultiTermBlueprintTest, bitvectors_used_instead_of_btree_iterators_for_none_filter_field) +TEST_P(DirectMultiTermBlueprintTest, bitvectors_used_instead_of_btree_iterators_for_in_operator) { setup(false, true); if (!in_operator) { @@ -302,11 +312,11 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_used_instead_of_btree_iterators_ auto itr = create_leaf_search(); expect_or_iterator(*itr, 2); expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); - expect_or_child(*itr, 1, multi_term_iterator()); + expect_or_child(*itr, 1, iterator_unpack_docid); expect_hits(concat({10}, range(100, 128)), *itr); } -TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_instead_of_bitvectors_for_none_filter_field) +TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_instead_of_bitvectors_for_wset_operator) { setup(false, true); if (in_operator) { @@ -315,7 +325,7 @@ TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_instead_of_bitvectors_ add_term(1); add_term(100); auto itr = create_leaf_search(); - EXPECT_THAT(itr->asString(), StartsWith(multi_term_iterator())); + EXPECT_THAT(itr->asString(), StartsWith(iterator_unpack_docid_and_weights)); expect_hits(concat({10}, range(100, 128)), *itr); } @@ -330,7 +340,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_fil expect_or_iterator(*itr, 3); expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); expect_or_child(*itr, 1, "search::BitVectorIteratorStrictT"); - expect_or_child(*itr, 2, multi_term_iterator()); + expect_or_child(*itr, 2, iterator_unpack_docid); expect_hits(concat({10, 30, 31}, concat(range(100, 128), range(300, 128))), *itr); } @@ -346,17 +356,17 @@ TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field) expect_hits(concat(range(100, 128), range(300, 128)), *itr); } -TEST_P(DirectMultiTermBlueprintTest, or_filter_iterator_used_for_filter_field_when_ranking_not_needed) +TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_filter_field_when_ranking_not_needed) { setup(true, false); add_term(1); add_term(3); auto itr = create_leaf_search(); - EXPECT_THAT(itr->asString(), StartsWith("search::attribute::MultiTermOrFilterSearchImpl")); + EXPECT_THAT(itr->asString(), StartsWith(iterator_unpack_none)); expect_hits({10, 30, 31}, *itr); } -TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_or_filter_iterator_used_for_filter_field_when_ranking_not_needed) +TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_filter_field_when_ranking_not_needed) { setup(true, false); add_term(1); @@ -367,7 +377,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_or_filter_iterator_used_for_ expect_or_iterator(*itr, 3); expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); expect_or_child(*itr, 1, "search::BitVectorIteratorStrictT"); - expect_or_child(*itr, 2, "search::attribute::MultiTermOrFilterSearchImpl"); + expect_or_child(*itr, 2, iterator_unpack_none); expect_hits(concat({10, 30, 31}, concat(range(100, 128), range(300, 128))), *itr); } diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h index 066b70481dc..42651854599 100644 --- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h +++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h @@ -44,7 +44,7 @@ private: std::vector<std::unique_ptr<queryeval::SearchIterator>>&& bitvectors, bool strict) const; - template <bool filter_search, bool need_match_data> + template <bool filter_search> std::unique_ptr<queryeval::SearchIterator> create_search_helper(const fef::TermFieldMatchDataArray& tfmda, bool strict) const; diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp index f195e97fee0..6fc8ac63026 100644 --- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp +++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp @@ -88,7 +88,7 @@ DirectMultiTermBlueprint<PostingStoreType, SearchType>::combine_iterators(std::u } template <typename PostingStoreType, typename SearchType> -template <bool filter_search, bool need_match_data> +template <bool filter_search> std::unique_ptr<queryeval::SearchIterator> DirectMultiTermBlueprint<PostingStoreType, SearchType>::create_search_helper(const fef::TermFieldMatchDataArray& tfmda, bool strict) const @@ -103,27 +103,18 @@ DirectMultiTermBlueprint<PostingStoreType, SearchType>::create_search_helper(con auto& tfmd = *tfmda[0]; bool use_bit_vector_when_available = filter_search || !_attr.has_always_btree_iterator(); auto weights = create_iterators(btree_iterators, bitvectors, use_bit_vector_when_available, tfmd, strict); - if constexpr (filter_search || (!need_match_data && !SearchType::require_btree_iterators)) { - auto filter = !btree_iterators.empty() ? - (need_match_data ? - attribute::MultiTermOrFilterSearch::create(std::move(btree_iterators), tfmd) : - attribute::MultiTermOrFilterSearch::create(std::move(btree_iterators))) : - std::unique_ptr<SearchIterator>(); - return combine_iterators(std::move(filter), std::move(bitvectors), strict); - } bool field_is_filter = getState().fields()[0].isFilter(); - if constexpr (!filter_search && !SearchType::require_btree_iterators) { + if constexpr (!SearchType::require_btree_iterators) { auto multi_term = !btree_iterators.empty() ? - SearchType::create(tfmd, field_is_filter, std::move(weights), std::move(btree_iterators)) + SearchType::create(tfmd, (filter_search || field_is_filter), std::move(weights), std::move(btree_iterators)) : std::unique_ptr<SearchIterator>(); return combine_iterators(std::move(multi_term), std::move(bitvectors), strict); - } else if constexpr (SearchType::require_btree_iterators) { + } else { // In this case we should only have btree iterators. assert(btree_iterators.size() == _terms.size()); assert(weights.index() == 0); return SearchType::create(tfmd, field_is_filter, std::get<0>(weights).get(), std::move(btree_iterators)); } - return std::make_unique<queryeval::EmptySearch>(); } template <typename PostingStoreType, typename SearchType> @@ -132,12 +123,7 @@ DirectMultiTermBlueprint<PostingStoreType, SearchType>::createLeafSearch(const f { assert(tfmda.size() == 1); assert(getState().numFields() == 1); - bool need_match_data = !tfmda[0]->isNotNeeded(); - if (need_match_data) { - return create_search_helper<SearchType::filter_search, true>(tfmda, strict); - } else { - return create_search_helper<SearchType::filter_search, false>(tfmda, strict); - } + return create_search_helper<SearchType::filter_search>(tfmda, strict); } template <typename PostingStoreType, typename SearchType> @@ -146,7 +132,7 @@ DirectMultiTermBlueprint<PostingStoreType, SearchType>::createFilterSearch(bool { assert(getState().numFields() == 1); auto wrapper = std::make_unique<FilterWrapper>(getState().numFields()); - wrapper->wrap(create_search_helper<true, false>(wrapper->tfmda(), strict)); + wrapper->wrap(create_search_helper<true>(wrapper->tfmda(), strict)); return wrapper; } diff --git a/searchlib/src/vespa/searchlib/attribute/in_term_search.h b/searchlib/src/vespa/searchlib/attribute/in_term_search.h index 36776499e51..f9a48af2aba 100644 --- a/searchlib/src/vespa/searchlib/attribute/in_term_search.h +++ b/searchlib/src/vespa/searchlib/attribute/in_term_search.h @@ -2,14 +2,20 @@ #pragma once +#include <vespa/searchlib/queryeval/weighted_set_term_search.h> + namespace search::attribute { /** - * Class used as template argument in DirectMultiTermBlueprint to configure it for the InTerm query operator. + * Search iterator for an InTerm, sharing the implementation with WeightedSetTerm. + * + * The only difference is that an InTerm never requires unpacking of weights. */ -struct InTermSearch { +class InTermSearch : public queryeval::WeightedSetTermSearch { +public: + // Whether this iterator is considered a filter, independent of attribute vector settings (ref. rank: filter). + // Setting this to true ensures that weights are never unpacked. static constexpr bool filter_search = true; - static constexpr bool require_btree_iterators = false; }; } diff --git a/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.h b/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.h index 1e8227c3007..f9357081d74 100644 --- a/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.h +++ b/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.h @@ -9,8 +9,7 @@ namespace search::attribute { /** * Filter iterator on top of low-level posting list iterators or regular search iterators with OR semantics. * - * Used during calculation of global filter for InTerm, WeightedSetTerm, DotProduct and WandTerm, - * or when ranking is not needed for InTerm and WeightedSetTerm. + * Used during calculation of global filter for DotProduct and WandTerm. */ class MultiTermOrFilterSearch : public queryeval::SearchIterator { diff --git a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp index 0929f80a8f0..0c57b21aba6 100644 --- a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp @@ -14,7 +14,13 @@ using vespalib::ObjectVisitor; namespace search::queryeval { -template <typename HEAP, typename IteratorPack> +enum class UnpackType { + DocidAndWeights, + Docid, + None +}; + +template <UnpackType unpack_type, typename HEAP, typename IteratorPack> class WeightedSetTermSearchImpl : public WeightedSetTermSearch { private: @@ -47,7 +53,6 @@ private: ref_t *_data_stash; ref_t *_data_end; IteratorPack _children; - bool _need_match_data; void seek_child(ref_t child, uint32_t docId) { _termPos[child] = _children.seek(child, docId); @@ -64,7 +69,6 @@ private: public: WeightedSetTermSearchImpl(fef::TermFieldMatchData &tmd, - bool field_is_filter, std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights, IteratorPack &&iteratorPack) : _tmd(tmd), @@ -77,8 +81,7 @@ public: _data_begin(nullptr), _data_stash(nullptr), _data_end(nullptr), - _children(std::move(iteratorPack)), - _need_match_data(!field_is_filter && !_tmd.isNotNeeded()) + _children(std::move(iteratorPack)) { HEAP::require_left_heap(); assert(_children.size() > 0); @@ -89,7 +92,7 @@ public: } _data_begin = &_data_space[0]; _data_end = _data_begin + _data_space.size(); - if (_need_match_data) { + if constexpr (unpack_type == UnpackType::DocidAndWeights) { _tmd.reservePositions(_children.size()); } } @@ -115,7 +118,7 @@ public: } void doUnpack(uint32_t docId) override { - if (_need_match_data) { + if constexpr (unpack_type == UnpackType::DocidAndWeights) { _tmd.reset(docId); pop_matching_children(docId); std::sort(_data_stash, _data_end, _cmpWeight); @@ -124,7 +127,7 @@ public: pos.setElementWeight(_weights[*ptr]); _tmd.appendPosition(pos); } - } else { + } else if constexpr (unpack_type == UnpackType::Docid) { _tmd.resetOnlyDocId(docId); } } @@ -162,68 +165,76 @@ public: } }; -//----------------------------------------------------------------------------- +template <typename HeapType, typename IteratorPackType> +SearchIterator::UP +create_helper(fef::TermFieldMatchData& tmd, + bool is_filter_search, + std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights, + IteratorPackType&& pack) +{ + bool match_data_needed = !tmd.isNotNeeded(); + if (is_filter_search && match_data_needed) { + return std::make_unique<WeightedSetTermSearchImpl<UnpackType::Docid, HeapType, IteratorPackType>> + (tmd, std::move(weights), std::move(pack)); + } else if (!is_filter_search && match_data_needed) { + return std::make_unique<WeightedSetTermSearchImpl<UnpackType::DocidAndWeights, HeapType, IteratorPackType>> + (tmd, std::move(weights), std::move(pack)); + } else { + return std::make_unique<WeightedSetTermSearchImpl<UnpackType::None, HeapType, IteratorPackType>> + (tmd, std::move(weights), std::move(pack)); + } +} SearchIterator::UP WeightedSetTermSearch::create(const std::vector<SearchIterator *> &children, TermFieldMatchData &tmd, - bool field_is_filter, + bool is_filter_search, const std::vector<int32_t> &weights, fef::MatchData::UP match_data) { - using ArrayHeapImpl = WeightedSetTermSearchImpl<vespalib::LeftArrayHeap, SearchIteratorPack>; - using HeapImpl = WeightedSetTermSearchImpl<vespalib::LeftHeap, SearchIteratorPack>; - - if (tmd.isNotNeeded()) { - return attribute::MultiTermOrFilterSearch::create(children, std::move(match_data)); - } - if (children.size() < 128) { - return SearchIterator::UP(new ArrayHeapImpl(tmd, field_is_filter, std::cref(weights), SearchIteratorPack(children, std::move(match_data)))); + return create_helper<vespalib::LeftArrayHeap, SearchIteratorPack>(tmd, is_filter_search, std::cref(weights), + SearchIteratorPack(children, std::move(match_data))); } - return SearchIterator::UP(new HeapImpl(tmd, field_is_filter, std::cref(weights), SearchIteratorPack(children, std::move(match_data)))); + return create_helper<vespalib::LeftHeap, SearchIteratorPack>(tmd, is_filter_search, std::cref(weights), + SearchIteratorPack(children, std::move(match_data))); } -//----------------------------------------------------------------------------- - namespace { template <typename IteratorType, typename IteratorPackType> SearchIterator::UP -create_helper(fef::TermFieldMatchData& tmd, - bool field_is_filter, - std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights, - std::vector<IteratorType>&& iterators) +create_helper_resolve_pack(fef::TermFieldMatchData& tmd, + bool is_filter_search, + std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights, + std::vector<IteratorType>&& iterators) { - using ArrayHeapImpl = WeightedSetTermSearchImpl<vespalib::LeftArrayHeap, IteratorPackType>; - using HeapImpl = WeightedSetTermSearchImpl<vespalib::LeftHeap, IteratorPackType>; - if (iterators.size() < 128) { - return SearchIterator::UP(new ArrayHeapImpl(tmd, field_is_filter, std::move(weights), IteratorPackType(std::move(iterators)))); + return create_helper<vespalib::LeftArrayHeap, IteratorPackType>(tmd, is_filter_search, std::move(weights), + IteratorPackType(std::move(iterators))); } - return SearchIterator::UP(new HeapImpl(tmd, field_is_filter, std::move(weights), IteratorPackType(std::move(iterators)))); + return create_helper<vespalib::LeftHeap, IteratorPackType>(tmd, is_filter_search, std::move(weights), + IteratorPackType(std::move(iterators))); } } SearchIterator::UP WeightedSetTermSearch::create(fef::TermFieldMatchData& tmd, - bool field_is_filter, + bool is_filter_search, std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights, std::vector<DocidIterator>&& iterators) { - return create_helper<DocidIterator, DocidIteratorPack>(tmd, field_is_filter, std::move(weights), std::move(iterators)); + return create_helper_resolve_pack<DocidIterator, DocidIteratorPack>(tmd, is_filter_search, std::move(weights), std::move(iterators)); } SearchIterator::UP WeightedSetTermSearch::create(fef::TermFieldMatchData &tmd, - bool field_is_filter, + bool is_filter_search, std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights, std::vector<DocidWithWeightIterator> &&iterators) { - return create_helper<DocidWithWeightIterator, DocidWithWeightIteratorPack>(tmd, field_is_filter, std::move(weights), std::move(iterators)); + return create_helper_resolve_pack<DocidWithWeightIterator, DocidWithWeightIteratorPack>(tmd, is_filter_search, std::move(weights), std::move(iterators)); } -//----------------------------------------------------------------------------- - } diff --git a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h index a9ab86e2c5f..7e47928fb90 100644 --- a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h @@ -18,8 +18,7 @@ namespace search::queryeval { class Blueprint; /** - * Search iterator for a weighted set, based on a set of child search - * iterators. + * Search iterator for a WeightedSetTerm, based on a set of child search iterators. */ class WeightedSetTermSearch : public SearchIterator { @@ -27,23 +26,25 @@ protected: WeightedSetTermSearch() = default; public: + // Whether this iterator is considered a filter, independent of attribute vector settings (ref rank: filter). static constexpr bool filter_search = false; + // Whether this iterator requires btree iterators for all tokens/terms used by the operator. static constexpr bool require_btree_iterators = false; // TODO: pass ownership with unique_ptr static SearchIterator::UP create(const std::vector<SearchIterator *> &children, search::fef::TermFieldMatchData &tmd, - bool field_is_filter, + bool is_filter_search, const std::vector<int32_t> &weights, fef::MatchData::UP match_data); static SearchIterator::UP create(search::fef::TermFieldMatchData& tmd, - bool field_is_filter, + bool is_filter_search, std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights, std::vector<DocidIterator>&& iterators); static SearchIterator::UP create(search::fef::TermFieldMatchData &tmd, - bool field_is_filter, + bool is_filter_search, std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights, std::vector<DocidWithWeightIterator> &&iterators); |