From 29fa616713a0b2be72707ad0193bf5112b55b91e Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Fri, 5 Jan 2024 13:51:54 +0000 Subject: Support the IN query operator in DirectMultiTermBlueprint. --- .../direct_multi_term_blueprint_test.cpp | 114 +++++++++++++++------ .../attribute/direct_multi_term_blueprint.cpp | 3 + .../attribute/direct_multi_term_blueprint.h | 6 +- .../attribute/direct_multi_term_blueprint.hpp | 39 ++++--- .../src/vespa/searchlib/attribute/in_term_search.h | 15 +++ .../vespa/searchlib/queryeval/dot_product_search.h | 3 + .../searchlib/queryeval/weighted_set_term_search.h | 3 + 7 files changed, 136 insertions(+), 47 deletions(-) create mode 100644 searchlib/src/vespa/searchlib/attribute/in_term_search.h diff --git a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp index 67b73f459c9..899ddaa3cc0 100644 --- a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp +++ b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -130,58 +131,77 @@ validate_posting_lists(const IDirectPostingStore& store) expect_has_bitvector_iterator(store, LookupKeyType(300)); } +enum OperatorType { + In, + WSet +}; + struct TestParam { + OperatorType op_type; CollectionType col_type; BasicType type; - TestParam(CollectionType col_type_in, BasicType type_in) : col_type(col_type_in), type(type_in) {} + TestParam(OperatorType op_type_in, CollectionType col_type_in, BasicType type_in) + : op_type(op_type_in), col_type(col_type_in), type(type_in) {} ~TestParam() = default; }; std::ostream& operator<<(std::ostream& os, const TestParam& param) { - os << param.col_type.asString() << "_" << param.type.asString(); + os << (param.op_type == OperatorType::In ? "in_" : "wset_") << param.col_type.asString() << "_" << param.type.asString(); return os; } +using SingleInBlueprintType = DirectMultiTermBlueprint; +using MultiInBlueprintType = DirectMultiTermBlueprint; +using SingleWSetBlueprintType = DirectMultiTermBlueprint; +using MultiWSetBlueprintType = DirectMultiTermBlueprint; + class DirectMultiTermBlueprintTest : public ::testing::TestWithParam { public: - using SingleValueBlueprintType = DirectMultiTermBlueprint; - using MultiValueBlueprintType = DirectMultiTermBlueprint; std::shared_ptr attr; + bool in_operator; + bool single_type; bool integer_type; - std::shared_ptr single_blueprint; - std::shared_ptr multi_blueprint; - queryeval::ComplexLeafBlueprint* blueprint; + std::shared_ptr blueprint; Blueprint::HitEstimate estimate; fef::TermFieldMatchData tfmd; fef::TermFieldMatchDataArray tfmda; DirectMultiTermBlueprintTest() : attr(), + in_operator(true), + single_type(true), integer_type(true), - single_blueprint(), - multi_blueprint(), blueprint(), tfmd(), tfmda() { tfmda.add(&tfmd); } + ~DirectMultiTermBlueprintTest() {} void setup(bool field_is_filter, bool need_term_field_match_data) { attr = make_attribute(GetParam().col_type, GetParam().type, field_is_filter); + in_operator = GetParam().op_type == OperatorType::In; + single_type = GetParam().col_type == CollectionType::SINGLE; integer_type = GetParam().type != BasicType::STRING; FieldSpec spec(field_name, field_id, fef::TermFieldHandle(), field_is_filter); const IDirectPostingStore* store; - if (GetParam().col_type == CollectionType::SINGLE) { + if (single_type) { auto real_store = attr->as_docid_posting_store(); ASSERT_TRUE(real_store); - single_blueprint = std::make_shared(spec, *attr, *real_store, 2); - blueprint = single_blueprint.get(); + if (in_operator) { + blueprint = std::make_shared(spec, *attr, *real_store, 2); + } else { + blueprint = std::make_shared(spec, *attr, *real_store, 2); + } store = real_store; } else { auto real_store = attr->as_docid_with_weight_posting_store(); ASSERT_TRUE(real_store); - multi_blueprint = std::make_shared(spec, *attr, *real_store, 2); - blueprint = multi_blueprint.get(); + if (in_operator) { + blueprint = std::make_shared(spec, *attr, *real_store, 2); + } else { + blueprint = std::make_shared(spec, *attr, *real_store, 2); + } store = real_store; } if (integer_type) { @@ -205,15 +225,26 @@ public: } } void add_term(int64_t term_value) { - if (single_blueprint) { - add_term_helper(*single_blueprint, term_value); + if (single_type) { + if (in_operator) { + add_term_helper(dynamic_cast(*blueprint), term_value); + } else { + add_term_helper(dynamic_cast(*blueprint), term_value); + } } else { - add_term_helper(*multi_blueprint, term_value); + if (in_operator) { + add_term_helper(dynamic_cast(*blueprint), term_value); + } else { + add_term_helper(dynamic_cast(*blueprint), term_value); + } } } std::unique_ptr create_leaf_search() const { return blueprint->createLeafSearch(tfmda, true); } + vespalib::string multi_term_iterator() const { + return in_operator ? "search::attribute::MultiTermOrFilterSearchImpl" : "search::queryeval::WeightedSetTermSearchImpl"; + } }; void @@ -241,33 +272,54 @@ expect_or_child(SearchIterator& itr, size_t child, const vespalib::string& exp_c INSTANTIATE_TEST_SUITE_P(DefaultInstantiation, DirectMultiTermBlueprintTest, - testing::Values(TestParam(CollectionType::SINGLE, BasicType::INT64), - TestParam(CollectionType::SINGLE, BasicType::STRING), - TestParam(CollectionType::WSET, BasicType::INT64), - TestParam(CollectionType::WSET, BasicType::STRING)), + testing::Values(TestParam(OperatorType::In, CollectionType::SINGLE, BasicType::INT64), + TestParam(OperatorType::In, CollectionType::SINGLE, BasicType::STRING), + TestParam(OperatorType::In, CollectionType::WSET, BasicType::INT64), + TestParam(OperatorType::In, CollectionType::WSET, BasicType::STRING), + TestParam(OperatorType::WSet, CollectionType::SINGLE, BasicType::INT64), + TestParam(OperatorType::WSet, CollectionType::SINGLE, BasicType::STRING), + TestParam(OperatorType::WSet, CollectionType::WSET, BasicType::INT64), + TestParam(OperatorType::WSet, CollectionType::WSET, BasicType::STRING)), testing::PrintToStringParamName()); -TEST_P(DirectMultiTermBlueprintTest, weight_iterators_used_for_none_filter_field) -{ +TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_none_filter_field) { setup(false, true); add_term(1); add_term(3); auto itr = create_leaf_search(); - EXPECT_THAT(itr->asString(), StartsWith("search::queryeval::WeightedSetTermSearchImpl")); + EXPECT_THAT(itr->asString(), StartsWith(multi_term_iterator())); expect_hits({10, 30, 31}, *itr); } -TEST_P(DirectMultiTermBlueprintTest, weight_iterators_used_instead_of_bitvectors_for_none_filter_field) +TEST_P(DirectMultiTermBlueprintTest, bitvectors_used_instead_of_btree_iterators_for_none_filter_field) { setup(false, true); + if (!in_operator) { + return; + } + add_term(1); + add_term(100); + auto itr = create_leaf_search(); + expect_or_iterator(*itr, 2); + expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); + expect_or_child(*itr, 1, multi_term_iterator()); + expect_hits(concat({10}, range(100, 128)), *itr); +} + +TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_instead_of_bitvectors_for_none_filter_field) +{ + setup(false, true); + if (in_operator) { + return; + } add_term(1); add_term(100); auto itr = create_leaf_search(); - EXPECT_THAT(itr->asString(), StartsWith("search::queryeval::WeightedSetTermSearchImpl")); + EXPECT_THAT(itr->asString(), StartsWith(multi_term_iterator())); expect_hits(concat({10}, range(100, 128)), *itr); } -TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_weight_iterators_used_for_filter_field) +TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_filter_field) { setup(true, true); add_term(1); @@ -278,7 +330,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_weight_iterators_used_for_fi expect_or_iterator(*itr, 3); expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); expect_or_child(*itr, 1, "search::BitVectorIteratorStrictT"); - expect_or_child(*itr, 2, "search::queryeval::WeightedSetTermSearchImpl"); + expect_or_child(*itr, 2, multi_term_iterator()); expect_hits(concat({10, 30, 31}, concat(range(100, 128), range(300, 128))), *itr); } @@ -294,7 +346,7 @@ TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field) expect_hits(concat(range(100, 128), range(300, 128)), *itr); } -TEST_P(DirectMultiTermBlueprintTest, filter_iterator_used_for_filter_field_and_ranking_not_needed) +TEST_P(DirectMultiTermBlueprintTest, or_filter_iterator_used_for_filter_field_when_ranking_not_needed) { setup(true, false); add_term(1); @@ -304,7 +356,7 @@ TEST_P(DirectMultiTermBlueprintTest, filter_iterator_used_for_filter_field_and_r expect_hits({10, 30, 31}, *itr); } -TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_filter_iterator_used_for_filter_field_and_ranking_not_needed) +TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_or_filter_iterator_used_for_filter_field_when_ranking_not_needed) { setup(true, false); add_term(1); @@ -319,7 +371,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_filter_iterator_used_for_fil expect_hits(concat({10, 30, 31}, concat(range(100, 128), range(300, 128))), *itr); } -TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field_and_ranking_not_needed) +TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field_when_ranking_not_needed) { setup(true, false); add_term(100); diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.cpp b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.cpp index 12ae226895e..d7f9cd84d8d 100644 --- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.cpp @@ -4,12 +4,15 @@ #include "direct_multi_term_blueprint.hpp" #include "i_docid_posting_store.h" #include "i_docid_with_weight_posting_store.h" +#include "in_term_search.h" #include #include namespace search::attribute { +template class DirectMultiTermBlueprint; template class DirectMultiTermBlueprint; +template class DirectMultiTermBlueprint; template class DirectMultiTermBlueprint; template class DirectMultiTermBlueprint; diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h index 668034ecd3d..066b70481dc 100644 --- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h +++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h @@ -18,7 +18,7 @@ namespace search::attribute { /** * Blueprint used for multi-term query operators as InTerm, WeightedSetTerm or DotProduct - * over a multi-value attribute which supports the IDocidWithWeightPostingStore interface. + * over an attribute which supports the IDocidPostingStore or IDocidWithWeightPostingStore interface. * * This uses access to low-level posting lists, which speeds up query execution. */ @@ -44,7 +44,9 @@ private: std::vector>&& bitvectors, bool strict) const; - std::unique_ptr create_search_helper(const fef::TermFieldMatchDataArray& tfmda, bool strict, bool is_filter_search) const; + template + std::unique_ptr create_search_helper(const fef::TermFieldMatchDataArray& tfmda, + bool strict) const; public: DirectMultiTermBlueprint(const queryeval::FieldSpec &field, const IAttributeVector &iattr, const PostingStoreType &attr, size_t size_hint); diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp index bb6804f22f1..f195e97fee0 100644 --- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp +++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp @@ -88,8 +88,10 @@ DirectMultiTermBlueprint::combine_iterators(std::u } template +template std::unique_ptr -DirectMultiTermBlueprint::create_search_helper(const fef::TermFieldMatchDataArray& tfmda, bool strict, bool is_filter_search) const +DirectMultiTermBlueprint::create_search_helper(const fef::TermFieldMatchDataArray& tfmda, + bool strict) const { if (_terms.empty()) { return std::make_unique(); @@ -98,24 +100,30 @@ DirectMultiTermBlueprint::create_search_helper(con std::vector bitvectors; const size_t num_children = _terms.size(); btree_iterators.reserve(num_children); - bool use_bit_vector_when_available = is_filter_search || !_attr.has_always_btree_iterator(); - auto weights = create_iterators(btree_iterators, bitvectors, use_bit_vector_when_available, *tfmda[0], strict); - if (is_filter_search) { - auto filter = !btree_iterators.empty() ? attribute::MultiTermOrFilterSearch::create(std::move(btree_iterators)) : std::unique_ptr(); + auto& tfmd = *tfmda[0]; + bool use_bit_vector_when_available = filter_search || !_attr.has_always_btree_iterator(); + auto weights = create_iterators(btree_iterators, bitvectors, use_bit_vector_when_available, tfmd, strict); + if constexpr (filter_search || (!need_match_data && !SearchType::require_btree_iterators)) { + auto filter = !btree_iterators.empty() ? + (need_match_data ? + attribute::MultiTermOrFilterSearch::create(std::move(btree_iterators), tfmd) : + attribute::MultiTermOrFilterSearch::create(std::move(btree_iterators))) : + std::unique_ptr(); return combine_iterators(std::move(filter), std::move(bitvectors), strict); } bool field_is_filter = getState().fields()[0].isFilter(); - if constexpr (std::is_same_v) { + if constexpr (!filter_search && !SearchType::require_btree_iterators) { auto multi_term = !btree_iterators.empty() ? - SearchType::create(*tfmda[0], field_is_filter, std::move(weights), std::move(btree_iterators)) + SearchType::create(tfmd, field_is_filter, std::move(weights), std::move(btree_iterators)) : std::unique_ptr(); return combine_iterators(std::move(multi_term), std::move(bitvectors), strict); - } else { - // In this case we should only have weight iterators. + } else if constexpr (SearchType::require_btree_iterators) { + // In this case we should only have btree iterators. assert(btree_iterators.size() == _terms.size()); assert(weights.index() == 0); - return SearchType::create(*tfmda[0], field_is_filter, std::get<0>(weights).get(), std::move(btree_iterators)); + return SearchType::create(tfmd, field_is_filter, std::get<0>(weights).get(), std::move(btree_iterators)); } + return std::make_unique(); } template @@ -124,9 +132,12 @@ DirectMultiTermBlueprint::createLeafSearch(const f { assert(tfmda.size() == 1); assert(getState().numFields() == 1); - bool field_is_filter = getState().fields()[0].isFilter(); - bool is_filter_search = field_is_filter && tfmda[0]->isNotNeeded(); - return create_search_helper(tfmda, strict, is_filter_search); + bool need_match_data = !tfmda[0]->isNotNeeded(); + if (need_match_data) { + return create_search_helper(tfmda, strict); + } else { + return create_search_helper(tfmda, strict); + } } template @@ -135,7 +146,7 @@ DirectMultiTermBlueprint::createFilterSearch(bool { assert(getState().numFields() == 1); auto wrapper = std::make_unique(getState().numFields()); - wrapper->wrap(create_search_helper(wrapper->tfmda(), strict, true)); + wrapper->wrap(create_search_helper(wrapper->tfmda(), strict)); return wrapper; } diff --git a/searchlib/src/vespa/searchlib/attribute/in_term_search.h b/searchlib/src/vespa/searchlib/attribute/in_term_search.h new file mode 100644 index 00000000000..36776499e51 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/in_term_search.h @@ -0,0 +1,15 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +namespace search::attribute { + +/** + * Class used as template argument in DirectMultiTermBlueprint to configure it for the InTerm query operator. + */ +struct InTermSearch { + static constexpr bool filter_search = true; + static constexpr bool require_btree_iterators = false; +}; + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h b/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h index 2f1a4386a95..e49fcbcb5bc 100644 --- a/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h @@ -25,6 +25,9 @@ protected: DotProductSearch() {} public: + static constexpr bool filter_search = false; + static constexpr bool require_btree_iterators = true; + // TODO: use MultiSearch::Children to pass ownership static SearchIterator::UP create(const std::vector &children, search::fef::TermFieldMatchData &tmd, diff --git a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h index a497a647ac6..a9ab86e2c5f 100644 --- a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h @@ -27,6 +27,9 @@ protected: WeightedSetTermSearch() = default; public: + static constexpr bool filter_search = false; + static constexpr bool require_btree_iterators = false; + // TODO: pass ownership with unique_ptr static SearchIterator::UP create(const std::vector &children, search::fef::TermFieldMatchData &tmd, -- cgit v1.2.3