diff options
Diffstat (limited to 'searchlib')
5 files changed, 133 insertions, 33 deletions
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp index e776b6a9379..6a2b306522d 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp @@ -25,13 +25,13 @@ populate_attribute(AttributeType& attr, uint32_t docid_limit, const HitSpecs& hi docids->foreach_truebit([&](uint32_t docid) { if constexpr (is_string) { if constexpr (is_multivalue) { - attr.append(docid, std::to_string(spec.term_value), 1); + attr.append(docid, std::to_string(spec.term_value), random_int(1, 100)); } else { attr.update(docid, std::to_string(spec.term_value)); } } else { if constexpr (is_multivalue) { - attr.append(docid, spec.term_value, 1); + attr.append(docid, spec.term_value, random_int(1, 100)); } else { attr.update(docid, spec.term_value); } diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp index 15690fd71d5..516a819aae8 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp @@ -18,6 +18,7 @@ using search::query::Node; using search::query::SimpleDotProduct; using search::query::SimpleInTerm; using search::query::SimpleStringTerm; +using search::query::SimpleWandTerm; using search::query::SimpleWeightedSetTerm; using search::query::Weight; @@ -83,6 +84,17 @@ make_query_node(QueryOperator query_op, const TermVector& terms) res->addTerm(term, Weight(1)); } return res; + } else if (query_op == QueryOperator::ParallelWeakAnd) { + // These config values match the defaults (see WandItem.java): + uint32_t target_hits = 100; + int64_t score_threshold = 0; + double threshold_boost_factor = 1.0; + auto res = std::make_unique<SimpleWandTerm>(terms.size(), field_name, 0, Weight(1), + target_hits, score_threshold, threshold_boost_factor); + for (auto term : terms) { + res->addTerm(term, Weight(random_int(1, 100))); + } + return res; } return {}; } @@ -97,15 +109,18 @@ make_leaf_blueprint(const Node& node, BenchmarkSearchable& searchable, uint32_t return blueprint; } -template <typename BlueprintType> Blueprint::UP -make_intermediate_blueprint(BenchmarkSearchable& searchable, const TermVector& terms, uint32_t docid_limit) +make_intermediate_blueprint(std::unique_ptr<IntermediateBlueprint> blueprint, BenchmarkSearchable& searchable, const TermVector& terms, uint32_t docid_limit) { - auto blueprint = std::make_unique<BlueprintType>(); + auto* weak_and = blueprint->asWeakAnd(); for (auto term : terms) { SimpleStringTerm sterm(std::to_string(term), field_name, 0, Weight(1)); auto child = make_leaf_blueprint(sterm, searchable, docid_limit); - blueprint->addChild(std::move(child)); + if (weak_and != nullptr) { + weak_and->addTerm(std::move(child), random_int(1, 100)); + } else { + blueprint->addChild(std::move(child)); + } } blueprint->setDocIdLimit(docid_limit); blueprint->update_flow_stats(docid_limit); @@ -116,9 +131,12 @@ Blueprint::UP make_blueprint_helper(BenchmarkSearchable& searchable, QueryOperator query_op, const TermVector& terms, uint32_t docid_limit) { if (query_op == QueryOperator::And) { - return make_intermediate_blueprint<AndBlueprint>(searchable, terms, docid_limit); + return make_intermediate_blueprint(std::make_unique<AndBlueprint>(), searchable, terms, docid_limit); } else if (query_op == QueryOperator::Or) { - return make_intermediate_blueprint<OrBlueprint>(searchable, terms, docid_limit); + return make_intermediate_blueprint(std::make_unique<OrBlueprint>(), searchable, terms, docid_limit); + } else if (query_op == QueryOperator::WeakAnd) { + uint32_t target_hits = 100; + return make_intermediate_blueprint(std::make_unique<WeakAndBlueprint>(target_hits), searchable, terms, docid_limit); } else { auto query_node = make_query_node(query_op, terms); return make_leaf_blueprint(*query_node, searchable, docid_limit); diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp index b937f6a2f00..f17403bd33a 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp @@ -35,6 +35,8 @@ to_string(QueryOperator query_op) case QueryOperator::DotProduct: return "DotProduct"; case QueryOperator::And: return "And"; case QueryOperator::Or: return "Or"; + case QueryOperator::WeakAnd: return "WeakAnd"; + case QueryOperator::ParallelWeakAnd: return "ParallelWeakAnd"; } return "unknown"; } @@ -71,4 +73,11 @@ random_docids(uint32_t docid_limit, uint32_t count) return res; } +int32_t +random_int(int32_t a, int32_t b) +{ + std::uniform_int_distribution<int32_t> distr(a, b); + return distr(gen); +} + }
\ No newline at end of file diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/common.h b/searchlib/src/tests/queryeval/iterator_benchmark/common.h index 6d890910271..45fd82b091c 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/common.h +++ b/searchlib/src/tests/queryeval/iterator_benchmark/common.h @@ -39,7 +39,9 @@ enum class QueryOperator { WeightedSet, DotProduct, And, - Or + Or, + WeakAnd, + ParallelWeakAnd }; vespalib::string to_string(QueryOperator query_op); @@ -78,4 +80,6 @@ public: BitVector::UP random_docids(uint32_t docid_limit, uint32_t count); +int32_t random_int(int32_t a, int32_t b); + } diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp index 33c2e067255..5b061697220 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp @@ -153,40 +153,87 @@ get_class_name(const auto& obj) return res; } +struct MatchLoopContext { + Blueprint::UP blueprint; + MatchData::UP match_data; + SearchIterator::UP iterator; + MatchLoopContext() : blueprint(), match_data(), iterator() {} + MatchLoopContext(Blueprint::UP blueprint_in, + MatchData::UP match_data_in, + SearchIterator::UP iterator_in) + : blueprint(std::move(blueprint_in)), + match_data(std::move(match_data_in)), + iterator(std::move(iterator_in)) + {} + void operator=(MatchLoopContext&& rhs) { + blueprint = std::move(rhs.blueprint); + match_data = std::move(rhs.match_data); + iterator = std::move(rhs.iterator); + } + ~MatchLoopContext(); +}; + +MatchLoopContext::~MatchLoopContext() = default; + +MatchLoopContext +make_match_loop_context(BenchmarkBlueprintFactory& factory, bool strict, uint32_t docid_limit) +{ + auto blueprint = factory.make_blueprint(); + assert(blueprint); + blueprint->basic_plan(strict, docid_limit); + blueprint->fetchPostings(ExecuteInfo::FULL); + // Note: All blueprints get the same TermFieldMatchData instance. + // This is OK as long as we don't do unpacking and only use 1 thread. + auto md = MatchData::makeTestInstance(1, 1); + auto itr = blueprint->createSearch(*md); + assert(itr); + return {std::move(blueprint), std::move(md), std::move(itr)}; +} + +template <bool do_unpack> BenchmarkResult -strict_search(Blueprint& blueprint, MatchData& md, uint32_t docid_limit) +strict_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit) { - auto itr = blueprint.createSearch(md); - assert(itr.get()); BenchmarkTimer timer(budget_sec); uint32_t hits = 0; + MatchLoopContext ctx; while (timer.has_budget()) { + ctx = make_match_loop_context(factory, true, docid_limit); + auto* itr = ctx.iterator.get(); timer.before(); hits = 0; itr->initRange(1, docid_limit); uint32_t docid = itr->seekFirst(1); + if constexpr (do_unpack) { + itr->unpack(docid); + } while (docid < docid_limit) { ++hits; docid = itr->seekNext(docid + 1); + if constexpr (do_unpack) { + itr->unpack(docid); + } } timer.after(); } - FlowStats flow(blueprint.estimate(), blueprint.cost(), blueprint.strict_cost()); - return {timer.min_time() * 1000.0, hits + 1, hits, flow, flow.strict_cost, flow.strict_cost, get_class_name(*itr), get_class_name(blueprint)}; + FlowStats flow(ctx.blueprint->estimate(), ctx.blueprint->cost(), ctx.blueprint->strict_cost()); + return {timer.min_time() * 1000.0, hits + 1, hits, flow, flow.strict_cost, flow.strict_cost, get_class_name(*ctx.iterator), get_class_name(*ctx.blueprint)}; } +template <bool do_unpack> BenchmarkResult -non_strict_search(Blueprint& blueprint, MatchData& md, uint32_t docid_limit, double filter_hit_ratio) +non_strict_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit, double filter_hit_ratio, bool force_strict) { - auto itr = blueprint.createSearch(md); - assert(itr.get()); BenchmarkTimer timer(budget_sec); uint32_t seeks = 0; uint32_t hits = 0; // This simulates a filter that is evaluated before this iterator. // The filter returns 'filter_hit_ratio' amount of the document corpus. uint32_t docid_skip = 1.0 / filter_hit_ratio; + MatchLoopContext ctx; while (timer.has_budget()) { + ctx = make_match_loop_context(factory, force_strict, docid_limit); + auto* itr = ctx.iterator.get(); timer.before(); seeks = 0; hits = 0; @@ -195,29 +242,35 @@ non_strict_search(Blueprint& blueprint, MatchData& md, uint32_t docid_limit, dou ++seeks; if (itr->seek(docid)) { ++hits; + if constexpr (do_unpack) { + itr->unpack(docid); + } } } timer.after(); } - FlowStats flow(blueprint.estimate(), blueprint.cost(), blueprint.strict_cost()); + FlowStats flow(ctx.blueprint->estimate(), ctx.blueprint->cost(), ctx.blueprint->strict_cost()); double actual_cost = flow.cost * filter_hit_ratio; // This is an attempt to calculate an alternative actual cost for strict / posting list iterators that are used in a non-strict context. double alt_cost = flow.strict_cost + 0.5 * filter_hit_ratio; - return {timer.min_time() * 1000.0, seeks, hits, flow, actual_cost, alt_cost, get_class_name(*itr), get_class_name(blueprint)}; + return {timer.min_time() * 1000.0, seeks, hits, flow, actual_cost, alt_cost, get_class_name(*ctx.iterator), get_class_name(*ctx.blueprint)}; } BenchmarkResult -benchmark_search(Blueprint::UP blueprint, uint32_t docid_limit, bool strict_context, bool force_strict, double filter_hit_ratio) +benchmark_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit, bool strict_context, bool force_strict, bool unpack_iterator, double filter_hit_ratio) { - blueprint->basic_plan(strict_context || force_strict, docid_limit); - blueprint->fetchPostings(ExecuteInfo::FULL); - // Note: All blueprints get the same TermFieldMatchData instance. - // This is OK as long as we don't do unpacking and only use 1 thread. - auto md = MatchData::makeTestInstance(1, 1); if (strict_context) { - return strict_search(*blueprint, *md, docid_limit); + if (unpack_iterator) { + return strict_search<true>(factory, docid_limit); + } else { + return strict_search<false>(factory, docid_limit); + } } else { - return non_strict_search(*blueprint, *md, docid_limit, filter_hit_ratio); + if (unpack_iterator) { + return non_strict_search<true>(factory, docid_limit, filter_hit_ratio, force_strict); + } else { + return non_strict_search<false>(factory, docid_limit, filter_hit_ratio, force_strict); + } } } @@ -230,18 +283,19 @@ to_string(bool val) void print_result_header() { - std::cout << "| chn | f_ratio | o_ratio | a_ratio | f.est | f.cost | f.scost | hits | seeks | time_ms | act_cost | alt_cost | ns_per_seek | ms_per_act_cost | ms_per_alt_cost | iterator | blueprint |" << std::endl; + std::cout << "| chn | f_ratio | o_ratio | a_ratio | f.est | f.cost | f.scost | hits | seeks | time_ms | act_cost | alt_cost | ns_per_seek | ms_per_act_cost | ms_per_alt_cost | iterator | blueprint |" << std::endl; } void print_result(const BenchmarkResult& res, uint32_t children, double op_hit_ratio, double filter_hit_ratio, uint32_t num_docs) { - std::cout << std::fixed << std::setprecision(4) + std::cout << std::fixed << std::setprecision(5) << "| " << std::setw(4) << children << " | " << std::setw(7) << filter_hit_ratio << " | " << std::setw(7) << op_hit_ratio << " | " << std::setw(7) << ((double) res.hits / (double) num_docs) << " | " << std::setw(6) << res.flow.estimate + << std::setprecision(4) << " | " << std::setw(7) << res.flow.cost << " | " << std::setw(7) << res.flow.strict_cost << " | " << std::setw(8) << res.hits @@ -274,11 +328,13 @@ struct BenchmarkCase { QueryOperator query_op; bool strict_context; bool force_strict; + bool unpack_iterator; BenchmarkCase(const FieldConfig& field_cfg_in, QueryOperator query_op_in, bool strict_context_in) : field_cfg(field_cfg_in), query_op(query_op_in), strict_context(strict_context_in), - force_strict(false) + force_strict(false), + unpack_iterator(false) {} vespalib::string to_string() const { return "op=" + search::queryeval::test::to_string(query_op) + ", cfg=" + field_cfg.to_string() + @@ -359,7 +415,7 @@ struct BenchmarkCaseSetup { child_counts(child_counts_in), filter_hit_ratios({1.0}), default_values_per_document(0), - filter_crossover_factor(1.0) + filter_crossover_factor(0.0) {} ~BenchmarkCaseSetup() {} }; @@ -373,6 +429,7 @@ struct BenchmarkSetup { std::vector<uint32_t> child_counts; std::vector<double> filter_hit_ratios; bool force_strict; + bool unpack_iterator; uint32_t default_values_per_document; double filter_crossover_factor; BenchmarkSetup(uint32_t num_docs_in, @@ -389,8 +446,9 @@ struct BenchmarkSetup { child_counts(child_counts_in), filter_hit_ratios({1.0}), force_strict(false), + unpack_iterator(false), default_values_per_document(0), - filter_crossover_factor(1.0) + filter_crossover_factor(0.0) {} BenchmarkSetup(uint32_t num_docs_in, const std::vector<FieldConfig>& field_cfgs_in, @@ -402,6 +460,7 @@ struct BenchmarkSetup { BenchmarkCaseSetup make_case_setup(const BenchmarkCase& bcase) const { BenchmarkCaseSetup res(num_docs, bcase, op_hit_ratios, child_counts); res.bcase.force_strict = force_strict; + res.bcase.unpack_iterator = unpack_iterator; res.default_values_per_document = default_values_per_document; if (!bcase.strict_context) { // Simulation of a filter is only relevant in a non-strict context. @@ -431,7 +490,8 @@ run_benchmark_case(const BenchmarkCaseSetup& setup) op_hit_ratio, children); for (double filter_hit_ratio : setup.filter_hit_ratios) { if (filter_hit_ratio * setup.filter_crossover_factor <= op_hit_ratio) { - auto res = benchmark_search(factory->make_blueprint(), setup.num_docs + 1, setup.bcase.strict_context, setup.bcase.force_strict, filter_hit_ratio); + auto res = benchmark_search(*factory, setup.num_docs + 1, + setup.bcase.strict_context, setup.bcase.force_strict, setup.bcase.unpack_iterator, filter_hit_ratio); print_result(res, children, op_hit_ratio, filter_hit_ratio, setup.num_docs); result.add(res); } @@ -545,6 +605,15 @@ TEST(IteratorBenchmark, analyze_complex_leaf_operators) run_benchmarks(setup); } +TEST(IteratorBenchmark, analyze_weak_and_operators) +{ + std::vector<FieldConfig> field_cfgs = {int32_wset_fs}; + std::vector<QueryOperator> query_ops = {QueryOperator::WeakAnd, QueryOperator::ParallelWeakAnd}; + BenchmarkSetup setup(num_docs, field_cfgs, query_ops, {true, false}, base_hit_ratios, {1, 2, 10, 100}); + setup.unpack_iterator = true; + run_benchmarks(setup); +} + TEST(IteratorBenchmark, term_benchmark) { BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::Term}, {true, false}, base_hit_ratios); |