diff options
author | Geir Storli <geirst@vespa.ai> | 2024-04-11 16:19:36 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-11 16:19:36 +0200 |
commit | 3e1d610373d83625ca5c6dc5b2d19e6fe0fe89c4 (patch) | |
tree | 6d2c965ba4c11c830ead1cec83bafbc4e3f0193d /searchlib/src/tests | |
parent | 9c2fd2e3d72bac09e5f5ca772ac144b46bbf88dd (diff) | |
parent | cccae3743c8e9e0003f79c68887ae6e2b879f332 (diff) |
Merge pull request #30872 from vespa-engine/geirst/more-low-level-in-operator-perf-tests
Support populating attribute where docid sets matching terms/children…
Diffstat (limited to 'searchlib/src/tests')
7 files changed, 119 insertions, 56 deletions
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp index 6a2b306522d..5a0bda49b98 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp @@ -18,30 +18,71 @@ namespace { template <typename AttributeType, bool is_string, bool is_multivalue> void +update_attribute(AttributeType& attr, uint32_t docid, uint32_t value) +{ + if constexpr (is_string) { + if constexpr (is_multivalue) { + attr.append(docid, std::to_string(value), random_int(1, 100)); + } else { + attr.update(docid, std::to_string(value)); + } + } else { + if constexpr (is_multivalue) { + attr.append(docid, value, random_int(1, 100)); + } else { + attr.update(docid, value); + } + } +} + +template <typename AttributeType, bool is_string, bool is_multivalue> +void populate_attribute(AttributeType& attr, uint32_t docid_limit, const HitSpecs& hit_specs) { for (auto spec : hit_specs) { auto docids = random_docids(docid_limit, spec.num_hits); docids->foreach_truebit([&](uint32_t docid) { - if constexpr (is_string) { - if constexpr (is_multivalue) { - attr.append(docid, std::to_string(spec.term_value), random_int(1, 100)); - } else { - attr.update(docid, std::to_string(spec.term_value)); - } - } else { - if constexpr (is_multivalue) { - attr.append(docid, spec.term_value, random_int(1, 100)); - } else { - attr.update(docid, spec.term_value); - } - } + update_attribute<AttributeType, is_string, is_multivalue>(attr, docid, spec.term_value); }); } } +template <typename AttributeType, bool is_string, bool is_multivalue> +void +populate_attribute(AttributeType& attr, const std::vector<uint32_t>& values) +{ + for (uint32_t docid = 1; docid < values.size(); ++docid) { + uint32_t value = values[docid]; + if (value == 0) { + continue; + } + update_attribute<AttributeType, is_string, is_multivalue>(attr, docid, value); + } +} + +template <typename AttributeType, bool is_string, bool is_multivalue> +void +populate_attribute(AttributeType& attr, uint32_t docid_limit, const HitSpecs& hit_specs, bool disjunct_terms) +{ + if (disjunct_terms) { + // Ensure that each term in HitSpecs is matched by a disjunct (random) subset of docids. + std::vector<uint32_t> values(docid_limit, 0); + uint32_t docid = 1; + for (auto spec : hit_specs) { + assert((docid + spec.num_hits) <= docid_limit); + std::fill_n(values.begin() + docid, spec.num_hits, spec.term_value); + docid += spec.num_hits; + } + std::shuffle(values.begin() + 1, values.end(), get_gen()); + populate_attribute<AttributeType, is_string, is_multivalue>(attr, values); + } else { + // For each term in HitSpecs we draw a new random set of docids that will match this term value. + populate_attribute<AttributeType, is_string, is_multivalue>(attr, docid_limit, hit_specs); + } +} + AttributeVector::SP -make_attribute(const Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs) +make_attribute(const Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs, bool disjunct_terms) { auto attr = AttributeFactory::createAttribute(field_name, cfg); attr->addReservedDoc(); @@ -52,16 +93,16 @@ make_attribute(const Config& cfg, vespalib::stringref field_name, uint32_t num_d if (attr->isStringType()) { auto& real = dynamic_cast<StringAttribute&>(*attr); if (is_multivalue) { - populate_attribute<StringAttribute, true, true>(real, docid_limit, hit_specs); + populate_attribute<StringAttribute, true, true>(real, docid_limit, hit_specs, disjunct_terms); } else { - populate_attribute<StringAttribute, true, false>(real, docid_limit, hit_specs); + populate_attribute<StringAttribute, true, false>(real, docid_limit, hit_specs, disjunct_terms); } } else { auto& real = dynamic_cast<IntegerAttribute&>(*attr); if (is_multivalue) { - populate_attribute<IntegerAttribute, false, true>(real, docid_limit, hit_specs); + populate_attribute<IntegerAttribute, false, true>(real, docid_limit, hit_specs, disjunct_terms); } else { - populate_attribute<IntegerAttribute, false, false>(real, docid_limit, hit_specs); + populate_attribute<IntegerAttribute, false, false>(real, docid_limit, hit_specs, disjunct_terms); } } attr->commit(true); @@ -90,9 +131,9 @@ AttributeContextBuilder::AttributeContextBuilder() } void -AttributeContextBuilder::add(const search::attribute::Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs) +AttributeContextBuilder::add(const search::attribute::Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs, bool disjunct_terms) { - auto attr = make_attribute(cfg, field_name, num_docs, hit_specs); + auto attr = make_attribute(cfg, field_name, num_docs, hit_specs, disjunct_terms); _ctx->add(std::move(attr)); } diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h index e4a58c91668..7e5236e43be 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h +++ b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h @@ -19,7 +19,7 @@ private: public: AttributeContextBuilder(); - void add(const search::attribute::Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs); + void add(const search::attribute::Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs, bool disjunct_terms); std::unique_ptr<BenchmarkSearchable> build(); }; diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp index 516a819aae8..0496a0e6dc8 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp @@ -42,11 +42,11 @@ calc_hits_per_term(uint32_t num_docs, double op_hit_ratio, uint32_t children, Qu } std::unique_ptr<BenchmarkSearchable> -make_searchable(const FieldConfig& cfg, uint32_t num_docs, const HitSpecs& hit_specs) +make_searchable(const FieldConfig& cfg, uint32_t num_docs, const HitSpecs& hit_specs, bool disjunct_terms) { if (cfg.is_attr()) { AttributeContextBuilder builder; - builder.add(cfg.attr_cfg(), field_name, num_docs, hit_specs); + builder.add(cfg.attr_cfg(), field_name, num_docs, hit_specs, disjunct_terms); return builder.build(); } else { uint32_t docid_limit = num_docs + 1; @@ -159,14 +159,14 @@ private: public: MyFactory(const FieldConfig& field_cfg, QueryOperator query_op, uint32_t num_docs, uint32_t default_values_per_document, - double op_hit_ratio, uint32_t children); + double op_hit_ratio, uint32_t children, bool disjunct_children); std::unique_ptr<Blueprint> make_blueprint() override; }; MyFactory::MyFactory(const FieldConfig& field_cfg, QueryOperator query_op, uint32_t num_docs, uint32_t default_values_per_document, - double op_hit_ratio, uint32_t children) + double op_hit_ratio, uint32_t children, bool disjunct_children) : _query_op(query_op), _docid_limit(num_docs + 1), _terms(), @@ -174,9 +174,17 @@ MyFactory::MyFactory(const FieldConfig& field_cfg, QueryOperator query_op, { uint32_t hits_per_term = calc_hits_per_term(num_docs, op_hit_ratio, children, query_op); HitSpecs hit_specs(55555); - hit_specs.add(default_values_per_document, num_docs); + if (!disjunct_children) { + hit_specs.add(default_values_per_document, num_docs); + } _terms = hit_specs.add(children, hits_per_term); - _searchable = make_searchable(field_cfg, num_docs, hit_specs); + if (disjunct_children && default_values_per_document != 0) { + // This ensures that the remaining docids are populated with a "default value". + // Only a single default value is supported. + uint32_t op_num_hits = num_docs * op_hit_ratio; + hit_specs.add(1, num_docs - op_num_hits); + } + _searchable = make_searchable(field_cfg, num_docs, hit_specs, disjunct_children); } std::unique_ptr<Blueprint> @@ -190,9 +198,9 @@ MyFactory::make_blueprint() std::unique_ptr<BenchmarkBlueprintFactory> make_blueprint_factory(const FieldConfig& field_cfg, QueryOperator query_op, uint32_t num_docs, uint32_t default_values_per_document, - double op_hit_ratio, uint32_t children) + double op_hit_ratio, uint32_t children, bool disjunct_children) { - return std::make_unique<MyFactory>(field_cfg, query_op, num_docs, default_values_per_document, op_hit_ratio, children); + return std::make_unique<MyFactory>(field_cfg, query_op, num_docs, default_values_per_document, op_hit_ratio, children, disjunct_children); } } diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h index d3e529fcd65..423f517ffb0 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h +++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h @@ -21,6 +21,6 @@ public: std::unique_ptr<BenchmarkBlueprintFactory> make_blueprint_factory(const FieldConfig& field_cfg, QueryOperator query_op, uint32_t num_docs, uint32_t default_values_per_document, - double op_hit_ratio, uint32_t children); + double op_hit_ratio, uint32_t children, bool disjunct_children); } diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp index f17403bd33a..c67a5ee1074 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp @@ -1,7 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "common.h" -#include <random> #include <sstream> using search::attribute::CollectionType; @@ -49,6 +48,8 @@ std::mt19937 gen(default_seed); } +std::mt19937& get_gen() { return gen; } + BitVector::UP random_docids(uint32_t docid_limit, uint32_t count) { diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/common.h b/searchlib/src/tests/queryeval/iterator_benchmark/common.h index 45fd82b091c..bf16e6f51d7 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/common.h +++ b/searchlib/src/tests/queryeval/iterator_benchmark/common.h @@ -5,6 +5,7 @@ #include <vespa/searchcommon/attribute/config.h> #include <vespa/searchcommon/common/schema.h> #include <vespa/searchlib/common/bitvector.h> +#include <random> #include <variant> namespace search::queryeval::test { @@ -78,6 +79,8 @@ public: auto end() const { return _specs.end(); } }; +std::mt19937& get_gen(); + BitVector::UP random_docids(uint32_t docid_limit, uint32_t count); int32_t random_int(int32_t a, int32_t b); diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp index b08fde50d7c..c6dae52fd69 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp @@ -31,25 +31,22 @@ struct BenchmarkResult { uint32_t hits; FlowStats flow; double actual_cost; - double alt_cost; vespalib::string iterator_name; vespalib::string blueprint_name; - BenchmarkResult() : BenchmarkResult(0, 0, 0, {0, 0, 0}, 0, 0, "", "") {} - BenchmarkResult(double time_ms_in, uint32_t seeks_in, uint32_t hits_in, FlowStats flow_in, double actual_cost_in, double alt_cost_in, + BenchmarkResult() : BenchmarkResult(0, 0, 0, {0, 0, 0}, 0, "", "") {} + BenchmarkResult(double time_ms_in, uint32_t seeks_in, uint32_t hits_in, FlowStats flow_in, double actual_cost_in, const vespalib::string& iterator_name_in, const vespalib::string& blueprint_name_in) : time_ms(time_ms_in), seeks(seeks_in), hits(hits_in), flow(flow_in), actual_cost(actual_cost_in), - alt_cost(alt_cost_in), iterator_name(iterator_name_in), blueprint_name(blueprint_name_in) {} ~BenchmarkResult(); double ns_per_seek() const { return (time_ms / seeks) * 1000.0 * 1000.0; } double ms_per_actual_cost() const { return (time_ms / actual_cost); } - double ms_per_alt_cost() const { return (time_ms / alt_cost); } }; BenchmarkResult::~BenchmarkResult() = default; @@ -128,9 +125,6 @@ public: Stats ms_per_actual_cost_stats() const { return calc_stats([](const auto& res){ return res.ms_per_actual_cost(); }); } - Stats ms_per_alt_cost_stats() const { - return calc_stats([](const auto& res){ return res.ms_per_alt_cost(); }); - } }; std::string @@ -222,7 +216,7 @@ strict_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit) timer.after(); } FlowStats flow(ctx.blueprint->estimate(), ctx.blueprint->cost(), ctx.blueprint->strict_cost()); - return {timer.min_time() * 1000.0, hits + 1, hits, flow, flow.strict_cost, flow.strict_cost, get_class_name(*ctx.iterator), get_class_name(*ctx.blueprint)}; + return {timer.min_time() * 1000.0, hits + 1, hits, flow, flow.strict_cost, get_class_name(*ctx.iterator), get_class_name(*ctx.blueprint)}; } template <bool do_unpack> @@ -256,9 +250,7 @@ non_strict_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit, doub } FlowStats flow(ctx.blueprint->estimate(), ctx.blueprint->cost(), ctx.blueprint->strict_cost()); double actual_cost = flow.cost * filter_hit_ratio; - // This is an attempt to calculate an alternative actual cost for strict / posting list iterators that are used in a non-strict context. - double alt_cost = flow.strict_cost + 0.5 * filter_hit_ratio; - return {timer.min_time() * 1000.0, seeks, hits, flow, actual_cost, alt_cost, get_class_name(*ctx.iterator), get_class_name(*ctx.blueprint)}; + return {timer.min_time() * 1000.0, seeks, hits, flow, actual_cost, get_class_name(*ctx.iterator), get_class_name(*ctx.blueprint)}; } BenchmarkResult @@ -413,32 +405,30 @@ to_string(bool val) void print_result_header() { - std::cout << "| chn | f_ratio | o_ratio | a_ratio | f.est | f.cost | f.scost | hits | seeks | time_ms | act_cost | alt_cost | ns_per_seek | ms_per_act_cost | ms_per_alt_cost | iterator | blueprint |" << std::endl; + std::cout << "| chn | f_ratio | o_ratio | a_ratio | f.est | f.cost | f.scost | hits | seeks | time_ms | act_cost | ns_per_seek | ms_per_act_cost | iterator | blueprint |" << std::endl; } void print_result(const BenchmarkResult& res, uint32_t children, double op_hit_ratio, double filter_hit_ratio, uint32_t num_docs) { std::cout << std::fixed << std::setprecision(5) - << "| " << std::setw(4) << children + << "| " << std::setw(5) << children << " | " << std::setw(7) << filter_hit_ratio << " | " << std::setw(7) << op_hit_ratio << " | " << std::setw(7) << ((double) res.hits / (double) num_docs) << " | " << std::setw(6) << res.flow.estimate << std::setprecision(4) - << " | " << std::setw(7) << res.flow.cost + << " | " << std::setw(9) << res.flow.cost << " | " << std::setw(7) << res.flow.strict_cost << " | " << std::setw(8) << res.hits << " | " << std::setw(8) << res.seeks << std::setprecision(3) << " | " << std::setw(8) << res.time_ms << std::setprecision(4) - << " | " << std::setw(8) << res.actual_cost - << " | " << std::setw(8) << res.alt_cost + << " | " << std::setw(9) << res.actual_cost << std::setprecision(2) << " | " << std::setw(11) << res.ns_per_seek() << " | " << std::setw(15) << res.ms_per_actual_cost() - << " | " << std::setw(15) << res.ms_per_alt_cost() << " | " << res.iterator_name << " | " << res.blueprint_name << " |" << std::endl; } @@ -449,8 +439,7 @@ print_result(const BenchmarkCaseResult& result) std::cout << std::fixed << std::setprecision(3) << "summary: time_ms=" << result.time_ms_stats().to_string() << std::endl << " ns_per_seek=" << result.ns_per_seek_stats().to_string() << std::endl - << " ms_per_act_cost=" << result.ms_per_actual_cost_stats().to_string() << std::endl - << " ms_per_alt_cost=" << result.ms_per_alt_cost_stats().to_string() << std::endl << std::endl; + << " ms_per_act_cost=" << result.ms_per_actual_cost_stats().to_string() << std::endl << std::endl; } struct BenchmarkCase { @@ -534,6 +523,7 @@ struct BenchmarkCaseSetup { std::vector<uint32_t> child_counts; std::vector<double> filter_hit_ratios; uint32_t default_values_per_document; + bool disjunct_children; double filter_crossover_factor; BenchmarkCaseSetup(uint32_t num_docs_in, const BenchmarkCase& bcase_in, @@ -545,6 +535,7 @@ struct BenchmarkCaseSetup { child_counts(child_counts_in), filter_hit_ratios({1.0}), default_values_per_document(0), + disjunct_children(false), filter_crossover_factor(0.0) {} ~BenchmarkCaseSetup() {} @@ -561,6 +552,7 @@ struct BenchmarkSetup { bool force_strict; bool unpack_iterator; uint32_t default_values_per_document; + bool disjunct_children; double filter_crossover_factor; BenchmarkSetup(uint32_t num_docs_in, const std::vector<FieldConfig>& field_cfgs_in, @@ -578,6 +570,7 @@ struct BenchmarkSetup { force_strict(false), unpack_iterator(false), default_values_per_document(0), + disjunct_children(false), filter_crossover_factor(0.0) {} BenchmarkSetup(uint32_t num_docs_in, @@ -592,6 +585,7 @@ struct BenchmarkSetup { res.bcase.force_strict = force_strict; res.bcase.unpack_iterator = unpack_iterator; res.default_values_per_document = default_values_per_document; + res.disjunct_children = disjunct_children; if (!bcase.strict_context) { // Simulation of a filter is only relevant in a non-strict context. res.filter_hit_ratios = filter_hit_ratios; @@ -617,7 +611,7 @@ run_benchmark_case(const BenchmarkCaseSetup& setup) for (uint32_t children : setup.child_counts) { auto factory = make_blueprint_factory(setup.bcase.field_cfg, setup.bcase.query_op, setup.num_docs, setup.default_values_per_document, - op_hit_ratio, children); + op_hit_ratio, children, setup.disjunct_children); for (double filter_hit_ratio : setup.filter_hit_ratios) { if (filter_hit_ratio * setup.filter_crossover_factor <= op_hit_ratio) { auto res = benchmark_search(*factory, setup.num_docs + 1, @@ -726,6 +720,22 @@ TEST(IteratorBenchmark, analyze_term_search_in_fast_search_attributes) run_benchmarks(setup, global_summary); } +TEST(IteratorBenchmark, analyze_in_operator_non_strict) +{ + const std::vector<double> hit_ratios = {0.001, 0.01, 0.1, 0.2, 0.4, 0.6, 0.8}; + BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::In}, {false}, hit_ratios, {5, 9, 10, 100, 1000, 10000}); + setup.disjunct_children = true; + run_benchmarks(setup); +} + +TEST(IteratorBenchmark, analyze_in_operator_strict) +{ + const std::vector<double> hit_ratios = {0.001, 0.01, 0.1, 0.2, 0.4, 0.6, 0.8}; + BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::In}, {true}, hit_ratios, {5, 9, 10, 100, 1000, 10000}); + setup.disjunct_children = true; + run_benchmarks(setup); +} + TEST(IteratorBenchmark, analyze_complex_leaf_operators) { std::vector<FieldConfig> field_cfgs = {int32_array_fs}; @@ -764,18 +774,18 @@ TEST(IteratorBenchmark, or_benchmark) TEST(IteratorBenchmark, or_vs_filter_crossover) { - auto fixed_or = make_blueprint_factory(int32_array_fs, QueryOperator::Or, num_docs, 0, 0.1, 100); + auto fixed_or = make_blueprint_factory(int32_array_fs, QueryOperator::Or, num_docs, 0, 0.1, 100, false); auto variable_term = [](double rate) { - return make_blueprint_factory(int32_array_fs, QueryOperator::Term, num_docs, 0, rate, 1); + return make_blueprint_factory(int32_array_fs, QueryOperator::Term, num_docs, 0, rate, 1, false); }; analyze_crossover(*fixed_or, variable_term, num_docs + 1, false, 0.0001); } TEST(IteratorBenchmark, or_vs_filter_crossover_with_allow_force_strict) { - auto fixed_or = make_blueprint_factory(int32_array_fs, QueryOperator::Or, num_docs, 0, 0.1, 100); + auto fixed_or = make_blueprint_factory(int32_array_fs, QueryOperator::Or, num_docs, 0, 0.1, 100, false); auto variable_term = [](double rate) { - return make_blueprint_factory(int32_array_fs, QueryOperator::Term, num_docs, 0, rate, 1); + return make_blueprint_factory(int32_array_fs, QueryOperator::Term, num_docs, 0, rate, 1, false); }; analyze_crossover(*fixed_or, variable_term, num_docs + 1, true, 0.0001); } |