diff options
author | Geir Storli <geirst@yahooinc.com> | 2024-03-19 13:26:39 +0000 |
---|---|---|
committer | Geir Storli <geirst@yahooinc.com> | 2024-03-19 13:26:39 +0000 |
commit | 49f89adc8dd08e2f322fc2313064042a03b9b1bb (patch) | |
tree | e7c81c2184f97c6f84ef444aa0fca3c2e620c901 /searchlib | |
parent | b6a88a3cc4dc6dd52d2ce4ffe910a4e5e5722bb4 (diff) |
Add factory for creating a blueprint for a given benchmark setup.
Diffstat (limited to 'searchlib')
5 files changed, 219 insertions, 125 deletions
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt b/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt index c35b8211897..872fb4ca6ca 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt +++ b/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt @@ -2,6 +2,7 @@ vespa_add_executable(searchlib_iterator_benchmark_test_app TEST SOURCES attribute_ctx_builder.cpp + benchmark_blueprint_factory.cpp common.cpp disk_index_builder.cpp iterator_benchmark_test.cpp diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp new file mode 100644 index 00000000000..15690fd71d5 --- /dev/null +++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp @@ -0,0 +1,181 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "attribute_ctx_builder.h" +#include "benchmark_blueprint_factory.h" +#include "benchmark_searchable.h" +#include "disk_index_builder.h" +#include <vespa/searchlib/diskindex/diskindex.h> +#include <vespa/searchlib/query/tree/integer_term_vector.h> +#include <vespa/searchlib/query/tree/node.h> +#include <vespa/searchlib/query/tree/simplequery.h> +#include <vespa/searchlib/queryeval/blueprint.h> +#include <vespa/searchlib/queryeval/intermediate_blueprints.h> +#include <cmath> + +using search::query::IntegerTermVector; +using search::query::MultiTerm; +using search::query::Node; +using search::query::SimpleDotProduct; +using search::query::SimpleInTerm; +using search::query::SimpleStringTerm; +using search::query::SimpleWeightedSetTerm; +using search::query::Weight; + +namespace search::queryeval::test { + +namespace { + +const vespalib::string field_name = "myfield"; +const vespalib::string index_dir = "indexdir"; + +uint32_t +calc_hits_per_term(uint32_t num_docs, double op_hit_ratio, uint32_t children, QueryOperator query_op) +{ + if (query_op == QueryOperator::And) { + double child_hit_ratio = std::pow(op_hit_ratio, (1.0/(double)children)); + return num_docs * child_hit_ratio; + } else { + uint32_t op_num_hits = num_docs * op_hit_ratio; + return op_num_hits / children; + } +} + +std::unique_ptr<BenchmarkSearchable> +make_searchable(const FieldConfig& cfg, uint32_t num_docs, const HitSpecs& hit_specs) +{ + if (cfg.is_attr()) { + AttributeContextBuilder builder; + builder.add(cfg.attr_cfg(), field_name, num_docs, hit_specs); + return builder.build(); + } else { + uint32_t docid_limit = num_docs + 1; + DiskIndexBuilder builder(cfg.index_cfg(), index_dir, docid_limit, hit_specs.size()); + for (auto spec : hit_specs) { + // TODO: make number of occurrences configurable. + uint32_t num_occs = 1; + builder.add_word(std::to_string(spec.term_value), *random_docids(docid_limit, spec.num_hits), num_occs); + } + return builder.build(); + } +} + +std::unique_ptr<Node> +make_query_node(QueryOperator query_op, const TermVector& terms) +{ + if (query_op == QueryOperator::Term) { + assert(terms.size() == 1); + return std::make_unique<SimpleStringTerm>(std::to_string(terms[0]), field_name, 0, Weight(1)); + } else if (query_op == QueryOperator::In) { + auto termv = std::make_unique<IntegerTermVector>(terms.size()); + for (auto term : terms) { + termv->addTerm(term); + } + return std::make_unique<SimpleInTerm>(std::move(termv), MultiTerm::Type::INTEGER, field_name, 0, Weight(1)); + } else if (query_op == QueryOperator::WeightedSet) { + auto res = std::make_unique<SimpleWeightedSetTerm>(terms.size(), field_name, 0, Weight(1)); + for (auto term : terms) { + res->addTerm(term, Weight(1)); + } + return res; + } else if (query_op == QueryOperator::DotProduct) { + auto res = std::make_unique<SimpleDotProduct>(terms.size(), field_name, 0, Weight(1)); + for (auto term : terms) { + res->addTerm(term, Weight(1)); + } + return res; + } + return {}; +} + +Blueprint::UP +make_leaf_blueprint(const Node& node, BenchmarkSearchable& searchable, uint32_t docid_limit) +{ + auto blueprint = searchable.create_blueprint(FieldSpec(field_name, 0, 0), node); + assert(blueprint.get()); + blueprint->setDocIdLimit(docid_limit); + blueprint->update_flow_stats(docid_limit); + return blueprint; +} + +template <typename BlueprintType> +Blueprint::UP +make_intermediate_blueprint(BenchmarkSearchable& searchable, const TermVector& terms, uint32_t docid_limit) +{ + auto blueprint = std::make_unique<BlueprintType>(); + for (auto term : terms) { + SimpleStringTerm sterm(std::to_string(term), field_name, 0, Weight(1)); + auto child = make_leaf_blueprint(sterm, searchable, docid_limit); + blueprint->addChild(std::move(child)); + } + blueprint->setDocIdLimit(docid_limit); + blueprint->update_flow_stats(docid_limit); + return blueprint; +} + +Blueprint::UP +make_blueprint_helper(BenchmarkSearchable& searchable, QueryOperator query_op, const TermVector& terms, uint32_t docid_limit) +{ + if (query_op == QueryOperator::And) { + return make_intermediate_blueprint<AndBlueprint>(searchable, terms, docid_limit); + } else if (query_op == QueryOperator::Or) { + return make_intermediate_blueprint<OrBlueprint>(searchable, terms, docid_limit); + } else { + auto query_node = make_query_node(query_op, terms); + return make_leaf_blueprint(*query_node, searchable, docid_limit); + } +} + +/** + * Factory for creating a Blueprint for a given benchmark setup. + * + * This populates an attribute or disk index field such that the query operator hits + * the given ratio of the total document corpus. + */ +class MyFactory : public BenchmarkBlueprintFactory { +private: + QueryOperator _query_op; + uint32_t _docid_limit; + TermVector _terms; + std::unique_ptr<BenchmarkSearchable> _searchable; + +public: + MyFactory(const FieldConfig& field_cfg, QueryOperator query_op, + uint32_t num_docs, uint32_t default_values_per_document, + double op_hit_ratio, uint32_t children); + + std::unique_ptr<Blueprint> make_blueprint() override; +}; + +MyFactory::MyFactory(const FieldConfig& field_cfg, QueryOperator query_op, + uint32_t num_docs, uint32_t default_values_per_document, + double op_hit_ratio, uint32_t children) + : _query_op(query_op), + _docid_limit(num_docs + 1), + _terms(), + _searchable() +{ + uint32_t hits_per_term = calc_hits_per_term(num_docs, op_hit_ratio, children, query_op); + HitSpecs hit_specs(55555); + hit_specs.add(default_values_per_document, num_docs); + _terms = hit_specs.add(children, hits_per_term); + _searchable = make_searchable(field_cfg, num_docs, hit_specs); +} + +std::unique_ptr<Blueprint> +MyFactory::make_blueprint() +{ + return make_blueprint_helper(*_searchable, _query_op, _terms, _docid_limit); +} + +} + +std::unique_ptr<BenchmarkBlueprintFactory> +make_blueprint_factory(const FieldConfig& field_cfg, QueryOperator query_op, + uint32_t num_docs, uint32_t default_values_per_document, + double op_hit_ratio, uint32_t children) +{ + return std::make_unique<MyFactory>(field_cfg, query_op, num_docs, default_values_per_document, op_hit_ratio, children); +} + +} + diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h new file mode 100644 index 00000000000..1459cbfe856 --- /dev/null +++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h @@ -0,0 +1,27 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "common.h" +#include <memory> + +namespace search::queryeval { class Blueprint; } + +namespace search::queryeval::test { + +/** + * Interface for creating a Blueprint. + */ +class BenchmarkBlueprintFactory { +public: + virtual ~BenchmarkBlueprintFactory() = default; + virtual std::unique_ptr<Blueprint> make_blueprint() = 0; +}; + +std::unique_ptr<BenchmarkBlueprintFactory> +make_blueprint_factory(const FieldConfig& field_cfg, QueryOperator query_op, + uint32_t num_docs, uint32_t default_values_per_document, + double op_hit_ratio, uint32_t children); + +} + diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/common.h b/searchlib/src/tests/queryeval/iterator_benchmark/common.h index 7d91db56d93..6d890910271 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/common.h +++ b/searchlib/src/tests/queryeval/iterator_benchmark/common.h @@ -50,9 +50,7 @@ struct HitSpec { HitSpec(uint32_t term_value_in, uint32_t num_hits_in) : term_value(term_value_in), num_hits(num_hits_in) {} }; -namespace benchmark { using TermVector = std::vector<uint32_t>; -} class HitSpecs { private: @@ -64,8 +62,8 @@ public: : _specs(), _next_term_value(first_term_value) { } - benchmark::TermVector add(uint32_t num_terms, uint32_t hits_per_term) { - benchmark::TermVector res; + TermVector add(uint32_t num_terms, uint32_t hits_per_term) { + TermVector res; for (uint32_t i = 0; i < num_terms; ++i) { uint32_t term_value = _next_term_value++; _specs.push_back({term_value, hits_per_term}); diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp index cb830920ced..202ba8c180e 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp @@ -1,18 +1,9 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "attribute_ctx_builder.h" -#include "benchmark_searchable.h" +#include "benchmark_blueprint_factory.h" #include "common.h" -#include "disk_index_builder.h" -#include <vespa/searchlib/diskindex/diskindex.h> #include <vespa/searchlib/fef/matchdata.h> -#include <vespa/searchlib/index/docidandfeatures.h> -#include <vespa/searchlib/query/tree/integer_term_vector.h> -#include <vespa/searchlib/query/tree/node.h> -#include <vespa/searchlib/query/tree/simplequery.h> #include <vespa/searchlib/queryeval/blueprint.h> -#include <vespa/searchlib/queryeval/field_spec.h> -#include <vespa/searchlib/queryeval/intermediate_blueprints.h> #include <vespa/vespalib/gtest/gtest.h> #include <vespa/vespalib/util/benchmark_timer.h> #include <cmath> @@ -21,38 +12,16 @@ using namespace search::attribute; using namespace search::fef; -using namespace search::query; using namespace search::queryeval::test; using namespace search::queryeval; using namespace search; using namespace vespalib; -using search::index::DocIdAndFeatures; using search::index::Schema; const vespalib::string field_name = "myfield"; -const vespalib::string index_dir = "indexdir"; double budget_sec = 1.0; -std::unique_ptr<BenchmarkSearchable> -make_searchable(const FieldConfig& cfg, uint32_t num_docs, const HitSpecs& hit_specs) -{ - if (cfg.is_attr()) { - AttributeContextBuilder builder; - builder.add(cfg.attr_cfg(), field_name, num_docs, hit_specs); - return builder.build(); - } else { - uint32_t docid_limit = num_docs + 1; - DiskIndexBuilder builder(cfg.index_cfg(), index_dir, docid_limit, hit_specs.size()); - for (auto spec : hit_specs) { - // TODO: make number of occurrences configurable. - uint32_t num_occs = 1; - builder.add_word(std::to_string(spec.term_value), *random_docids(docid_limit, spec.num_hits), num_occs); - } - return builder.build(); - } -} - struct BenchmarkResult { double time_ms; uint32_t seeks; @@ -253,79 +222,12 @@ benchmark_search(Blueprint::UP blueprint, uint32_t docid_limit, bool strict_cont } } -Blueprint::UP -make_leaf_blueprint(const Node& node, BenchmarkSearchable& searchable, uint32_t docid_limit) -{ - auto blueprint = searchable.create_blueprint(FieldSpec(field_name, 0, 0), node); - assert(blueprint.get()); - blueprint->setDocIdLimit(docid_limit); - blueprint->update_flow_stats(docid_limit); - return blueprint; -} - vespalib::string to_string(bool val) { return val ? "true" : "false"; } -std::unique_ptr<Node> -make_query_node(QueryOperator query_op, const benchmark::TermVector& terms) -{ - if (query_op == QueryOperator::Term) { - assert(terms.size() == 1); - return std::make_unique<SimpleStringTerm>(std::to_string(terms[0]), field_name, 0, Weight(1)); - } else if (query_op == QueryOperator::In) { - auto termv = std::make_unique<IntegerTermVector>(terms.size()); - for (auto term : terms) { - termv->addTerm(term); - } - return std::make_unique<SimpleInTerm>(std::move(termv), MultiTerm::Type::INTEGER, field_name, 0, Weight(1)); - } else if (query_op == QueryOperator::WeightedSet) { - auto res = std::make_unique<SimpleWeightedSetTerm>(terms.size(), field_name, 0, Weight(1)); - for (auto term : terms) { - res->addTerm(term, Weight(1)); - } - return res; - } else if (query_op == QueryOperator::DotProduct) { - auto res = std::make_unique<SimpleDotProduct>(terms.size(), field_name, 0, Weight(1)); - for (auto term : terms) { - res->addTerm(term, Weight(1)); - } - return res; - } - return {}; -} - -template <typename BlueprintType> -Blueprint::UP -make_intermediate_blueprint(BenchmarkSearchable& searchable, const benchmark::TermVector& terms, uint32_t docid_limit) -{ - auto blueprint = std::make_unique<BlueprintType>(); - for (auto term : terms) { - SimpleStringTerm sterm(std::to_string(term), field_name, 0, Weight(1)); - auto child = make_leaf_blueprint(sterm, searchable, docid_limit); - blueprint->addChild(std::move(child)); - } - blueprint->setDocIdLimit(docid_limit); - blueprint->update_flow_stats(docid_limit); - return blueprint; -} - -BenchmarkResult -run_benchmark(BenchmarkSearchable& searchable, QueryOperator query_op, const benchmark::TermVector& terms, uint32_t docid_limit, bool strict_context, bool force_strict, double filter_hit_ratio) -{ - if (query_op == QueryOperator::And) { - return benchmark_search(make_intermediate_blueprint<AndBlueprint>(searchable, terms, docid_limit), docid_limit, strict_context, force_strict, filter_hit_ratio); - } else if (query_op == QueryOperator::Or) { - return benchmark_search(make_intermediate_blueprint<OrBlueprint>(searchable, terms, docid_limit), docid_limit, strict_context, force_strict, filter_hit_ratio); - } else { - auto query_node = make_query_node(query_op, terms); - auto blueprint = make_leaf_blueprint(*query_node, searchable, docid_limit); - return benchmark_search(std::move(blueprint), docid_limit, strict_context, force_strict, filter_hit_ratio); - } -} - void print_result_header() { @@ -333,10 +235,10 @@ print_result_header() } void -print_result(const BenchmarkResult& res, const benchmark::TermVector& terms, double op_hit_ratio, double filter_hit_ratio, uint32_t num_docs) +print_result(const BenchmarkResult& res, uint32_t children, double op_hit_ratio, double filter_hit_ratio, uint32_t num_docs) { std::cout << std::fixed << std::setprecision(4) - << "| " << std::setw(4) << terms.size() + << "| " << std::setw(4) << children << " | " << std::setw(7) << filter_hit_ratio << " | " << std::setw(7) << op_hit_ratio << " | " << std::setw(7) << ((double) res.hits / (double) num_docs) @@ -517,18 +419,6 @@ struct BenchmarkSetup { BenchmarkSetup::~BenchmarkSetup() = default; -uint32_t -calc_hits_per_term(uint32_t num_docs, double op_hit_ratio, uint32_t children, QueryOperator query_op) -{ - if (query_op == QueryOperator::And) { - double child_hit_ratio = std::pow(op_hit_ratio, (1.0/(double)children)); - return num_docs * child_hit_ratio; - } else { - uint32_t op_num_hits = num_docs * op_hit_ratio; - return op_num_hits / children; - } -} - BenchmarkCaseResult run_benchmark_case(const BenchmarkCaseSetup& setup) { @@ -537,16 +427,13 @@ run_benchmark_case(const BenchmarkCaseSetup& setup) print_result_header(); for (double op_hit_ratio : setup.op_hit_ratios) { for (uint32_t children : setup.child_counts) { - uint32_t hits_per_term = calc_hits_per_term(setup.num_docs, op_hit_ratio, children, setup.bcase.query_op); - HitSpecs hit_specs(55555); - hit_specs.add(setup.default_values_per_document, setup.num_docs); - auto terms = hit_specs.add(children, hits_per_term); - auto searchable = make_searchable(setup.bcase.field_cfg, setup.num_docs, hit_specs); + auto factory = make_blueprint_factory(setup.bcase.field_cfg, setup.bcase.query_op, + setup.num_docs, setup.default_values_per_document, + op_hit_ratio, children); for (double filter_hit_ratio : setup.filter_hit_ratios) { if (filter_hit_ratio * setup.filter_crossover_factor <= op_hit_ratio) { - auto res = run_benchmark(*searchable, setup.bcase.query_op, terms, setup.num_docs + 1, - setup.bcase.strict_context, setup.bcase.force_strict, filter_hit_ratio); - print_result(res, terms, op_hit_ratio, filter_hit_ratio, setup.num_docs); + auto res = benchmark_search(factory->make_blueprint(), setup.num_docs + 1, setup.bcase.strict_context, setup.bcase.force_strict, filter_hit_ratio); + print_result(res, children, op_hit_ratio, filter_hit_ratio, setup.num_docs); result.add(res); } } |