diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-03-19 16:27:22 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-19 16:27:22 +0100 |
commit | 45d8663b069aaeba8d00d037a7cafdfaec95ca54 (patch) | |
tree | 70618331c488c1cbf43e5448f755b21f5efeaa8b /searchlib/src | |
parent | f48096c672bc54ae962a642353d34fdb3067ddff (diff) | |
parent | 49f89adc8dd08e2f322fc2313064042a03b9b1bb (diff) |
Merge pull request #30681 from vespa-engine/geirst/iterator-benchmark-blueprint-factory
Refactor iterator benchmark to have a factory for creating blueprints
Diffstat (limited to 'searchlib/src')
8 files changed, 508 insertions, 321 deletions
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt b/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt index 34c5928c123..872fb4ca6ca 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt +++ b/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt @@ -1,6 +1,9 @@ # Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_executable(searchlib_iterator_benchmark_test_app TEST SOURCES + attribute_ctx_builder.cpp + benchmark_blueprint_factory.cpp + common.cpp disk_index_builder.cpp iterator_benchmark_test.cpp DEPENDS diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp new file mode 100644 index 00000000000..e776b6a9379 --- /dev/null +++ b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp @@ -0,0 +1,105 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "attribute_ctx_builder.h" +#include <vespa/searchlib/attribute/attribute_blueprint_factory.h> +#include <vespa/searchlib/attribute/attributefactory.h> +#include <vespa/searchlib/attribute/attributevector.h> +#include <vespa/searchlib/attribute/integerbase.h> +#include <vespa/searchlib/attribute/stringbase.h> +#include <vespa/searchlib/queryeval/blueprint.h> +#include <vespa/searchlib/queryeval/fake_requestcontext.h> + +using namespace search::attribute; +using namespace search::attribute::test; + +namespace search::queryeval::test { + +namespace { + +template <typename AttributeType, bool is_string, bool is_multivalue> +void +populate_attribute(AttributeType& attr, uint32_t docid_limit, const HitSpecs& hit_specs) +{ + for (auto spec : hit_specs) { + auto docids = random_docids(docid_limit, spec.num_hits); + docids->foreach_truebit([&](uint32_t docid) { + if constexpr (is_string) { + if constexpr (is_multivalue) { + attr.append(docid, std::to_string(spec.term_value), 1); + } else { + attr.update(docid, std::to_string(spec.term_value)); + } + } else { + if constexpr (is_multivalue) { + attr.append(docid, spec.term_value, 1); + } else { + attr.update(docid, spec.term_value); + } + } + }); + } +} + +AttributeVector::SP +make_attribute(const Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs) +{ + auto attr = AttributeFactory::createAttribute(field_name, cfg); + attr->addReservedDoc(); + attr->addDocs(num_docs); + uint32_t docid_limit = attr->getNumDocs(); + assert(docid_limit == (num_docs + 1)); + bool is_multivalue = cfg.collectionType() != CollectionType::SINGLE; + if (attr->isStringType()) { + auto& real = dynamic_cast<StringAttribute&>(*attr); + if (is_multivalue) { + populate_attribute<StringAttribute, true, true>(real, docid_limit, hit_specs); + } else { + populate_attribute<StringAttribute, true, false>(real, docid_limit, hit_specs); + } + } else { + auto& real = dynamic_cast<IntegerAttribute&>(*attr); + if (is_multivalue) { + populate_attribute<IntegerAttribute, false, true>(real, docid_limit, hit_specs); + } else { + populate_attribute<IntegerAttribute, false, false>(real, docid_limit, hit_specs); + } + } + attr->commit(true); + return attr; +} + +class AttributeSearchable : public BenchmarkSearchable { +private: + std::unique_ptr<MockAttributeContext> _attr_ctx; + +public: + AttributeSearchable(std::unique_ptr<MockAttributeContext> attr_ctx) : _attr_ctx(std::move(attr_ctx)) {} + std::unique_ptr<Blueprint> create_blueprint(const FieldSpec& field_spec, + const search::query::Node& term) override { + AttributeBlueprintFactory factory; + FakeRequestContext req_ctx(_attr_ctx.get()); + return factory.createBlueprint(req_ctx, field_spec, term); + } +}; + +} + +AttributeContextBuilder::AttributeContextBuilder() + : _ctx(std::make_unique<MockAttributeContext>()) +{ +} + +void +AttributeContextBuilder::add(const search::attribute::Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs) +{ + auto attr = make_attribute(cfg, field_name, num_docs, hit_specs); + _ctx->add(std::move(attr)); +} + +std::unique_ptr<BenchmarkSearchable> +AttributeContextBuilder::build() +{ + return std::make_unique<AttributeSearchable>(std::move(_ctx)); +} + +} diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h new file mode 100644 index 00000000000..e4a58c91668 --- /dev/null +++ b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h @@ -0,0 +1,26 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "benchmark_searchable.h" +#include "common.h" +#include <vespa/searchcommon/attribute/config.h> +#include <vespa/searchlib/test/mock_attribute_context.h> +#include <memory> + +namespace search::queryeval::test { + +/** + * Class used to build attribute(s), used for benchmarking. + */ +class AttributeContextBuilder { +private: + std::unique_ptr<search::attribute::test::MockAttributeContext> _ctx; + +public: + AttributeContextBuilder(); + void add(const search::attribute::Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs); + std::unique_ptr<BenchmarkSearchable> build(); +}; + +} diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp new file mode 100644 index 00000000000..15690fd71d5 --- /dev/null +++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp @@ -0,0 +1,181 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "attribute_ctx_builder.h" +#include "benchmark_blueprint_factory.h" +#include "benchmark_searchable.h" +#include "disk_index_builder.h" +#include <vespa/searchlib/diskindex/diskindex.h> +#include <vespa/searchlib/query/tree/integer_term_vector.h> +#include <vespa/searchlib/query/tree/node.h> +#include <vespa/searchlib/query/tree/simplequery.h> +#include <vespa/searchlib/queryeval/blueprint.h> +#include <vespa/searchlib/queryeval/intermediate_blueprints.h> +#include <cmath> + +using search::query::IntegerTermVector; +using search::query::MultiTerm; +using search::query::Node; +using search::query::SimpleDotProduct; +using search::query::SimpleInTerm; +using search::query::SimpleStringTerm; +using search::query::SimpleWeightedSetTerm; +using search::query::Weight; + +namespace search::queryeval::test { + +namespace { + +const vespalib::string field_name = "myfield"; +const vespalib::string index_dir = "indexdir"; + +uint32_t +calc_hits_per_term(uint32_t num_docs, double op_hit_ratio, uint32_t children, QueryOperator query_op) +{ + if (query_op == QueryOperator::And) { + double child_hit_ratio = std::pow(op_hit_ratio, (1.0/(double)children)); + return num_docs * child_hit_ratio; + } else { + uint32_t op_num_hits = num_docs * op_hit_ratio; + return op_num_hits / children; + } +} + +std::unique_ptr<BenchmarkSearchable> +make_searchable(const FieldConfig& cfg, uint32_t num_docs, const HitSpecs& hit_specs) +{ + if (cfg.is_attr()) { + AttributeContextBuilder builder; + builder.add(cfg.attr_cfg(), field_name, num_docs, hit_specs); + return builder.build(); + } else { + uint32_t docid_limit = num_docs + 1; + DiskIndexBuilder builder(cfg.index_cfg(), index_dir, docid_limit, hit_specs.size()); + for (auto spec : hit_specs) { + // TODO: make number of occurrences configurable. + uint32_t num_occs = 1; + builder.add_word(std::to_string(spec.term_value), *random_docids(docid_limit, spec.num_hits), num_occs); + } + return builder.build(); + } +} + +std::unique_ptr<Node> +make_query_node(QueryOperator query_op, const TermVector& terms) +{ + if (query_op == QueryOperator::Term) { + assert(terms.size() == 1); + return std::make_unique<SimpleStringTerm>(std::to_string(terms[0]), field_name, 0, Weight(1)); + } else if (query_op == QueryOperator::In) { + auto termv = std::make_unique<IntegerTermVector>(terms.size()); + for (auto term : terms) { + termv->addTerm(term); + } + return std::make_unique<SimpleInTerm>(std::move(termv), MultiTerm::Type::INTEGER, field_name, 0, Weight(1)); + } else if (query_op == QueryOperator::WeightedSet) { + auto res = std::make_unique<SimpleWeightedSetTerm>(terms.size(), field_name, 0, Weight(1)); + for (auto term : terms) { + res->addTerm(term, Weight(1)); + } + return res; + } else if (query_op == QueryOperator::DotProduct) { + auto res = std::make_unique<SimpleDotProduct>(terms.size(), field_name, 0, Weight(1)); + for (auto term : terms) { + res->addTerm(term, Weight(1)); + } + return res; + } + return {}; +} + +Blueprint::UP +make_leaf_blueprint(const Node& node, BenchmarkSearchable& searchable, uint32_t docid_limit) +{ + auto blueprint = searchable.create_blueprint(FieldSpec(field_name, 0, 0), node); + assert(blueprint.get()); + blueprint->setDocIdLimit(docid_limit); + blueprint->update_flow_stats(docid_limit); + return blueprint; +} + +template <typename BlueprintType> +Blueprint::UP +make_intermediate_blueprint(BenchmarkSearchable& searchable, const TermVector& terms, uint32_t docid_limit) +{ + auto blueprint = std::make_unique<BlueprintType>(); + for (auto term : terms) { + SimpleStringTerm sterm(std::to_string(term), field_name, 0, Weight(1)); + auto child = make_leaf_blueprint(sterm, searchable, docid_limit); + blueprint->addChild(std::move(child)); + } + blueprint->setDocIdLimit(docid_limit); + blueprint->update_flow_stats(docid_limit); + return blueprint; +} + +Blueprint::UP +make_blueprint_helper(BenchmarkSearchable& searchable, QueryOperator query_op, const TermVector& terms, uint32_t docid_limit) +{ + if (query_op == QueryOperator::And) { + return make_intermediate_blueprint<AndBlueprint>(searchable, terms, docid_limit); + } else if (query_op == QueryOperator::Or) { + return make_intermediate_blueprint<OrBlueprint>(searchable, terms, docid_limit); + } else { + auto query_node = make_query_node(query_op, terms); + return make_leaf_blueprint(*query_node, searchable, docid_limit); + } +} + +/** + * Factory for creating a Blueprint for a given benchmark setup. + * + * This populates an attribute or disk index field such that the query operator hits + * the given ratio of the total document corpus. + */ +class MyFactory : public BenchmarkBlueprintFactory { +private: + QueryOperator _query_op; + uint32_t _docid_limit; + TermVector _terms; + std::unique_ptr<BenchmarkSearchable> _searchable; + +public: + MyFactory(const FieldConfig& field_cfg, QueryOperator query_op, + uint32_t num_docs, uint32_t default_values_per_document, + double op_hit_ratio, uint32_t children); + + std::unique_ptr<Blueprint> make_blueprint() override; +}; + +MyFactory::MyFactory(const FieldConfig& field_cfg, QueryOperator query_op, + uint32_t num_docs, uint32_t default_values_per_document, + double op_hit_ratio, uint32_t children) + : _query_op(query_op), + _docid_limit(num_docs + 1), + _terms(), + _searchable() +{ + uint32_t hits_per_term = calc_hits_per_term(num_docs, op_hit_ratio, children, query_op); + HitSpecs hit_specs(55555); + hit_specs.add(default_values_per_document, num_docs); + _terms = hit_specs.add(children, hits_per_term); + _searchable = make_searchable(field_cfg, num_docs, hit_specs); +} + +std::unique_ptr<Blueprint> +MyFactory::make_blueprint() +{ + return make_blueprint_helper(*_searchable, _query_op, _terms, _docid_limit); +} + +} + +std::unique_ptr<BenchmarkBlueprintFactory> +make_blueprint_factory(const FieldConfig& field_cfg, QueryOperator query_op, + uint32_t num_docs, uint32_t default_values_per_document, + double op_hit_ratio, uint32_t children) +{ + return std::make_unique<MyFactory>(field_cfg, query_op, num_docs, default_values_per_document, op_hit_ratio, children); +} + +} + diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h new file mode 100644 index 00000000000..1459cbfe856 --- /dev/null +++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h @@ -0,0 +1,27 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "common.h" +#include <memory> + +namespace search::queryeval { class Blueprint; } + +namespace search::queryeval::test { + +/** + * Interface for creating a Blueprint. + */ +class BenchmarkBlueprintFactory { +public: + virtual ~BenchmarkBlueprintFactory() = default; + virtual std::unique_ptr<Blueprint> make_blueprint() = 0; +}; + +std::unique_ptr<BenchmarkBlueprintFactory> +make_blueprint_factory(const FieldConfig& field_cfg, QueryOperator query_op, + uint32_t num_docs, uint32_t default_values_per_document, + double op_hit_ratio, uint32_t children); + +} + diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp new file mode 100644 index 00000000000..b937f6a2f00 --- /dev/null +++ b/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp @@ -0,0 +1,74 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "common.h" +#include <random> +#include <sstream> + +using search::attribute::CollectionType; + +namespace search::queryeval::test { + +vespalib::string +to_string(const Config& attr_config) +{ + std::ostringstream oss; + auto col_type = attr_config.collectionType(); + auto basic_type = attr_config.basicType(); + if (col_type == CollectionType::SINGLE) { + oss << basic_type.asString(); + } else { + oss << col_type.asString() << "<" << basic_type.asString() << ">"; + } + if (attr_config.fastSearch()) { + oss << "(fs)"; + } + return oss.str(); +} + +vespalib::string +to_string(QueryOperator query_op) +{ + switch (query_op) { + case QueryOperator::Term: return "Term"; + case QueryOperator::In: return "In"; + case QueryOperator::WeightedSet: return "WeightedSet"; + case QueryOperator::DotProduct: return "DotProduct"; + case QueryOperator::And: return "And"; + case QueryOperator::Or: return "Or"; + } + return "unknown"; +} + +namespace { + +// TODO: Make seed configurable. +constexpr uint32_t default_seed = 1234; +std::mt19937 gen(default_seed); + +} + +BitVector::UP +random_docids(uint32_t docid_limit, uint32_t count) +{ + auto res = BitVector::create(docid_limit); + if ((count + 1) == docid_limit) { + res->notSelf(); + res->clearBit(0); + return res; + } + uint32_t docids_left = count; + // Bit 0 is never set since it is reserved as docid 0. + // All other docids have equal probability to be set. + for (uint32_t docid = 1; docid < docid_limit; ++docid) { + std::uniform_int_distribution<uint32_t> distr(0, docid_limit - docid - 1); + if (distr(gen) < docids_left) { + res->setBit(docid); + --docids_left; + } + } + res->invalidateCachedCount(); + assert(res->countTrueBits() == count); + return res; +} + +}
\ No newline at end of file diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/common.h b/searchlib/src/tests/queryeval/iterator_benchmark/common.h new file mode 100644 index 00000000000..6d890910271 --- /dev/null +++ b/searchlib/src/tests/queryeval/iterator_benchmark/common.h @@ -0,0 +1,81 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchcommon/attribute/config.h> +#include <vespa/searchcommon/common/schema.h> +#include <vespa/searchlib/common/bitvector.h> +#include <variant> + +namespace search::queryeval::test { + +using search::attribute::Config; +using search::index::Schema; + +vespalib::string to_string(const Config& attr_config); + +class FieldConfig { +private: + std::variant<Config, Schema::IndexField> _cfg; + +public: + FieldConfig(const Config& attr_cfg_in) : _cfg(attr_cfg_in) {} + FieldConfig(const Schema::IndexField& index_cfg_in) : _cfg(index_cfg_in) {} + bool is_attr() const { return _cfg.index() == 0; } + const Config& attr_cfg() const { return std::get<0>(_cfg); } + Schema index_cfg() const { + Schema res; + res.addIndexField(std::get<1>(_cfg)); + return res; + } + vespalib::string to_string() const { + return is_attr() ? search::queryeval::test::to_string(attr_cfg()) : "diskindex"; + } +}; + +enum class QueryOperator { + Term, + In, + WeightedSet, + DotProduct, + And, + Or +}; + +vespalib::string to_string(QueryOperator query_op); + +struct HitSpec { + uint32_t term_value; + uint32_t num_hits; + HitSpec(uint32_t term_value_in, uint32_t num_hits_in) : term_value(term_value_in), num_hits(num_hits_in) {} +}; + +using TermVector = std::vector<uint32_t>; + +class HitSpecs { +private: + std::vector<HitSpec> _specs; + uint32_t _next_term_value; + +public: + HitSpecs(uint32_t first_term_value) + : _specs(), _next_term_value(first_term_value) + { + } + TermVector add(uint32_t num_terms, uint32_t hits_per_term) { + TermVector res; + for (uint32_t i = 0; i < num_terms; ++i) { + uint32_t term_value = _next_term_value++; + _specs.push_back({term_value, hits_per_term}); + res.push_back(term_value); + } + return res; + } + size_t size() const { return _specs.size(); } + auto begin() const { return _specs.begin(); } + auto end() const { return _specs.end(); } +}; + +BitVector::UP random_docids(uint32_t docid_limit, uint32_t count); + +} diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp index 5134ca575ca..202ba8c180e 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp @@ -1,232 +1,27 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "benchmark_searchable.h" -#include "disk_index_builder.h" -#include <vespa/searchcommon/attribute/config.h> -#include <vespa/searchcommon/attribute/iattributecontext.h> -#include <vespa/searchlib/attribute/attribute_blueprint_factory.h> -#include <vespa/searchlib/attribute/attributefactory.h> -#include <vespa/searchlib/attribute/attributevector.h> -#include <vespa/searchlib/attribute/integerbase.h> -#include <vespa/searchlib/attribute/stringbase.h> -#include <vespa/searchlib/diskindex/diskindex.h> +#include "benchmark_blueprint_factory.h" +#include "common.h" #include <vespa/searchlib/fef/matchdata.h> -#include <vespa/searchlib/index/docidandfeatures.h> -#include <vespa/searchlib/query/tree/integer_term_vector.h> -#include <vespa/searchlib/query/tree/node.h> -#include <vespa/searchlib/query/tree/simplequery.h> #include <vespa/searchlib/queryeval/blueprint.h> -#include <vespa/searchlib/queryeval/fake_requestcontext.h> -#include <vespa/searchlib/queryeval/field_spec.h> -#include <vespa/searchlib/queryeval/intermediate_blueprints.h> -#include <vespa/searchlib/test/mock_attribute_context.h> #include <vespa/vespalib/gtest/gtest.h> #include <vespa/vespalib/util/benchmark_timer.h> #include <cmath> #include <numeric> -#include <random> #include <vector> -using namespace search::attribute::test; using namespace search::attribute; using namespace search::fef; -using namespace search::query; +using namespace search::queryeval::test; using namespace search::queryeval; using namespace search; using namespace vespalib; -using search::index::DocIdAndFeatures; using search::index::Schema; -using search::queryeval::test::BenchmarkSearchable; -using search::queryeval::test::DiskIndexBuilder; -// TODO: Re-seed for each benchmark setup -constexpr uint32_t default_seed = 1234; -std::mt19937 gen(default_seed); const vespalib::string field_name = "myfield"; -const vespalib::string index_dir = "indexdir"; double budget_sec = 1.0; -BitVector::UP -random_docids(uint32_t docid_limit, uint32_t count) -{ - auto res = BitVector::create(docid_limit); - if ((count + 1) == docid_limit) { - res->notSelf(); - res->clearBit(0); - return res; - } - uint32_t docids_left = count; - // Bit 0 is never set since it is reserved as docid 0. - // All other docids have equal probability to be set. - for (uint32_t docid = 1; docid < docid_limit; ++docid) { - std::uniform_int_distribution<uint32_t> distr(0, docid_limit - docid - 1); - if (distr(gen) < docids_left) { - res->setBit(docid); - --docids_left; - } - } - res->invalidateCachedCount(); - assert(res->countTrueBits() == count); - return res; -} - -struct HitSpec { - uint32_t term_value; - uint32_t num_hits; - HitSpec(uint32_t term_value_in, uint32_t num_hits_in) : term_value(term_value_in), num_hits(num_hits_in) {} -}; - -namespace benchmark { -using TermVector = std::vector<uint32_t>; -} - -class HitSpecs { -private: - std::vector<HitSpec> _specs; - uint32_t _next_term_value; - -public: - HitSpecs(uint32_t first_term_value) - : _specs(), _next_term_value(first_term_value) - { - } - benchmark::TermVector add(uint32_t num_terms, uint32_t hits_per_term) { - benchmark::TermVector res; - for (uint32_t i = 0; i < num_terms; ++i) { - uint32_t term_value = _next_term_value++; - _specs.push_back({term_value, hits_per_term}); - res.push_back(term_value); - } - return res; - } - size_t size() const { return _specs.size(); } - auto begin() const { return _specs.begin(); } - auto end() const { return _specs.end(); } -}; - -vespalib::string -to_string(const Config& attr_config) -{ - std::ostringstream oss; - auto col_type = attr_config.collectionType(); - auto basic_type = attr_config.basicType(); - if (col_type == CollectionType::SINGLE) { - oss << basic_type.asString(); - } else { - oss << col_type.asString() << "<" << basic_type.asString() << ">"; - } - if (attr_config.fastSearch()) { - oss << "(fs)"; - } - return oss.str(); -} - -class FieldConfig { -private: - std::variant<Config, Schema::IndexField> _cfg; - -public: - FieldConfig(const Config& attr_cfg_in) : _cfg(attr_cfg_in) {} - FieldConfig(const Schema::IndexField& index_cfg_in) : _cfg(index_cfg_in) {} - bool is_attr() const { return _cfg.index() == 0; } - const Config& attr_cfg() const { return std::get<0>(_cfg); } - Schema index_cfg() const { - Schema res; - res.addIndexField(std::get<1>(_cfg)); - return res; - } - vespalib::string to_string() const { - return is_attr() ? ::to_string(attr_cfg()) : "diskindex"; - } -}; - -template <typename AttributeType, bool is_string, bool is_multivalue> -void -populate_attribute(AttributeType& attr, uint32_t docid_limit, const HitSpecs& hit_specs) -{ - for (auto spec : hit_specs) { - auto docids = random_docids(docid_limit, spec.num_hits); - docids->foreach_truebit([&](uint32_t docid) { - if constexpr (is_string) { - if constexpr (is_multivalue) { - attr.append(docid, std::to_string(spec.term_value), 1); - } else { - attr.update(docid, std::to_string(spec.term_value)); - } - } else { - if constexpr (is_multivalue) { - attr.append(docid, spec.term_value, 1); - } else { - attr.update(docid, spec.term_value); - } - } - }); - } -} - -AttributeVector::SP -make_attribute(const Config& cfg, uint32_t num_docs, const HitSpecs& hit_specs) -{ - auto attr = AttributeFactory::createAttribute(field_name, cfg); - attr->addReservedDoc(); - attr->addDocs(num_docs); - uint32_t docid_limit = attr->getNumDocs(); - assert(docid_limit == (num_docs + 1)); - bool is_multivalue = cfg.collectionType() != CollectionType::SINGLE; - if (attr->isStringType()) { - auto& real = dynamic_cast<StringAttribute&>(*attr); - if (is_multivalue) { - populate_attribute<StringAttribute, true, true>(real, docid_limit, hit_specs); - } else { - populate_attribute<StringAttribute, true, false>(real, docid_limit, hit_specs); - } - } else { - auto& real = dynamic_cast<IntegerAttribute&>(*attr); - if (is_multivalue) { - populate_attribute<IntegerAttribute, false, true>(real, docid_limit, hit_specs); - } else { - populate_attribute<IntegerAttribute, false, false>(real, docid_limit, hit_specs); - } - } - attr->commit(true); - return attr; -} - -class AttributeSearchable : public BenchmarkSearchable { -private: - std::unique_ptr<MockAttributeContext> _attr_ctx; - -public: - AttributeSearchable(std::unique_ptr<MockAttributeContext> attr_ctx) : _attr_ctx(std::move(attr_ctx)) {} - std::unique_ptr<Blueprint> create_blueprint(const FieldSpec& field_spec, - const search::query::Node& term) override { - AttributeBlueprintFactory factory; - FakeRequestContext req_ctx(_attr_ctx.get()); - return factory.createBlueprint(req_ctx, field_spec, term); - } -}; - -std::unique_ptr<BenchmarkSearchable> -make_searchable(const FieldConfig& cfg, uint32_t num_docs, const HitSpecs& hit_specs) -{ - if (cfg.is_attr()) { - auto attr = make_attribute(cfg.attr_cfg(), num_docs, hit_specs); - auto ctx = std::make_unique<MockAttributeContext>(); - ctx->add(std::move(attr)); - return std::make_unique<AttributeSearchable>(std::move(ctx)); - } else { - uint32_t docid_limit = num_docs + 1; - DiskIndexBuilder builder(cfg.index_cfg(), index_dir, docid_limit, hit_specs.size()); - for (auto spec : hit_specs) { - // TODO: make number of occurrences configurable. - uint32_t num_occs = 1; - builder.add_word(std::to_string(spec.term_value), *random_docids(docid_limit, spec.num_hits), num_occs); - } - return builder.build(); - } -} - struct BenchmarkResult { double time_ms; uint32_t seeks; @@ -427,102 +222,12 @@ benchmark_search(Blueprint::UP blueprint, uint32_t docid_limit, bool strict_cont } } -Blueprint::UP -make_leaf_blueprint(const Node& node, BenchmarkSearchable& searchable, uint32_t docid_limit) -{ - auto blueprint = searchable.create_blueprint(FieldSpec(field_name, 0, 0), node); - assert(blueprint.get()); - blueprint->setDocIdLimit(docid_limit); - blueprint->update_flow_stats(docid_limit); - return blueprint; -} - -enum class QueryOperator { - Term, - In, - WeightedSet, - DotProduct, - And, - Or -}; - -vespalib::string -to_string(QueryOperator query_op) -{ - switch (query_op) { - case QueryOperator::Term: return "Term"; - case QueryOperator::In: return "In"; - case QueryOperator::WeightedSet: return "WeightedSet"; - case QueryOperator::DotProduct: return "DotProduct"; - case QueryOperator::And: return "And"; - case QueryOperator::Or: return "Or"; - } - return "unknown"; -} - vespalib::string to_string(bool val) { return val ? "true" : "false"; } -std::unique_ptr<Node> -make_query_node(QueryOperator query_op, const benchmark::TermVector& terms) -{ - if (query_op == QueryOperator::Term) { - assert(terms.size() == 1); - return std::make_unique<SimpleStringTerm>(std::to_string(terms[0]), field_name, 0, Weight(1)); - } else if (query_op == QueryOperator::In) { - auto termv = std::make_unique<IntegerTermVector>(terms.size()); - for (auto term : terms) { - termv->addTerm(term); - } - return std::make_unique<SimpleInTerm>(std::move(termv), MultiTerm::Type::INTEGER, field_name, 0, Weight(1)); - } else if (query_op == QueryOperator::WeightedSet) { - auto res = std::make_unique<SimpleWeightedSetTerm>(terms.size(), field_name, 0, Weight(1)); - for (auto term : terms) { - res->addTerm(term, Weight(1)); - } - return res; - } else if (query_op == QueryOperator::DotProduct) { - auto res = std::make_unique<SimpleDotProduct>(terms.size(), field_name, 0, Weight(1)); - for (auto term : terms) { - res->addTerm(term, Weight(1)); - } - return res; - } - return {}; -} - -template <typename BlueprintType> -Blueprint::UP -make_intermediate_blueprint(BenchmarkSearchable& searchable, const benchmark::TermVector& terms, uint32_t docid_limit) -{ - auto blueprint = std::make_unique<BlueprintType>(); - for (auto term : terms) { - SimpleStringTerm sterm(std::to_string(term), field_name, 0, Weight(1)); - auto child = make_leaf_blueprint(sterm, searchable, docid_limit); - blueprint->addChild(std::move(child)); - } - blueprint->setDocIdLimit(docid_limit); - blueprint->update_flow_stats(docid_limit); - return blueprint; -} - -BenchmarkResult -run_benchmark(BenchmarkSearchable& searchable, QueryOperator query_op, const benchmark::TermVector& terms, uint32_t docid_limit, bool strict_context, bool force_strict, double filter_hit_ratio) -{ - if (query_op == QueryOperator::And) { - return benchmark_search(make_intermediate_blueprint<AndBlueprint>(searchable, terms, docid_limit), docid_limit, strict_context, force_strict, filter_hit_ratio); - } else if (query_op == QueryOperator::Or) { - return benchmark_search(make_intermediate_blueprint<OrBlueprint>(searchable, terms, docid_limit), docid_limit, strict_context, force_strict, filter_hit_ratio); - } else { - auto query_node = make_query_node(query_op, terms); - auto blueprint = make_leaf_blueprint(*query_node, searchable, docid_limit); - return benchmark_search(std::move(blueprint), docid_limit, strict_context, force_strict, filter_hit_ratio); - } -} - void print_result_header() { @@ -530,10 +235,10 @@ print_result_header() } void -print_result(const BenchmarkResult& res, const benchmark::TermVector& terms, double op_hit_ratio, double filter_hit_ratio, uint32_t num_docs) +print_result(const BenchmarkResult& res, uint32_t children, double op_hit_ratio, double filter_hit_ratio, uint32_t num_docs) { std::cout << std::fixed << std::setprecision(4) - << "| " << std::setw(4) << terms.size() + << "| " << std::setw(4) << children << " | " << std::setw(7) << filter_hit_ratio << " | " << std::setw(7) << op_hit_ratio << " | " << std::setw(7) << ((double) res.hits / (double) num_docs) @@ -577,7 +282,7 @@ struct BenchmarkCase { force_strict(false) {} vespalib::string to_string() const { - return "op=" + ::to_string(query_op) + ", cfg=" + field_cfg.to_string() + + return "op=" + search::queryeval::test::to_string(query_op) + ", cfg=" + field_cfg.to_string() + ", strict_context=" + ::to_string(strict_context) + (force_strict ? (", force_strict=" + ::to_string(force_strict)) : ""); } }; @@ -714,18 +419,6 @@ struct BenchmarkSetup { BenchmarkSetup::~BenchmarkSetup() = default; -uint32_t -calc_hits_per_term(uint32_t num_docs, double op_hit_ratio, uint32_t children, QueryOperator query_op) -{ - if (query_op == QueryOperator::And) { - double child_hit_ratio = std::pow(op_hit_ratio, (1.0/(double)children)); - return num_docs * child_hit_ratio; - } else { - uint32_t op_num_hits = num_docs * op_hit_ratio; - return op_num_hits / children; - } -} - BenchmarkCaseResult run_benchmark_case(const BenchmarkCaseSetup& setup) { @@ -734,16 +427,13 @@ run_benchmark_case(const BenchmarkCaseSetup& setup) print_result_header(); for (double op_hit_ratio : setup.op_hit_ratios) { for (uint32_t children : setup.child_counts) { - uint32_t hits_per_term = calc_hits_per_term(setup.num_docs, op_hit_ratio, children, setup.bcase.query_op); - HitSpecs hit_specs(55555); - hit_specs.add(setup.default_values_per_document, setup.num_docs); - auto terms = hit_specs.add(children, hits_per_term); - auto searchable = make_searchable(setup.bcase.field_cfg, setup.num_docs, hit_specs); + auto factory = make_blueprint_factory(setup.bcase.field_cfg, setup.bcase.query_op, + setup.num_docs, setup.default_values_per_document, + op_hit_ratio, children); for (double filter_hit_ratio : setup.filter_hit_ratios) { if (filter_hit_ratio * setup.filter_crossover_factor <= op_hit_ratio) { - auto res = run_benchmark(*searchable, setup.bcase.query_op, terms, setup.num_docs + 1, - setup.bcase.strict_context, setup.bcase.force_strict, filter_hit_ratio); - print_result(res, terms, op_hit_ratio, filter_hit_ratio, setup.num_docs); + auto res = benchmark_search(factory->make_blueprint(), setup.num_docs + 1, setup.bcase.strict_context, setup.bcase.force_strict, filter_hit_ratio); + print_result(res, children, op_hit_ratio, filter_hit_ratio, setup.num_docs); result.add(res); } } |