aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-03-19 16:27:22 +0100
committerGitHub <noreply@github.com>2024-03-19 16:27:22 +0100
commit45d8663b069aaeba8d00d037a7cafdfaec95ca54 (patch)
tree70618331c488c1cbf43e5448f755b21f5efeaa8b /searchlib/src
parentf48096c672bc54ae962a642353d34fdb3067ddff (diff)
parent49f89adc8dd08e2f322fc2313064042a03b9b1bb (diff)
Merge pull request #30681 from vespa-engine/geirst/iterator-benchmark-blueprint-factory
Refactor iterator benchmark to have a factory for creating blueprints
Diffstat (limited to 'searchlib/src')
-rw-r--r--searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt3
-rw-r--r--searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp105
-rw-r--r--searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h26
-rw-r--r--searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp181
-rw-r--r--searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h27
-rw-r--r--searchlib/src/tests/queryeval/iterator_benchmark/common.cpp74
-rw-r--r--searchlib/src/tests/queryeval/iterator_benchmark/common.h81
-rw-r--r--searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp332
8 files changed, 508 insertions, 321 deletions
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt b/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt
index 34c5928c123..872fb4ca6ca 100644
--- a/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/CMakeLists.txt
@@ -1,6 +1,9 @@
# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
vespa_add_executable(searchlib_iterator_benchmark_test_app TEST
SOURCES
+ attribute_ctx_builder.cpp
+ benchmark_blueprint_factory.cpp
+ common.cpp
disk_index_builder.cpp
iterator_benchmark_test.cpp
DEPENDS
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp
new file mode 100644
index 00000000000..e776b6a9379
--- /dev/null
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.cpp
@@ -0,0 +1,105 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "attribute_ctx_builder.h"
+#include <vespa/searchlib/attribute/attribute_blueprint_factory.h>
+#include <vespa/searchlib/attribute/attributefactory.h>
+#include <vespa/searchlib/attribute/attributevector.h>
+#include <vespa/searchlib/attribute/integerbase.h>
+#include <vespa/searchlib/attribute/stringbase.h>
+#include <vespa/searchlib/queryeval/blueprint.h>
+#include <vespa/searchlib/queryeval/fake_requestcontext.h>
+
+using namespace search::attribute;
+using namespace search::attribute::test;
+
+namespace search::queryeval::test {
+
+namespace {
+
+template <typename AttributeType, bool is_string, bool is_multivalue>
+void
+populate_attribute(AttributeType& attr, uint32_t docid_limit, const HitSpecs& hit_specs)
+{
+ for (auto spec : hit_specs) {
+ auto docids = random_docids(docid_limit, spec.num_hits);
+ docids->foreach_truebit([&](uint32_t docid) {
+ if constexpr (is_string) {
+ if constexpr (is_multivalue) {
+ attr.append(docid, std::to_string(spec.term_value), 1);
+ } else {
+ attr.update(docid, std::to_string(spec.term_value));
+ }
+ } else {
+ if constexpr (is_multivalue) {
+ attr.append(docid, spec.term_value, 1);
+ } else {
+ attr.update(docid, spec.term_value);
+ }
+ }
+ });
+ }
+}
+
+AttributeVector::SP
+make_attribute(const Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs)
+{
+ auto attr = AttributeFactory::createAttribute(field_name, cfg);
+ attr->addReservedDoc();
+ attr->addDocs(num_docs);
+ uint32_t docid_limit = attr->getNumDocs();
+ assert(docid_limit == (num_docs + 1));
+ bool is_multivalue = cfg.collectionType() != CollectionType::SINGLE;
+ if (attr->isStringType()) {
+ auto& real = dynamic_cast<StringAttribute&>(*attr);
+ if (is_multivalue) {
+ populate_attribute<StringAttribute, true, true>(real, docid_limit, hit_specs);
+ } else {
+ populate_attribute<StringAttribute, true, false>(real, docid_limit, hit_specs);
+ }
+ } else {
+ auto& real = dynamic_cast<IntegerAttribute&>(*attr);
+ if (is_multivalue) {
+ populate_attribute<IntegerAttribute, false, true>(real, docid_limit, hit_specs);
+ } else {
+ populate_attribute<IntegerAttribute, false, false>(real, docid_limit, hit_specs);
+ }
+ }
+ attr->commit(true);
+ return attr;
+}
+
+class AttributeSearchable : public BenchmarkSearchable {
+private:
+ std::unique_ptr<MockAttributeContext> _attr_ctx;
+
+public:
+ AttributeSearchable(std::unique_ptr<MockAttributeContext> attr_ctx) : _attr_ctx(std::move(attr_ctx)) {}
+ std::unique_ptr<Blueprint> create_blueprint(const FieldSpec& field_spec,
+ const search::query::Node& term) override {
+ AttributeBlueprintFactory factory;
+ FakeRequestContext req_ctx(_attr_ctx.get());
+ return factory.createBlueprint(req_ctx, field_spec, term);
+ }
+};
+
+}
+
+AttributeContextBuilder::AttributeContextBuilder()
+ : _ctx(std::make_unique<MockAttributeContext>())
+{
+}
+
+void
+AttributeContextBuilder::add(const search::attribute::Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs)
+{
+ auto attr = make_attribute(cfg, field_name, num_docs, hit_specs);
+ _ctx->add(std::move(attr));
+}
+
+std::unique_ptr<BenchmarkSearchable>
+AttributeContextBuilder::build()
+{
+ return std::make_unique<AttributeSearchable>(std::move(_ctx));
+}
+
+}
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h
new file mode 100644
index 00000000000..e4a58c91668
--- /dev/null
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/attribute_ctx_builder.h
@@ -0,0 +1,26 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "benchmark_searchable.h"
+#include "common.h"
+#include <vespa/searchcommon/attribute/config.h>
+#include <vespa/searchlib/test/mock_attribute_context.h>
+#include <memory>
+
+namespace search::queryeval::test {
+
+/**
+ * Class used to build attribute(s), used for benchmarking.
+ */
+class AttributeContextBuilder {
+private:
+ std::unique_ptr<search::attribute::test::MockAttributeContext> _ctx;
+
+public:
+ AttributeContextBuilder();
+ void add(const search::attribute::Config& cfg, vespalib::stringref field_name, uint32_t num_docs, const HitSpecs& hit_specs);
+ std::unique_ptr<BenchmarkSearchable> build();
+};
+
+}
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp
new file mode 100644
index 00000000000..15690fd71d5
--- /dev/null
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.cpp
@@ -0,0 +1,181 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "attribute_ctx_builder.h"
+#include "benchmark_blueprint_factory.h"
+#include "benchmark_searchable.h"
+#include "disk_index_builder.h"
+#include <vespa/searchlib/diskindex/diskindex.h>
+#include <vespa/searchlib/query/tree/integer_term_vector.h>
+#include <vespa/searchlib/query/tree/node.h>
+#include <vespa/searchlib/query/tree/simplequery.h>
+#include <vespa/searchlib/queryeval/blueprint.h>
+#include <vespa/searchlib/queryeval/intermediate_blueprints.h>
+#include <cmath>
+
+using search::query::IntegerTermVector;
+using search::query::MultiTerm;
+using search::query::Node;
+using search::query::SimpleDotProduct;
+using search::query::SimpleInTerm;
+using search::query::SimpleStringTerm;
+using search::query::SimpleWeightedSetTerm;
+using search::query::Weight;
+
+namespace search::queryeval::test {
+
+namespace {
+
+const vespalib::string field_name = "myfield";
+const vespalib::string index_dir = "indexdir";
+
+uint32_t
+calc_hits_per_term(uint32_t num_docs, double op_hit_ratio, uint32_t children, QueryOperator query_op)
+{
+ if (query_op == QueryOperator::And) {
+ double child_hit_ratio = std::pow(op_hit_ratio, (1.0/(double)children));
+ return num_docs * child_hit_ratio;
+ } else {
+ uint32_t op_num_hits = num_docs * op_hit_ratio;
+ return op_num_hits / children;
+ }
+}
+
+std::unique_ptr<BenchmarkSearchable>
+make_searchable(const FieldConfig& cfg, uint32_t num_docs, const HitSpecs& hit_specs)
+{
+ if (cfg.is_attr()) {
+ AttributeContextBuilder builder;
+ builder.add(cfg.attr_cfg(), field_name, num_docs, hit_specs);
+ return builder.build();
+ } else {
+ uint32_t docid_limit = num_docs + 1;
+ DiskIndexBuilder builder(cfg.index_cfg(), index_dir, docid_limit, hit_specs.size());
+ for (auto spec : hit_specs) {
+ // TODO: make number of occurrences configurable.
+ uint32_t num_occs = 1;
+ builder.add_word(std::to_string(spec.term_value), *random_docids(docid_limit, spec.num_hits), num_occs);
+ }
+ return builder.build();
+ }
+}
+
+std::unique_ptr<Node>
+make_query_node(QueryOperator query_op, const TermVector& terms)
+{
+ if (query_op == QueryOperator::Term) {
+ assert(terms.size() == 1);
+ return std::make_unique<SimpleStringTerm>(std::to_string(terms[0]), field_name, 0, Weight(1));
+ } else if (query_op == QueryOperator::In) {
+ auto termv = std::make_unique<IntegerTermVector>(terms.size());
+ for (auto term : terms) {
+ termv->addTerm(term);
+ }
+ return std::make_unique<SimpleInTerm>(std::move(termv), MultiTerm::Type::INTEGER, field_name, 0, Weight(1));
+ } else if (query_op == QueryOperator::WeightedSet) {
+ auto res = std::make_unique<SimpleWeightedSetTerm>(terms.size(), field_name, 0, Weight(1));
+ for (auto term : terms) {
+ res->addTerm(term, Weight(1));
+ }
+ return res;
+ } else if (query_op == QueryOperator::DotProduct) {
+ auto res = std::make_unique<SimpleDotProduct>(terms.size(), field_name, 0, Weight(1));
+ for (auto term : terms) {
+ res->addTerm(term, Weight(1));
+ }
+ return res;
+ }
+ return {};
+}
+
+Blueprint::UP
+make_leaf_blueprint(const Node& node, BenchmarkSearchable& searchable, uint32_t docid_limit)
+{
+ auto blueprint = searchable.create_blueprint(FieldSpec(field_name, 0, 0), node);
+ assert(blueprint.get());
+ blueprint->setDocIdLimit(docid_limit);
+ blueprint->update_flow_stats(docid_limit);
+ return blueprint;
+}
+
+template <typename BlueprintType>
+Blueprint::UP
+make_intermediate_blueprint(BenchmarkSearchable& searchable, const TermVector& terms, uint32_t docid_limit)
+{
+ auto blueprint = std::make_unique<BlueprintType>();
+ for (auto term : terms) {
+ SimpleStringTerm sterm(std::to_string(term), field_name, 0, Weight(1));
+ auto child = make_leaf_blueprint(sterm, searchable, docid_limit);
+ blueprint->addChild(std::move(child));
+ }
+ blueprint->setDocIdLimit(docid_limit);
+ blueprint->update_flow_stats(docid_limit);
+ return blueprint;
+}
+
+Blueprint::UP
+make_blueprint_helper(BenchmarkSearchable& searchable, QueryOperator query_op, const TermVector& terms, uint32_t docid_limit)
+{
+ if (query_op == QueryOperator::And) {
+ return make_intermediate_blueprint<AndBlueprint>(searchable, terms, docid_limit);
+ } else if (query_op == QueryOperator::Or) {
+ return make_intermediate_blueprint<OrBlueprint>(searchable, terms, docid_limit);
+ } else {
+ auto query_node = make_query_node(query_op, terms);
+ return make_leaf_blueprint(*query_node, searchable, docid_limit);
+ }
+}
+
+/**
+ * Factory for creating a Blueprint for a given benchmark setup.
+ *
+ * This populates an attribute or disk index field such that the query operator hits
+ * the given ratio of the total document corpus.
+ */
+class MyFactory : public BenchmarkBlueprintFactory {
+private:
+ QueryOperator _query_op;
+ uint32_t _docid_limit;
+ TermVector _terms;
+ std::unique_ptr<BenchmarkSearchable> _searchable;
+
+public:
+ MyFactory(const FieldConfig& field_cfg, QueryOperator query_op,
+ uint32_t num_docs, uint32_t default_values_per_document,
+ double op_hit_ratio, uint32_t children);
+
+ std::unique_ptr<Blueprint> make_blueprint() override;
+};
+
+MyFactory::MyFactory(const FieldConfig& field_cfg, QueryOperator query_op,
+ uint32_t num_docs, uint32_t default_values_per_document,
+ double op_hit_ratio, uint32_t children)
+ : _query_op(query_op),
+ _docid_limit(num_docs + 1),
+ _terms(),
+ _searchable()
+{
+ uint32_t hits_per_term = calc_hits_per_term(num_docs, op_hit_ratio, children, query_op);
+ HitSpecs hit_specs(55555);
+ hit_specs.add(default_values_per_document, num_docs);
+ _terms = hit_specs.add(children, hits_per_term);
+ _searchable = make_searchable(field_cfg, num_docs, hit_specs);
+}
+
+std::unique_ptr<Blueprint>
+MyFactory::make_blueprint()
+{
+ return make_blueprint_helper(*_searchable, _query_op, _terms, _docid_limit);
+}
+
+}
+
+std::unique_ptr<BenchmarkBlueprintFactory>
+make_blueprint_factory(const FieldConfig& field_cfg, QueryOperator query_op,
+ uint32_t num_docs, uint32_t default_values_per_document,
+ double op_hit_ratio, uint32_t children)
+{
+ return std::make_unique<MyFactory>(field_cfg, query_op, num_docs, default_values_per_document, op_hit_ratio, children);
+}
+
+}
+
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h
new file mode 100644
index 00000000000..1459cbfe856
--- /dev/null
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/benchmark_blueprint_factory.h
@@ -0,0 +1,27 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "common.h"
+#include <memory>
+
+namespace search::queryeval { class Blueprint; }
+
+namespace search::queryeval::test {
+
+/**
+ * Interface for creating a Blueprint.
+ */
+class BenchmarkBlueprintFactory {
+public:
+ virtual ~BenchmarkBlueprintFactory() = default;
+ virtual std::unique_ptr<Blueprint> make_blueprint() = 0;
+};
+
+std::unique_ptr<BenchmarkBlueprintFactory>
+make_blueprint_factory(const FieldConfig& field_cfg, QueryOperator query_op,
+ uint32_t num_docs, uint32_t default_values_per_document,
+ double op_hit_ratio, uint32_t children);
+
+}
+
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp
new file mode 100644
index 00000000000..b937f6a2f00
--- /dev/null
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/common.cpp
@@ -0,0 +1,74 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "common.h"
+#include <random>
+#include <sstream>
+
+using search::attribute::CollectionType;
+
+namespace search::queryeval::test {
+
+vespalib::string
+to_string(const Config& attr_config)
+{
+ std::ostringstream oss;
+ auto col_type = attr_config.collectionType();
+ auto basic_type = attr_config.basicType();
+ if (col_type == CollectionType::SINGLE) {
+ oss << basic_type.asString();
+ } else {
+ oss << col_type.asString() << "<" << basic_type.asString() << ">";
+ }
+ if (attr_config.fastSearch()) {
+ oss << "(fs)";
+ }
+ return oss.str();
+}
+
+vespalib::string
+to_string(QueryOperator query_op)
+{
+ switch (query_op) {
+ case QueryOperator::Term: return "Term";
+ case QueryOperator::In: return "In";
+ case QueryOperator::WeightedSet: return "WeightedSet";
+ case QueryOperator::DotProduct: return "DotProduct";
+ case QueryOperator::And: return "And";
+ case QueryOperator::Or: return "Or";
+ }
+ return "unknown";
+}
+
+namespace {
+
+// TODO: Make seed configurable.
+constexpr uint32_t default_seed = 1234;
+std::mt19937 gen(default_seed);
+
+}
+
+BitVector::UP
+random_docids(uint32_t docid_limit, uint32_t count)
+{
+ auto res = BitVector::create(docid_limit);
+ if ((count + 1) == docid_limit) {
+ res->notSelf();
+ res->clearBit(0);
+ return res;
+ }
+ uint32_t docids_left = count;
+ // Bit 0 is never set since it is reserved as docid 0.
+ // All other docids have equal probability to be set.
+ for (uint32_t docid = 1; docid < docid_limit; ++docid) {
+ std::uniform_int_distribution<uint32_t> distr(0, docid_limit - docid - 1);
+ if (distr(gen) < docids_left) {
+ res->setBit(docid);
+ --docids_left;
+ }
+ }
+ res->invalidateCachedCount();
+ assert(res->countTrueBits() == count);
+ return res;
+}
+
+} \ No newline at end of file
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/common.h b/searchlib/src/tests/queryeval/iterator_benchmark/common.h
new file mode 100644
index 00000000000..6d890910271
--- /dev/null
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/common.h
@@ -0,0 +1,81 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchcommon/attribute/config.h>
+#include <vespa/searchcommon/common/schema.h>
+#include <vespa/searchlib/common/bitvector.h>
+#include <variant>
+
+namespace search::queryeval::test {
+
+using search::attribute::Config;
+using search::index::Schema;
+
+vespalib::string to_string(const Config& attr_config);
+
+class FieldConfig {
+private:
+ std::variant<Config, Schema::IndexField> _cfg;
+
+public:
+ FieldConfig(const Config& attr_cfg_in) : _cfg(attr_cfg_in) {}
+ FieldConfig(const Schema::IndexField& index_cfg_in) : _cfg(index_cfg_in) {}
+ bool is_attr() const { return _cfg.index() == 0; }
+ const Config& attr_cfg() const { return std::get<0>(_cfg); }
+ Schema index_cfg() const {
+ Schema res;
+ res.addIndexField(std::get<1>(_cfg));
+ return res;
+ }
+ vespalib::string to_string() const {
+ return is_attr() ? search::queryeval::test::to_string(attr_cfg()) : "diskindex";
+ }
+};
+
+enum class QueryOperator {
+ Term,
+ In,
+ WeightedSet,
+ DotProduct,
+ And,
+ Or
+};
+
+vespalib::string to_string(QueryOperator query_op);
+
+struct HitSpec {
+ uint32_t term_value;
+ uint32_t num_hits;
+ HitSpec(uint32_t term_value_in, uint32_t num_hits_in) : term_value(term_value_in), num_hits(num_hits_in) {}
+};
+
+using TermVector = std::vector<uint32_t>;
+
+class HitSpecs {
+private:
+ std::vector<HitSpec> _specs;
+ uint32_t _next_term_value;
+
+public:
+ HitSpecs(uint32_t first_term_value)
+ : _specs(), _next_term_value(first_term_value)
+ {
+ }
+ TermVector add(uint32_t num_terms, uint32_t hits_per_term) {
+ TermVector res;
+ for (uint32_t i = 0; i < num_terms; ++i) {
+ uint32_t term_value = _next_term_value++;
+ _specs.push_back({term_value, hits_per_term});
+ res.push_back(term_value);
+ }
+ return res;
+ }
+ size_t size() const { return _specs.size(); }
+ auto begin() const { return _specs.begin(); }
+ auto end() const { return _specs.end(); }
+};
+
+BitVector::UP random_docids(uint32_t docid_limit, uint32_t count);
+
+}
diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp
index 5134ca575ca..202ba8c180e 100644
--- a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp
+++ b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp
@@ -1,232 +1,27 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include "benchmark_searchable.h"
-#include "disk_index_builder.h"
-#include <vespa/searchcommon/attribute/config.h>
-#include <vespa/searchcommon/attribute/iattributecontext.h>
-#include <vespa/searchlib/attribute/attribute_blueprint_factory.h>
-#include <vespa/searchlib/attribute/attributefactory.h>
-#include <vespa/searchlib/attribute/attributevector.h>
-#include <vespa/searchlib/attribute/integerbase.h>
-#include <vespa/searchlib/attribute/stringbase.h>
-#include <vespa/searchlib/diskindex/diskindex.h>
+#include "benchmark_blueprint_factory.h"
+#include "common.h"
#include <vespa/searchlib/fef/matchdata.h>
-#include <vespa/searchlib/index/docidandfeatures.h>
-#include <vespa/searchlib/query/tree/integer_term_vector.h>
-#include <vespa/searchlib/query/tree/node.h>
-#include <vespa/searchlib/query/tree/simplequery.h>
#include <vespa/searchlib/queryeval/blueprint.h>
-#include <vespa/searchlib/queryeval/fake_requestcontext.h>
-#include <vespa/searchlib/queryeval/field_spec.h>
-#include <vespa/searchlib/queryeval/intermediate_blueprints.h>
-#include <vespa/searchlib/test/mock_attribute_context.h>
#include <vespa/vespalib/gtest/gtest.h>
#include <vespa/vespalib/util/benchmark_timer.h>
#include <cmath>
#include <numeric>
-#include <random>
#include <vector>
-using namespace search::attribute::test;
using namespace search::attribute;
using namespace search::fef;
-using namespace search::query;
+using namespace search::queryeval::test;
using namespace search::queryeval;
using namespace search;
using namespace vespalib;
-using search::index::DocIdAndFeatures;
using search::index::Schema;
-using search::queryeval::test::BenchmarkSearchable;
-using search::queryeval::test::DiskIndexBuilder;
-// TODO: Re-seed for each benchmark setup
-constexpr uint32_t default_seed = 1234;
-std::mt19937 gen(default_seed);
const vespalib::string field_name = "myfield";
-const vespalib::string index_dir = "indexdir";
double budget_sec = 1.0;
-BitVector::UP
-random_docids(uint32_t docid_limit, uint32_t count)
-{
- auto res = BitVector::create(docid_limit);
- if ((count + 1) == docid_limit) {
- res->notSelf();
- res->clearBit(0);
- return res;
- }
- uint32_t docids_left = count;
- // Bit 0 is never set since it is reserved as docid 0.
- // All other docids have equal probability to be set.
- for (uint32_t docid = 1; docid < docid_limit; ++docid) {
- std::uniform_int_distribution<uint32_t> distr(0, docid_limit - docid - 1);
- if (distr(gen) < docids_left) {
- res->setBit(docid);
- --docids_left;
- }
- }
- res->invalidateCachedCount();
- assert(res->countTrueBits() == count);
- return res;
-}
-
-struct HitSpec {
- uint32_t term_value;
- uint32_t num_hits;
- HitSpec(uint32_t term_value_in, uint32_t num_hits_in) : term_value(term_value_in), num_hits(num_hits_in) {}
-};
-
-namespace benchmark {
-using TermVector = std::vector<uint32_t>;
-}
-
-class HitSpecs {
-private:
- std::vector<HitSpec> _specs;
- uint32_t _next_term_value;
-
-public:
- HitSpecs(uint32_t first_term_value)
- : _specs(), _next_term_value(first_term_value)
- {
- }
- benchmark::TermVector add(uint32_t num_terms, uint32_t hits_per_term) {
- benchmark::TermVector res;
- for (uint32_t i = 0; i < num_terms; ++i) {
- uint32_t term_value = _next_term_value++;
- _specs.push_back({term_value, hits_per_term});
- res.push_back(term_value);
- }
- return res;
- }
- size_t size() const { return _specs.size(); }
- auto begin() const { return _specs.begin(); }
- auto end() const { return _specs.end(); }
-};
-
-vespalib::string
-to_string(const Config& attr_config)
-{
- std::ostringstream oss;
- auto col_type = attr_config.collectionType();
- auto basic_type = attr_config.basicType();
- if (col_type == CollectionType::SINGLE) {
- oss << basic_type.asString();
- } else {
- oss << col_type.asString() << "<" << basic_type.asString() << ">";
- }
- if (attr_config.fastSearch()) {
- oss << "(fs)";
- }
- return oss.str();
-}
-
-class FieldConfig {
-private:
- std::variant<Config, Schema::IndexField> _cfg;
-
-public:
- FieldConfig(const Config& attr_cfg_in) : _cfg(attr_cfg_in) {}
- FieldConfig(const Schema::IndexField& index_cfg_in) : _cfg(index_cfg_in) {}
- bool is_attr() const { return _cfg.index() == 0; }
- const Config& attr_cfg() const { return std::get<0>(_cfg); }
- Schema index_cfg() const {
- Schema res;
- res.addIndexField(std::get<1>(_cfg));
- return res;
- }
- vespalib::string to_string() const {
- return is_attr() ? ::to_string(attr_cfg()) : "diskindex";
- }
-};
-
-template <typename AttributeType, bool is_string, bool is_multivalue>
-void
-populate_attribute(AttributeType& attr, uint32_t docid_limit, const HitSpecs& hit_specs)
-{
- for (auto spec : hit_specs) {
- auto docids = random_docids(docid_limit, spec.num_hits);
- docids->foreach_truebit([&](uint32_t docid) {
- if constexpr (is_string) {
- if constexpr (is_multivalue) {
- attr.append(docid, std::to_string(spec.term_value), 1);
- } else {
- attr.update(docid, std::to_string(spec.term_value));
- }
- } else {
- if constexpr (is_multivalue) {
- attr.append(docid, spec.term_value, 1);
- } else {
- attr.update(docid, spec.term_value);
- }
- }
- });
- }
-}
-
-AttributeVector::SP
-make_attribute(const Config& cfg, uint32_t num_docs, const HitSpecs& hit_specs)
-{
- auto attr = AttributeFactory::createAttribute(field_name, cfg);
- attr->addReservedDoc();
- attr->addDocs(num_docs);
- uint32_t docid_limit = attr->getNumDocs();
- assert(docid_limit == (num_docs + 1));
- bool is_multivalue = cfg.collectionType() != CollectionType::SINGLE;
- if (attr->isStringType()) {
- auto& real = dynamic_cast<StringAttribute&>(*attr);
- if (is_multivalue) {
- populate_attribute<StringAttribute, true, true>(real, docid_limit, hit_specs);
- } else {
- populate_attribute<StringAttribute, true, false>(real, docid_limit, hit_specs);
- }
- } else {
- auto& real = dynamic_cast<IntegerAttribute&>(*attr);
- if (is_multivalue) {
- populate_attribute<IntegerAttribute, false, true>(real, docid_limit, hit_specs);
- } else {
- populate_attribute<IntegerAttribute, false, false>(real, docid_limit, hit_specs);
- }
- }
- attr->commit(true);
- return attr;
-}
-
-class AttributeSearchable : public BenchmarkSearchable {
-private:
- std::unique_ptr<MockAttributeContext> _attr_ctx;
-
-public:
- AttributeSearchable(std::unique_ptr<MockAttributeContext> attr_ctx) : _attr_ctx(std::move(attr_ctx)) {}
- std::unique_ptr<Blueprint> create_blueprint(const FieldSpec& field_spec,
- const search::query::Node& term) override {
- AttributeBlueprintFactory factory;
- FakeRequestContext req_ctx(_attr_ctx.get());
- return factory.createBlueprint(req_ctx, field_spec, term);
- }
-};
-
-std::unique_ptr<BenchmarkSearchable>
-make_searchable(const FieldConfig& cfg, uint32_t num_docs, const HitSpecs& hit_specs)
-{
- if (cfg.is_attr()) {
- auto attr = make_attribute(cfg.attr_cfg(), num_docs, hit_specs);
- auto ctx = std::make_unique<MockAttributeContext>();
- ctx->add(std::move(attr));
- return std::make_unique<AttributeSearchable>(std::move(ctx));
- } else {
- uint32_t docid_limit = num_docs + 1;
- DiskIndexBuilder builder(cfg.index_cfg(), index_dir, docid_limit, hit_specs.size());
- for (auto spec : hit_specs) {
- // TODO: make number of occurrences configurable.
- uint32_t num_occs = 1;
- builder.add_word(std::to_string(spec.term_value), *random_docids(docid_limit, spec.num_hits), num_occs);
- }
- return builder.build();
- }
-}
-
struct BenchmarkResult {
double time_ms;
uint32_t seeks;
@@ -427,102 +222,12 @@ benchmark_search(Blueprint::UP blueprint, uint32_t docid_limit, bool strict_cont
}
}
-Blueprint::UP
-make_leaf_blueprint(const Node& node, BenchmarkSearchable& searchable, uint32_t docid_limit)
-{
- auto blueprint = searchable.create_blueprint(FieldSpec(field_name, 0, 0), node);
- assert(blueprint.get());
- blueprint->setDocIdLimit(docid_limit);
- blueprint->update_flow_stats(docid_limit);
- return blueprint;
-}
-
-enum class QueryOperator {
- Term,
- In,
- WeightedSet,
- DotProduct,
- And,
- Or
-};
-
-vespalib::string
-to_string(QueryOperator query_op)
-{
- switch (query_op) {
- case QueryOperator::Term: return "Term";
- case QueryOperator::In: return "In";
- case QueryOperator::WeightedSet: return "WeightedSet";
- case QueryOperator::DotProduct: return "DotProduct";
- case QueryOperator::And: return "And";
- case QueryOperator::Or: return "Or";
- }
- return "unknown";
-}
-
vespalib::string
to_string(bool val)
{
return val ? "true" : "false";
}
-std::unique_ptr<Node>
-make_query_node(QueryOperator query_op, const benchmark::TermVector& terms)
-{
- if (query_op == QueryOperator::Term) {
- assert(terms.size() == 1);
- return std::make_unique<SimpleStringTerm>(std::to_string(terms[0]), field_name, 0, Weight(1));
- } else if (query_op == QueryOperator::In) {
- auto termv = std::make_unique<IntegerTermVector>(terms.size());
- for (auto term : terms) {
- termv->addTerm(term);
- }
- return std::make_unique<SimpleInTerm>(std::move(termv), MultiTerm::Type::INTEGER, field_name, 0, Weight(1));
- } else if (query_op == QueryOperator::WeightedSet) {
- auto res = std::make_unique<SimpleWeightedSetTerm>(terms.size(), field_name, 0, Weight(1));
- for (auto term : terms) {
- res->addTerm(term, Weight(1));
- }
- return res;
- } else if (query_op == QueryOperator::DotProduct) {
- auto res = std::make_unique<SimpleDotProduct>(terms.size(), field_name, 0, Weight(1));
- for (auto term : terms) {
- res->addTerm(term, Weight(1));
- }
- return res;
- }
- return {};
-}
-
-template <typename BlueprintType>
-Blueprint::UP
-make_intermediate_blueprint(BenchmarkSearchable& searchable, const benchmark::TermVector& terms, uint32_t docid_limit)
-{
- auto blueprint = std::make_unique<BlueprintType>();
- for (auto term : terms) {
- SimpleStringTerm sterm(std::to_string(term), field_name, 0, Weight(1));
- auto child = make_leaf_blueprint(sterm, searchable, docid_limit);
- blueprint->addChild(std::move(child));
- }
- blueprint->setDocIdLimit(docid_limit);
- blueprint->update_flow_stats(docid_limit);
- return blueprint;
-}
-
-BenchmarkResult
-run_benchmark(BenchmarkSearchable& searchable, QueryOperator query_op, const benchmark::TermVector& terms, uint32_t docid_limit, bool strict_context, bool force_strict, double filter_hit_ratio)
-{
- if (query_op == QueryOperator::And) {
- return benchmark_search(make_intermediate_blueprint<AndBlueprint>(searchable, terms, docid_limit), docid_limit, strict_context, force_strict, filter_hit_ratio);
- } else if (query_op == QueryOperator::Or) {
- return benchmark_search(make_intermediate_blueprint<OrBlueprint>(searchable, terms, docid_limit), docid_limit, strict_context, force_strict, filter_hit_ratio);
- } else {
- auto query_node = make_query_node(query_op, terms);
- auto blueprint = make_leaf_blueprint(*query_node, searchable, docid_limit);
- return benchmark_search(std::move(blueprint), docid_limit, strict_context, force_strict, filter_hit_ratio);
- }
-}
-
void
print_result_header()
{
@@ -530,10 +235,10 @@ print_result_header()
}
void
-print_result(const BenchmarkResult& res, const benchmark::TermVector& terms, double op_hit_ratio, double filter_hit_ratio, uint32_t num_docs)
+print_result(const BenchmarkResult& res, uint32_t children, double op_hit_ratio, double filter_hit_ratio, uint32_t num_docs)
{
std::cout << std::fixed << std::setprecision(4)
- << "| " << std::setw(4) << terms.size()
+ << "| " << std::setw(4) << children
<< " | " << std::setw(7) << filter_hit_ratio
<< " | " << std::setw(7) << op_hit_ratio
<< " | " << std::setw(7) << ((double) res.hits / (double) num_docs)
@@ -577,7 +282,7 @@ struct BenchmarkCase {
force_strict(false)
{}
vespalib::string to_string() const {
- return "op=" + ::to_string(query_op) + ", cfg=" + field_cfg.to_string() +
+ return "op=" + search::queryeval::test::to_string(query_op) + ", cfg=" + field_cfg.to_string() +
", strict_context=" + ::to_string(strict_context) + (force_strict ? (", force_strict=" + ::to_string(force_strict)) : "");
}
};
@@ -714,18 +419,6 @@ struct BenchmarkSetup {
BenchmarkSetup::~BenchmarkSetup() = default;
-uint32_t
-calc_hits_per_term(uint32_t num_docs, double op_hit_ratio, uint32_t children, QueryOperator query_op)
-{
- if (query_op == QueryOperator::And) {
- double child_hit_ratio = std::pow(op_hit_ratio, (1.0/(double)children));
- return num_docs * child_hit_ratio;
- } else {
- uint32_t op_num_hits = num_docs * op_hit_ratio;
- return op_num_hits / children;
- }
-}
-
BenchmarkCaseResult
run_benchmark_case(const BenchmarkCaseSetup& setup)
{
@@ -734,16 +427,13 @@ run_benchmark_case(const BenchmarkCaseSetup& setup)
print_result_header();
for (double op_hit_ratio : setup.op_hit_ratios) {
for (uint32_t children : setup.child_counts) {
- uint32_t hits_per_term = calc_hits_per_term(setup.num_docs, op_hit_ratio, children, setup.bcase.query_op);
- HitSpecs hit_specs(55555);
- hit_specs.add(setup.default_values_per_document, setup.num_docs);
- auto terms = hit_specs.add(children, hits_per_term);
- auto searchable = make_searchable(setup.bcase.field_cfg, setup.num_docs, hit_specs);
+ auto factory = make_blueprint_factory(setup.bcase.field_cfg, setup.bcase.query_op,
+ setup.num_docs, setup.default_values_per_document,
+ op_hit_ratio, children);
for (double filter_hit_ratio : setup.filter_hit_ratios) {
if (filter_hit_ratio * setup.filter_crossover_factor <= op_hit_ratio) {
- auto res = run_benchmark(*searchable, setup.bcase.query_op, terms, setup.num_docs + 1,
- setup.bcase.strict_context, setup.bcase.force_strict, filter_hit_ratio);
- print_result(res, terms, op_hit_ratio, filter_hit_ratio, setup.num_docs);
+ auto res = benchmark_search(factory->make_blueprint(), setup.num_docs + 1, setup.bcase.strict_context, setup.bcase.force_strict, filter_hit_ratio);
+ print_result(res, children, op_hit_ratio, filter_hit_ratio, setup.num_docs);
result.add(res);
}
}