From 727611cc8b092e3b851806b7d1a282d3e738c062 Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Fri, 9 Feb 2024 13:55:15 +0000 Subject: Support AND operator and reduce the time for populating attributes. --- .../iterator_benchmark/iterator_benchmark_test.cpp | 113 +++++++++++++-------- 1 file changed, 72 insertions(+), 41 deletions(-) diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp index 0d4cdaed446..48fd53300b2 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp @@ -38,19 +38,23 @@ const vespalib::string field = "myfield"; double budget_sec = 1.0; using DocidVector = std::vector; -DocidVector +BitVector::UP random_docids(uint32_t docid_limit, uint32_t count) { - std::uniform_int_distribution distr(1, docid_limit - 1); - vespalib::hash_set unique_docids; - while (unique_docids.size() < count) { - unique_docids.insert(distr(gen)); + auto res = BitVector::create(docid_limit); + uint32_t docids_left = count; + // Bit 0 is never set since it is reserved as docid 0. + // All other docids have equal probability to be set. + for (uint32_t docid = 1; docid < docid_limit; ++docid) { + std::uniform_int_distribution distr(0, docid_limit - docid - 1); + if (distr(gen) < docids_left) { + res->setBit(docid); + --docids_left; + } } - DocidVector result; - result.reserve(count); - std::copy_n(unique_docids.begin(), count, std::back_inserter(result)); - std::sort(result.begin(), result.end()); - return result; + res->invalidateCachedCount(); + assert(res->countTrueBits() == count); + return res; } struct HitSpec { @@ -75,10 +79,9 @@ template void populate_attribute(AttributeType& attr, uint32_t docid_limit, const HitSpecs& hit_specs) { - size_t updated = 0; - constexpr uint32_t commit_freq = 10000; for (auto spec : hit_specs) { - for (uint32_t docid : random_docids(docid_limit, spec.num_hits)) { + auto docids = random_docids(docid_limit, spec.num_hits); + docids->foreach_truebit([&](uint32_t docid) { if constexpr (is_string) { if constexpr (is_multivalue) { attr.append(docid, std::to_string(spec.term_value), 1); @@ -92,13 +95,9 @@ populate_attribute(AttributeType& attr, uint32_t docid_limit, const HitSpecs& hi attr.update(docid, spec.term_value); } } - ++updated; - if (updated % commit_freq == 0) { - attr.commit(true); - } - } + }); + attr.commit(true); } - attr.commit(true); } AttributeVector::SP @@ -211,7 +210,7 @@ benchmark_search(Blueprint::UP blueprint, uint32_t docid_limit, bool strict) } Blueprint::UP -make_blueprint(const Node& node, IAttributeContext& attr_ctx, uint32_t docid_limit) +make_leaf_blueprint(const Node& node, IAttributeContext& attr_ctx, uint32_t docid_limit) { FakeRequestContext request_ctx(&attr_ctx); AttributeBlueprintFactory source; @@ -227,6 +226,7 @@ enum class QueryOperator { In, WeightedSet, DotProduct, + And, Or }; @@ -238,6 +238,7 @@ to_string(QueryOperator query_op) case QueryOperator::In: return "In"; case QueryOperator::WeightedSet: return "WeightedSet"; case QueryOperator::DotProduct: return "DotProduct"; + case QueryOperator::And: return "And"; case QueryOperator::Or: return "Or"; } return "unknown"; @@ -251,19 +252,19 @@ make_query_node(QueryOperator query_op, const HitSpecs& hit_specs) return std::make_unique(std::to_string(hit_specs[0].term_value), field, 0, Weight(1)); } else if (query_op == QueryOperator::In) { auto terms = std::make_unique(hit_specs.size()); - for (auto spec: hit_specs) { + for (auto spec : hit_specs) { terms->addTerm(spec.term_value); } return std::make_unique(std::move(terms), MultiTerm::Type::INTEGER, field, 0, Weight(1)); } else if (query_op == QueryOperator::WeightedSet) { auto res = std::make_unique(hit_specs.size(), field, 0, Weight(1)); - for (auto spec: hit_specs) { + for (auto spec : hit_specs) { res->addTerm(spec.term_value, Weight(1)); } return res; } else if (query_op == QueryOperator::DotProduct) { auto res = std::make_unique(hit_specs.size(), field, 0, Weight(1)); - for (auto spec: hit_specs) { + for (auto spec : hit_specs) { res->addTerm(spec.term_value, Weight(1)); } return res; @@ -271,22 +272,31 @@ make_query_node(QueryOperator query_op, const HitSpecs& hit_specs) return {}; } +template +Blueprint::UP +make_intermediate_blueprint(IAttributeContext& attr_ctx, const HitSpecs& hit_specs, uint32_t docid_limit) +{ + auto blueprint = std::make_unique(); + for (auto spec : hit_specs) { + SimpleNumberTerm term(std::to_string(spec.term_value), field, 0, Weight(1)); + auto child = make_leaf_blueprint(term, attr_ctx, docid_limit); + blueprint->addChild(std::move(child)); + } + blueprint->setDocIdLimit(docid_limit); + blueprint->update_flow_stats(docid_limit); + return blueprint; +} + BenchmarkResult run_benchmark(IAttributeContext& attr_ctx, QueryOperator query_op, const HitSpecs& hit_specs, uint32_t docid_limit, bool strict) { - if (query_op == QueryOperator::Or) { - auto blueprint = std::make_unique(); - for (auto spec: hit_specs) { - SimpleNumberTerm term(std::to_string(spec.term_value), field, 0, Weight(1)); - auto child = make_blueprint(term, attr_ctx, docid_limit); - blueprint->addChild(std::move(child)); - } - blueprint->setDocIdLimit(docid_limit); - blueprint->update_flow_stats(docid_limit); - return benchmark_search(std::move(blueprint), docid_limit, strict); + if (query_op == QueryOperator::And) { + return benchmark_search(make_intermediate_blueprint(attr_ctx, hit_specs, docid_limit), docid_limit, strict); + } else if (query_op == QueryOperator::Or) { + return benchmark_search(make_intermediate_blueprint(attr_ctx, hit_specs, docid_limit), docid_limit, strict); } else { auto query_node = make_query_node(query_op, hit_specs); - auto blueprint = make_blueprint(*query_node, attr_ctx, docid_limit); + auto blueprint = make_leaf_blueprint(*query_node, attr_ctx, docid_limit); return benchmark_search(std::move(blueprint), docid_limit, strict); } } @@ -305,10 +315,13 @@ run_benchmark(const Config& cfg, uint32_t num_docs, const HitSpecs& hit_specs, d << ", est=" << std::setw(5) << res.estimate << ", hits=" << std::setw(7) << res.hits << ", seeks=" << std::setw(8) << res.seeks - << ", time_ms:" << std::setw(9) << res.time_ms + << std::setprecision(2) + << ", time_ms:" << std::setw(8) << res.time_ms + << std::setprecision(3) << ", cost=" << std::setw(8) << res.cost + << std::setprecision(2) << ", ns_per_seek=" << std::setw(8) << res.time_per_seek_ns() - << ", ms_per_cost=" << std::setw(8) << res.time_per_cost_ms() << std::endl; + << ", ms_per_cost=" << std::setw(7) << res.time_per_cost_ms() << std::endl; } struct BenchmarkSetup { @@ -334,15 +347,26 @@ struct BenchmarkSetup { ~BenchmarkSetup() {} }; +uint32_t +calc_hits_per_term(uint32_t num_docs, double target_hit_ratio, uint32_t children, QueryOperator query_op) +{ + if (query_op == QueryOperator::And) { + double child_hit_ratio = std::pow(target_hit_ratio, (1.0/(double)children)); + return num_docs * child_hit_ratio; + } else { + uint32_t target_num_hits = num_docs * target_hit_ratio; + return target_num_hits / children; + } +} + void run_benchmarks(const BenchmarkSetup& setup) { std::cout << "-------- run_benchmarks: " << to_string(setup.query_op) << " --------" << std::endl; for (bool strict : setup.strictness) { for (double hit_ratio : setup.hit_ratios) { - uint32_t num_hits = setup.num_docs * hit_ratio; for (uint32_t children : setup.child_counts) { - uint32_t hits_per_term = num_hits / children; + uint32_t hits_per_term = calc_hits_per_term(setup.num_docs, hit_ratio, children, setup.query_op); run_benchmark(setup.attr_cfg, setup.num_docs, make_hit_specs(children, hits_per_term, 55555), hit_ratio, setup.query_op, strict); } } @@ -361,6 +385,7 @@ constexpr uint32_t num_docs = 10'000'000; const std::vector hit_ratios = {0.001, 0.01, 0.1, 0.5}; const std::vector child_counts = {1, 10, 100, 1000}; const Config int32_fs = make_config(BasicType::INT32, CollectionType::SINGLE, true); +const Config int32_array_fs = make_config(BasicType::INT32, CollectionType::ARRAY, true); const Config int32_wset_fs = make_config(BasicType::INT32, CollectionType::WSET, true); TEST(IteratorBenchmark, term_benchmark) @@ -371,13 +396,13 @@ TEST(IteratorBenchmark, term_benchmark) TEST(IteratorBenchmark, in_benchmark) { - BenchmarkSetup setup(num_docs, int32_fs, QueryOperator::In, hit_ratios, child_counts, {true, false}); + BenchmarkSetup setup(num_docs, int32_array_fs, QueryOperator::In, hit_ratios, child_counts, {true, false}); run_benchmarks(setup); } TEST(IteratorBenchmark, weighted_set_benchmark) { - BenchmarkSetup setup(num_docs, int32_fs, QueryOperator::WeightedSet, hit_ratios, child_counts, {true, false}); + BenchmarkSetup setup(num_docs, int32_array_fs, QueryOperator::WeightedSet, hit_ratios, child_counts, {true, false}); run_benchmarks(setup); } @@ -387,9 +412,15 @@ TEST(IteratorBenchmark, dot_product_benchmark) run_benchmarks(setup); } +TEST(IteratorBenchmark, and_benchmark) +{ + BenchmarkSetup setup(num_docs, int32_array_fs, QueryOperator::And, hit_ratios, {1, 2, 4, 8}, {true, false}); + run_benchmarks(setup); +} + TEST(IteratorBenchmark, or_benchmark) { - BenchmarkSetup setup(num_docs, int32_fs, QueryOperator::Or, hit_ratios, child_counts, {true, false}); + BenchmarkSetup setup(num_docs, int32_array_fs, QueryOperator::Or, hit_ratios, child_counts, {true, false}); run_benchmarks(setup); } -- cgit v1.2.3