summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@oath.com>2021-10-01 10:30:09 +0000
committerHåvard Pettersen <havardpe@oath.com>2021-10-01 10:51:39 +0000
commit0fad251ccd7241f9e4b905935398891ef1800d72 (patch)
treea3fc5d9cf43008ba29923df089adb66d93030eda
parent284a23842b642917a4f380f36a9416af1e9f8ec7 (diff)
use saturated sum as hit estimate for OR
The upper bound for the hit estimate is the docid limit. The lower bound for the docid limit is the maximum child estimate. In the future we probably want a solution where everyone knows the docid limit from the start, which might also require us to have a separate way of estimating the document frequency if we want to avoid ranking score regression.
-rw-r--r--searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp16
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/blueprint.cpp27
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/blueprint.h8
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp7
4 files changed, 48 insertions, 10 deletions
diff --git a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp
index eb6e49747a1..802aee8dee0 100644
--- a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp
+++ b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp
@@ -1453,4 +1453,20 @@ TEST("require that intermediate cost tier is minimum cost tier of children") {
EXPECT_EQUAL(bp2->getState().cost_tier(), 2u);
}
+void verify_or_est(const std::vector<Blueprint::HitEstimate> &child_estimates, Blueprint::HitEstimate expect) {
+ OrBlueprint my_or;
+ my_or.setDocIdLimit(32);
+ auto my_est = my_or.combine(child_estimates);
+ EXPECT_EQUAL(my_est.empty, expect.empty);
+ EXPECT_EQUAL(my_est.estHits, expect.estHits);
+}
+
+TEST("require that OR blueprint use saturated sum as estimate") {
+ TEST_DO(verify_or_est({{0, true},{0, true},{0, true}}, {0, true}));
+ TEST_DO(verify_or_est({{0, true},{0, false},{0, true}}, {0, false}));
+ TEST_DO(verify_or_est({{4, false},{6, false},{5, false}}, {15, false}));
+ TEST_DO(verify_or_est({{5, false},{20, false},{10, false}}, {32, false}));
+ TEST_DO(verify_or_est({{100, false},{300, false},{200, false}}, {300, false}));
+}
+
TEST_MAIN() { TEST_DEBUG("lhs.out", "rhs.out"); TEST_RUN_ALL(); }
diff --git a/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp
index 897d36150d9..c68bccc451b 100644
--- a/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp
@@ -27,12 +27,11 @@ namespace search::queryeval {
void maybe_eliminate_self(Blueprint* &self, Blueprint::UP replacement) {
// replace with replacement
if (replacement) {
- Blueprint *tmp = replacement.release();
- tmp->setParent(self->getParent());
- tmp->setSourceId(self->getSourceId());
- self->setParent(0);
- replacement.reset(self);
- self = tmp;
+ Blueprint::UP discard(self);
+ self = replacement.release();
+ self->setParent(discard->getParent());
+ self->setSourceId(discard->getSourceId());
+ discard->setParent(nullptr);
}
// replace with empty blueprint if empty
if (self->getState().estimate().empty) {
@@ -40,6 +39,8 @@ void maybe_eliminate_self(Blueprint* &self, Blueprint::UP replacement) {
self = new EmptyBlueprint(discard->getState().fields());
self->setParent(discard->getParent());
self->setSourceId(discard->getSourceId());
+ self->setDocIdLimit(discard->get_docid_limit());
+ discard->setParent(nullptr);
}
}
@@ -69,6 +70,20 @@ Blueprint::min(const std::vector<HitEstimate> &data)
return est;
}
+Blueprint::HitEstimate
+Blueprint::sat_sum(const std::vector<HitEstimate> &data, uint32_t docid_limit)
+{
+ uint64_t sum = 0;
+ bool empty = true;
+ uint32_t limit = docid_limit;
+ for (const auto &est: data) {
+ sum += est.estHits;
+ empty = (empty && est.empty);
+ limit = std::max(limit, est.estHits);
+ }
+ return { uint32_t(std::min(sum, uint64_t(limit))), empty };
+}
+
Blueprint::State::State(const FieldSpecBaseList &fields_in)
: _fields(fields_in),
_estimate(),
diff --git a/searchlib/src/vespa/searchlib/queryeval/blueprint.h b/searchlib/src/vespa/searchlib/queryeval/blueprint.h
index b75c903234f..38b02c4d927 100644
--- a/searchlib/src/vespa/searchlib/queryeval/blueprint.h
+++ b/searchlib/src/vespa/searchlib/queryeval/blueprint.h
@@ -101,7 +101,7 @@ public:
double hit_ratio(uint32_t docid_limit) const {
uint32_t total_hits = _estimate.estHits;
uint32_t total_docs = std::max(total_hits, docid_limit);
- return double(total_hits) / double(total_docs);
+ return (total_docs == 0) ? 0.0 : double(total_hits) / double(total_docs);
}
void tree_size(uint32_t value) { _tree_size = value; }
uint32_t tree_size() const { return _tree_size; }
@@ -119,6 +119,12 @@ public:
// utility that just takes minium estimate
static HitEstimate min(const std::vector<HitEstimate> &data);
+ // utility that calculates saturated sum
+ //
+ // upper limit for estimate: docid_limit
+ // lower limit for docid_limit: max child estimate
+ static HitEstimate sat_sum(const std::vector<HitEstimate> &data, uint32_t docid_limit);
+
// utility to get the greater estimate to sort first, higher tiers last
struct TieredGreaterEstimate {
bool operator () (Blueprint * const &a, Blueprint * const &b) const {
diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp
index 561c532430b..eb16025f175 100644
--- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp
@@ -19,7 +19,7 @@ namespace search::queryeval {
namespace {
template <typename CombineType>
-size_t lookup_create_source(std::vector<std::unique_ptr<CombineType> > &sources, uint32_t child_source) {
+size_t lookup_create_source(std::vector<std::unique_ptr<CombineType> > &sources, uint32_t child_source, uint32_t docid_limit) {
for (size_t i = 0; i < sources.size(); ++i) {
if (sources[i]->getSourceId() == child_source) {
return i;
@@ -27,6 +27,7 @@ size_t lookup_create_source(std::vector<std::unique_ptr<CombineType> > &sources,
}
sources.push_back(std::unique_ptr<CombineType>(new CombineType()));
sources.back()->setSourceId(child_source);
+ sources.back()->setDocIdLimit(docid_limit);
return (sources.size() - 1);
}
@@ -53,7 +54,7 @@ void optimize_source_blenders(IntermediateBlueprint &self, size_t begin_idx) {
auto *blender = static_cast<SourceBlenderBlueprint *>(blender_up.get());
while (blender->childCnt() > 0) {
Blueprint::UP child_up = blender->removeChild(blender->childCnt() - 1);
- size_t source_idx = lookup_create_source(sources, child_up->getSourceId());
+ size_t source_idx = lookup_create_source(sources, child_up->getSourceId(), self.get_docid_limit());
sources[source_idx]->addChild(std::move(child_up));
}
}
@@ -298,7 +299,7 @@ AndBlueprint::computeNextHitRate(const Blueprint & child, double hitRate) const
Blueprint::HitEstimate
OrBlueprint::combine(const std::vector<HitEstimate> &data) const
{
- return max(data);
+ return sat_sum(data, get_docid_limit());
}
FieldSpecBaseList