diff options
author | Tor Egge <Tor.Egge@online.no> | 2022-11-14 12:37:47 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2022-11-14 12:37:47 +0100 |
commit | 3cd84044662d3770bb580e89e39e7c699b19823a (patch) | |
tree | 2234a662e74f1d99e7fc8df1839c3caf28a2b255 | |
parent | 64aa97631e1ac9c2b968efcfeb84be89a15666fc (diff) |
Improve hit estimate for ImportedSearchContext.
3 files changed, 114 insertions, 4 deletions
diff --git a/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp b/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp index 6cb0dec3c1f..847a992d241 100644 --- a/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp +++ b/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp @@ -6,6 +6,7 @@ #include <vespa/searchlib/query/query_term_ucs4.h> #include <vespa/searchlib/queryeval/simpleresult.h> #include <vespa/searchlib/test/imported_attribute_fixture.h> +#include <vespa/searchlib/test/mock_gid_to_lid_mapping.h> #include <vespa/vespalib/test/insertion_operators.h> #include <vespa/searchlib/queryeval/executeinfo.h> @@ -75,13 +76,57 @@ bool is_strict_hit_with_weight(Iterator& iter, TermFieldMatchData& match, EXPECT_EQUAL(weight, match.getWeight())); } -TEST_F("approximateHits() returns document count of reference attribute", Fixture) { +TEST_F("approximateHits() returns document count of reference attribute when not using fast-search target attribute", Fixture) { + add_n_docs_with_undefined_values(*f.target_attr, 10); add_n_docs_with_undefined_values(*f.reference_attr, 101); auto ctx = f.create_context(word_term("foo")); EXPECT_EQUAL(101u, ctx->approximateHits()); } +TEST_F("approximateHits() estimates hits when using fast-search target attribute", Fixture(false, FastSearchConfig::ExplicitlyEnabled)) +{ + constexpr uint32_t target_docs = 1000; + constexpr uint32_t docs = 10000; + constexpr uint32_t target_gids = 200; + f.target_attr->addReservedDoc(); + add_n_docs_with_undefined_values(*f.target_attr, target_docs); + f.reference_attr->addReservedDoc(); + add_n_docs_with_undefined_values(*f.reference_attr, docs); + auto target_attr = dynamic_cast<IntegerAttribute *>(f.target_attr.get()); + // 2 documents with value 20, 110 documents with value 30. + for (uint32_t i = 1; i < 3; ++i) { + target_attr->update(i, 20); + } + for (uint32_t i = 10; i < 120; ++i) { + target_attr->update(i, 30); + } + f.target_attr->commit(); + // Assign target gids + for (uint32_t i = 1; i <= target_gids; ++i) { + auto target_gid = dummy_gid(i); + f.mapper_factory->_map[target_gid] = i; + f.reference_attr->notifyReferencedPut(target_gid, i); + } + // Add 2 references to each target gid + for (uint32_t i = 1; i <= target_gids * 2; ++i) { + f.reference_attr->update(i, dummy_gid(((i - 1) % target_gids) + 1)); + } + f.reference_attr->commit(); + auto ctx = f.create_context(word_term("10")); + // Exact count: 0 target hits => 0 + EXPECT_EQUAL(0u, ctx->approximateHits()); + TermFieldMatchData match; + auto iter = f.create_iterator(*ctx, match, false); + EXPECT_TRUE(iter->matches_any() == Trinary::False); + ctx = f.create_context(word_term("20")); + // Exact count: 2 target hits, 2 docs / target doc => 2 * 2 = 4 + EXPECT_EQUAL(4u, ctx->approximateHits()); + ctx = f.create_context(word_term("30")); + // Approximation: 110 target hits => 110 * 10001 / 1001 = 1099 + EXPECT_EQUAL(1099u, ctx->approximateHits()); +} + TEST_F("attributeName() returns imported attribute name", Fixture) { auto ctx = f.create_context(word_term("foo")); EXPECT_EQUAL(f.default_imported_attr_name(), ctx->attributeName()); diff --git a/searchlib/src/vespa/searchlib/attribute/imported_search_context.cpp b/searchlib/src/vespa/searchlib/attribute/imported_search_context.cpp index 9c4f8395fec..1e8adc3922e 100644 --- a/searchlib/src/vespa/searchlib/attribute/imported_search_context.cpp +++ b/searchlib/src/vespa/searchlib/attribute/imported_search_context.cpp @@ -44,18 +44,78 @@ ImportedSearchContext::ImportedSearchContext( _target_search_context(_target_attribute.createSearchContext(std::move(term), params)), _targetLids(_reference_attribute.getTargetLids()), _merger(_reference_attribute.getCommittedDocIdLimit()), - _params(params) + _params(params), + _zero_hits(false) { } ImportedSearchContext::~ImportedSearchContext() = default; -unsigned int ImportedSearchContext::approximateHits() const { - return _reference_attribute.getNumDocs(); +uint32_t +ImportedSearchContext::calc_approx_hits(uint32_t target_approx_hits) const +{ + uint32_t docid_limit = _targetLids.size(); + uint32_t target_docid_limit = _target_attribute.getCommittedDocIdLimit(); + double approx_hits_multiplier = static_cast<double>(docid_limit) / target_docid_limit; + if (approx_hits_multiplier < 1.0) { + approx_hits_multiplier = 1.0; + } + uint64_t approx_hits = target_approx_hits * approx_hits_multiplier; + if (approx_hits > docid_limit) { + approx_hits = docid_limit; + } + return approx_hits; +} + +uint32_t +ImportedSearchContext::calc_exact_hits() const +{ + uint32_t docid_limit = _targetLids.size(); + uint32_t target_docid_limit = _target_attribute.getCommittedDocIdLimit(); + auto reverse_mapping_refs = _reference_attribute.getReverseMappingRefs(); + auto& reverse_mapping = _reference_attribute.getReverseMapping(); + if (target_docid_limit > reverse_mapping_refs.size()) { + target_docid_limit = reverse_mapping_refs.size(); + } + fef::TermFieldMatchData matchData; + auto it = _target_search_context->createIterator(&matchData, true); + uint64_t sum_hits = 0; + it->initRange(1, target_docid_limit); + for (uint32_t lid = it->seekFirst(1); !it->isAtEnd(); lid = it->seekNext(lid + 1)) { + EntryRef ref = reverse_mapping_refs[lid].load_acquire(); + if (__builtin_expect(ref.valid(), true)) { + sum_hits += reverse_mapping.frozenSize(ref); + } + } + if (sum_hits > docid_limit) { + sum_hits = docid_limit; + } + return sum_hits; +} + +unsigned int +ImportedSearchContext::approximateHits() const +{ + uint32_t target_approx_hits = _target_search_context->approximateHits(); + if (target_approx_hits == 0) { + _zero_hits.store(true, std::memory_order_relaxed); + return 0; + } + if (!_target_attribute.getIsFastSearch()) { + return _reference_attribute.getNumDocs(); + } + if (target_approx_hits >= MIN_TARGET_HITS_FOR_APPROXIMATION) { + return calc_approx_hits(target_approx_hits); + } else { + return calc_exact_hits(); + } } std::unique_ptr<queryeval::SearchIterator> ImportedSearchContext::createIterator(fef::TermFieldMatchData* matchData, bool strict) { + if (_zero_hits.load(std::memory_order_relaxed)) { + return std::make_unique<EmptySearch>(); + } if (_searchCacheLookup) { return BitVectorIterator::create(_searchCacheLookup->bitVector.get(), _searchCacheLookup->docIdLimit, *matchData, strict); } diff --git a/searchlib/src/vespa/searchlib/attribute/imported_search_context.h b/searchlib/src/vespa/searchlib/attribute/imported_search_context.h index c77799ef7a5..0f55bd85a39 100644 --- a/searchlib/src/vespa/searchlib/attribute/imported_search_context.h +++ b/searchlib/src/vespa/searchlib/attribute/imported_search_context.h @@ -41,6 +41,9 @@ class ImportedSearchContext : public ISearchContext { TargetLids _targetLids; PostingListMerger<int32_t> _merger; SearchContextParams _params; + mutable std::atomic<bool> _zero_hits; + + static constexpr uint32_t MIN_TARGET_HITS_FOR_APPROXIMATION = 100; uint32_t getTargetLid(uint32_t lid) const { // Check range to avoid reading memory beyond end of mapping array @@ -49,6 +52,8 @@ class ImportedSearchContext : public ISearchContext { void makeMergedPostings(bool isFilter); void considerAddSearchCacheEntry(); + uint32_t calc_approx_hits(uint32_t target_approx_hits) const; + uint32_t calc_exact_hits() const; public: ImportedSearchContext(std::unique_ptr<QueryTermSimple> term, const SearchContextParams& params, |