diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-02-13 06:49:42 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-13 06:49:42 +0100 |
commit | 379ef849bc37c1b0509bb28d4c9162557bdb5ffd (patch) | |
tree | 5f0bb1c25d9ce73a8a225475408c5fcf1e77ccb7 | |
parent | 9e992553a1defb68c945bc169999742975f42fea (diff) | |
parent | 5512e92e417f13d031f0bf0c6fc18ddf76511bd9 (diff) |
Merge pull request #30250 from vespa-engine/revert-30249-revert-30240-balder/use-wanted-hitcount
Use explicit given wanted hit count.
9 files changed, 80 insertions, 59 deletions
diff --git a/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp b/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp index 131e39b1d5c..5fc8fae181a 100644 --- a/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp +++ b/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp @@ -118,7 +118,7 @@ HitCollectorTest::addHit(HitCollector &hc, uint32_t docId, double score, const c TEST_F(HitCollectorTest, simple) { - HitCollector hc(5); + HitCollector hc(5, false); // add hits to hit collector for (uint32_t i = 0; i < 5; ++i) { @@ -139,7 +139,7 @@ TEST_F(HitCollectorTest, simple) TEST_F(HitCollectorTest, gaps_in_docid) { - HitCollector hc(5); + HitCollector hc(5, false); // add hits to hit collector for (uint32_t i = 0; i < 5; ++i) { @@ -161,7 +161,7 @@ TEST_F(HitCollectorTest, gaps_in_docid) TEST_F(HitCollectorTest, heap_property) { { - HitCollector hc(3); + HitCollector hc(3, false); // add hits (low to high) for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, i + 10); @@ -174,7 +174,7 @@ TEST_F(HitCollectorTest, heap_property) assertHit(15, 5, 2, sr); } { - HitCollector hc(3); + HitCollector hc(3, false); // add hits (high to low) for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, 10 - i); @@ -187,7 +187,7 @@ TEST_F(HitCollectorTest, heap_property) assertHit(8, 2, 2, sr); } { - HitCollector hc(3); + HitCollector hc(3, false); // add hits (same rank score) for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, 10); @@ -211,7 +211,7 @@ TEST_F(HitCollectorTest, heap_property_with_sorting) sortData.push_back('e'); sortData.push_back('f'); { - HitCollector hc(3); + HitCollector hc(3, true); // add hits ('a' is sorted/ranked better than 'b') for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, i + 10, &sortData[i], 1); @@ -224,7 +224,7 @@ TEST_F(HitCollectorTest, heap_property_with_sorting) assertHit(12, 2, 2, sr); } { - HitCollector hc(3); + HitCollector hc(3, true); // add hits ('a' is sorted/ranked better than 'b') for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, i + 10, &sortData[5 - i], 1); @@ -237,7 +237,7 @@ TEST_F(HitCollectorTest, heap_property_with_sorting) assertHit(15, 5, 2, sr); } { - HitCollector hc(3); + HitCollector hc(3, true); // add hits (same sort blob) for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, 10, &sortData[0], 1); @@ -253,7 +253,7 @@ TEST_F(HitCollectorTest, heap_property_with_sorting) TEST_F(HitCollectorTest, empty) { - HitCollector hc(0); + HitCollector hc(0, false); addHit(hc, 0, 0); SearchResult rs; hc.fillSearchResult(rs); @@ -298,7 +298,7 @@ MyRankProgram::~MyRankProgram() = default; TEST_F(HitCollectorTest, feature_set) { - HitCollector hc(3); + HitCollector hc(3, false); addHit(hc, 0, 10); addHit(hc, 1, 50); // on heap @@ -361,7 +361,7 @@ TEST_F(HitCollectorTest, feature_set) TEST_F(HitCollectorTest, match_features) { - HitCollector hc(3); + HitCollector hc(3, false); addHit(hc, 0, 10); addHit(hc, 1, 50); // on heap diff --git a/streamingvisitors/src/tests/matching_elements_filler/matching_elements_filler_test.cpp b/streamingvisitors/src/tests/matching_elements_filler/matching_elements_filler_test.cpp index 09f50afeb77..4f57b5a0b92 100644 --- a/streamingvisitors/src/tests/matching_elements_filler/matching_elements_filler_test.cpp +++ b/streamingvisitors/src/tests/matching_elements_filler/matching_elements_filler_test.cpp @@ -299,7 +299,7 @@ MatchingElementsFillerTest::MatchingElementsFillerTest() _env(), _field_searcher_map(make_field_searcher_map()), _index_to_field_ids(make_index_to_field_ids()), - _hit_collector(10), + _hit_collector(10, false), _search_result(), _query(), _matching_elements_filler(), diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp index 96bd3c733ec..e1ee72b0152 100644 --- a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp @@ -33,8 +33,9 @@ HitCollector::Hit::Hit(const vsm::StorageDocument * doc, uint32_t docId, const HitCollector::Hit::~Hit() = default; -HitCollector::HitCollector(size_t wantedHits) : +HitCollector::HitCollector(size_t wantedHits, bool use_sort_blob) : _hits(), + _use_sort_blob(use_sort_blob), _sortedByDocId(true) { _hits.reserve(wantedHits); @@ -77,17 +78,44 @@ bool HitCollector::addHitToHeap(const Hit & hit) const { // return true if the given hit is better than the current worst one. - return (hit.getSortBlob().empty()) - ? (hit.cmpRank(_hits[0]) < 0) - : (hit.cmpSort(_hits[0]) < 0); + return (_use_sort_blob) + ? (hit.cmpSort(_hits[0]) < 0) + : (hit.cmpRank(_hits[0]) < 0); +} + +void +HitCollector::make_heap() { + if (_use_sort_blob) { + std::make_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); + } else { + std::make_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); + } +} + +void +HitCollector::pop_heap() { + if (_use_sort_blob) { + std::pop_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); + } else { + std::pop_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); + } +} + +void +HitCollector::push_heap() { + if (_use_sort_blob) { + std::push_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); + } else { + std::push_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); + } } bool HitCollector::addHit(Hit && hit) { bool amongTheBest(false); - ssize_t avail = (_hits.capacity() - _hits.size()); - bool useSortBlob( ! hit.getSortBlob().empty() ); + size_t avail = (_hits.capacity() - _hits.size()); + assert(_use_sort_blob != hit.getSortBlob().empty() ); if (avail > 1) { // No heap yet. _hits.emplace_back(std::move(hit)); @@ -96,28 +124,14 @@ HitCollector::addHit(Hit && hit) // this happens when wantedHitCount = 0 // in this case we shall not put anything on the heap (which is empty) } else if ( avail == 0 && addHitToHeap(hit)) { // already a heap - if (useSortBlob) { - std::pop_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); - } else { - std::pop_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); - } - + pop_heap(); _hits.back() = std::move(hit); amongTheBest = true; - - if (useSortBlob) { - std::push_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); - } else { - std::push_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); - } + push_heap(); } else if (avail == 1) { // make a heap of the hit vector _hits.emplace_back(std::move(hit)); amongTheBest = true; - if (useSortBlob) { - std::make_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); - } else { - std::make_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); - } + make_heap(); _sortedByDocId = false; // the hit vector is no longer sorted by docId } return amongTheBest; @@ -127,7 +141,9 @@ void HitCollector::fillSearchResult(vdslib::SearchResult & searchResult, FeatureValues&& match_features) { sortByDocId(); - for (const Hit & hit : _hits) { + size_t count = std::min(_hits.size(), searchResult.getWantedHitCount()); + for (size_t i(0); i < count; i++) { + const Hit & hit = _hits[i]; vespalib::string documentId(hit.getDocument().docDoc().getId().toString()); search::DocumentIdT docId = hit.getDocId(); SearchResult::RankType rank = hit.getRankScore(); @@ -161,8 +177,8 @@ HitCollector::getFeatureSet(IRankProgram &rankProgram, auto names = FefUtils::extract_feature_names(resolver, feature_rename_map); FeatureSet::SP retval = std::make_shared<FeatureSet>(names, _hits.size()); for (const Hit & hit : _hits) { - rankProgram.run(hit.getDocId(), hit.getMatchData()); uint32_t docId = hit.getDocId(); + rankProgram.run(docId, hit.getMatchData()); auto * f = retval->getFeaturesByIndex(retval->addDocId(docId)); FefUtils::extract_feature_values(resolver, docId, f); } diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.h b/streamingvisitors/src/vespa/searchvisitor/hitcollector.h index 4c3a49fa5cc..50a233bfcef 100644 --- a/streamingvisitors/src/vespa/searchvisitor/hitcollector.h +++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.h @@ -73,11 +73,15 @@ private: }; using HitVector = std::vector<Hit>; HitVector _hits; + bool _use_sort_blob; bool _sortedByDocId; // flag for whether the hit vector is sorted on docId void sortByDocId(); bool addHitToHeap(const Hit & hit) const; bool addHit(Hit && hit); + void make_heap(); + void pop_heap(); + void push_heap(); public: using UP = std::unique_ptr<HitCollector>; @@ -87,7 +91,7 @@ public: virtual void run(uint32_t docid, const std::vector<TermFieldMatchData> &matchData) = 0; }; - explicit HitCollector(size_t wantedHits); + HitCollector(size_t wantedHits, bool use_sort_blob); virtual const vsm::Document & getDocSum(const search::DocumentIdT & docId) const override; diff --git a/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.cpp b/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.cpp index 79bacda3f3b..35a3f1966b6 100644 --- a/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/matching_elements_filler.cpp @@ -178,12 +178,13 @@ MatchingElementsFiller::fill_matching_elements(const MatchingElementsFields& fie return result; } // Scan documents that will be returned as hits - for (size_t i(0), m(_search_result.getHitCount()); (i < m) && (i < _search_result.getWantedHitCount()); i++ ) { + size_t count = std::min(_search_result.getHitCount(), _search_result.getWantedHitCount()); + for (size_t i(0); i < count; i++ ) { const char* doc_id(nullptr); SearchResult::RankType rank(0); uint32_t lid = _search_result.getHit(i, doc_id, rank); const vsm::Document& vsm_doc = _hit_collector.getDocSum(lid); - const StorageDocument& doc = dynamic_cast<const StorageDocument&>(vsm_doc); + const auto& doc = dynamic_cast<const StorageDocument&>(vsm_doc); matcher.find_matching_elements(doc, lid, *result); _query.reset(); } diff --git a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp index 7db1542ca53..167d5ecde4c 100644 --- a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp @@ -133,9 +133,9 @@ RankProcessor::initQueryEnvironment() } void -RankProcessor::initHitCollector(size_t wantedHitCount) +RankProcessor::initHitCollector(size_t wantedHitCount, bool use_sort_blob) { - _hitCollector = std::make_unique<HitCollector>(wantedHitCount); + _hitCollector = std::make_unique<HitCollector>(wantedHitCount, use_sort_blob); } void @@ -145,7 +145,7 @@ RankProcessor::setupRankProgram(RankProgram &program) } void -RankProcessor::init(bool forRanking, size_t wantedHitCount) +RankProcessor::init(bool forRanking, size_t wantedHitCount, bool use_sort_blob) { initQueryEnvironment(); if (forRanking) { @@ -167,7 +167,7 @@ RankProcessor::init(bool forRanking, size_t wantedHitCount) _rankProgram = _rankSetup.create_dump_program(); setupRankProgram(*_rankProgram); } - initHitCollector(wantedHitCount); + initHitCollector(wantedHitCount, use_sort_blob); } RankProcessor::RankProcessor(std::shared_ptr<const RankManager::Snapshot> snapshot, @@ -197,15 +197,15 @@ RankProcessor::RankProcessor(std::shared_ptr<const RankManager::Snapshot> snapsh } void -RankProcessor::initForRanking(size_t wantedHitCount) +RankProcessor::initForRanking(size_t wantedHitCount, bool use_sort_blob) { - return init(true, wantedHitCount); + return init(true, wantedHitCount, use_sort_blob); } void -RankProcessor::initForDumping(size_t wantedHitCount) +RankProcessor::initForDumping(size_t wantedHitCount, bool use_sort_blob) { - return init(false, wantedHitCount); + return init(false, wantedHitCount, use_sort_blob); } void diff --git a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h index bec70beca77..b9ed07f1170 100644 --- a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h +++ b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h @@ -48,7 +48,7 @@ private: void resolve_fields_from_children(QueryTermData& qtd, search::streaming::MultiTerm& mt); void resolve_fields_from_term(QueryTermData& qtd, search::streaming::QueryTerm& term); void initQueryEnvironment(); - void initHitCollector(size_t wantedHitCount); + void initHitCollector(size_t wantedHitCount, bool use_sort_blob); void setupRankProgram(search::fef::RankProgram &program); FeatureValues calculate_match_features(); @@ -58,7 +58,7 @@ private: * @param wantedHitCount the number of hits we want to return from the hit collector. * @return whether the rank processor was initialized or not. **/ - void init(bool forRanking, size_t wantedHitCount); + void init(bool forRanking, size_t wantedHitCount, bool use_sort_blob); public: using UP = std::unique_ptr<RankProcessor>; @@ -71,8 +71,8 @@ public: const search::fef::Properties & featureOverrides, const search::IAttributeManager * attrMgr); - void initForRanking(size_t wantedHitCount); - void initForDumping(size_t wantedHitCount); + void initForRanking(size_t wantedHitCount, bool use_sort_blob); + void initForDumping(size_t wantedHitCount, bool use_sort_blob); void unpackMatchData(uint32_t docId); static void unpack_match_data(uint32_t docid, search::fef::MatchData& matchData, QueryWrapper& query); void runRankProgram(uint32_t docId); diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp index 9755f20588f..f8261e499dc 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp @@ -500,7 +500,7 @@ SearchVisitor::init(const Parameters & params) setupAttributeVectorsForSorting(_sortSpec); _rankController.setRankManagerSnapshot(_env->get_rank_manager_snapshot()); - _rankController.setupRankProcessors(_query, location, wantedSummaryCount, _attrMan, _attributeFields); + _rankController.setupRankProcessors(_query, location, wantedSummaryCount, ! _sortSpec.empty(), _attrMan, _attributeFields); // This depends on _fieldPathMap (from setupScratchDocument), // and IQueryEnvironment (from setupRankProcessors). @@ -668,8 +668,8 @@ SearchVisitor::RankController::RankController() : _queryProperties(), _featureOverrides(), _hasRanking(false), - _rankProcessor(), _dumpFeatures(false), + _rankProcessor(), _dumpProcessor() { } @@ -679,20 +679,20 @@ SearchVisitor::RankController::~RankController() = default; void SearchVisitor::RankController::setupRankProcessors(Query & query, const vespalib::string & location, - size_t wantedHitCount, + size_t wantedHitCount, bool use_sort_blob, const search::IAttributeManager & attrMan, std::vector<AttrInfo> & attributeFields) { _rankSetup = &_rankManagerSnapshot->getRankSetup(_rankProfile); _rankProcessor = std::make_unique<RankProcessor>(_rankManagerSnapshot, _rankProfile, query, location, _queryProperties, _featureOverrides, &attrMan); - _rankProcessor->initForRanking(wantedHitCount); + _rankProcessor->initForRanking(wantedHitCount, use_sort_blob); // register attribute vectors needed for ranking processAccessedAttributes(_rankProcessor->get_real_query_env(), true, attrMan, attributeFields); if (_dumpFeatures) { _dumpProcessor = std::make_unique<RankProcessor>(_rankManagerSnapshot, _rankProfile, query, location, _queryProperties, _featureOverrides, &attrMan); LOG(debug, "Initialize dump processor"); - _dumpProcessor->initForDumping(wantedHitCount); + _dumpProcessor->initForDumping(wantedHitCount, use_sort_blob); // register attribute vectors needed for dumping processAccessedAttributes(_dumpProcessor->get_real_query_env(), false, attrMan, attributeFields); } diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h index 21f89da3d18..dfd48736e89 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h @@ -131,8 +131,8 @@ private: search::fef::Properties _queryProperties; search::fef::Properties _featureOverrides; bool _hasRanking; - RankProcessor::UP _rankProcessor; bool _dumpFeatures; + RankProcessor::UP _rankProcessor; RankProcessor::UP _dumpProcessor; /** @@ -168,7 +168,7 @@ private: **/ void setupRankProcessors(search::streaming::Query & query, const vespalib::string & location, - size_t wantedHitCount, + size_t wantedHitCount, bool use_sort_blob, const search::IAttributeManager & attrMan, std::vector<AttrInfo> & attributeFields); /** |