diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2024-02-12 11:10:08 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2024-02-13 11:58:17 +0000 |
commit | 66744e0891dcc1681fe35b010503850977396ca5 (patch) | |
tree | 27ec303749b0dc5670db9ee69d47466e2097af89 | |
parent | 45b56e769811a58807a6c59e534473cd5f4be180 (diff) |
- Add all hits to the hit collector.
- Maintain a heap on the side, and keep heap property when producing results and features.
- Drop teh pointer to the document once it drops off the heap.
4 files changed, 115 insertions, 95 deletions
diff --git a/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp b/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp index 5fc8fae181a..56c8b93052b 100644 --- a/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp +++ b/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp @@ -139,7 +139,9 @@ TEST_F(HitCollectorTest, simple) TEST_F(HitCollectorTest, gaps_in_docid) { - HitCollector hc(5, false); + SearchResult sr; + sr.setWantedHitCount(5); + HitCollector hc(sr.getWantedHitCount(), false); // add hits to hit collector for (uint32_t i = 0; i < 5; ++i) { @@ -147,7 +149,6 @@ TEST_F(HitCollectorTest, gaps_in_docid) } // merge from heap into search result - SearchResult sr; hc.fillSearchResult(sr); ASSERT_TRUE(sr.getHitCount() == 5); @@ -161,12 +162,13 @@ TEST_F(HitCollectorTest, gaps_in_docid) TEST_F(HitCollectorTest, heap_property) { { - HitCollector hc(3, false); + SearchResult sr; + sr.setWantedHitCount(3); + HitCollector hc(sr.getWantedHitCount(), false); // add hits (low to high) for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, i + 10); } - SearchResult sr; hc.fillSearchResult(sr); ASSERT_TRUE(sr.getHitCount() == 3); assertHit(13, 3, 0, sr); @@ -174,12 +176,13 @@ TEST_F(HitCollectorTest, heap_property) assertHit(15, 5, 2, sr); } { - HitCollector hc(3, false); + SearchResult sr; + sr.setWantedHitCount(3); + HitCollector hc(sr.getWantedHitCount(), false); // add hits (high to low) for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, 10 - i); } - SearchResult sr; hc.fillSearchResult(sr); ASSERT_TRUE(sr.getHitCount() == 3); assertHit(10, 0, 0, sr); @@ -187,12 +190,13 @@ TEST_F(HitCollectorTest, heap_property) assertHit(8, 2, 2, sr); } { - HitCollector hc(3, false); + SearchResult sr; + sr.setWantedHitCount(3); + HitCollector hc(sr.getWantedHitCount(), false); // add hits (same rank score) for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, 10); } - SearchResult sr; hc.fillSearchResult(sr); ASSERT_TRUE(sr.getHitCount() == 3); assertHit(10, 0, 0, sr); @@ -211,12 +215,13 @@ TEST_F(HitCollectorTest, heap_property_with_sorting) sortData.push_back('e'); sortData.push_back('f'); { - HitCollector hc(3, true); + SearchResult sr; + sr.setWantedHitCount(3); + HitCollector hc(sr.getWantedHitCount(), true); // add hits ('a' is sorted/ranked better than 'b') for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, i + 10, &sortData[i], 1); } - SearchResult sr; hc.fillSearchResult(sr); ASSERT_TRUE(sr.getHitCount() == 3); assertHit(10, 0, 0, sr); @@ -224,12 +229,13 @@ TEST_F(HitCollectorTest, heap_property_with_sorting) assertHit(12, 2, 2, sr); } { - HitCollector hc(3, true); + SearchResult sr; + sr.setWantedHitCount(3); + HitCollector hc(sr.getWantedHitCount(), true); // add hits ('a' is sorted/ranked better than 'b') for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, i + 10, &sortData[5 - i], 1); } - SearchResult sr; hc.fillSearchResult(sr); ASSERT_TRUE(sr.getHitCount() == 3); assertHit(13, 3, 0, sr); @@ -237,12 +243,13 @@ TEST_F(HitCollectorTest, heap_property_with_sorting) assertHit(15, 5, 2, sr); } { - HitCollector hc(3, true); + SearchResult sr; + sr.setWantedHitCount(3); + HitCollector hc(sr.getWantedHitCount(), true); // add hits (same sort blob) for (uint32_t i = 0; i < 6; ++i) { addHit(hc, i, 10, &sortData[0], 1); } - SearchResult sr; hc.fillSearchResult(sr); ASSERT_TRUE(sr.getHitCount() == 3); assertHit(10, 0, 0, sr); diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp index e1ee72b0152..305e76477ca 100644 --- a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp @@ -18,12 +18,12 @@ using FefUtils = search::fef::Utils; namespace streaming { HitCollector::Hit::Hit(const vsm::StorageDocument * doc, uint32_t docId, const MatchData & matchData, - double score, const void * sortData, size_t sortDataLen) : - _docid(docId), - _score(score), - _document(doc), - _matchData(), - _sortBlob(sortData, sortDataLen) + double score, const void * sortData, size_t sortDataLen) + : _docid(docId), + _score(score), + _document(doc), + _matchData(), + _sortBlob(sortData, sortDataLen) { _matchData.reserve(matchData.getNumTermFields()); for (search::fef::TermFieldHandle handle = 0; handle < matchData.getNumTermFields(); ++handle) { @@ -35,12 +35,15 @@ HitCollector::Hit::~Hit() = default; HitCollector::HitCollector(size_t wantedHits, bool use_sort_blob) : _hits(), - _use_sort_blob(use_sort_blob), - _sortedByDocId(true) + _heap(), + _use_sort_blob(use_sort_blob) { - _hits.reserve(wantedHits); + _hits.reserve(16); + _heap.reserve(wantedHits); } +HitCollector::~HitCollector() = default; + const vsm::Document & HitCollector::getDocSum(const search::DocumentIdT & docId) const { @@ -65,85 +68,87 @@ HitCollector::addHit(const vsm::StorageDocument * doc, uint32_t docId, const Mat return addHit(Hit(doc, docId, data, score, sortData, sortDataLen)); } -void -HitCollector::sortByDocId() -{ - if (!_sortedByDocId) { - std::sort(_hits.begin(), _hits.end()); // sort on docId - _sortedByDocId = true; - } -} - bool -HitCollector::addHitToHeap(const Hit & hit) const +HitCollector::addHitToHeap(uint32_t index) const { + if (_heap.capacity() == 0) return false; // return true if the given hit is better than the current worst one. - return (_use_sort_blob) - ? (hit.cmpSort(_hits[0]) < 0) - : (hit.cmpRank(_hits[0]) < 0); + const Hit & hit = _hits[index]; + return _use_sort_blob + ? (hit.cmpSort(_hits[_heap[0]]) < 0) + : (hit.cmpRank(_hits[_heap[0]]) < 0); } void HitCollector::make_heap() { if (_use_sort_blob) { - std::make_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); + std::make_heap(_heap.begin(), _heap.end(), SortComparator(_hits)); } else { - std::make_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); + std::make_heap(_heap.begin(), _heap.end(), RankComparator(_hits)); } } void HitCollector::pop_heap() { if (_use_sort_blob) { - std::pop_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); + std::pop_heap(_heap.begin(), _heap.end(), SortComparator(_hits)); } else { - std::pop_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); + std::pop_heap(_heap.begin(), _heap.end(), RankComparator(_hits)); } } void HitCollector::push_heap() { if (_use_sort_blob) { - std::push_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); + std::push_heap(_heap.begin(), _heap.end(), SortComparator(_hits)); } else { - std::push_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); + std::push_heap(_heap.begin(), _heap.end(), RankComparator(_hits)); } } bool HitCollector::addHit(Hit && hit) { - bool amongTheBest(false); - size_t avail = (_hits.capacity() - _hits.size()); + size_t avail = (_heap.capacity() - _heap.size()); assert(_use_sort_blob != hit.getSortBlob().empty() ); + assert(_hits.size() <= hit.getDocId()); + _hits.emplace_back(std::move(hit)); + uint32_t index = _hits.size() - 1; if (avail > 1) { // No heap yet. - _hits.emplace_back(std::move(hit)); - amongTheBest = true; - } else if (_hits.capacity() == 0) { - // this happens when wantedHitCount = 0 - // in this case we shall not put anything on the heap (which is empty) - } else if ( avail == 0 && addHitToHeap(hit)) { // already a heap + _heap.emplace_back(index); + } else if ((avail == 0) && addHitToHeap(index)) { // already a heap pop_heap(); - _hits.back() = std::move(hit); - amongTheBest = true; + uint32_t toDrop = _heap.back(); + _heap.back() = index; push_heap(); + // Signal that it is not among the best and should hence never be accessed. + // Drop early to catch logic flaws. + _hits[toDrop].dropDocument(); } else if (avail == 1) { // make a heap of the hit vector - _hits.emplace_back(std::move(hit)); - amongTheBest = true; + _heap.emplace_back(index); make_heap(); - _sortedByDocId = false; // the hit vector is no longer sorted by docId + } else { + // The document might be invalid if it did not make the cut, + // so clear the reference here to catch logic flaws early. + _hits.back().dropDocument(); + return false; } - return amongTheBest; + return true; +} + +std::vector<uint32_t> +HitCollector::bestLids() const { + std::vector<uint32_t> hitsOnHeap = _heap; + std::sort(hitsOnHeap.begin(), hitsOnHeap.end()); + return hitsOnHeap; } void -HitCollector::fillSearchResult(vdslib::SearchResult & searchResult, FeatureValues&& match_features) +HitCollector::fillSearchResult(vdslib::SearchResult & searchResult, FeatureValues&& match_features) const { - sortByDocId(); - size_t count = std::min(_hits.size(), searchResult.getWantedHitCount()); - for (size_t i(0); i < count; i++) { - const Hit & hit = _hits[i]; + for (uint32_t lid : bestLids()) { + const Hit & hit = _hits[lid]; vespalib::string documentId(hit.getDocument().docDoc().getId().toString()); search::DocumentIdT docId = hit.getDocId(); SearchResult::RankType rank = hit.getRankScore(); @@ -160,7 +165,7 @@ HitCollector::fillSearchResult(vdslib::SearchResult & searchResult, FeatureValue } void -HitCollector::fillSearchResult(vdslib::SearchResult & searchResult) +HitCollector::fillSearchResult(vdslib::SearchResult & searchResult) const { fillSearchResult(searchResult, FeatureValues()); } @@ -168,15 +173,15 @@ HitCollector::fillSearchResult(vdslib::SearchResult & searchResult) FeatureSet::SP HitCollector::getFeatureSet(IRankProgram &rankProgram, const FeatureResolver &resolver, - const search::StringStringMap &feature_rename_map) + const search::StringStringMap &feature_rename_map) const { - if (resolver.num_features() == 0 || _hits.empty()) { + if (resolver.num_features() == 0 || _heap.empty()) { return std::make_shared<FeatureSet>(); } - sortByDocId(); auto names = FefUtils::extract_feature_names(resolver, feature_rename_map); - FeatureSet::SP retval = std::make_shared<FeatureSet>(names, _hits.size()); - for (const Hit & hit : _hits) { + FeatureSet::SP retval = std::make_shared<FeatureSet>(names, _heap.size()); + for (uint32_t lid : bestLids()) { + const Hit & hit = _hits[lid]; uint32_t docId = hit.getDocId(); rankProgram.run(docId, hit.getMatchData()); auto * f = retval->getFeaturesByIndex(retval->addDocId(docId)); @@ -188,17 +193,17 @@ HitCollector::getFeatureSet(IRankProgram &rankProgram, FeatureValues HitCollector::get_match_features(IRankProgram& rank_program, const FeatureResolver& resolver, - const search::StringStringMap& feature_rename_map) + const search::StringStringMap& feature_rename_map) const { FeatureValues match_features; - if (resolver.num_features() == 0 || _hits.empty()) { + if (resolver.num_features() == 0 || _heap.empty()) { return match_features; } - sortByDocId(); match_features.names = FefUtils::extract_feature_names(resolver, feature_rename_map); - match_features.values.resize(resolver.num_features() * _hits.size()); + match_features.values.resize(resolver.num_features() * _heap.size()); auto f = match_features.values.data(); - for (const Hit & hit : _hits) { + for (uint32_t lid : bestLids()) { + const Hit & hit = _hits[lid]; auto docid = hit.getDocId(); rank_program.run(docid, hit.getMatchData()); FefUtils::extract_feature_values(resolver, docid, f); diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.h b/streamingvisitors/src/vespa/searchvisitor/hitcollector.h index 50a233bfcef..43ffbbe30c6 100644 --- a/streamingvisitors/src/vespa/searchvisitor/hitcollector.h +++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.h @@ -51,18 +51,8 @@ private: int diff = _sortBlob.compare(b._sortBlob.c_str(), b._sortBlob.size()); return (diff == 0) ? cmpDocId(b) : diff; } - class RankComparator { - public: - bool operator() (const Hit & lhs, const Hit & rhs) const noexcept { - return lhs.cmpRank(rhs) < 0; - } - }; - class SortComparator { - public: - bool operator() (const Hit & lhs, const Hit & rhs) const noexcept { - return lhs.cmpSort(rhs) < 0; - } - }; + + void dropDocument() noexcept { _document = nullptr; } private: uint32_t _docid; @@ -72,16 +62,35 @@ private: vespalib::string _sortBlob; }; using HitVector = std::vector<Hit>; + using Lids = std::vector<uint32_t>; HitVector _hits; + Lids _heap; bool _use_sort_blob; - bool _sortedByDocId; // flag for whether the hit vector is sorted on docId - void sortByDocId(); - bool addHitToHeap(const Hit & hit) const; + Lids bestLids() const; + bool addHitToHeap(uint32_t index) const; bool addHit(Hit && hit); void make_heap(); void pop_heap(); void push_heap(); + class RankComparator { + public: + explicit RankComparator(const HitVector & hits) noexcept : _hits(hits) {} + bool operator() (uint32_t lhs, uint32_t rhs) const noexcept { + return _hits[lhs].cmpRank(_hits[rhs]) < 0; + } + private: + const HitVector & _hits; + }; + class SortComparator { + public: + explicit SortComparator(const HitVector & hits) noexcept : _hits(hits) {} + bool operator() (uint32_t lhs, uint32_t rhs) const noexcept { + return _hits[lhs].cmpSort(_hits[rhs]) < 0; + } + private: + const HitVector & _hits; + }; public: using UP = std::unique_ptr<HitCollector>; @@ -92,8 +101,9 @@ public: }; HitCollector(size_t wantedHits, bool use_sort_blob); + ~HitCollector() override; - virtual const vsm::Document & getDocSum(const search::DocumentIdT & docId) const override; + const vsm::Document & getDocSum(const search::DocumentIdT & docId) const override; /** * Adds a hit to this hit collector. @@ -124,14 +134,12 @@ public: /** * Fills the given search result with the m best hits from the hit heap. - * Invoking this method will destroy the heap property of the hit heap. **/ - void fillSearchResult(vdslib::SearchResult & searchResult, vespalib::FeatureValues&& match_features); - void fillSearchResult(vdslib::SearchResult & searchResult); + void fillSearchResult(vdslib::SearchResult & searchResult, vespalib::FeatureValues&& match_features) const; + void fillSearchResult(vdslib::SearchResult & searchResult) const; /** * Extract features from the hits stored in the hit heap. - * Invoking this method will destroy the heap property of the hit heap. * Note that this method will calculate any additional features. * * @return features for all hits on the heap. @@ -140,11 +148,11 @@ public: **/ vespalib::FeatureSet::SP getFeatureSet(IRankProgram &rankProgram, const FeatureResolver &resolver, - const search::StringStringMap &feature_rename_map); + const search::StringStringMap &feature_rename_map) const; vespalib::FeatureValues get_match_features(IRankProgram& rank_program, const FeatureResolver& resolver, - const search::StringStringMap& feature_rename_map); + const search::StringStringMap& feature_rename_map) const; }; } // namespace streaming diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h index dfd48736e89..bb16655193f 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h @@ -499,7 +499,7 @@ class SearchVisitorFactory : public storage::VisitorFactory { std::shared_ptr<storage::VisitorEnvironment> makeVisitorEnvironment(storage::StorageComponent&) override; storage::Visitor* makeVisitor(storage::StorageComponent&, storage::VisitorEnvironment&env, - const vdslib::Parameters& params) override; + const vdslib::Parameters& params) override; public: explicit SearchVisitorFactory(const config::ConfigUri & configUri, FNET_Transport* transport, const vespalib::string& file_distributor_connection_spec); ~SearchVisitorFactory() override; |