diff options
author | Tor Egge <Tor.Egge@online.no> | 2024-05-29 13:44:55 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2024-05-29 13:44:55 +0200 |
commit | d495abf7a078c1109b9be7bae12fe2b225fa9327 (patch) | |
tree | 4fdb16827d602601eb000076ef19b6d253056b2d /searchlib | |
parent | 8276854275b3daaf79d6774062bce6279981c4ba (diff) |
Factor out FirstPhaseRescorer from HitCollector.
Diffstat (limited to 'searchlib')
5 files changed, 113 insertions, 66 deletions
diff --git a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt index 51fe2d12637..126ecd56dc2 100644 --- a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt @@ -21,6 +21,7 @@ vespa_add_library(searchlib_queryeval OBJECT fake_searchable.cpp field_spec.cpp filter_wrapper.cpp + first_phase_rescorer.cpp flow.cpp full_search.cpp get_weight_from_node.cpp diff --git a/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.cpp b/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.cpp new file mode 100644 index 00000000000..a7b1e3a7c92 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.cpp @@ -0,0 +1,38 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "first_phase_rescorer.h" + +namespace search::queryeval { + +FirstPhaseRescorer::FirstPhaseRescorer(const std::pair<Scores,Scores>& ranges) + : _scale(1.0), + _adjust(0.0) +{ + if (need_rescore(ranges)) { + auto& first_phase_scores = ranges.first; + auto& second_phase_scores = ranges.second; + // scale and adjust the first phase score according to the + // first phase and second phase heap score values to avoid that + // a score from the first phase is larger than second_phase_scores.low + double first_phase_range = first_phase_scores.high - first_phase_scores.low; + if (first_phase_range < 1.0) { + first_phase_range = 1.0; + } + double second_phase_range = second_phase_scores.high - second_phase_scores.low; + if (second_phase_range < 1.0) { + second_phase_range = 1.0; + } + _scale = second_phase_range / first_phase_range; + _adjust = first_phase_scores.low * _scale - second_phase_scores.low; + } +} + +bool +FirstPhaseRescorer::need_rescore(const std::pair<Scores,Scores>& ranges) +{ + auto& first_phase_scores = ranges.first; + auto& second_phase_scores = ranges.second; + return (first_phase_scores.low > second_phase_scores.low); +} + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.h b/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.h new file mode 100644 index 00000000000..4d3fe40a4dc --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.h @@ -0,0 +1,24 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "scores.h" + +namespace search::queryeval { + +/* + * Rescore hits not selected for second phase to prevent them from getting + * a better score than hits selected for second phase ranking. + */ +class FirstPhaseRescorer { + double _scale; + double _adjust; +public: + FirstPhaseRescorer(const std::pair<Scores,Scores>& ranges); + static bool need_rescore(const std::pair<Scores,Scores>& ranges); + double rescore(double score) const noexcept { + return ((score * _scale) - _adjust); + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp index bf7f44f0e7a..ad6c1342900 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "hitcollector.h" +#include "first_phase_rescorer.h" #include <vespa/searchlib/common/bitvector.h> #include <vespa/searchlib/common/sort.h> #include <cassert> @@ -43,9 +44,7 @@ HitCollector::HitCollector(uint32_t numDocs, uint32_t maxHitsSize) _unordered(false), _docIdVector(), _bitVector(), - _reRankedHits(), - _scale(1.0), - _adjust(0) + _reRankedHits() { if (_maxHitsSize > 0) { _collector = std::make_unique<RankedHitCollector>(*this); @@ -71,7 +70,7 @@ HitCollector::RankedHitCollector::collect(uint32_t docId, feature_t score) } hc._hits.emplace_back(docId, score); } else { - collectAndChangeCollector(docId, score); + collectAndChangeCollector(docId, score); // note - self-destruct. } } @@ -101,11 +100,10 @@ HitCollector::RankedHitCollector::collectAndChangeCollector(uint32_t docId, feat if (hc._maxDocIdVectorSize > hc._maxHitsSize) { // start using docid vector hc._docIdVector.reserve(hc._maxDocIdVectorSize); - uint32_t iSize = hc._hits.size(); - for (uint32_t i = 0; i < iSize; ++i) { - hc._docIdVector.push_back(hc._hits[i].first); + for (const auto& hit : hc._hits) { + hc._docIdVector.push_back(hit.first); } - if ((iSize > 0) && (docId < hc._docIdVector.back())) { + if (!hc._docIdVector.empty() && (docId < hc._docIdVector.back())) { hc._unordered = true; } hc._docIdVector.push_back(docId); @@ -114,9 +112,8 @@ HitCollector::RankedHitCollector::collectAndChangeCollector(uint32_t docId, feat // start using bit vector hc._bitVector = BitVector::create(hc._numDocs); hc._bitVector->invalidateCachedCount(); - uint32_t iSize = hc._hits.size(); - for (uint32_t i = 0; i < iSize; ++i) { - hc._bitVector->setBit(hc._hits[i].first); + for (const auto& hit : _hc._hits) { + hc._bitVector->setBit(hit.first); } hc._bitVector->setBit(docId); newCollector = std::make_unique<BitVectorCollector<true>>(hc); @@ -125,7 +122,7 @@ HitCollector::RankedHitCollector::collectAndChangeCollector(uint32_t docId, feat std::make_heap(hc._hits.begin(), hc._hits.end(), ScoreComparator()); hc._hitsSortOrder = SortOrder::HEAP; this->considerForHitVector(docId, score); - hc._collector = std::move(newCollector); + hc._collector = std::move(newCollector); // note - self-destruct. } template<bool CollectRankedHit> @@ -145,7 +142,7 @@ HitCollector::DocIdCollector<CollectRankedHit>::collect(uint32_t docId, feature_ } hc._docIdVector.push_back(docId); } else { - collectAndChangeCollector(docId); + collectAndChangeCollector(docId); // note - self-destruct. } } @@ -157,9 +154,8 @@ HitCollector::DocIdCollector<CollectRankedHit>::collectAndChangeCollector(uint32 // start using bit vector instead of docid array. hc._bitVector = BitVector::create(hc._numDocs); hc._bitVector->invalidateCachedCount(); - uint32_t iSize = static_cast<uint32_t>(hc._docIdVector.size()); - for (uint32_t i = 0; i < iSize; ++i) { - hc._bitVector->setBit(hc._docIdVector[i]); + for (auto docid : hc._docIdVector) { + hc._bitVector->setBit(docid); } std::vector<uint32_t> emptyVector; emptyVector.swap(hc._docIdVector); @@ -191,6 +187,36 @@ HitCollector::setRanges(const std::pair<Scores, Scores> &ranges) namespace { +struct NoRescorer +{ + static double rescore(double score) noexcept { return score; } +}; + +template <typename Rescorer> +void +add_rescored_hits(ResultSet& rs, const std::vector<HitCollector::Hit>& hits, Rescorer rescorer) +{ + for (auto& hit : hits) { + rs.push_back({hit.first, rescorer.rescore(hit.second)}); + } +} + +template <typename Rescorer> +void +mixin_rescored_hits(ResultSet& rs, const std::vector<HitCollector::Hit>& hits, const std::vector<uint32_t>& docids, double default_value, Rescorer rescorer) +{ + auto hits_cur = hits.begin(); + auto hits_end = hits.end(); + for (auto docid : docids) { + if (hits_cur != hits_end && docid == hits_cur->first) { + rs.push_back({docid, rescorer.rescore(hits_cur->second)}); + ++hits_cur; + } else { + rs.push_back({docid, default_value}); + } + } +} + void mergeHitsIntoResultSet(const std::vector<HitCollector::Hit> &hits, ResultSet &result) { @@ -211,66 +237,29 @@ mergeHitsIntoResultSet(const std::vector<HitCollector::Hit> &hits, ResultSet &re std::unique_ptr<ResultSet> HitCollector::getResultSet(HitRank default_value) { - bool needReScore = false; - Scores &initHeapScores = _ranges.first; - Scores &finalHeapScores = _ranges.second; - if (initHeapScores.low > finalHeapScores.low) { - // scale and adjust the score according to the range - // of the initial and final heap score values to avoid that - // a score from the first phase is larger than finalHeapScores.low - feature_t initRange = initHeapScores.high - initHeapScores.low; - if (initRange < 1.0) initRange = 1.0f; - feature_t finalRange = finalHeapScores.high - finalHeapScores.low; - if (finalRange < 1.0) finalRange = 1.0f; - _scale = finalRange / initRange; - _adjust = initHeapScores.low * _scale - finalHeapScores.low; - needReScore = true; - } + bool needReScore = FirstPhaseRescorer::need_rescore(_ranges); + FirstPhaseRescorer rescorer(_ranges); // destroys the heap property or score sort order sortHitsByDocId(); auto rs = std::make_unique<ResultSet>(); if ( ! _collector->isDocIdCollector() ) { - unsigned int iSize = _hits.size(); - rs->allocArray(iSize); + rs->allocArray(_hits.size()); if (needReScore) { - for (uint32_t i = 0; i < iSize; ++i) { - rs->push_back(RankedHit(_hits[i].first, getReScore(_hits[i].second))); - } + add_rescored_hits(*rs, _hits, rescorer); } else { - for (uint32_t i = 0; i < iSize; ++i) { - rs->push_back(RankedHit(_hits[i].first, _hits[i].second)); - } + add_rescored_hits(*rs, _hits, NoRescorer()); } } else { if (_unordered) { std::sort(_docIdVector.begin(), _docIdVector.end()); } - unsigned int iSize = _hits.size(); - unsigned int jSize = _docIdVector.size(); - rs->allocArray(jSize); - uint32_t i = 0; + rs->allocArray(_docIdVector.size()); if (needReScore) { - for (uint32_t j = 0; j < jSize; ++j) { - uint32_t docId = _docIdVector[j]; - if (i < iSize && docId == _hits[i].first) { - rs->push_back(RankedHit(docId, getReScore(_hits[i].second))); - ++i; - } else { - rs->push_back(RankedHit(docId, default_value)); - } - } + mixin_rescored_hits(*rs, _hits, _docIdVector, default_value, rescorer); } else { - for (uint32_t j = 0; j < jSize; ++j) { - uint32_t docId = _docIdVector[j]; - if (i < iSize && docId == _hits[i].first) { - rs->push_back(RankedHit(docId, _hits[i].second)); - ++i; - } else { - rs->push_back(RankedHit(docId, default_value)); - } - } + mixin_rescored_hits(*rs, _hits, _docIdVector, default_value, NoRescorer()); } } diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h index 94ffe619bab..903c2ab5b13 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h @@ -35,8 +35,6 @@ private: std::vector<Hit> _reRankedHits; std::pair<Scores, Scores> _ranges; - feature_t _scale; - feature_t _adjust; struct ScoreComparator { bool operator() (const Hit & lhs, const Hit & rhs) const noexcept { @@ -120,9 +118,6 @@ private: void collect(uint32_t docId, feature_t score) override; }; - HitRank getReScore(feature_t score) const { - return ((score * _scale) - _adjust); - } VESPA_DLL_LOCAL void sortHitsByScore(size_t topn); VESPA_DLL_LOCAL void sortHitsByDocId(); |