diff options
author | Håvard Pettersen <havardpe@oath.com> | 2020-10-30 21:45:49 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@oath.com> | 2020-11-02 15:30:33 +0000 |
commit | 354bfd789c8ca637a7d719e6c17b494710ffd40b (patch) | |
tree | 756993e7e6543e3409bd1979de51f087cf05fc7a /searchlib | |
parent | fe688d87e4ff96dc63d797eaba5582d2594d7076 (diff) |
balance second phase ranking workload
... by first giving each thread the same number of results to re-rank
and then exchanging results back to the threads finding them during
matching/first phase ranking.
Diffstat (limited to 'searchlib')
4 files changed, 52 insertions, 63 deletions
diff --git a/searchlib/src/tests/hitcollector/hitcollector_test.cpp b/searchlib/src/tests/hitcollector/hitcollector_test.cpp index 2274314c7da..8aceb583b9c 100644 --- a/searchlib/src/tests/hitcollector/hitcollector_test.cpp +++ b/searchlib/src/tests/hitcollector/hitcollector_test.cpp @@ -14,26 +14,28 @@ using namespace search::queryeval; typedef std::map<uint32_t, feature_t> ScoreMap; -struct BasicScorer : public HitCollector::DocumentScorer +using Ranges = std::pair<Scores, Scores>; + +struct BasicScorer { feature_t _scoreDelta; explicit BasicScorer(feature_t scoreDelta) : _scoreDelta(scoreDelta) {} - feature_t score(uint32_t docId) override { - return docId + _scoreDelta; + feature_t score(uint32_t docid) const { + return (docid + _scoreDelta); } }; -struct PredefinedScorer : public HitCollector::DocumentScorer +struct PredefinedScorer { ScoreMap _scores; explicit PredefinedScorer(ScoreMap scores) : _scores(std::move(scores)) {} - feature_t score(uint32_t docId) override { - feature_t retval = default_rank_value; - auto itr = _scores.find(docId); + feature_t score(uint32_t docid) const { + feature_t my_score = default_rank_value; + auto itr = _scores.find(docid); if (itr != _scores.end()) { - retval = itr->second; + my_score = itr->second; } - return retval; + return my_score; } }; @@ -46,6 +48,20 @@ std::vector<HitCollector::Hit> extract(SortedHitSequence seq) { return ret; } +template <typename Scorer> +size_t do_reRank(const Scorer &scorer, HitCollector &hc, size_t count) { + Ranges ranges; + auto hits = extract(hc.getSortedHitSequence(count)); + for (auto &[docid, score]: hits) { + ranges.first.update(score); + score = scorer.score(docid); + ranges.second.update(score); + } + hc.setRanges(ranges); + hc.setReRankedHits(std::move(hits)); + return hc.getReRankedHits().size(); +} + void checkResult(const ResultSet & rs, const std::vector<RankedHit> & exp) { if ( ! exp.empty()) { @@ -156,7 +172,7 @@ struct Fixture { } } size_t reRank(size_t count) { - return hc.reRank(scorer, extract(hc.getSortedHitSequence(count))); + return do_reRank(scorer, hc, count); } size_t reRank() { return reRank(5); } }; @@ -295,7 +311,7 @@ void testScaling(const std::vector<feature_t> &initScores, PredefinedScorer scorer(std::move(finalScores)); // perform second phase ranking - EXPECT_EQUAL(2u, hc.reRank(scorer, extract(hc.getSortedHitSequence(2)))); + EXPECT_EQUAL(2u, do_reRank(scorer, hc, 2)); // check results std::unique_ptr<ResultSet> rs = hc.getResultSet(); @@ -462,7 +478,7 @@ TEST_F("require that result set is merged correctly with second phase ranking (d f.hc.addHit(i, i + 1000); addExpectedHitForMergeTest(f, expRh, i); } - EXPECT_EQUAL(f.maxHeapSize, f.hc.reRank(scorer, extract(f.hc.getSortedHitSequence(f.maxHeapSize)))); + EXPECT_EQUAL(f.maxHeapSize, do_reRank(scorer, f.hc, f.maxHeapSize)); std::unique_ptr<ResultSet> rs = f.hc.getResultSet(); TEST_DO(checkResult(*rs, expRh)); } diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp index 7002faa6451..3f41dc3553b 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp @@ -45,9 +45,7 @@ HitCollector::HitCollector(uint32_t numDocs, _bitVector(), _reRankedHits(), _scale(1.0), - _adjust(0), - _hasReRanked(false), - _needReScore(false) + _adjust(0) { if (_maxHitsSize > 0) { _collector = std::make_unique<RankedHitCollector>(*this); @@ -174,32 +172,12 @@ HitCollector::getSortedHitSequence(size_t max_hits) return SortedHitSequence(&_hits[0], &_scoreOrder[0], num_hits); } -size_t -HitCollector::reRank(DocumentScorer &scorer, std::vector<Hit> hits) { - if (hits.empty()) { return 0; } - - size_t hitsToReRank = hits.size(); - Scores &initScores = _ranges.first; - Scores &finalScores = _ranges.second; - initScores = Scores(hits.back().second, hits.front().second); - finalScores = Scores(std::numeric_limits<feature_t>::max(), - -std::numeric_limits<feature_t>::max()); - - std::sort(hits.begin(), hits.end()); // sort on docId - for (auto &hit : hits) { - hit.second = scorer.score(hit.first); - finalScores.low = std::min(finalScores.low, hit.second); - finalScores.high = std::max(finalScores.high, hit.second); - } - _reRankedHits = std::move(hits); - _hasReRanked = true; - return hitsToReRank; -} - -std::pair<Scores, Scores> -HitCollector::getRanges() const +void +HitCollector::setReRankedHits(std::vector<Hit> hits) { - return _ranges; + auto sort_on_docid = [](const Hit &a, const Hit &b){ return (a.first < b.first); }; + std::sort(hits.begin(), hits.end(), sort_on_docid); + _reRankedHits = std::move(hits); } void @@ -230,6 +208,7 @@ mergeHitsIntoResultSet(const std::vector<HitCollector::Hit> &hits, ResultSet &re std::unique_ptr<ResultSet> HitCollector::getResultSet(HitRank default_value) { + bool needReScore = false; Scores &initHeapScores = _ranges.first; Scores &finalHeapScores = _ranges.second; if (initHeapScores.low > finalHeapScores.low) { @@ -242,7 +221,7 @@ HitCollector::getResultSet(HitRank default_value) if (finalRange < 1.0) finalRange = 1.0f; _scale = finalRange / initRange; _adjust = initHeapScores.low * _scale - finalHeapScores.low; - _needReScore = true; + needReScore = true; } // destroys the heap property or score sort order @@ -252,7 +231,7 @@ HitCollector::getResultSet(HitRank default_value) if ( ! _collector->isDocIdCollector() ) { unsigned int iSize = _hits.size(); rs->allocArray(iSize); - if (_needReScore) { + if (needReScore) { for (uint32_t i = 0; i < iSize; ++i) { rs->push_back(RankedHit(_hits[i].first, getReScore(_hits[i].second))); } @@ -269,7 +248,7 @@ HitCollector::getResultSet(HitRank default_value) unsigned int jSize = _docIdVector.size(); rs->allocArray(jSize); uint32_t i = 0; - if (_needReScore) { + if (needReScore) { for (uint32_t j = 0; j < jSize; ++j) { uint32_t docId = _docIdVector[j]; if (i < iSize && docId == _hits[i].first) { @@ -292,7 +271,7 @@ HitCollector::getResultSet(HitRank default_value) } } - if (_hasReRanked) { + if (!_reRankedHits.empty()) { mergeHitsIntoResultSet(_reRankedHits, *rs); } diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h index 97beaa0fd55..a7a6af68e1f 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h @@ -20,14 +20,6 @@ class HitCollector { public: using Hit = std::pair<uint32_t, feature_t>; - /** - * Interface used to calculate the second phase score for the documents being re-ranked. - */ - struct DocumentScorer { - virtual ~DocumentScorer() {} - virtual feature_t score(uint32_t docId) = 0; - }; - private: enum class SortOrder { NONE, DOC_ID, HEAP }; @@ -47,9 +39,6 @@ private: feature_t _scale; feature_t _adjust; - bool _hasReRanked; - bool _needReScore; - struct ScoreComparator { bool operator() (const Hit & lhs, const Hit & rhs) const { if (lhs.second == rhs.second) { @@ -178,15 +167,9 @@ public: SortedHitSequence getSortedHitSequence(size_t max_hits); const std::vector<Hit> & getReRankedHits() const { return _reRankedHits; } + void setReRankedHits(std::vector<Hit> hits); - /** - * Re-ranks the given hits by invoking the score() method on the - * given document scorer. The hits are sorted on doc id so that - * score() is called in doc id order. - **/ - size_t reRank(DocumentScorer &scorer, std::vector<Hit> hits); - - std::pair<Scores, Scores> getRanges() const; + const std::pair<Scores, Scores> &getRanges() const { return _ranges; } void setRanges(const std::pair<Scores, Scores> &ranges); /** diff --git a/searchlib/src/vespa/searchlib/queryeval/scores.h b/searchlib/src/vespa/searchlib/queryeval/scores.h index e8dae898909..08106e0c750 100644 --- a/searchlib/src/vespa/searchlib/queryeval/scores.h +++ b/searchlib/src/vespa/searchlib/queryeval/scores.h @@ -3,6 +3,7 @@ #pragma once #include <vespa/searchlib/common/feature.h> +#include <algorithm> namespace search::queryeval { @@ -13,6 +14,16 @@ struct Scores { Scores(feature_t l, feature_t h) : low(l), high(h) {} bool isValid() const { return low <= high; } + + void update(feature_t score) { + if (isValid()) { + low = std::min(low, score); + high = std::max(high, score); + } else { + low = score; + high = score; + } + } }; } |