summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@oath.com>2020-10-30 21:45:49 +0000
committerHåvard Pettersen <havardpe@oath.com>2020-11-02 15:30:33 +0000
commit354bfd789c8ca637a7d719e6c17b494710ffd40b (patch)
tree756993e7e6543e3409bd1979de51f087cf05fc7a /searchlib
parentfe688d87e4ff96dc63d797eaba5582d2594d7076 (diff)
balance second phase ranking workload
... by first giving each thread the same number of results to re-rank and then exchanging results back to the threads finding them during matching/first phase ranking.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/hitcollector/hitcollector_test.cpp40
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp43
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/hitcollector.h21
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/scores.h11
4 files changed, 52 insertions, 63 deletions
diff --git a/searchlib/src/tests/hitcollector/hitcollector_test.cpp b/searchlib/src/tests/hitcollector/hitcollector_test.cpp
index 2274314c7da..8aceb583b9c 100644
--- a/searchlib/src/tests/hitcollector/hitcollector_test.cpp
+++ b/searchlib/src/tests/hitcollector/hitcollector_test.cpp
@@ -14,26 +14,28 @@ using namespace search::queryeval;
typedef std::map<uint32_t, feature_t> ScoreMap;
-struct BasicScorer : public HitCollector::DocumentScorer
+using Ranges = std::pair<Scores, Scores>;
+
+struct BasicScorer
{
feature_t _scoreDelta;
explicit BasicScorer(feature_t scoreDelta) : _scoreDelta(scoreDelta) {}
- feature_t score(uint32_t docId) override {
- return docId + _scoreDelta;
+ feature_t score(uint32_t docid) const {
+ return (docid + _scoreDelta);
}
};
-struct PredefinedScorer : public HitCollector::DocumentScorer
+struct PredefinedScorer
{
ScoreMap _scores;
explicit PredefinedScorer(ScoreMap scores) : _scores(std::move(scores)) {}
- feature_t score(uint32_t docId) override {
- feature_t retval = default_rank_value;
- auto itr = _scores.find(docId);
+ feature_t score(uint32_t docid) const {
+ feature_t my_score = default_rank_value;
+ auto itr = _scores.find(docid);
if (itr != _scores.end()) {
- retval = itr->second;
+ my_score = itr->second;
}
- return retval;
+ return my_score;
}
};
@@ -46,6 +48,20 @@ std::vector<HitCollector::Hit> extract(SortedHitSequence seq) {
return ret;
}
+template <typename Scorer>
+size_t do_reRank(const Scorer &scorer, HitCollector &hc, size_t count) {
+ Ranges ranges;
+ auto hits = extract(hc.getSortedHitSequence(count));
+ for (auto &[docid, score]: hits) {
+ ranges.first.update(score);
+ score = scorer.score(docid);
+ ranges.second.update(score);
+ }
+ hc.setRanges(ranges);
+ hc.setReRankedHits(std::move(hits));
+ return hc.getReRankedHits().size();
+}
+
void checkResult(const ResultSet & rs, const std::vector<RankedHit> & exp)
{
if ( ! exp.empty()) {
@@ -156,7 +172,7 @@ struct Fixture {
}
}
size_t reRank(size_t count) {
- return hc.reRank(scorer, extract(hc.getSortedHitSequence(count)));
+ return do_reRank(scorer, hc, count);
}
size_t reRank() { return reRank(5); }
};
@@ -295,7 +311,7 @@ void testScaling(const std::vector<feature_t> &initScores,
PredefinedScorer scorer(std::move(finalScores));
// perform second phase ranking
- EXPECT_EQUAL(2u, hc.reRank(scorer, extract(hc.getSortedHitSequence(2))));
+ EXPECT_EQUAL(2u, do_reRank(scorer, hc, 2));
// check results
std::unique_ptr<ResultSet> rs = hc.getResultSet();
@@ -462,7 +478,7 @@ TEST_F("require that result set is merged correctly with second phase ranking (d
f.hc.addHit(i, i + 1000);
addExpectedHitForMergeTest(f, expRh, i);
}
- EXPECT_EQUAL(f.maxHeapSize, f.hc.reRank(scorer, extract(f.hc.getSortedHitSequence(f.maxHeapSize))));
+ EXPECT_EQUAL(f.maxHeapSize, do_reRank(scorer, f.hc, f.maxHeapSize));
std::unique_ptr<ResultSet> rs = f.hc.getResultSet();
TEST_DO(checkResult(*rs, expRh));
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp
index 7002faa6451..3f41dc3553b 100644
--- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp
@@ -45,9 +45,7 @@ HitCollector::HitCollector(uint32_t numDocs,
_bitVector(),
_reRankedHits(),
_scale(1.0),
- _adjust(0),
- _hasReRanked(false),
- _needReScore(false)
+ _adjust(0)
{
if (_maxHitsSize > 0) {
_collector = std::make_unique<RankedHitCollector>(*this);
@@ -174,32 +172,12 @@ HitCollector::getSortedHitSequence(size_t max_hits)
return SortedHitSequence(&_hits[0], &_scoreOrder[0], num_hits);
}
-size_t
-HitCollector::reRank(DocumentScorer &scorer, std::vector<Hit> hits) {
- if (hits.empty()) { return 0; }
-
- size_t hitsToReRank = hits.size();
- Scores &initScores = _ranges.first;
- Scores &finalScores = _ranges.second;
- initScores = Scores(hits.back().second, hits.front().second);
- finalScores = Scores(std::numeric_limits<feature_t>::max(),
- -std::numeric_limits<feature_t>::max());
-
- std::sort(hits.begin(), hits.end()); // sort on docId
- for (auto &hit : hits) {
- hit.second = scorer.score(hit.first);
- finalScores.low = std::min(finalScores.low, hit.second);
- finalScores.high = std::max(finalScores.high, hit.second);
- }
- _reRankedHits = std::move(hits);
- _hasReRanked = true;
- return hitsToReRank;
-}
-
-std::pair<Scores, Scores>
-HitCollector::getRanges() const
+void
+HitCollector::setReRankedHits(std::vector<Hit> hits)
{
- return _ranges;
+ auto sort_on_docid = [](const Hit &a, const Hit &b){ return (a.first < b.first); };
+ std::sort(hits.begin(), hits.end(), sort_on_docid);
+ _reRankedHits = std::move(hits);
}
void
@@ -230,6 +208,7 @@ mergeHitsIntoResultSet(const std::vector<HitCollector::Hit> &hits, ResultSet &re
std::unique_ptr<ResultSet>
HitCollector::getResultSet(HitRank default_value)
{
+ bool needReScore = false;
Scores &initHeapScores = _ranges.first;
Scores &finalHeapScores = _ranges.second;
if (initHeapScores.low > finalHeapScores.low) {
@@ -242,7 +221,7 @@ HitCollector::getResultSet(HitRank default_value)
if (finalRange < 1.0) finalRange = 1.0f;
_scale = finalRange / initRange;
_adjust = initHeapScores.low * _scale - finalHeapScores.low;
- _needReScore = true;
+ needReScore = true;
}
// destroys the heap property or score sort order
@@ -252,7 +231,7 @@ HitCollector::getResultSet(HitRank default_value)
if ( ! _collector->isDocIdCollector() ) {
unsigned int iSize = _hits.size();
rs->allocArray(iSize);
- if (_needReScore) {
+ if (needReScore) {
for (uint32_t i = 0; i < iSize; ++i) {
rs->push_back(RankedHit(_hits[i].first, getReScore(_hits[i].second)));
}
@@ -269,7 +248,7 @@ HitCollector::getResultSet(HitRank default_value)
unsigned int jSize = _docIdVector.size();
rs->allocArray(jSize);
uint32_t i = 0;
- if (_needReScore) {
+ if (needReScore) {
for (uint32_t j = 0; j < jSize; ++j) {
uint32_t docId = _docIdVector[j];
if (i < iSize && docId == _hits[i].first) {
@@ -292,7 +271,7 @@ HitCollector::getResultSet(HitRank default_value)
}
}
- if (_hasReRanked) {
+ if (!_reRankedHits.empty()) {
mergeHitsIntoResultSet(_reRankedHits, *rs);
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h
index 97beaa0fd55..a7a6af68e1f 100644
--- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h
+++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h
@@ -20,14 +20,6 @@ class HitCollector {
public:
using Hit = std::pair<uint32_t, feature_t>;
- /**
- * Interface used to calculate the second phase score for the documents being re-ranked.
- */
- struct DocumentScorer {
- virtual ~DocumentScorer() {}
- virtual feature_t score(uint32_t docId) = 0;
- };
-
private:
enum class SortOrder { NONE, DOC_ID, HEAP };
@@ -47,9 +39,6 @@ private:
feature_t _scale;
feature_t _adjust;
- bool _hasReRanked;
- bool _needReScore;
-
struct ScoreComparator {
bool operator() (const Hit & lhs, const Hit & rhs) const {
if (lhs.second == rhs.second) {
@@ -178,15 +167,9 @@ public:
SortedHitSequence getSortedHitSequence(size_t max_hits);
const std::vector<Hit> & getReRankedHits() const { return _reRankedHits; }
+ void setReRankedHits(std::vector<Hit> hits);
- /**
- * Re-ranks the given hits by invoking the score() method on the
- * given document scorer. The hits are sorted on doc id so that
- * score() is called in doc id order.
- **/
- size_t reRank(DocumentScorer &scorer, std::vector<Hit> hits);
-
- std::pair<Scores, Scores> getRanges() const;
+ const std::pair<Scores, Scores> &getRanges() const { return _ranges; }
void setRanges(const std::pair<Scores, Scores> &ranges);
/**
diff --git a/searchlib/src/vespa/searchlib/queryeval/scores.h b/searchlib/src/vespa/searchlib/queryeval/scores.h
index e8dae898909..08106e0c750 100644
--- a/searchlib/src/vespa/searchlib/queryeval/scores.h
+++ b/searchlib/src/vespa/searchlib/queryeval/scores.h
@@ -3,6 +3,7 @@
#pragma once
#include <vespa/searchlib/common/feature.h>
+#include <algorithm>
namespace search::queryeval {
@@ -13,6 +14,16 @@ struct Scores {
Scores(feature_t l, feature_t h) : low(l), high(h) {}
bool isValid() const { return low <= high; }
+
+ void update(feature_t score) {
+ if (isValid()) {
+ low = std::min(low, score);
+ high = std::max(high, score);
+ } else {
+ low = score;
+ high = score;
+ }
+ }
};
}