diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2018-08-08 20:41:22 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-08-08 20:41:22 +0200 |
commit | 269b9c2d80fb333f3bb049df9898437356cff784 (patch) | |
tree | 955d325723425a605e644670fc8845c50df7c81c | |
parent | 6c17b52cb68feeb35afa6cb77ead966756b391d6 (diff) | |
parent | 7bf3c8c42007d878cba06e7725df7588203c2619 (diff) |
Merge pull request #6485 from vespa-engine/balder/transfer-when-selecting-the-best
Balder/transfer when selecting the best
10 files changed, 135 insertions, 113 deletions
diff --git a/searchcore/src/tests/proton/matching/match_loop_communicator/match_loop_communicator_test.cpp b/searchcore/src/tests/proton/matching/match_loop_communicator/match_loop_communicator_test.cpp index 2bfd907b4a3..b64b2526f06 100644 --- a/searchcore/src/tests/proton/matching/match_loop_communicator/match_loop_communicator_test.cpp +++ b/searchcore/src/tests/proton/matching/match_loop_communicator/match_loop_communicator_test.cpp @@ -8,20 +8,21 @@ using namespace proton::matching; using vespalib::Box; using vespalib::make_box; -typedef MatchLoopCommunicator::Range Range; -typedef MatchLoopCommunicator::RangePair RangePair; -typedef MatchLoopCommunicator::feature_t feature_t; -typedef MatchLoopCommunicator::Matches Matches; +using Range = MatchLoopCommunicator::Range; +using RangePair = MatchLoopCommunicator::RangePair; +using Matches = MatchLoopCommunicator::Matches; +using Hit = MatchLoopCommunicator::Hit; +using Hits = MatchLoopCommunicator::Hits; -std::vector<feature_t> makeScores(size_t id) { +Hits makeScores(size_t id) { switch (id) { - case 0: return make_box<feature_t>(5.4, 4.4, 3.4, 2.4, 1.4); - case 1: return make_box<feature_t>(5.3, 4.3, 3.3, 2.3, 1.3); - case 2: return make_box<feature_t>(5.2, 4.2, 3.2, 2.2, 1.2); - case 3: return make_box<feature_t>(5.1, 4.1, 3.1, 2.1, 1.1); - case 4: return make_box<feature_t>(5.0, 4.0, 3.0, 2.0, 1.0); + case 0: return make_box<Hit>({1, 5.4}, {2, 4.4}, {3, 3.4}, {4, 2.4}, {5, 1.4}); + case 1: return make_box<Hit>({11, 5.3}, {12, 4.3}, {13, 3.3}, {14, 2.3}, {15, 1.3}); + case 2: return make_box<Hit>({21, 5.2}, {22, 4.2}, {23, 3.2}, {24, 2.2}, {25, 1.2}); + case 3: return make_box<Hit>({31, 5.1}, {32, 4.1}, {33, 3.1}, {34, 2.1}, {35, 1.1}); + case 4: return make_box<Hit>({41, 5.0}, {42, 4.0}, {43, 3.0}, {44, 2.0}, {45, 1.0}); } - return Box<feature_t>(); + return Box<Hit>(); } RangePair makeRanges(size_t id) { @@ -35,43 +36,65 @@ RangePair makeRanges(size_t id) { return std::make_pair(Range(-50, -60), Range(60, 50)); } +void equal(size_t count, const Hits & a, const Hits & b) { + EXPECT_EQUAL(count, b.size()); + for (size_t i(0); i < count; i++) { + EXPECT_EQUAL(a[i].first, b[i].first); + EXPECT_EQUAL(a[i].second , b[i].second); + } +} + +struct EveryOdd : public search::queryeval::IDiversifier { + bool accepted(uint32_t docId) override { + return docId & 0x01; + } +}; + TEST_F("require that selectBest gives appropriate results for single thread", MatchLoopCommunicator(num_threads, 3)) { - EXPECT_EQUAL(2u, f1.selectBest(make_box<feature_t>(5, 4))); - EXPECT_EQUAL(3u, f1.selectBest(make_box<feature_t>(5, 4, 3))); - EXPECT_EQUAL(3u, f1.selectBest(make_box<feature_t>(5, 4, 3, 2))); + TEST_DO(equal(2u, make_box<Hit>({1, 5}, {2, 4}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4})))); + TEST_DO(equal(3u, make_box<Hit>({1, 5}, {2, 4}, {3, 3}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4}, {3, 3})))); + TEST_DO(equal(3u, make_box<Hit>({1, 5}, {2, 4}, {3, 3}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4}, {3, 3}, {4, 2})))); +} + +TEST_F("require that selectBest gives appropriate results for single thread with filter", + MatchLoopCommunicator(num_threads, 3, std::make_unique<EveryOdd>())) +{ + TEST_DO(equal(1u, make_box<Hit>({1, 5}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4})))); + TEST_DO(equal(2u, make_box<Hit>({1, 5}, {3, 3}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4}, {3, 3})))); + TEST_DO(equal(3u, make_box<Hit>({1, 5}, {3, 3}, {5, 1}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4}, {3, 3}, {4, 2}, {5, 1}, {6, 0})))); } TEST_MT_F("require that selectBest works with no hits", 10, MatchLoopCommunicator(num_threads, 10)) { - EXPECT_EQUAL(0u, f1.selectBest(Box<feature_t>())); + EXPECT_TRUE(f1.selectBest(Box<Hit>()).empty()); } TEST_MT_F("require that selectBest works with too many hits from all threads", 5, MatchLoopCommunicator(num_threads, 13)) { if (thread_id < 3) { - EXPECT_EQUAL(3u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(3u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } else { - EXPECT_EQUAL(2u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(2u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } } TEST_MT_F("require that selectBest works with some exhausted threads", 5, MatchLoopCommunicator(num_threads, 22)) { if (thread_id < 2) { - EXPECT_EQUAL(5u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(5u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } else { - EXPECT_EQUAL(4u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(4u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } } TEST_MT_F("require that selectBest can select all hits from all threads", 5, MatchLoopCommunicator(num_threads, 100)) { - EXPECT_EQUAL(5u, f1.selectBest(makeScores(thread_id))); + EXPECT_EQUAL(5u, f1.selectBest(makeScores(thread_id)).size()); } TEST_MT_F("require that selectBest works with some empty threads", 10, MatchLoopCommunicator(num_threads, 7)) { if (thread_id < 2) { - EXPECT_EQUAL(2u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(2u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } else if (thread_id < 5) { - EXPECT_EQUAL(1u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(1u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } else { - EXPECT_EQUAL(0u, f1.selectBest(makeScores(thread_id))); + EXPECT_TRUE(f1.selectBest(makeScores(thread_id)).empty()); } } diff --git a/searchcore/src/vespa/searchcore/proton/matching/i_match_loop_communicator.h b/searchcore/src/vespa/searchcore/proton/matching/i_match_loop_communicator.h index c22232d47db..df24fa9e76b 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/i_match_loop_communicator.h +++ b/searchcore/src/vespa/searchcore/proton/matching/i_match_loop_communicator.h @@ -5,14 +5,16 @@ #include <vespa/searchlib/queryeval/scores.h> #include <utility> #include <cstddef> +#include <cstdint> #include <vector> namespace proton::matching { struct IMatchLoopCommunicator { - typedef search::feature_t feature_t; - typedef search::queryeval::Scores Range; - typedef std::pair<Range, Range> RangePair; + using Range = search::queryeval::Scores; + using RangePair = std::pair<Range, Range>; + using Hit = std::pair<uint32_t, search::feature_t>; + using Hits = std::vector<Hit>; struct Matches { size_t hits; size_t docs; @@ -24,7 +26,7 @@ struct IMatchLoopCommunicator { } }; virtual double estimate_match_frequency(const Matches &matches) = 0; - virtual size_t selectBest(const std::vector<feature_t> &sortedScores) = 0; + virtual Hits selectBest(Hits sortedHits) = 0; virtual RangePair rangeCover(const RangePair &ranges) = 0; virtual ~IMatchLoopCommunicator() {} }; diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.cpp index 95148ef56e8..54cffce7f40 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.cpp @@ -6,8 +6,11 @@ namespace proton:: matching { MatchLoopCommunicator::MatchLoopCommunicator(size_t threads, size_t topN) + : MatchLoopCommunicator(threads, topN, std::unique_ptr<IDiversifier>()) +{} +MatchLoopCommunicator::MatchLoopCommunicator(size_t threads, size_t topN, std::unique_ptr<IDiversifier> diversifier) : _estimate_match_frequency(threads), - _selectBest(threads, topN), + _selectBest(threads, topN, std::move(diversifier)), _rangeCover(threads) {} MatchLoopCommunicator::~MatchLoopCommunicator() = default; @@ -29,22 +32,47 @@ MatchLoopCommunicator::EstimateMatchFrequency::mingle() } } +MatchLoopCommunicator::SelectBest::SelectBest(size_t n, size_t topN_in, std::unique_ptr<IDiversifier> diversifier) + : vespalib::Rendezvous<Hits, Hits>(n), + topN(topN_in), + _indexes(n, 0), + _diversifier(std::move(diversifier)) +{} +MatchLoopCommunicator::SelectBest::~SelectBest() = default; + +template<typename Q, typename F> +void +MatchLoopCommunicator::SelectBest::mingle(Q & queue, F && accept) { + for (size_t picked = 0; picked < topN && !queue.empty(); ) { + uint32_t i = queue.front(); + const Hit & hit = in(i)[_indexes[i]]; + if (accept(hit.first)) { + out(i).push_back(hit); + ++picked; + } + if (in(i).size() > ++_indexes[i]) { + queue.adjust(); + } else { + queue.pop_front(); + } + } +} + void MatchLoopCommunicator::SelectBest::mingle() { vespalib::PriorityQueue<uint32_t, SelectCmp> queue(SelectCmp(*this)); for (size_t i = 0; i < size(); ++i) { if (!in(i).empty()) { + out(i).reserve(std::min(topN, in(i).size())); + _indexes[i] = 0; queue.push(i); } } - for (size_t picked = 0; picked < topN && !queue.empty(); ++picked) { - uint32_t i = queue.front(); - if (in(i).size() > ++out(i)) { - queue.adjust(); - } else { - queue.pop_front(); - } + if (_diversifier) { + mingle(queue, [diversifier=_diversifier.get()](uint32_t docId) { return diversifier->accepted(docId);}); + } else { + mingle(queue, [](uint32_t) { return true;}); } } diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.h b/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.h index c1eec37299f..e17efd66c78 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.h +++ b/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.h @@ -3,6 +3,7 @@ #pragma once #include "i_match_loop_communicator.h" +#include <vespa/searchlib/queryeval/idiversifier.h> #include <vespa/vespalib/util/rendezvous.h> namespace proton::matching { @@ -10,44 +11,49 @@ namespace proton::matching { class MatchLoopCommunicator : public IMatchLoopCommunicator { private: + using IDiversifier = search::queryeval::IDiversifier; struct EstimateMatchFrequency : vespalib::Rendezvous<Matches, double> { - EstimateMatchFrequency(size_t n) - : vespalib::Rendezvous<Matches, double>(n) {} + EstimateMatchFrequency(size_t n) : vespalib::Rendezvous<Matches, double>(n) {} void mingle() override; }; - struct SelectBest : vespalib::Rendezvous<std::vector<feature_t>, size_t> { + struct SelectBest : vespalib::Rendezvous<Hits, Hits> { size_t topN; - SelectBest(size_t n, size_t topN_in) - : vespalib::Rendezvous<std::vector<feature_t>, size_t>(n), topN(topN_in) {} + std::vector<uint32_t> _indexes; + std::unique_ptr<IDiversifier> _diversifier; + SelectBest(size_t n, size_t topN_in, std::unique_ptr<IDiversifier>); + ~SelectBest() override; void mingle() override; - bool cmp(const uint32_t &a, const uint32_t &b) { - return (in(a)[out(a)] > in(b)[out(b)]); + template<typename Q, typename F> + void mingle(Q & queue, F && accept); + bool cmp(uint32_t a, uint32_t b) { + return (in(a)[_indexes[a]].second > in(b)[_indexes[b]].second); } }; struct SelectCmp { SelectBest &sb; SelectCmp(SelectBest &sb_in) : sb(sb_in) {} - bool operator()(const uint32_t &a, const uint32_t &b) const { + bool operator()(uint32_t a, uint32_t b) const { return (sb.cmp(a, b)); } }; struct RangeCover : vespalib::Rendezvous<RangePair, RangePair> { - RangeCover(size_t n) - : vespalib::Rendezvous<RangePair, RangePair>(n) {}void mingle() override; + RangeCover(size_t n) : vespalib::Rendezvous<RangePair, RangePair>(n) {} + void mingle() override; }; - EstimateMatchFrequency _estimate_match_frequency; - SelectBest _selectBest; - RangeCover _rangeCover; + EstimateMatchFrequency _estimate_match_frequency; + SelectBest _selectBest; + RangeCover _rangeCover; public: MatchLoopCommunicator(size_t threads, size_t topN); + MatchLoopCommunicator(size_t threads, size_t topN, std::unique_ptr<IDiversifier>); ~MatchLoopCommunicator(); double estimate_match_frequency(const Matches &matches) override { return _estimate_match_frequency.rendezvous(matches); } - size_t selectBest(const std::vector<feature_t> &sortedScores) override { - return _selectBest.rendezvous(sortedScores); + Hits selectBest(Hits sortedHits) override { + return _selectBest.rendezvous(sortedHits); } RangePair rangeCover(const RangePair &ranges) override { return _rangeCover.rendezvous(ranges); diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp index 0eb49aec754..920f84a21b0 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp @@ -25,8 +25,8 @@ struct TimedMatchLoopCommunicator : IMatchLoopCommunicator { double estimate_match_frequency(const Matches &matches) override { return communicator.estimate_match_frequency(matches); } - size_t selectBest(const std::vector<feature_t> &sortedScores) override { - size_t result = communicator.selectBest(sortedScores); + Hits selectBest(Hits sortedHits) override { + Hits result = communicator.selectBest(std::move(sortedHits)); rerank_time.start(); return result; } diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp index 1efb74b96ba..9232a15043b 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp @@ -265,12 +265,15 @@ MatchThread::findMatches(MatchTools &tools) tools.setup_second_phase(); DocidRange docid_range = scheduler.total_span(thread_id); tools.search().initRange(docid_range.begin, docid_range.end); - auto sorted_scores = hits.getSortedHeapScores(); + auto sorted_hits = hits.getSortedHeapHits(); WaitTimer select_best_timer(wait_time_s); - size_t useHits = communicator.selectBest(sorted_scores); + auto kept_hits = communicator.selectBest(std::move(sorted_hits)); select_best_timer.done(); DocumentScorer scorer(tools.rank_program(), tools.search()); - uint32_t reRanked = hits.reRank(scorer, tools.getHardDoom().doom() ? 0 : useHits); + if (tools.getHardDoom().doom()) { + kept_hits.clear(); + } + uint32_t reRanked = hits.reRank(scorer, std::move(kept_hits)); thread_stats.docsReRanked(reRanked); } { // rank scaling diff --git a/searchlib/src/tests/hitcollector/hitcollector_test.cpp b/searchlib/src/tests/hitcollector/hitcollector_test.cpp index f025c09b6b9..f9f977f4093 100644 --- a/searchlib/src/tests/hitcollector/hitcollector_test.cpp +++ b/searchlib/src/tests/hitcollector/hitcollector_test.cpp @@ -149,10 +149,14 @@ struct Fixture { } } size_t reRank() { - return hc.reRank(scorer); + return hc.reRank(scorer, hc.getSortedHeapHits()); } size_t reRank(size_t count) { - return hc.reRank(scorer, count); + auto hits = hc.getSortedHeapHits(); + if (hits.size() > count) { + hits.resize(count); + } + return hc.reRank(scorer, std::move(hits)); } }; @@ -231,18 +235,6 @@ TEST_F("testReRank - partial", AscendingScoreFixture) TEST_DO(checkResult(*rs, f.expBv.get())); } -TEST_F("require that scores for 2nd phase candidates can be retrieved", DescendingScoreFixture) -{ - f.addHits(); - std::vector<feature_t> scores = f.hc.getSortedHeapScores(); - ASSERT_EQUAL(5u, scores.size()); - EXPECT_EQUAL(100, scores[0]); - EXPECT_EQUAL(99, scores[1]); - EXPECT_EQUAL(98, scores[2]); - EXPECT_EQUAL(97, scores[3]); - EXPECT_EQUAL(96, scores[4]); -} - TEST_F("require that hits for 2nd phase candidates can be retrieved", DescendingScoreFixture) { f.addHits(); @@ -302,7 +294,7 @@ void testScaling(const std::vector<feature_t> &initScores, PredefinedScorer scorer(std::move(finalScores)); // perform second phase ranking - EXPECT_EQUAL(2u, hc.reRank(scorer)); + EXPECT_EQUAL(2u, hc.reRank(scorer, hc.getSortedHeapHits())); // check results std::unique_ptr<ResultSet> rs = hc.getResultSet(); @@ -469,7 +461,7 @@ TEST_F("require that result set is merged correctly with second phase ranking (d f.hc.addHit(i, i + 1000); addExpectedHitForMergeTest(f, expRh, i); } - EXPECT_EQUAL(f.maxHeapSize, f.hc.reRank(scorer)); + EXPECT_EQUAL(f.maxHeapSize, f.hc.reRank(scorer, f.hc.getSortedHeapHits())); std::unique_ptr<ResultSet> rs = f.hc.getResultSet(); TEST_DO(checkResult(*rs, expRh)); } diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp index e0385af99bc..2a38d8bf27e 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp @@ -168,19 +168,6 @@ HitCollector::DocIdCollector<CollectRankedHit>::collectAndChangeCollector(uint32 hc._collector = std::make_unique<BitVectorCollector<CollectRankedHit>>(hc); // note - self-destruct. } -std::vector<feature_t> -HitCollector::getSortedHeapScores() -{ - std::vector<feature_t> scores; - size_t scoresToReturn = std::min(_hits.size(), static_cast<size_t>(_maxReRankHitsSize)); - scores.reserve(scoresToReturn); - sortHitsByScore(scoresToReturn); - for (size_t i = 0; i < scoresToReturn; ++i) { - scores.push_back(_hits[_scoreOrder[i]].second); - } - return scores; -} - std::vector<HitCollector::Hit> HitCollector::getSortedHeapHits() { @@ -194,31 +181,10 @@ HitCollector::getSortedHeapHits() return scores; } - -size_t -HitCollector::reRank(DocumentScorer &scorer) -{ - return reRank(scorer, _maxReRankHitsSize); -} - -size_t -HitCollector::reRank(DocumentScorer &scorer, size_t count) -{ - size_t hitsToReRank = std::min(_hits.size(), count); - if (_hasReRanked || hitsToReRank == 0) { - return 0; - } - sortHitsByScore(hitsToReRank); - std::vector<Hit> hits; - hits.reserve(hitsToReRank); - for (size_t i(0); i < hitsToReRank; i++) { - hits.push_back(_hits[_scoreOrder[i]]); - } - return reRank(scorer, std::move(hits)); -} - size_t HitCollector::reRank(DocumentScorer &scorer, std::vector<Hit> hits) { + if (hits.empty()) { return 0; } + size_t hitsToReRank = hits.size(); Scores &initScores = _ranges.first; Scores &finalScores = _ranges.second; diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h index bbd1e08c65a..ef4bf76b66b 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h @@ -167,10 +167,9 @@ public: } /** - * Returns a sorted vector of scores for the hits that are stored + * Returns a sorted vector of hits for the hits that are stored * in the heap. These are the candidates for re-ranking. */ - std::vector<feature_t> getSortedHeapScores(); std::vector<Hit> getSortedHeapHits(); /** @@ -178,8 +177,6 @@ public: * method on the given document scorer. The best m hits are sorted on doc id * so that score() is called in doc id order. **/ - size_t reRank(DocumentScorer &scorer); - size_t reRank(DocumentScorer &scorer, size_t count); size_t reRank(DocumentScorer &scorer, std::vector<Hit> hits); std::pair<Scores, Scores> getRanges() const; diff --git a/vespalib/src/vespa/vespalib/util/box.h b/vespalib/src/vespa/vespalib/util/box.h index 5fb81cc9d28..efa5d9653f1 100644 --- a/vespalib/src/vespa/vespalib/util/box.h +++ b/vespalib/src/vespa/vespalib/util/box.h @@ -44,11 +44,16 @@ Box<T> make_box(const T &t1, const T &t2, const T &t3, const T &t4) { } template <typename T> -Box<T> make_box(const T &t1, const T &t2, const T &t3, const T &t4, - const T &t5) +Box<T> make_box(const T &t1, const T &t2, const T &t3, const T &t4, const T &t5) { return Box<T>().add(t1).add(t2).add(t3).add(t4).add(t5); } +template <typename T> +Box<T> make_box(const T &t1, const T &t2, const T &t3, const T &t4, const T &t5, const T &t6) +{ + return Box<T>().add(t1).add(t2).add(t3).add(t4).add(t5).add(t6); +} + } // namespace vespalib |