diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2018-08-08 20:41:22 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-08-08 20:41:22 +0200 |
commit | 269b9c2d80fb333f3bb049df9898437356cff784 (patch) | |
tree | 955d325723425a605e644670fc8845c50df7c81c /searchcore | |
parent | 6c17b52cb68feeb35afa6cb77ead966756b391d6 (diff) | |
parent | 7bf3c8c42007d878cba06e7725df7588203c2619 (diff) |
Merge pull request #6485 from vespa-engine/balder/transfer-when-selecting-the-best
Balder/transfer when selecting the best
Diffstat (limited to 'searchcore')
6 files changed, 117 insertions, 55 deletions
diff --git a/searchcore/src/tests/proton/matching/match_loop_communicator/match_loop_communicator_test.cpp b/searchcore/src/tests/proton/matching/match_loop_communicator/match_loop_communicator_test.cpp index 2bfd907b4a3..b64b2526f06 100644 --- a/searchcore/src/tests/proton/matching/match_loop_communicator/match_loop_communicator_test.cpp +++ b/searchcore/src/tests/proton/matching/match_loop_communicator/match_loop_communicator_test.cpp @@ -8,20 +8,21 @@ using namespace proton::matching; using vespalib::Box; using vespalib::make_box; -typedef MatchLoopCommunicator::Range Range; -typedef MatchLoopCommunicator::RangePair RangePair; -typedef MatchLoopCommunicator::feature_t feature_t; -typedef MatchLoopCommunicator::Matches Matches; +using Range = MatchLoopCommunicator::Range; +using RangePair = MatchLoopCommunicator::RangePair; +using Matches = MatchLoopCommunicator::Matches; +using Hit = MatchLoopCommunicator::Hit; +using Hits = MatchLoopCommunicator::Hits; -std::vector<feature_t> makeScores(size_t id) { +Hits makeScores(size_t id) { switch (id) { - case 0: return make_box<feature_t>(5.4, 4.4, 3.4, 2.4, 1.4); - case 1: return make_box<feature_t>(5.3, 4.3, 3.3, 2.3, 1.3); - case 2: return make_box<feature_t>(5.2, 4.2, 3.2, 2.2, 1.2); - case 3: return make_box<feature_t>(5.1, 4.1, 3.1, 2.1, 1.1); - case 4: return make_box<feature_t>(5.0, 4.0, 3.0, 2.0, 1.0); + case 0: return make_box<Hit>({1, 5.4}, {2, 4.4}, {3, 3.4}, {4, 2.4}, {5, 1.4}); + case 1: return make_box<Hit>({11, 5.3}, {12, 4.3}, {13, 3.3}, {14, 2.3}, {15, 1.3}); + case 2: return make_box<Hit>({21, 5.2}, {22, 4.2}, {23, 3.2}, {24, 2.2}, {25, 1.2}); + case 3: return make_box<Hit>({31, 5.1}, {32, 4.1}, {33, 3.1}, {34, 2.1}, {35, 1.1}); + case 4: return make_box<Hit>({41, 5.0}, {42, 4.0}, {43, 3.0}, {44, 2.0}, {45, 1.0}); } - return Box<feature_t>(); + return Box<Hit>(); } RangePair makeRanges(size_t id) { @@ -35,43 +36,65 @@ RangePair makeRanges(size_t id) { return std::make_pair(Range(-50, -60), Range(60, 50)); } +void equal(size_t count, const Hits & a, const Hits & b) { + EXPECT_EQUAL(count, b.size()); + for (size_t i(0); i < count; i++) { + EXPECT_EQUAL(a[i].first, b[i].first); + EXPECT_EQUAL(a[i].second , b[i].second); + } +} + +struct EveryOdd : public search::queryeval::IDiversifier { + bool accepted(uint32_t docId) override { + return docId & 0x01; + } +}; + TEST_F("require that selectBest gives appropriate results for single thread", MatchLoopCommunicator(num_threads, 3)) { - EXPECT_EQUAL(2u, f1.selectBest(make_box<feature_t>(5, 4))); - EXPECT_EQUAL(3u, f1.selectBest(make_box<feature_t>(5, 4, 3))); - EXPECT_EQUAL(3u, f1.selectBest(make_box<feature_t>(5, 4, 3, 2))); + TEST_DO(equal(2u, make_box<Hit>({1, 5}, {2, 4}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4})))); + TEST_DO(equal(3u, make_box<Hit>({1, 5}, {2, 4}, {3, 3}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4}, {3, 3})))); + TEST_DO(equal(3u, make_box<Hit>({1, 5}, {2, 4}, {3, 3}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4}, {3, 3}, {4, 2})))); +} + +TEST_F("require that selectBest gives appropriate results for single thread with filter", + MatchLoopCommunicator(num_threads, 3, std::make_unique<EveryOdd>())) +{ + TEST_DO(equal(1u, make_box<Hit>({1, 5}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4})))); + TEST_DO(equal(2u, make_box<Hit>({1, 5}, {3, 3}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4}, {3, 3})))); + TEST_DO(equal(3u, make_box<Hit>({1, 5}, {3, 3}, {5, 1}), f1.selectBest(make_box<Hit>({1, 5}, {2, 4}, {3, 3}, {4, 2}, {5, 1}, {6, 0})))); } TEST_MT_F("require that selectBest works with no hits", 10, MatchLoopCommunicator(num_threads, 10)) { - EXPECT_EQUAL(0u, f1.selectBest(Box<feature_t>())); + EXPECT_TRUE(f1.selectBest(Box<Hit>()).empty()); } TEST_MT_F("require that selectBest works with too many hits from all threads", 5, MatchLoopCommunicator(num_threads, 13)) { if (thread_id < 3) { - EXPECT_EQUAL(3u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(3u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } else { - EXPECT_EQUAL(2u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(2u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } } TEST_MT_F("require that selectBest works with some exhausted threads", 5, MatchLoopCommunicator(num_threads, 22)) { if (thread_id < 2) { - EXPECT_EQUAL(5u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(5u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } else { - EXPECT_EQUAL(4u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(4u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } } TEST_MT_F("require that selectBest can select all hits from all threads", 5, MatchLoopCommunicator(num_threads, 100)) { - EXPECT_EQUAL(5u, f1.selectBest(makeScores(thread_id))); + EXPECT_EQUAL(5u, f1.selectBest(makeScores(thread_id)).size()); } TEST_MT_F("require that selectBest works with some empty threads", 10, MatchLoopCommunicator(num_threads, 7)) { if (thread_id < 2) { - EXPECT_EQUAL(2u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(2u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } else if (thread_id < 5) { - EXPECT_EQUAL(1u, f1.selectBest(makeScores(thread_id))); + TEST_DO(equal(1u, makeScores(thread_id), f1.selectBest(makeScores(thread_id)))); } else { - EXPECT_EQUAL(0u, f1.selectBest(makeScores(thread_id))); + EXPECT_TRUE(f1.selectBest(makeScores(thread_id)).empty()); } } diff --git a/searchcore/src/vespa/searchcore/proton/matching/i_match_loop_communicator.h b/searchcore/src/vespa/searchcore/proton/matching/i_match_loop_communicator.h index c22232d47db..df24fa9e76b 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/i_match_loop_communicator.h +++ b/searchcore/src/vespa/searchcore/proton/matching/i_match_loop_communicator.h @@ -5,14 +5,16 @@ #include <vespa/searchlib/queryeval/scores.h> #include <utility> #include <cstddef> +#include <cstdint> #include <vector> namespace proton::matching { struct IMatchLoopCommunicator { - typedef search::feature_t feature_t; - typedef search::queryeval::Scores Range; - typedef std::pair<Range, Range> RangePair; + using Range = search::queryeval::Scores; + using RangePair = std::pair<Range, Range>; + using Hit = std::pair<uint32_t, search::feature_t>; + using Hits = std::vector<Hit>; struct Matches { size_t hits; size_t docs; @@ -24,7 +26,7 @@ struct IMatchLoopCommunicator { } }; virtual double estimate_match_frequency(const Matches &matches) = 0; - virtual size_t selectBest(const std::vector<feature_t> &sortedScores) = 0; + virtual Hits selectBest(Hits sortedHits) = 0; virtual RangePair rangeCover(const RangePair &ranges) = 0; virtual ~IMatchLoopCommunicator() {} }; diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.cpp index 95148ef56e8..54cffce7f40 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.cpp @@ -6,8 +6,11 @@ namespace proton:: matching { MatchLoopCommunicator::MatchLoopCommunicator(size_t threads, size_t topN) + : MatchLoopCommunicator(threads, topN, std::unique_ptr<IDiversifier>()) +{} +MatchLoopCommunicator::MatchLoopCommunicator(size_t threads, size_t topN, std::unique_ptr<IDiversifier> diversifier) : _estimate_match_frequency(threads), - _selectBest(threads, topN), + _selectBest(threads, topN, std::move(diversifier)), _rangeCover(threads) {} MatchLoopCommunicator::~MatchLoopCommunicator() = default; @@ -29,22 +32,47 @@ MatchLoopCommunicator::EstimateMatchFrequency::mingle() } } +MatchLoopCommunicator::SelectBest::SelectBest(size_t n, size_t topN_in, std::unique_ptr<IDiversifier> diversifier) + : vespalib::Rendezvous<Hits, Hits>(n), + topN(topN_in), + _indexes(n, 0), + _diversifier(std::move(diversifier)) +{} +MatchLoopCommunicator::SelectBest::~SelectBest() = default; + +template<typename Q, typename F> +void +MatchLoopCommunicator::SelectBest::mingle(Q & queue, F && accept) { + for (size_t picked = 0; picked < topN && !queue.empty(); ) { + uint32_t i = queue.front(); + const Hit & hit = in(i)[_indexes[i]]; + if (accept(hit.first)) { + out(i).push_back(hit); + ++picked; + } + if (in(i).size() > ++_indexes[i]) { + queue.adjust(); + } else { + queue.pop_front(); + } + } +} + void MatchLoopCommunicator::SelectBest::mingle() { vespalib::PriorityQueue<uint32_t, SelectCmp> queue(SelectCmp(*this)); for (size_t i = 0; i < size(); ++i) { if (!in(i).empty()) { + out(i).reserve(std::min(topN, in(i).size())); + _indexes[i] = 0; queue.push(i); } } - for (size_t picked = 0; picked < topN && !queue.empty(); ++picked) { - uint32_t i = queue.front(); - if (in(i).size() > ++out(i)) { - queue.adjust(); - } else { - queue.pop_front(); - } + if (_diversifier) { + mingle(queue, [diversifier=_diversifier.get()](uint32_t docId) { return diversifier->accepted(docId);}); + } else { + mingle(queue, [](uint32_t) { return true;}); } } diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.h b/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.h index c1eec37299f..e17efd66c78 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.h +++ b/searchcore/src/vespa/searchcore/proton/matching/match_loop_communicator.h @@ -3,6 +3,7 @@ #pragma once #include "i_match_loop_communicator.h" +#include <vespa/searchlib/queryeval/idiversifier.h> #include <vespa/vespalib/util/rendezvous.h> namespace proton::matching { @@ -10,44 +11,49 @@ namespace proton::matching { class MatchLoopCommunicator : public IMatchLoopCommunicator { private: + using IDiversifier = search::queryeval::IDiversifier; struct EstimateMatchFrequency : vespalib::Rendezvous<Matches, double> { - EstimateMatchFrequency(size_t n) - : vespalib::Rendezvous<Matches, double>(n) {} + EstimateMatchFrequency(size_t n) : vespalib::Rendezvous<Matches, double>(n) {} void mingle() override; }; - struct SelectBest : vespalib::Rendezvous<std::vector<feature_t>, size_t> { + struct SelectBest : vespalib::Rendezvous<Hits, Hits> { size_t topN; - SelectBest(size_t n, size_t topN_in) - : vespalib::Rendezvous<std::vector<feature_t>, size_t>(n), topN(topN_in) {} + std::vector<uint32_t> _indexes; + std::unique_ptr<IDiversifier> _diversifier; + SelectBest(size_t n, size_t topN_in, std::unique_ptr<IDiversifier>); + ~SelectBest() override; void mingle() override; - bool cmp(const uint32_t &a, const uint32_t &b) { - return (in(a)[out(a)] > in(b)[out(b)]); + template<typename Q, typename F> + void mingle(Q & queue, F && accept); + bool cmp(uint32_t a, uint32_t b) { + return (in(a)[_indexes[a]].second > in(b)[_indexes[b]].second); } }; struct SelectCmp { SelectBest &sb; SelectCmp(SelectBest &sb_in) : sb(sb_in) {} - bool operator()(const uint32_t &a, const uint32_t &b) const { + bool operator()(uint32_t a, uint32_t b) const { return (sb.cmp(a, b)); } }; struct RangeCover : vespalib::Rendezvous<RangePair, RangePair> { - RangeCover(size_t n) - : vespalib::Rendezvous<RangePair, RangePair>(n) {}void mingle() override; + RangeCover(size_t n) : vespalib::Rendezvous<RangePair, RangePair>(n) {} + void mingle() override; }; - EstimateMatchFrequency _estimate_match_frequency; - SelectBest _selectBest; - RangeCover _rangeCover; + EstimateMatchFrequency _estimate_match_frequency; + SelectBest _selectBest; + RangeCover _rangeCover; public: MatchLoopCommunicator(size_t threads, size_t topN); + MatchLoopCommunicator(size_t threads, size_t topN, std::unique_ptr<IDiversifier>); ~MatchLoopCommunicator(); double estimate_match_frequency(const Matches &matches) override { return _estimate_match_frequency.rendezvous(matches); } - size_t selectBest(const std::vector<feature_t> &sortedScores) override { - return _selectBest.rendezvous(sortedScores); + Hits selectBest(Hits sortedHits) override { + return _selectBest.rendezvous(sortedHits); } RangePair rangeCover(const RangePair &ranges) override { return _rangeCover.rendezvous(ranges); diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp index 0eb49aec754..920f84a21b0 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_master.cpp @@ -25,8 +25,8 @@ struct TimedMatchLoopCommunicator : IMatchLoopCommunicator { double estimate_match_frequency(const Matches &matches) override { return communicator.estimate_match_frequency(matches); } - size_t selectBest(const std::vector<feature_t> &sortedScores) override { - size_t result = communicator.selectBest(sortedScores); + Hits selectBest(Hits sortedHits) override { + Hits result = communicator.selectBest(std::move(sortedHits)); rerank_time.start(); return result; } diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp index 1efb74b96ba..9232a15043b 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_thread.cpp @@ -265,12 +265,15 @@ MatchThread::findMatches(MatchTools &tools) tools.setup_second_phase(); DocidRange docid_range = scheduler.total_span(thread_id); tools.search().initRange(docid_range.begin, docid_range.end); - auto sorted_scores = hits.getSortedHeapScores(); + auto sorted_hits = hits.getSortedHeapHits(); WaitTimer select_best_timer(wait_time_s); - size_t useHits = communicator.selectBest(sorted_scores); + auto kept_hits = communicator.selectBest(std::move(sorted_hits)); select_best_timer.done(); DocumentScorer scorer(tools.rank_program(), tools.search()); - uint32_t reRanked = hits.reRank(scorer, tools.getHardDoom().doom() ? 0 : useHits); + if (tools.getHardDoom().doom()) { + kept_hits.clear(); + } + uint32_t reRanked = hits.reRank(scorer, std::move(kept_hits)); thread_stats.docsReRanked(reRanked); } { // rank scaling |