diff options
author | Håvard Pettersen <havardpe@oath.com> | 2018-08-10 14:32:56 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@oath.com> | 2018-08-14 09:47:43 +0000 |
commit | b5819c74e563b46a39e2949348b7de01d16b96ae (patch) | |
tree | 6708b4a4e39c0073e7c79146a5885e3e7c4b3271 /searchlib/src | |
parent | 4f9bd36ae68cafc82e010be1c5e3e8883b721bb8 (diff) |
expose 2nd phase candidates as a referencing sorted hit sequence
also stop keeping track of the re-rank count in the hit collector itself
Diffstat (limited to 'searchlib/src')
5 files changed, 47 insertions, 54 deletions
diff --git a/searchlib/src/tests/attribute/benchmark/attributesearcher.h b/searchlib/src/tests/attribute/benchmark/attributesearcher.h index 37f33803d27..adc63266489 100644 --- a/searchlib/src/tests/attribute/benchmark/attributesearcher.h +++ b/searchlib/src/tests/attribute/benchmark/attributesearcher.h @@ -15,7 +15,7 @@ namespace search { std::unique_ptr<ResultSet> performSearch(queryeval::SearchIterator & sb, uint32_t numDocs) { - queryeval::HitCollector hc(numDocs, numDocs, 0); + queryeval::HitCollector hc(numDocs, numDocs); // assume strict toplevel search object located at start for (sb.seek(1); ! sb.isAtEnd(); sb.seek(sb.getDocId() + 1)) { hc.addHit(sb.getDocId(), 0.0); diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext.cpp index 85492df7016..0f6f7bdade7 100644 --- a/searchlib/src/tests/attribute/searchcontext/searchcontext.cpp +++ b/searchlib/src/tests/attribute/searchcontext/searchcontext.cpp @@ -429,7 +429,7 @@ SearchContextTest::getSearch(const V & vec, const T & term, QueryTermSimple::Sea ResultSetPtr SearchContextTest::performSearch(SearchIterator & sb, uint32_t numDocs) { - HitCollector hc(numDocs, numDocs, 0); + HitCollector hc(numDocs, numDocs); sb.initRange(1, numDocs); // assume strict toplevel search object located at start for (sb.seek(1u); ! sb.isAtEnd(); sb.seek(sb.getDocId() + 1)) { diff --git a/searchlib/src/tests/hitcollector/hitcollector_test.cpp b/searchlib/src/tests/hitcollector/hitcollector_test.cpp index f9f977f4093..31a24d2a8f1 100644 --- a/searchlib/src/tests/hitcollector/hitcollector_test.cpp +++ b/searchlib/src/tests/hitcollector/hitcollector_test.cpp @@ -37,6 +37,15 @@ struct PredefinedScorer : public HitCollector::DocumentScorer } }; +std::vector<HitCollector::Hit> extract(SortedHitSequence seq) { + std::vector<HitCollector::Hit> ret; + while (seq.valid()) { + ret.push_back(seq.get()); + seq.next(); + } + return ret; +} + void checkResult(const ResultSet & rs, const std::vector<RankedHit> & exp) { if ( ! exp.empty()) { @@ -68,12 +77,12 @@ void checkResult(ResultSet & rs, BitVector * exp) } } -void testAddHit(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxHeapSize) +void testAddHit(uint32_t numDocs, uint32_t maxHitsSize) { LOG(info, "testAddHit: no hits"); { // no hits - HitCollector hc(numDocs, maxHitsSize, maxHeapSize); + HitCollector hc(numDocs, maxHitsSize); std::vector<RankedHit> expRh; std::unique_ptr<ResultSet> rs = hc.getResultSet(); @@ -83,7 +92,7 @@ void testAddHit(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxHeapSize) LOG(info, "testAddHit: only ranked hits"); { // only ranked hits - HitCollector hc(numDocs, maxHitsSize, maxHeapSize); + HitCollector hc(numDocs, maxHitsSize); std::vector<RankedHit> expRh; for (uint32_t i = 0; i < maxHitsSize; ++i) { @@ -102,7 +111,7 @@ void testAddHit(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxHeapSize) LOG(info, "testAddHit: both ranked hits and bit vector hits"); { // both ranked hits and bit vector hits - HitCollector hc(numDocs, maxHitsSize, maxHeapSize); + HitCollector hc(numDocs, maxHitsSize); std::vector<RankedHit> expRh; BitVector::UP expBv(BitVector::create(numDocs)); @@ -125,10 +134,8 @@ void testAddHit(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxHeapSize) } TEST("testAddHit") { - TEST_DO(testAddHit(30, 10, 5)); - TEST_DO(testAddHit(30, 10, 0)); - TEST_DO(testAddHit(400, 10, 5)); // 400/32 = 12 which is bigger than 10. - TEST_DO(testAddHit(400, 10, 0)); + TEST_DO(testAddHit(30, 10)); + TEST_DO(testAddHit(400, 10)); // 400/32 = 12 which is bigger than 10. } struct Fixture { @@ -137,7 +144,7 @@ struct Fixture { BasicScorer scorer; Fixture() - : hc(20, 10, 5), expBv(BitVector::create(20)), scorer(200) + : hc(20, 10), expBv(BitVector::create(20)), scorer(200) { } virtual ~Fixture() {} @@ -148,16 +155,10 @@ struct Fixture { expBv->setBit(i); } } - size_t reRank() { - return hc.reRank(scorer, hc.getSortedHeapHits()); - } size_t reRank(size_t count) { - auto hits = hc.getSortedHeapHits(); - if (hits.size() > count) { - hits.resize(count); - } - return hc.reRank(scorer, std::move(hits)); + return hc.reRank(scorer, extract(hc.getSortedHitSequence(count))); } + size_t reRank() { return reRank(5); } }; struct AscendingScoreFixture : Fixture { @@ -238,7 +239,7 @@ TEST_F("testReRank - partial", AscendingScoreFixture) TEST_F("require that hits for 2nd phase candidates can be retrieved", DescendingScoreFixture) { f.addHits(); - std::vector<HitCollector::Hit> scores = f.hc.getSortedHeapHits(); + std::vector<HitCollector::Hit> scores = extract(f.hc.getSortedHitSequence(5)); ASSERT_EQUAL(5u, scores.size()); EXPECT_EQUAL(100, scores[0].second); EXPECT_EQUAL(99, scores[1].second); @@ -249,7 +250,7 @@ TEST_F("require that hits for 2nd phase candidates can be retrieved", Descending TEST("require that score ranges can be read and set.") { std::pair<Scores, Scores> ranges = std::make_pair(Scores(1.0, 2.0), Scores(3.0, 4.0)); - HitCollector hc(20, 10, 5); + HitCollector hc(20, 10); hc.setRanges(ranges); EXPECT_EQUAL(ranges.first.low, hc.getRanges().first.low); EXPECT_EQUAL(ranges.first.high, hc.getRanges().first.high); @@ -263,7 +264,7 @@ TEST("testNoHitsToReRank") { LOG(info, "testNoMDHeap: test it"); { - HitCollector hc(numDocs, maxHitsSize, 0); + HitCollector hc(numDocs, maxHitsSize); std::vector<RankedHit> expRh; for (uint32_t i = 0; i < maxHitsSize; ++i) { @@ -285,7 +286,7 @@ void testScaling(const std::vector<feature_t> &initScores, ScoreMap finalScores, const std::vector<RankedHit> &expected) { - HitCollector hc(5, 5, 2); + HitCollector hc(5, 5); // first phase ranking for (uint32_t i = 0; i < 5; ++i) { @@ -294,7 +295,7 @@ void testScaling(const std::vector<feature_t> &initScores, PredefinedScorer scorer(std::move(finalScores)); // perform second phase ranking - EXPECT_EQUAL(2u, hc.reRank(scorer, hc.getSortedHeapHits())); + EXPECT_EQUAL(2u, hc.reRank(scorer, extract(hc.getSortedHitSequence(2)))); // check results std::unique_ptr<ResultSet> rs = hc.getResultSet(); @@ -394,7 +395,7 @@ TEST("testOnlyBitVector") { uint32_t numDocs = 20; LOG(info, "testOnlyBitVector: test it"); { - HitCollector hc(numDocs, 0, 0); + HitCollector hc(numDocs, 0); BitVector::UP expBv(BitVector::create(numDocs)); for (uint32_t i = 0; i < numDocs; i += 2) { @@ -416,7 +417,7 @@ struct MergeResultSetFixture { const uint32_t maxHeapSize; HitCollector hc; MergeResultSetFixture() - : numDocs(100), maxHitsSize(80), maxHeapSize(30), hc(numDocs * 32, maxHitsSize, maxHeapSize) + : numDocs(100), maxHitsSize(80), maxHeapSize(30), hc(numDocs * 32, maxHitsSize) {} }; @@ -461,13 +462,13 @@ TEST_F("require that result set is merged correctly with second phase ranking (d f.hc.addHit(i, i + 1000); addExpectedHitForMergeTest(f, expRh, i); } - EXPECT_EQUAL(f.maxHeapSize, f.hc.reRank(scorer, f.hc.getSortedHeapHits())); + EXPECT_EQUAL(f.maxHeapSize, f.hc.reRank(scorer, extract(f.hc.getSortedHitSequence(f.maxHeapSize)))); std::unique_ptr<ResultSet> rs = f.hc.getResultSet(); TEST_DO(checkResult(*rs, expRh)); } TEST("require that hits can be added out of order") { - HitCollector hc(1000, 100, 10); + HitCollector hc(1000, 100); std::vector<RankedHit> expRh; // produce expected result in normal order for (uint32_t i = 0; i < 5; ++i) { @@ -485,7 +486,7 @@ TEST("require that hits can be added out of order") { } TEST("require that hits can be added out of order when passing array limit") { - HitCollector hc(10000, 100, 10); + HitCollector hc(10000, 100); std::vector<RankedHit> expRh; // produce expected result in normal order const size_t numHits = 150; @@ -504,7 +505,7 @@ TEST("require that hits can be added out of order when passing array limit") { } TEST("require that hits can be added out of order only after passing array limit") { - HitCollector hc(10000, 100, 10); + HitCollector hc(10000, 100); std::vector<RankedHit> expRh; // produce expected result in normal order const size_t numHits = 150; diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp index 2a38d8bf27e..5c27b9d2067 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp @@ -34,11 +34,9 @@ HitCollector::sortHitsByDocId() } HitCollector::HitCollector(uint32_t numDocs, - uint32_t maxHitsSize, - uint32_t maxReRankHitsSize) + uint32_t maxHitsSize) : _numDocs(numDocs), _maxHitsSize(maxHitsSize), - _maxReRankHitsSize(maxReRankHitsSize), _maxDocIdVectorSize((numDocs + 31) / 32), _hits(), _hitsSortOrder(SortOrder::DOC_ID), @@ -168,17 +166,12 @@ HitCollector::DocIdCollector<CollectRankedHit>::collectAndChangeCollector(uint32 hc._collector = std::make_unique<BitVectorCollector<CollectRankedHit>>(hc); // note - self-destruct. } -std::vector<HitCollector::Hit> -HitCollector::getSortedHeapHits() +SortedHitSequence +HitCollector::getSortedHitSequence(size_t max_hits) { - std::vector<Hit> scores; - size_t scoresToReturn = std::min(_hits.size(), static_cast<size_t>(_maxReRankHitsSize)); - scores.reserve(scoresToReturn); - sortHitsByScore(scoresToReturn); - for (size_t i = 0; i < scoresToReturn; ++i) { - scores.push_back(_hits[_scoreOrder[i]]); - } - return scores; + size_t num_hits = std::min(_hits.size(), max_hits); + sortHitsByScore(num_hits); + return SortedHitSequence(&_hits[0], &_scoreOrder[0], num_hits); } size_t diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h index ef4bf76b66b..3577456e781 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h @@ -9,6 +9,7 @@ #include <vector> #include <vespa/vespalib/util/sort.h> #include <vespa/fastos/dynamiclibrary.h> +#include "sorted_hit_sequence.h" namespace search::queryeval { @@ -32,7 +33,6 @@ private: const uint32_t _numDocs; const uint32_t _maxHitsSize; - const uint32_t _maxReRankHitsSize; const uint32_t _maxDocIdVectorSize; std::vector<Hit> _hits; // used as a heap when _hits.size == _maxHitsSize @@ -144,14 +144,12 @@ public: /** * Creates a hit collector used to store hits for doc ids in the * range [0, numDocs>. Doc id and rank score are stored for the n - * (=maxHitsSize) best hits. The best m (=maxReRankHitsSize) hits are - * candidates for re-ranking. Note that n >= m. + * (=maxHitsSize) best hits. * * @param numDocs * @param maxHitsSize - * @param maxReRankHitsSize **/ - HitCollector(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxReRankHitsSize); + HitCollector(uint32_t numDocs, uint32_t maxHitsSize); ~HitCollector(); /** @@ -167,15 +165,16 @@ public: } /** - * Returns a sorted vector of hits for the hits that are stored - * in the heap. These are the candidates for re-ranking. + * Returns a sorted sequence of hits that reference internal data. + * + * @param max_hits maximum number of hits returned in the sequence. */ - std::vector<Hit> getSortedHeapHits(); + SortedHitSequence getSortedHitSequence(size_t max_hits); /** - * Re-ranks the m (=maxHeapSize) best hits by invoking the score() - * method on the given document scorer. The best m hits are sorted on doc id - * so that score() is called in doc id order. + * Re-ranks the given hits by invoking the score() method on the + * given document scorer. The hits are sorted on doc id so that + * score() is called in doc id order. **/ size_t reRank(DocumentScorer &scorer, std::vector<Hit> hits); |