summaryrefslogtreecommitdiffstats
path: root/searchlib/src
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@oath.com>2018-08-10 14:32:56 +0000
committerHåvard Pettersen <havardpe@oath.com>2018-08-14 09:47:43 +0000
commitb5819c74e563b46a39e2949348b7de01d16b96ae (patch)
tree6708b4a4e39c0073e7c79146a5885e3e7c4b3271 /searchlib/src
parent4f9bd36ae68cafc82e010be1c5e3e8883b721bb8 (diff)
expose 2nd phase candidates as a referencing sorted hit sequence
also stop keeping track of the re-rank count in the hit collector itself
Diffstat (limited to 'searchlib/src')
-rw-r--r--searchlib/src/tests/attribute/benchmark/attributesearcher.h2
-rw-r--r--searchlib/src/tests/attribute/searchcontext/searchcontext.cpp2
-rw-r--r--searchlib/src/tests/hitcollector/hitcollector_test.cpp57
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp19
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/hitcollector.h21
5 files changed, 47 insertions, 54 deletions
diff --git a/searchlib/src/tests/attribute/benchmark/attributesearcher.h b/searchlib/src/tests/attribute/benchmark/attributesearcher.h
index 37f33803d27..adc63266489 100644
--- a/searchlib/src/tests/attribute/benchmark/attributesearcher.h
+++ b/searchlib/src/tests/attribute/benchmark/attributesearcher.h
@@ -15,7 +15,7 @@ namespace search {
std::unique_ptr<ResultSet>
performSearch(queryeval::SearchIterator & sb, uint32_t numDocs)
{
- queryeval::HitCollector hc(numDocs, numDocs, 0);
+ queryeval::HitCollector hc(numDocs, numDocs);
// assume strict toplevel search object located at start
for (sb.seek(1); ! sb.isAtEnd(); sb.seek(sb.getDocId() + 1)) {
hc.addHit(sb.getDocId(), 0.0);
diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext.cpp
index 85492df7016..0f6f7bdade7 100644
--- a/searchlib/src/tests/attribute/searchcontext/searchcontext.cpp
+++ b/searchlib/src/tests/attribute/searchcontext/searchcontext.cpp
@@ -429,7 +429,7 @@ SearchContextTest::getSearch(const V & vec, const T & term, QueryTermSimple::Sea
ResultSetPtr
SearchContextTest::performSearch(SearchIterator & sb, uint32_t numDocs)
{
- HitCollector hc(numDocs, numDocs, 0);
+ HitCollector hc(numDocs, numDocs);
sb.initRange(1, numDocs);
// assume strict toplevel search object located at start
for (sb.seek(1u); ! sb.isAtEnd(); sb.seek(sb.getDocId() + 1)) {
diff --git a/searchlib/src/tests/hitcollector/hitcollector_test.cpp b/searchlib/src/tests/hitcollector/hitcollector_test.cpp
index f9f977f4093..31a24d2a8f1 100644
--- a/searchlib/src/tests/hitcollector/hitcollector_test.cpp
+++ b/searchlib/src/tests/hitcollector/hitcollector_test.cpp
@@ -37,6 +37,15 @@ struct PredefinedScorer : public HitCollector::DocumentScorer
}
};
+std::vector<HitCollector::Hit> extract(SortedHitSequence seq) {
+ std::vector<HitCollector::Hit> ret;
+ while (seq.valid()) {
+ ret.push_back(seq.get());
+ seq.next();
+ }
+ return ret;
+}
+
void checkResult(const ResultSet & rs, const std::vector<RankedHit> & exp)
{
if ( ! exp.empty()) {
@@ -68,12 +77,12 @@ void checkResult(ResultSet & rs, BitVector * exp)
}
}
-void testAddHit(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxHeapSize)
+void testAddHit(uint32_t numDocs, uint32_t maxHitsSize)
{
LOG(info, "testAddHit: no hits");
{ // no hits
- HitCollector hc(numDocs, maxHitsSize, maxHeapSize);
+ HitCollector hc(numDocs, maxHitsSize);
std::vector<RankedHit> expRh;
std::unique_ptr<ResultSet> rs = hc.getResultSet();
@@ -83,7 +92,7 @@ void testAddHit(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxHeapSize)
LOG(info, "testAddHit: only ranked hits");
{ // only ranked hits
- HitCollector hc(numDocs, maxHitsSize, maxHeapSize);
+ HitCollector hc(numDocs, maxHitsSize);
std::vector<RankedHit> expRh;
for (uint32_t i = 0; i < maxHitsSize; ++i) {
@@ -102,7 +111,7 @@ void testAddHit(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxHeapSize)
LOG(info, "testAddHit: both ranked hits and bit vector hits");
{ // both ranked hits and bit vector hits
- HitCollector hc(numDocs, maxHitsSize, maxHeapSize);
+ HitCollector hc(numDocs, maxHitsSize);
std::vector<RankedHit> expRh;
BitVector::UP expBv(BitVector::create(numDocs));
@@ -125,10 +134,8 @@ void testAddHit(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxHeapSize)
}
TEST("testAddHit") {
- TEST_DO(testAddHit(30, 10, 5));
- TEST_DO(testAddHit(30, 10, 0));
- TEST_DO(testAddHit(400, 10, 5)); // 400/32 = 12 which is bigger than 10.
- TEST_DO(testAddHit(400, 10, 0));
+ TEST_DO(testAddHit(30, 10));
+ TEST_DO(testAddHit(400, 10)); // 400/32 = 12 which is bigger than 10.
}
struct Fixture {
@@ -137,7 +144,7 @@ struct Fixture {
BasicScorer scorer;
Fixture()
- : hc(20, 10, 5), expBv(BitVector::create(20)), scorer(200)
+ : hc(20, 10), expBv(BitVector::create(20)), scorer(200)
{
}
virtual ~Fixture() {}
@@ -148,16 +155,10 @@ struct Fixture {
expBv->setBit(i);
}
}
- size_t reRank() {
- return hc.reRank(scorer, hc.getSortedHeapHits());
- }
size_t reRank(size_t count) {
- auto hits = hc.getSortedHeapHits();
- if (hits.size() > count) {
- hits.resize(count);
- }
- return hc.reRank(scorer, std::move(hits));
+ return hc.reRank(scorer, extract(hc.getSortedHitSequence(count)));
}
+ size_t reRank() { return reRank(5); }
};
struct AscendingScoreFixture : Fixture {
@@ -238,7 +239,7 @@ TEST_F("testReRank - partial", AscendingScoreFixture)
TEST_F("require that hits for 2nd phase candidates can be retrieved", DescendingScoreFixture)
{
f.addHits();
- std::vector<HitCollector::Hit> scores = f.hc.getSortedHeapHits();
+ std::vector<HitCollector::Hit> scores = extract(f.hc.getSortedHitSequence(5));
ASSERT_EQUAL(5u, scores.size());
EXPECT_EQUAL(100, scores[0].second);
EXPECT_EQUAL(99, scores[1].second);
@@ -249,7 +250,7 @@ TEST_F("require that hits for 2nd phase candidates can be retrieved", Descending
TEST("require that score ranges can be read and set.") {
std::pair<Scores, Scores> ranges = std::make_pair(Scores(1.0, 2.0), Scores(3.0, 4.0));
- HitCollector hc(20, 10, 5);
+ HitCollector hc(20, 10);
hc.setRanges(ranges);
EXPECT_EQUAL(ranges.first.low, hc.getRanges().first.low);
EXPECT_EQUAL(ranges.first.high, hc.getRanges().first.high);
@@ -263,7 +264,7 @@ TEST("testNoHitsToReRank") {
LOG(info, "testNoMDHeap: test it");
{
- HitCollector hc(numDocs, maxHitsSize, 0);
+ HitCollector hc(numDocs, maxHitsSize);
std::vector<RankedHit> expRh;
for (uint32_t i = 0; i < maxHitsSize; ++i) {
@@ -285,7 +286,7 @@ void testScaling(const std::vector<feature_t> &initScores,
ScoreMap finalScores,
const std::vector<RankedHit> &expected)
{
- HitCollector hc(5, 5, 2);
+ HitCollector hc(5, 5);
// first phase ranking
for (uint32_t i = 0; i < 5; ++i) {
@@ -294,7 +295,7 @@ void testScaling(const std::vector<feature_t> &initScores,
PredefinedScorer scorer(std::move(finalScores));
// perform second phase ranking
- EXPECT_EQUAL(2u, hc.reRank(scorer, hc.getSortedHeapHits()));
+ EXPECT_EQUAL(2u, hc.reRank(scorer, extract(hc.getSortedHitSequence(2))));
// check results
std::unique_ptr<ResultSet> rs = hc.getResultSet();
@@ -394,7 +395,7 @@ TEST("testOnlyBitVector") {
uint32_t numDocs = 20;
LOG(info, "testOnlyBitVector: test it");
{
- HitCollector hc(numDocs, 0, 0);
+ HitCollector hc(numDocs, 0);
BitVector::UP expBv(BitVector::create(numDocs));
for (uint32_t i = 0; i < numDocs; i += 2) {
@@ -416,7 +417,7 @@ struct MergeResultSetFixture {
const uint32_t maxHeapSize;
HitCollector hc;
MergeResultSetFixture()
- : numDocs(100), maxHitsSize(80), maxHeapSize(30), hc(numDocs * 32, maxHitsSize, maxHeapSize)
+ : numDocs(100), maxHitsSize(80), maxHeapSize(30), hc(numDocs * 32, maxHitsSize)
{}
};
@@ -461,13 +462,13 @@ TEST_F("require that result set is merged correctly with second phase ranking (d
f.hc.addHit(i, i + 1000);
addExpectedHitForMergeTest(f, expRh, i);
}
- EXPECT_EQUAL(f.maxHeapSize, f.hc.reRank(scorer, f.hc.getSortedHeapHits()));
+ EXPECT_EQUAL(f.maxHeapSize, f.hc.reRank(scorer, extract(f.hc.getSortedHitSequence(f.maxHeapSize))));
std::unique_ptr<ResultSet> rs = f.hc.getResultSet();
TEST_DO(checkResult(*rs, expRh));
}
TEST("require that hits can be added out of order") {
- HitCollector hc(1000, 100, 10);
+ HitCollector hc(1000, 100);
std::vector<RankedHit> expRh;
// produce expected result in normal order
for (uint32_t i = 0; i < 5; ++i) {
@@ -485,7 +486,7 @@ TEST("require that hits can be added out of order") {
}
TEST("require that hits can be added out of order when passing array limit") {
- HitCollector hc(10000, 100, 10);
+ HitCollector hc(10000, 100);
std::vector<RankedHit> expRh;
// produce expected result in normal order
const size_t numHits = 150;
@@ -504,7 +505,7 @@ TEST("require that hits can be added out of order when passing array limit") {
}
TEST("require that hits can be added out of order only after passing array limit") {
- HitCollector hc(10000, 100, 10);
+ HitCollector hc(10000, 100);
std::vector<RankedHit> expRh;
// produce expected result in normal order
const size_t numHits = 150;
diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp
index 2a38d8bf27e..5c27b9d2067 100644
--- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp
@@ -34,11 +34,9 @@ HitCollector::sortHitsByDocId()
}
HitCollector::HitCollector(uint32_t numDocs,
- uint32_t maxHitsSize,
- uint32_t maxReRankHitsSize)
+ uint32_t maxHitsSize)
: _numDocs(numDocs),
_maxHitsSize(maxHitsSize),
- _maxReRankHitsSize(maxReRankHitsSize),
_maxDocIdVectorSize((numDocs + 31) / 32),
_hits(),
_hitsSortOrder(SortOrder::DOC_ID),
@@ -168,17 +166,12 @@ HitCollector::DocIdCollector<CollectRankedHit>::collectAndChangeCollector(uint32
hc._collector = std::make_unique<BitVectorCollector<CollectRankedHit>>(hc); // note - self-destruct.
}
-std::vector<HitCollector::Hit>
-HitCollector::getSortedHeapHits()
+SortedHitSequence
+HitCollector::getSortedHitSequence(size_t max_hits)
{
- std::vector<Hit> scores;
- size_t scoresToReturn = std::min(_hits.size(), static_cast<size_t>(_maxReRankHitsSize));
- scores.reserve(scoresToReturn);
- sortHitsByScore(scoresToReturn);
- for (size_t i = 0; i < scoresToReturn; ++i) {
- scores.push_back(_hits[_scoreOrder[i]]);
- }
- return scores;
+ size_t num_hits = std::min(_hits.size(), max_hits);
+ sortHitsByScore(num_hits);
+ return SortedHitSequence(&_hits[0], &_scoreOrder[0], num_hits);
}
size_t
diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h
index ef4bf76b66b..3577456e781 100644
--- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h
+++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h
@@ -9,6 +9,7 @@
#include <vector>
#include <vespa/vespalib/util/sort.h>
#include <vespa/fastos/dynamiclibrary.h>
+#include "sorted_hit_sequence.h"
namespace search::queryeval {
@@ -32,7 +33,6 @@ private:
const uint32_t _numDocs;
const uint32_t _maxHitsSize;
- const uint32_t _maxReRankHitsSize;
const uint32_t _maxDocIdVectorSize;
std::vector<Hit> _hits; // used as a heap when _hits.size == _maxHitsSize
@@ -144,14 +144,12 @@ public:
/**
* Creates a hit collector used to store hits for doc ids in the
* range [0, numDocs>. Doc id and rank score are stored for the n
- * (=maxHitsSize) best hits. The best m (=maxReRankHitsSize) hits are
- * candidates for re-ranking. Note that n >= m.
+ * (=maxHitsSize) best hits.
*
* @param numDocs
* @param maxHitsSize
- * @param maxReRankHitsSize
**/
- HitCollector(uint32_t numDocs, uint32_t maxHitsSize, uint32_t maxReRankHitsSize);
+ HitCollector(uint32_t numDocs, uint32_t maxHitsSize);
~HitCollector();
/**
@@ -167,15 +165,16 @@ public:
}
/**
- * Returns a sorted vector of hits for the hits that are stored
- * in the heap. These are the candidates for re-ranking.
+ * Returns a sorted sequence of hits that reference internal data.
+ *
+ * @param max_hits maximum number of hits returned in the sequence.
*/
- std::vector<Hit> getSortedHeapHits();
+ SortedHitSequence getSortedHitSequence(size_t max_hits);
/**
- * Re-ranks the m (=maxHeapSize) best hits by invoking the score()
- * method on the given document scorer. The best m hits are sorted on doc id
- * so that score() is called in doc id order.
+ * Re-ranks the given hits by invoking the score() method on the
+ * given document scorer. The hits are sorted on doc id so that
+ * score() is called in doc id order.
**/
size_t reRank(DocumentScorer &scorer, std::vector<Hit> hits);