summaryrefslogtreecommitdiffstats
path: root/streamingvisitors
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2024-02-12 11:10:08 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2024-02-13 11:58:17 +0000
commit66744e0891dcc1681fe35b010503850977396ca5 (patch)
tree27ec303749b0dc5670db9ee69d47466e2097af89 /streamingvisitors
parent45b56e769811a58807a6c59e534473cd5f4be180 (diff)
- Add all hits to the hit collector.
- Maintain a heap on the side, and keep heap property when producing results and features. - Drop teh pointer to the document once it drops off the heap.
Diffstat (limited to 'streamingvisitors')
-rw-r--r--streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp35
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp121
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/hitcollector.h52
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/searchvisitor.h2
4 files changed, 115 insertions, 95 deletions
diff --git a/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp b/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp
index 5fc8fae181a..56c8b93052b 100644
--- a/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp
+++ b/streamingvisitors/src/tests/hitcollector/hitcollector_test.cpp
@@ -139,7 +139,9 @@ TEST_F(HitCollectorTest, simple)
TEST_F(HitCollectorTest, gaps_in_docid)
{
- HitCollector hc(5, false);
+ SearchResult sr;
+ sr.setWantedHitCount(5);
+ HitCollector hc(sr.getWantedHitCount(), false);
// add hits to hit collector
for (uint32_t i = 0; i < 5; ++i) {
@@ -147,7 +149,6 @@ TEST_F(HitCollectorTest, gaps_in_docid)
}
// merge from heap into search result
- SearchResult sr;
hc.fillSearchResult(sr);
ASSERT_TRUE(sr.getHitCount() == 5);
@@ -161,12 +162,13 @@ TEST_F(HitCollectorTest, gaps_in_docid)
TEST_F(HitCollectorTest, heap_property)
{
{
- HitCollector hc(3, false);
+ SearchResult sr;
+ sr.setWantedHitCount(3);
+ HitCollector hc(sr.getWantedHitCount(), false);
// add hits (low to high)
for (uint32_t i = 0; i < 6; ++i) {
addHit(hc, i, i + 10);
}
- SearchResult sr;
hc.fillSearchResult(sr);
ASSERT_TRUE(sr.getHitCount() == 3);
assertHit(13, 3, 0, sr);
@@ -174,12 +176,13 @@ TEST_F(HitCollectorTest, heap_property)
assertHit(15, 5, 2, sr);
}
{
- HitCollector hc(3, false);
+ SearchResult sr;
+ sr.setWantedHitCount(3);
+ HitCollector hc(sr.getWantedHitCount(), false);
// add hits (high to low)
for (uint32_t i = 0; i < 6; ++i) {
addHit(hc, i, 10 - i);
}
- SearchResult sr;
hc.fillSearchResult(sr);
ASSERT_TRUE(sr.getHitCount() == 3);
assertHit(10, 0, 0, sr);
@@ -187,12 +190,13 @@ TEST_F(HitCollectorTest, heap_property)
assertHit(8, 2, 2, sr);
}
{
- HitCollector hc(3, false);
+ SearchResult sr;
+ sr.setWantedHitCount(3);
+ HitCollector hc(sr.getWantedHitCount(), false);
// add hits (same rank score)
for (uint32_t i = 0; i < 6; ++i) {
addHit(hc, i, 10);
}
- SearchResult sr;
hc.fillSearchResult(sr);
ASSERT_TRUE(sr.getHitCount() == 3);
assertHit(10, 0, 0, sr);
@@ -211,12 +215,13 @@ TEST_F(HitCollectorTest, heap_property_with_sorting)
sortData.push_back('e');
sortData.push_back('f');
{
- HitCollector hc(3, true);
+ SearchResult sr;
+ sr.setWantedHitCount(3);
+ HitCollector hc(sr.getWantedHitCount(), true);
// add hits ('a' is sorted/ranked better than 'b')
for (uint32_t i = 0; i < 6; ++i) {
addHit(hc, i, i + 10, &sortData[i], 1);
}
- SearchResult sr;
hc.fillSearchResult(sr);
ASSERT_TRUE(sr.getHitCount() == 3);
assertHit(10, 0, 0, sr);
@@ -224,12 +229,13 @@ TEST_F(HitCollectorTest, heap_property_with_sorting)
assertHit(12, 2, 2, sr);
}
{
- HitCollector hc(3, true);
+ SearchResult sr;
+ sr.setWantedHitCount(3);
+ HitCollector hc(sr.getWantedHitCount(), true);
// add hits ('a' is sorted/ranked better than 'b')
for (uint32_t i = 0; i < 6; ++i) {
addHit(hc, i, i + 10, &sortData[5 - i], 1);
}
- SearchResult sr;
hc.fillSearchResult(sr);
ASSERT_TRUE(sr.getHitCount() == 3);
assertHit(13, 3, 0, sr);
@@ -237,12 +243,13 @@ TEST_F(HitCollectorTest, heap_property_with_sorting)
assertHit(15, 5, 2, sr);
}
{
- HitCollector hc(3, true);
+ SearchResult sr;
+ sr.setWantedHitCount(3);
+ HitCollector hc(sr.getWantedHitCount(), true);
// add hits (same sort blob)
for (uint32_t i = 0; i < 6; ++i) {
addHit(hc, i, 10, &sortData[0], 1);
}
- SearchResult sr;
hc.fillSearchResult(sr);
ASSERT_TRUE(sr.getHitCount() == 3);
assertHit(10, 0, 0, sr);
diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp
index e1ee72b0152..305e76477ca 100644
--- a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp
@@ -18,12 +18,12 @@ using FefUtils = search::fef::Utils;
namespace streaming {
HitCollector::Hit::Hit(const vsm::StorageDocument * doc, uint32_t docId, const MatchData & matchData,
- double score, const void * sortData, size_t sortDataLen) :
- _docid(docId),
- _score(score),
- _document(doc),
- _matchData(),
- _sortBlob(sortData, sortDataLen)
+ double score, const void * sortData, size_t sortDataLen)
+ : _docid(docId),
+ _score(score),
+ _document(doc),
+ _matchData(),
+ _sortBlob(sortData, sortDataLen)
{
_matchData.reserve(matchData.getNumTermFields());
for (search::fef::TermFieldHandle handle = 0; handle < matchData.getNumTermFields(); ++handle) {
@@ -35,12 +35,15 @@ HitCollector::Hit::~Hit() = default;
HitCollector::HitCollector(size_t wantedHits, bool use_sort_blob) :
_hits(),
- _use_sort_blob(use_sort_blob),
- _sortedByDocId(true)
+ _heap(),
+ _use_sort_blob(use_sort_blob)
{
- _hits.reserve(wantedHits);
+ _hits.reserve(16);
+ _heap.reserve(wantedHits);
}
+HitCollector::~HitCollector() = default;
+
const vsm::Document &
HitCollector::getDocSum(const search::DocumentIdT & docId) const
{
@@ -65,85 +68,87 @@ HitCollector::addHit(const vsm::StorageDocument * doc, uint32_t docId, const Mat
return addHit(Hit(doc, docId, data, score, sortData, sortDataLen));
}
-void
-HitCollector::sortByDocId()
-{
- if (!_sortedByDocId) {
- std::sort(_hits.begin(), _hits.end()); // sort on docId
- _sortedByDocId = true;
- }
-}
-
bool
-HitCollector::addHitToHeap(const Hit & hit) const
+HitCollector::addHitToHeap(uint32_t index) const
{
+ if (_heap.capacity() == 0) return false;
// return true if the given hit is better than the current worst one.
- return (_use_sort_blob)
- ? (hit.cmpSort(_hits[0]) < 0)
- : (hit.cmpRank(_hits[0]) < 0);
+ const Hit & hit = _hits[index];
+ return _use_sort_blob
+ ? (hit.cmpSort(_hits[_heap[0]]) < 0)
+ : (hit.cmpRank(_hits[_heap[0]]) < 0);
}
void
HitCollector::make_heap() {
if (_use_sort_blob) {
- std::make_heap(_hits.begin(), _hits.end(), Hit::SortComparator());
+ std::make_heap(_heap.begin(), _heap.end(), SortComparator(_hits));
} else {
- std::make_heap(_hits.begin(), _hits.end(), Hit::RankComparator());
+ std::make_heap(_heap.begin(), _heap.end(), RankComparator(_hits));
}
}
void
HitCollector::pop_heap() {
if (_use_sort_blob) {
- std::pop_heap(_hits.begin(), _hits.end(), Hit::SortComparator());
+ std::pop_heap(_heap.begin(), _heap.end(), SortComparator(_hits));
} else {
- std::pop_heap(_hits.begin(), _hits.end(), Hit::RankComparator());
+ std::pop_heap(_heap.begin(), _heap.end(), RankComparator(_hits));
}
}
void
HitCollector::push_heap() {
if (_use_sort_blob) {
- std::push_heap(_hits.begin(), _hits.end(), Hit::SortComparator());
+ std::push_heap(_heap.begin(), _heap.end(), SortComparator(_hits));
} else {
- std::push_heap(_hits.begin(), _hits.end(), Hit::RankComparator());
+ std::push_heap(_heap.begin(), _heap.end(), RankComparator(_hits));
}
}
bool
HitCollector::addHit(Hit && hit)
{
- bool amongTheBest(false);
- size_t avail = (_hits.capacity() - _hits.size());
+ size_t avail = (_heap.capacity() - _heap.size());
assert(_use_sort_blob != hit.getSortBlob().empty() );
+ assert(_hits.size() <= hit.getDocId());
+ _hits.emplace_back(std::move(hit));
+ uint32_t index = _hits.size() - 1;
if (avail > 1) {
// No heap yet.
- _hits.emplace_back(std::move(hit));
- amongTheBest = true;
- } else if (_hits.capacity() == 0) {
- // this happens when wantedHitCount = 0
- // in this case we shall not put anything on the heap (which is empty)
- } else if ( avail == 0 && addHitToHeap(hit)) { // already a heap
+ _heap.emplace_back(index);
+ } else if ((avail == 0) && addHitToHeap(index)) { // already a heap
pop_heap();
- _hits.back() = std::move(hit);
- amongTheBest = true;
+ uint32_t toDrop = _heap.back();
+ _heap.back() = index;
push_heap();
+ // Signal that it is not among the best and should hence never be accessed.
+ // Drop early to catch logic flaws.
+ _hits[toDrop].dropDocument();
} else if (avail == 1) { // make a heap of the hit vector
- _hits.emplace_back(std::move(hit));
- amongTheBest = true;
+ _heap.emplace_back(index);
make_heap();
- _sortedByDocId = false; // the hit vector is no longer sorted by docId
+ } else {
+ // The document might be invalid if it did not make the cut,
+ // so clear the reference here to catch logic flaws early.
+ _hits.back().dropDocument();
+ return false;
}
- return amongTheBest;
+ return true;
+}
+
+std::vector<uint32_t>
+HitCollector::bestLids() const {
+ std::vector<uint32_t> hitsOnHeap = _heap;
+ std::sort(hitsOnHeap.begin(), hitsOnHeap.end());
+ return hitsOnHeap;
}
void
-HitCollector::fillSearchResult(vdslib::SearchResult & searchResult, FeatureValues&& match_features)
+HitCollector::fillSearchResult(vdslib::SearchResult & searchResult, FeatureValues&& match_features) const
{
- sortByDocId();
- size_t count = std::min(_hits.size(), searchResult.getWantedHitCount());
- for (size_t i(0); i < count; i++) {
- const Hit & hit = _hits[i];
+ for (uint32_t lid : bestLids()) {
+ const Hit & hit = _hits[lid];
vespalib::string documentId(hit.getDocument().docDoc().getId().toString());
search::DocumentIdT docId = hit.getDocId();
SearchResult::RankType rank = hit.getRankScore();
@@ -160,7 +165,7 @@ HitCollector::fillSearchResult(vdslib::SearchResult & searchResult, FeatureValue
}
void
-HitCollector::fillSearchResult(vdslib::SearchResult & searchResult)
+HitCollector::fillSearchResult(vdslib::SearchResult & searchResult) const
{
fillSearchResult(searchResult, FeatureValues());
}
@@ -168,15 +173,15 @@ HitCollector::fillSearchResult(vdslib::SearchResult & searchResult)
FeatureSet::SP
HitCollector::getFeatureSet(IRankProgram &rankProgram,
const FeatureResolver &resolver,
- const search::StringStringMap &feature_rename_map)
+ const search::StringStringMap &feature_rename_map) const
{
- if (resolver.num_features() == 0 || _hits.empty()) {
+ if (resolver.num_features() == 0 || _heap.empty()) {
return std::make_shared<FeatureSet>();
}
- sortByDocId();
auto names = FefUtils::extract_feature_names(resolver, feature_rename_map);
- FeatureSet::SP retval = std::make_shared<FeatureSet>(names, _hits.size());
- for (const Hit & hit : _hits) {
+ FeatureSet::SP retval = std::make_shared<FeatureSet>(names, _heap.size());
+ for (uint32_t lid : bestLids()) {
+ const Hit & hit = _hits[lid];
uint32_t docId = hit.getDocId();
rankProgram.run(docId, hit.getMatchData());
auto * f = retval->getFeaturesByIndex(retval->addDocId(docId));
@@ -188,17 +193,17 @@ HitCollector::getFeatureSet(IRankProgram &rankProgram,
FeatureValues
HitCollector::get_match_features(IRankProgram& rank_program,
const FeatureResolver& resolver,
- const search::StringStringMap& feature_rename_map)
+ const search::StringStringMap& feature_rename_map) const
{
FeatureValues match_features;
- if (resolver.num_features() == 0 || _hits.empty()) {
+ if (resolver.num_features() == 0 || _heap.empty()) {
return match_features;
}
- sortByDocId();
match_features.names = FefUtils::extract_feature_names(resolver, feature_rename_map);
- match_features.values.resize(resolver.num_features() * _hits.size());
+ match_features.values.resize(resolver.num_features() * _heap.size());
auto f = match_features.values.data();
- for (const Hit & hit : _hits) {
+ for (uint32_t lid : bestLids()) {
+ const Hit & hit = _hits[lid];
auto docid = hit.getDocId();
rank_program.run(docid, hit.getMatchData());
FefUtils::extract_feature_values(resolver, docid, f);
diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.h b/streamingvisitors/src/vespa/searchvisitor/hitcollector.h
index 50a233bfcef..43ffbbe30c6 100644
--- a/streamingvisitors/src/vespa/searchvisitor/hitcollector.h
+++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.h
@@ -51,18 +51,8 @@ private:
int diff = _sortBlob.compare(b._sortBlob.c_str(), b._sortBlob.size());
return (diff == 0) ? cmpDocId(b) : diff;
}
- class RankComparator {
- public:
- bool operator() (const Hit & lhs, const Hit & rhs) const noexcept {
- return lhs.cmpRank(rhs) < 0;
- }
- };
- class SortComparator {
- public:
- bool operator() (const Hit & lhs, const Hit & rhs) const noexcept {
- return lhs.cmpSort(rhs) < 0;
- }
- };
+
+ void dropDocument() noexcept { _document = nullptr; }
private:
uint32_t _docid;
@@ -72,16 +62,35 @@ private:
vespalib::string _sortBlob;
};
using HitVector = std::vector<Hit>;
+ using Lids = std::vector<uint32_t>;
HitVector _hits;
+ Lids _heap;
bool _use_sort_blob;
- bool _sortedByDocId; // flag for whether the hit vector is sorted on docId
- void sortByDocId();
- bool addHitToHeap(const Hit & hit) const;
+ Lids bestLids() const;
+ bool addHitToHeap(uint32_t index) const;
bool addHit(Hit && hit);
void make_heap();
void pop_heap();
void push_heap();
+ class RankComparator {
+ public:
+ explicit RankComparator(const HitVector & hits) noexcept : _hits(hits) {}
+ bool operator() (uint32_t lhs, uint32_t rhs) const noexcept {
+ return _hits[lhs].cmpRank(_hits[rhs]) < 0;
+ }
+ private:
+ const HitVector & _hits;
+ };
+ class SortComparator {
+ public:
+ explicit SortComparator(const HitVector & hits) noexcept : _hits(hits) {}
+ bool operator() (uint32_t lhs, uint32_t rhs) const noexcept {
+ return _hits[lhs].cmpSort(_hits[rhs]) < 0;
+ }
+ private:
+ const HitVector & _hits;
+ };
public:
using UP = std::unique_ptr<HitCollector>;
@@ -92,8 +101,9 @@ public:
};
HitCollector(size_t wantedHits, bool use_sort_blob);
+ ~HitCollector() override;
- virtual const vsm::Document & getDocSum(const search::DocumentIdT & docId) const override;
+ const vsm::Document & getDocSum(const search::DocumentIdT & docId) const override;
/**
* Adds a hit to this hit collector.
@@ -124,14 +134,12 @@ public:
/**
* Fills the given search result with the m best hits from the hit heap.
- * Invoking this method will destroy the heap property of the hit heap.
**/
- void fillSearchResult(vdslib::SearchResult & searchResult, vespalib::FeatureValues&& match_features);
- void fillSearchResult(vdslib::SearchResult & searchResult);
+ void fillSearchResult(vdslib::SearchResult & searchResult, vespalib::FeatureValues&& match_features) const;
+ void fillSearchResult(vdslib::SearchResult & searchResult) const;
/**
* Extract features from the hits stored in the hit heap.
- * Invoking this method will destroy the heap property of the hit heap.
* Note that this method will calculate any additional features.
*
* @return features for all hits on the heap.
@@ -140,11 +148,11 @@ public:
**/
vespalib::FeatureSet::SP getFeatureSet(IRankProgram &rankProgram,
const FeatureResolver &resolver,
- const search::StringStringMap &feature_rename_map);
+ const search::StringStringMap &feature_rename_map) const;
vespalib::FeatureValues get_match_features(IRankProgram& rank_program,
const FeatureResolver& resolver,
- const search::StringStringMap& feature_rename_map);
+ const search::StringStringMap& feature_rename_map) const;
};
} // namespace streaming
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
index dfd48736e89..bb16655193f 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
+++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h
@@ -499,7 +499,7 @@ class SearchVisitorFactory : public storage::VisitorFactory {
std::shared_ptr<storage::VisitorEnvironment> makeVisitorEnvironment(storage::StorageComponent&) override;
storage::Visitor* makeVisitor(storage::StorageComponent&, storage::VisitorEnvironment&env,
- const vdslib::Parameters& params) override;
+ const vdslib::Parameters& params) override;
public:
explicit SearchVisitorFactory(const config::ConfigUri & configUri, FNET_Transport* transport, const vespalib::string& file_distributor_connection_spec);
~SearchVisitorFactory() override;