summaryrefslogtreecommitdiffstats
path: root/searchlib/src
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2023-09-28 16:06:07 +0200
committerGitHub <noreply@github.com>2023-09-28 16:06:07 +0200
commit168f8847334518c8efe73478805a580b5f5e23a6 (patch)
treecf736b7198d211dc3a00fc2496de5a7243778fad /searchlib/src
parentbaec2cce6cd62f6bd42f09c2ebe4a9dcdd921cb3 (diff)
parent94ab70c2213512dda2fe4b1824dd238ad33530bb (diff)
Merge pull request #28687 from vespa-engine/toregge/avoid-unneeded-counting-of-hits
Avoid counting hits in range multiple times.
Diffstat (limited to 'searchlib/src')
-rw-r--r--searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp20
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h65
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp110
4 files changed, 153 insertions, 45 deletions
diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
index 40d4b20aaf2..ac1042dda6c 100644
--- a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
+++ b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
@@ -4,6 +4,7 @@
#include <vespa/searchlib/attribute/attributefactory.h>
#include <vespa/searchlib/attribute/attributeiterators.h>
#include <vespa/searchlib/attribute/flagattribute.h>
+#include <vespa/searchlib/attribute/postinglistsearchcontext.h>
#include <vespa/searchlib/attribute/searchcontextelementiterator.h>
#include <vespa/searchlib/attribute/singleboolattribute.h>
#include <vespa/searchlib/attribute/stringbase.h>
@@ -1424,6 +1425,25 @@ SearchContextTest::testPrefixSearch(const vespalib::string& name, const Config&
}
}
}
+
+ // Long range of prefixes with unique strings that causes
+ // PostingListFoldedSearchContextT<DataT>::countHits() to populate
+ // partial vector of posting indexes, with scan resumed by
+ // fillArray or fillBitVector.
+ auto& vec = dynamic_cast<StringAttribute &>(*attr.get());
+ uint32_t old_size = attr->getNumDocs();
+ constexpr uint32_t longrange_values = search::attribute::PostingListFoldedSearchContextT<int32_t>::MAX_POSTING_INDEXES_SIZE + 100;
+ attr->addDocs(longrange_values);
+ DocSet exp_longrange;
+ for (uint32_t i = 0; i < longrange_values; ++i) {
+ vespalib::asciistream ss;
+ ss << "lpref" << i;
+ vespalib::string sss(ss.str());
+ exp_longrange.put(old_size + i);
+ vec.update(old_size + i, vespalib::string(ss.str()).c_str());
+ }
+ attr->commit();
+ performSearch(*attr, "lpref", exp_longrange, TermType::PREFIXTERM);
}
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp
index 12c887eb407..2b1d4fa3286 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp
@@ -27,7 +27,8 @@ PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_
_FSTC(0.0),
_PLSTC(0.0),
_hasWeight(hasWeight),
- _useBitVector(useBitVector)
+ _useBitVector(useBitVector),
+ _counted_hits()
{
}
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
index 1baf75d1bdf..9a4dbdd9c61 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
@@ -13,6 +13,7 @@
#include <vespa/vespalib/util/regexp.h>
#include <vespa/vespalib/fuzzy/fuzzy_matcher.h>
#include <regex>
+#include <optional>
namespace search::attribute {
@@ -30,28 +31,30 @@ protected:
using Dictionary = EnumPostingTree;
using DictionaryConstIterator = Dictionary::ConstIterator;
using FrozenDictionary = Dictionary::FrozenView;
+ using EntryRef = vespalib::datastore::EntryRef;
using EnumIndex = IEnumStore::Index;
static constexpr long MIN_UNIQUE_VALUES_BEFORE_APPROXIMATION = 100;
static constexpr long MIN_UNIQUE_VALUES_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION = 20;
static constexpr long MIN_APPROXHITS_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION = 10;
- const IEnumStoreDictionary & _dictionary;
- const ISearchContext &_baseSearchCtx;
- const BitVector *_bv; // bitvector if _useBitVector has been set
- const FrozenDictionary _frozenDictionary;
- DictionaryConstIterator _lowerDictItr;
- DictionaryConstIterator _upperDictItr;
- uint64_t _numValues; // attr.getStatus().getNumValues();
- uint32_t _uniqueValues;
- uint32_t _docIdLimit;
- uint32_t _dictSize;
- vespalib::datastore::EntryRef _pidx;
- vespalib::datastore::EntryRef _frozenRoot; // Posting list in tree form
- float _FSTC; // Filtering Search Time Constant
- float _PLSTC; // Posting List Search Time Constant
- bool _hasWeight;
- bool _useBitVector;
+ const IEnumStoreDictionary& _dictionary;
+ const ISearchContext& _baseSearchCtx;
+ const BitVector* _bv; // bitvector if _useBitVector has been set
+ const FrozenDictionary _frozenDictionary;
+ DictionaryConstIterator _lowerDictItr;
+ DictionaryConstIterator _upperDictItr;
+ uint64_t _numValues; // attr.getStatus().getNumValues();
+ uint32_t _uniqueValues;
+ uint32_t _docIdLimit;
+ uint32_t _dictSize;
+ EntryRef _pidx;
+ EntryRef _frozenRoot; // Posting list in tree form
+ float _FSTC; // Filtering Search Time Constant
+ float _PLSTC; // Posting List Search Time Constant
+ bool _hasWeight;
+ bool _useBitVector;
+ mutable std::optional<size_t> _counted_hits; // Snapshot of size of posting lists in range
PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_dictionary, uint32_t docIdLimit,
uint64_t numValues, bool hasWeight, bool useBitVector, const ISearchContext &baseSearchCtx);
@@ -99,6 +102,9 @@ protected:
(calculateApproxNumHits() * MIN_APPROXHITS_TO_NUMDOCS_RATIO_BEFORE_APPROXIMATION > _docIdLimit) ||
(_uniqueValues > MIN_UNIQUE_VALUES_BEFORE_APPROXIMATION*10)));
}
+ virtual size_t countHits() const = 0;
+ virtual void fillArray() = 0;
+ virtual void fillBitVector() = 0;
};
@@ -126,9 +132,9 @@ protected:
~PostingListSearchContextT() override;
void lookupSingle();
- size_t countHits() const;
- void fillArray();
- void fillBitVector();
+ size_t countHits() const override;
+ void fillArray() override;
+ void fillBitVector() override;
void fetchPostings(const queryeval::ExecuteInfo & strict) override;
// this will be called instead of the fetchPostings function in some cases
@@ -147,22 +153,37 @@ protected:
template <class DataT>
class PostingListFoldedSearchContextT : public PostingListSearchContextT<DataT>
{
+public:
+ static constexpr uint32_t MAX_POSTING_INDEXES_SIZE = 10000;
+
protected:
using Parent = PostingListSearchContextT<DataT>;
using Dictionary = typename Parent::Dictionary;
+ using DictionaryConstIterator = Dictionary::ConstIterator;
+ using EntryRef = vespalib::datastore::EntryRef;
using PostingList = typename Parent::PostingList;
+ using Parent::_counted_hits;
+ using Parent::_docIdLimit;
using Parent::_lowerDictItr;
- using Parent::_uniqueValues;
+ using Parent::_merger;
using Parent::_postingList;
- using Parent::_docIdLimit;
- using Parent::countHits;
+ using Parent::_uniqueValues;
+ using Parent::_upperDictItr;
using Parent::singleHits;
+ using Parent::use_dictionary_entry;
+
+ mutable DictionaryConstIterator _resume_scan_itr;
+ mutable std::vector<EntryRef> _posting_indexes;
PostingListFoldedSearchContextT(const IEnumStoreDictionary& dictionary, uint32_t docIdLimit, uint64_t numValues,
bool hasWeight, const PostingList &postingList,
bool useBitVector, const ISearchContext &baseSearchCtx);
+ ~PostingListFoldedSearchContextT() override;
bool fallback_to_approx_num_hits() const override;
+ size_t countHits() const override;
+ void fillArray() override;
+ void fillBitVector() override;
};
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
index 91054b02d1e..c479bbfcd06 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
@@ -58,51 +58,42 @@ PostingListSearchContextT<DataT>::lookupSingle()
}
}
-
template <typename DataT>
size_t
PostingListSearchContextT<DataT>::countHits() const
{
+ if (_counted_hits.has_value()) {
+ return _counted_hits.value();
+ }
size_t sum(0);
- for (auto it(_lowerDictItr); it != _upperDictItr;) {
- if (use_dictionary_entry(it)) {
- sum += _postingList.frozenSize(it.getData().load_acquire());
- ++it;
- }
+ for (auto it(_lowerDictItr); it != _upperDictItr; ++it) {
+ sum += _postingList.frozenSize(it.getData().load_acquire());
}
+ _counted_hits = sum;
return sum;
}
-
template <typename DataT>
void
PostingListSearchContextT<DataT>::fillArray()
{
- for (auto it(_lowerDictItr); it != _upperDictItr;) {
- if (use_dictionary_entry(it)) {
- _merger.addToArray(PostingListTraverser<PostingList>(_postingList,
- it.getData().load_acquire()));
- ++it;
- }
+ for (auto it(_lowerDictItr); it != _upperDictItr; ++it) {
+ _merger.addToArray(PostingListTraverser<PostingList>(_postingList,
+ it.getData().load_acquire()));
}
_merger.merge();
}
-
template <typename DataT>
void
PostingListSearchContextT<DataT>::fillBitVector()
{
- for (auto it(_lowerDictItr); it != _upperDictItr;) {
- if (use_dictionary_entry(it)) {
- _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList,
- it.getData().load_acquire()));
- ++it;
- }
+ for (auto it(_lowerDictItr); it != _upperDictItr; ++it) {
+ _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList,
+ it.getData().load_acquire()));
}
}
-
template <typename DataT>
void
PostingListSearchContextT<DataT>::fetchPostings(const queryeval::ExecuteInfo & execInfo)
@@ -273,15 +264,90 @@ PostingListFoldedSearchContextT<DataT>::
PostingListFoldedSearchContextT(const IEnumStoreDictionary& dictionary, uint32_t docIdLimit, uint64_t numValues,
bool hasWeight, const PostingList &postingList,
bool useBitVector, const ISearchContext &searchContext)
- : Parent(dictionary, docIdLimit, numValues, hasWeight, postingList, useBitVector, searchContext)
+ : Parent(dictionary, docIdLimit, numValues, hasWeight, postingList, useBitVector, searchContext),
+ _resume_scan_itr(),
+ _posting_indexes()
{
}
template <typename DataT>
+PostingListFoldedSearchContextT<DataT>::~PostingListFoldedSearchContextT() = default;
+
+template <typename DataT>
bool
PostingListFoldedSearchContextT<DataT>::fallback_to_approx_num_hits() const
{
return false;
}
+template <typename DataT>
+size_t
+PostingListFoldedSearchContextT<DataT>::countHits() const
+{
+ if (_counted_hits.has_value()) {
+ return _counted_hits.value();
+ }
+ size_t sum(0);
+ bool overflow = false;
+ for (auto it(_lowerDictItr); it != _upperDictItr;) {
+ if (use_dictionary_entry(it)) {
+ auto pidx = it.getData().load_acquire();
+ if (pidx.valid()) {
+ sum += _postingList.frozenSize(pidx);
+ if (!overflow) {
+ if (_posting_indexes.size() < MAX_POSTING_INDEXES_SIZE) {
+ _posting_indexes.emplace_back(pidx);
+ } else {
+ overflow = true;
+ _resume_scan_itr = it;
+ }
+ }
+ }
+ ++it;
+ }
+ }
+ if (!overflow) {
+ _resume_scan_itr = _upperDictItr;
+ }
+ _counted_hits = sum;
+ return sum;
+}
+
+template <typename DataT>
+void
+PostingListFoldedSearchContextT<DataT>::fillArray()
+{
+ for (auto pidx : _posting_indexes) {
+ _merger.addToArray(PostingListTraverser<PostingList>(_postingList, pidx));
+ }
+ for (auto it(_resume_scan_itr); it != _upperDictItr;) {
+ if (use_dictionary_entry(it)) {
+ auto pidx = it.getData().load_acquire();
+ if (pidx.valid()) {
+ _merger.addToArray(PostingListTraverser<PostingList>(_postingList, pidx));
+ }
+ ++it;
+ }
+ }
+ _merger.merge();
+}
+
+template <typename DataT>
+void
+PostingListFoldedSearchContextT<DataT>::fillBitVector()
+{
+ for (auto pidx : _posting_indexes) {
+ _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, pidx));
+ }
+ for (auto it(_resume_scan_itr); it != _upperDictItr;) {
+ if (use_dictionary_entry(it)) {
+ auto pidx = it.getData().load_acquire();
+ if (pidx.valid()) {
+ _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, pidx));
+ }
+ ++it;
+ }
+ }
+}
+
}