aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2023-09-27 16:33:09 +0200
committerTor Egge <Tor.Egge@online.no>2023-09-27 16:33:09 +0200
commit94ab70c2213512dda2fe4b1824dd238ad33530bb (patch)
treecb56890f878dc5a98501d1a1e29e27e830219104 /searchlib/src
parent03574a1f04b8f8b54401135ee077a86008123573 (diff)
Store a limited number of posting list indexes in countHits() to
reduce amount of dictionary entry filtering in fillArray() and fillBitVector() for regexp search and fuzzy search.
Diffstat (limited to 'searchlib/src')
-rw-r--r--searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp20
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp3
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h9
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp48
4 files changed, 70 insertions, 10 deletions
diff --git a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
index 40d4b20aaf2..ac1042dda6c 100644
--- a/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
+++ b/searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
@@ -4,6 +4,7 @@
#include <vespa/searchlib/attribute/attributefactory.h>
#include <vespa/searchlib/attribute/attributeiterators.h>
#include <vespa/searchlib/attribute/flagattribute.h>
+#include <vespa/searchlib/attribute/postinglistsearchcontext.h>
#include <vespa/searchlib/attribute/searchcontextelementiterator.h>
#include <vespa/searchlib/attribute/singleboolattribute.h>
#include <vespa/searchlib/attribute/stringbase.h>
@@ -1424,6 +1425,25 @@ SearchContextTest::testPrefixSearch(const vespalib::string& name, const Config&
}
}
}
+
+ // Long range of prefixes with unique strings that causes
+ // PostingListFoldedSearchContextT<DataT>::countHits() to populate
+ // partial vector of posting indexes, with scan resumed by
+ // fillArray or fillBitVector.
+ auto& vec = dynamic_cast<StringAttribute &>(*attr.get());
+ uint32_t old_size = attr->getNumDocs();
+ constexpr uint32_t longrange_values = search::attribute::PostingListFoldedSearchContextT<int32_t>::MAX_POSTING_INDEXES_SIZE + 100;
+ attr->addDocs(longrange_values);
+ DocSet exp_longrange;
+ for (uint32_t i = 0; i < longrange_values; ++i) {
+ vespalib::asciistream ss;
+ ss << "lpref" << i;
+ vespalib::string sss(ss.str());
+ exp_longrange.put(old_size + i);
+ vec.update(old_size + i, vespalib::string(ss.str()).c_str());
+ }
+ attr->commit();
+ performSearch(*attr, "lpref", exp_longrange, TermType::PREFIXTERM);
}
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp
index 12c887eb407..2b1d4fa3286 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.cpp
@@ -27,7 +27,8 @@ PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_
_FSTC(0.0),
_PLSTC(0.0),
_hasWeight(hasWeight),
- _useBitVector(useBitVector)
+ _useBitVector(useBitVector),
+ _counted_hits()
{
}
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
index 7e2002f95fd..9a4dbdd9c61 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
@@ -153,9 +153,14 @@ protected:
template <class DataT>
class PostingListFoldedSearchContextT : public PostingListSearchContextT<DataT>
{
+public:
+ static constexpr uint32_t MAX_POSTING_INDEXES_SIZE = 10000;
+
protected:
using Parent = PostingListSearchContextT<DataT>;
using Dictionary = typename Parent::Dictionary;
+ using DictionaryConstIterator = Dictionary::ConstIterator;
+ using EntryRef = vespalib::datastore::EntryRef;
using PostingList = typename Parent::PostingList;
using Parent::_counted_hits;
using Parent::_docIdLimit;
@@ -167,9 +172,13 @@ protected:
using Parent::singleHits;
using Parent::use_dictionary_entry;
+ mutable DictionaryConstIterator _resume_scan_itr;
+ mutable std::vector<EntryRef> _posting_indexes;
+
PostingListFoldedSearchContextT(const IEnumStoreDictionary& dictionary, uint32_t docIdLimit, uint64_t numValues,
bool hasWeight, const PostingList &postingList,
bool useBitVector, const ISearchContext &baseSearchCtx);
+ ~PostingListFoldedSearchContextT() override;
bool fallback_to_approx_num_hits() const override;
size_t countHits() const override;
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
index 714da9997bb..c479bbfcd06 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.hpp
@@ -264,11 +264,16 @@ PostingListFoldedSearchContextT<DataT>::
PostingListFoldedSearchContextT(const IEnumStoreDictionary& dictionary, uint32_t docIdLimit, uint64_t numValues,
bool hasWeight, const PostingList &postingList,
bool useBitVector, const ISearchContext &searchContext)
- : Parent(dictionary, docIdLimit, numValues, hasWeight, postingList, useBitVector, searchContext)
+ : Parent(dictionary, docIdLimit, numValues, hasWeight, postingList, useBitVector, searchContext),
+ _resume_scan_itr(),
+ _posting_indexes()
{
}
template <typename DataT>
+PostingListFoldedSearchContextT<DataT>::~PostingListFoldedSearchContextT() = default;
+
+template <typename DataT>
bool
PostingListFoldedSearchContextT<DataT>::fallback_to_approx_num_hits() const
{
@@ -283,12 +288,27 @@ PostingListFoldedSearchContextT<DataT>::countHits() const
return _counted_hits.value();
}
size_t sum(0);
- for (auto it(_lowerDictItr); it != this->_upperDictItr;) {
+ bool overflow = false;
+ for (auto it(_lowerDictItr); it != _upperDictItr;) {
if (use_dictionary_entry(it)) {
- sum += _postingList.frozenSize(it.getData().load_acquire());
+ auto pidx = it.getData().load_acquire();
+ if (pidx.valid()) {
+ sum += _postingList.frozenSize(pidx);
+ if (!overflow) {
+ if (_posting_indexes.size() < MAX_POSTING_INDEXES_SIZE) {
+ _posting_indexes.emplace_back(pidx);
+ } else {
+ overflow = true;
+ _resume_scan_itr = it;
+ }
+ }
+ }
++it;
}
}
+ if (!overflow) {
+ _resume_scan_itr = _upperDictItr;
+ }
_counted_hits = sum;
return sum;
}
@@ -297,10 +317,15 @@ template <typename DataT>
void
PostingListFoldedSearchContextT<DataT>::fillArray()
{
- for (auto it(_lowerDictItr); it != _upperDictItr;) {
+ for (auto pidx : _posting_indexes) {
+ _merger.addToArray(PostingListTraverser<PostingList>(_postingList, pidx));
+ }
+ for (auto it(_resume_scan_itr); it != _upperDictItr;) {
if (use_dictionary_entry(it)) {
- _merger.addToArray(PostingListTraverser<PostingList>(_postingList,
- it.getData().load_acquire()));
+ auto pidx = it.getData().load_acquire();
+ if (pidx.valid()) {
+ _merger.addToArray(PostingListTraverser<PostingList>(_postingList, pidx));
+ }
++it;
}
}
@@ -311,10 +336,15 @@ template <typename DataT>
void
PostingListFoldedSearchContextT<DataT>::fillBitVector()
{
- for (auto it(_lowerDictItr); it != _upperDictItr;) {
+ for (auto pidx : _posting_indexes) {
+ _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, pidx));
+ }
+ for (auto it(_resume_scan_itr); it != _upperDictItr;) {
if (use_dictionary_entry(it)) {
- _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList,
- it.getData().load_acquire()));
+ auto pidx = it.getData().load_acquire();
+ if (pidx.valid()) {
+ _merger.addToBitVector(PostingListTraverser<PostingList>(_postingList, pidx));
+ }
++it;
}
}