summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorGeir Storli <geirst@yahooinc.com>2024-01-19 15:40:57 +0000
committerGeir Storli <geirst@yahooinc.com>2024-01-19 15:40:57 +0000
commit206c895fbd7a5d8af615a5d2bbd2ee710e01dec3 (patch)
tree2af2600d5cf873970f97c0922e354bc8b87410f7 /searchlib
parent1f65f322a8d0f67e83129888b873a241c243a851 (diff)
Support hash filter iterator in DirectMultiTermBlueprint.
This is used in some non-strict cases, and should improve IN operator performance: https://github.com/vespa-engine/system-test/tree/master/tests/performance/in_operator.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp96
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp8
-rw-r--r--searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp49
-rw-r--r--searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.h1
-rw-r--r--searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.hpp12
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/dot_product_search.h1
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp101
-rw-r--r--searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h12
9 files changed, 244 insertions, 38 deletions
diff --git a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp
index a94ca9d890e..87b771af8e6 100644
--- a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp
+++ b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp
@@ -26,6 +26,7 @@ using LookupKey = IDirectPostingStore::LookupKey;
struct IntegerKey : public LookupKey {
int64_t _value;
IntegerKey(int64_t value_in) : _value(value_in) {}
+ IntegerKey(const vespalib::string&) : _value() { abort(); }
vespalib::stringref asString() const override { abort(); }
bool asInteger(int64_t& value) const override { value = _value; return true; }
};
@@ -33,6 +34,7 @@ struct IntegerKey : public LookupKey {
struct StringKey : public LookupKey {
vespalib::string _value;
StringKey(int64_t value_in) : _value(std::to_string(value_in)) {}
+ StringKey(const vespalib::string& value_in) : _value(value_in) {}
vespalib::stringref asString() const override { return _value; }
bool asInteger(int64_t&) const override { abort(); }
};
@@ -78,6 +80,10 @@ populate_attribute(AttributeType& attr, const std::vector<DataType>& values)
for (auto docid : range(300, 128)) {
attr.update(docid, values[3]);
}
+ if (values.size() > 4) {
+ attr.update(40, values[4]);
+ attr.update(41, values[5]);
+ }
attr.commit(true);
}
@@ -93,7 +99,7 @@ make_attribute(CollectionType col_type, BasicType type, bool field_is_filter)
auto attr = test::AttributeBuilder(field_name, cfg).docs(num_docs).get();
if (type == BasicType::STRING) {
populate_attribute<StringAttribute, vespalib::string>(dynamic_cast<StringAttribute&>(*attr),
- {"1", "3", "100", "300"});
+ {"1", "3", "100", "300", "foo", "Foo"});
} else {
populate_attribute<IntegerAttribute, int64_t>(dynamic_cast<IntegerAttribute&>(*attr),
{1, 3, 100, 300});
@@ -223,15 +229,16 @@ public:
tfmd.tagAsNotNeeded();
}
}
- template <typename BlueprintType>
- void add_term_helper(BlueprintType& b, int64_t term_value) {
+ template <typename BlueprintType, typename TermType>
+ void add_term_helper(BlueprintType& b, TermType term_value) {
if (integer_type) {
b.addTerm(IntegerKey(term_value), 1, estimate);
} else {
b.addTerm(StringKey(term_value), 1, estimate);
}
}
- void add_term(int64_t term_value) {
+ template <typename TermType>
+ void add_term(TermType term_value) {
if (single_type) {
if (in_operator) {
add_term_helper(dynamic_cast<SingleInBlueprintType&>(*blueprint), term_value);
@@ -246,8 +253,18 @@ public:
}
}
}
- std::unique_ptr<SearchIterator> create_leaf_search() const {
- return blueprint->createLeafSearch(tfmda, true);
+ void add_terms(const std::vector<int64_t>& term_values) {
+ for (auto value : term_values) {
+ add_term(value);
+ }
+ }
+ void add_terms(const std::vector<vespalib::string>& term_values) {
+ for (auto value : term_values) {
+ add_term(value);
+ }
+ }
+ std::unique_ptr<SearchIterator> create_leaf_search(bool strict = true) const {
+ return blueprint->createLeafSearch(tfmda, strict);
}
vespalib::string resolve_iterator_with_unpack() const {
if (in_operator) {
@@ -262,7 +279,7 @@ expect_hits(const Docids& exp_docids, SearchIterator& itr)
{
SimpleResult exp(exp_docids);
SimpleResult act;
- act.search(itr);
+ act.search(itr, doc_id_limit);
EXPECT_EQ(exp, act);
}
@@ -294,8 +311,7 @@ INSTANTIATE_TEST_SUITE_P(DefaultInstantiation,
TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_none_filter_field) {
setup(false, true);
- add_term(1);
- add_term(3);
+ add_terms({1, 3});
auto itr = create_leaf_search();
EXPECT_THAT(itr->asString(), StartsWith(resolve_iterator_with_unpack()));
expect_hits({10, 30, 31}, *itr);
@@ -307,8 +323,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_used_instead_of_btree_iterators_
if (!in_operator) {
return;
}
- add_term(1);
- add_term(100);
+ add_terms({1, 100});
auto itr = create_leaf_search();
expect_or_iterator(*itr, 2);
expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT");
@@ -322,8 +337,7 @@ TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_instead_of_bitvectors_
if (in_operator) {
return;
}
- add_term(1);
- add_term(100);
+ add_terms({1, 100});
auto itr = create_leaf_search();
EXPECT_THAT(itr->asString(), StartsWith(iterator_unpack_docid_and_weights));
expect_hits(concat({10}, range(100, 128)), *itr);
@@ -332,10 +346,7 @@ TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_instead_of_bitvectors_
TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_filter_field)
{
setup(true, true);
- add_term(1);
- add_term(3);
- add_term(100);
- add_term(300);
+ add_terms({1, 3, 100, 300});
auto itr = create_leaf_search();
expect_or_iterator(*itr, 3);
expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT");
@@ -347,8 +358,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_fil
TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field)
{
setup(true, true);
- add_term(100);
- add_term(300);
+ add_terms({100, 300});
auto itr = create_leaf_search();
expect_or_iterator(*itr, 2);
expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT");
@@ -359,8 +369,7 @@ TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field)
TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_filter_field_when_ranking_not_needed)
{
setup(true, false);
- add_term(1);
- add_term(3);
+ add_terms({1, 3});
auto itr = create_leaf_search();
EXPECT_THAT(itr->asString(), StartsWith(iterator_unpack_none));
expect_hits({10, 30, 31}, *itr);
@@ -369,10 +378,7 @@ TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_filter_field_when_
TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_filter_field_when_ranking_not_needed)
{
setup(true, false);
- add_term(1);
- add_term(3);
- add_term(100);
- add_term(300);
+ add_terms({1, 3, 100, 300});
auto itr = create_leaf_search();
expect_or_iterator(*itr, 3);
expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT");
@@ -384,8 +390,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_fil
TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field_when_ranking_not_needed)
{
setup(true, false);
- add_term(100);
- add_term(300);
+ add_terms({100, 300});
auto itr = create_leaf_search();
expect_or_iterator(*itr, 2);
expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT");
@@ -393,4 +398,41 @@ TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field_when_
expect_hits(concat(range(100, 128), range(300, 128)), *itr);
}
+TEST_P(DirectMultiTermBlueprintTest, hash_filter_used_for_non_strict_iterator_with_10_or_more_terms)
+{
+ setup(true, true);
+ if (!single_type) {
+ return;
+ }
+ add_terms({1, 3, 3, 3, 3, 3, 3, 3, 3, 3});
+ auto itr = create_leaf_search(false);
+ EXPECT_THAT(itr->asString(), StartsWith("search::attribute::MultiTermHashFilter"));
+ expect_hits({10, 30, 31}, *itr);
+}
+
+TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_non_strict_iterator_with_9_or_less_terms)
+{
+ setup(true, true);
+ if (!single_type) {
+ return;
+ }
+ add_terms({1, 3, 3, 3, 3, 3, 3, 3, 3});
+ auto itr = create_leaf_search(false);
+ EXPECT_THAT(itr->asString(), StartsWith(iterator_unpack_docid));
+ expect_hits({10, 30, 31}, *itr);
+}
+
+TEST_P(DirectMultiTermBlueprintTest, hash_filter_with_string_folding_used_for_non_strict_iterator)
+{
+ setup(true, true);
+ if (!single_type || integer_type) {
+ return;
+ }
+ // "foo" matches documents with "foo" (40) and "Foo" (41).
+ add_terms({"foo", "3", "3", "3", "3", "3", "3", "3", "3", "3"});
+ auto itr = create_leaf_search(false);
+ EXPECT_THAT(itr->asString(), StartsWith("search::attribute::MultiTermHashFilter"));
+ expect_hits({30, 31, 40, 41}, *itr);
+}
+
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp
index 01148c11c9c..d0353ab8947 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp
@@ -6,12 +6,12 @@
#include <vespa/searchlib/common/bitvector.h>
#include <vespa/searchlib/fef/matchdatalayout.h>
#include <vespa/searchlib/query/query_term_ucs4.h>
+#include <vespa/searchlib/queryeval/filter_wrapper.h>
+#include <vespa/searchlib/queryeval/orsearch.h>
#include <vespa/searchlib/queryeval/weighted_set_term_search.h>
#include <vespa/vespalib/objects/visit.h>
-#include <vespa/vespalib/util/stringfmt.h>
#include <vespa/vespalib/stllike/hash_map.hpp>
-#include <vespa/searchlib/queryeval/filter_wrapper.h>
-#include <vespa/searchlib/queryeval/orsearch.h>
+#include <vespa/vespalib/util/stringfmt.h>
namespace search {
@@ -38,6 +38,7 @@ class StringEnumWrapper : public AttrWrapper
{
public:
using TokenT = uint32_t;
+ static constexpr bool unpack_weights = true;
explicit StringEnumWrapper(const IAttributeVector & attr)
: AttrWrapper(attr) {}
auto mapToken(const ISearchContext &context) const {
@@ -52,6 +53,7 @@ class IntegerWrapper : public AttrWrapper
{
public:
using TokenT = uint64_t;
+ static constexpr bool unpack_weights = true;
explicit IntegerWrapper(const IAttributeVector & attr) : AttrWrapper(attr) {}
std::vector<int64_t> mapToken(const ISearchContext &context) const {
std::vector<int64_t> result;
diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h
index 42651854599..485427391ad 100644
--- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h
+++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h
@@ -35,6 +35,8 @@ private:
using IteratorType = typename PostingStoreType::IteratorType;
using IteratorWeights = std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>>;
+ bool use_hash_filter(bool strict) const;
+
IteratorWeights create_iterators(std::vector<IteratorType>& btree_iterators,
std::vector<std::unique_ptr<queryeval::SearchIterator>>& bitvectors,
bool use_bitvector_when_available,
diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp
index 6fc8ac63026..0a3b24142a5 100644
--- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp
@@ -8,6 +8,7 @@
#include <vespa/searchlib/queryeval/emptysearch.h>
#include <vespa/searchlib/queryeval/filter_wrapper.h>
#include <vespa/searchlib/queryeval/orsearch.h>
+#include <cmath>
#include <memory>
#include <type_traits>
@@ -38,6 +39,43 @@ DirectMultiTermBlueprint<PostingStoreType, SearchType>::DirectMultiTermBlueprint
template <typename PostingStoreType, typename SearchType>
DirectMultiTermBlueprint<PostingStoreType, SearchType>::~DirectMultiTermBlueprint() = default;
+
+template <typename PostingStoreType, typename SearchType>
+bool
+DirectMultiTermBlueprint<PostingStoreType, SearchType>::use_hash_filter(bool strict) const
+{
+ if (strict || _iattr.hasMultiValue()) {
+ return false;
+ }
+ // The following very simplified formula was created after analysing performance of the IN operator
+ // on a 10M document corpus using a machine with an Intel Xeon 2.5 GHz CPU with 48 cores and 256 Gb of memory:
+ // https://github.com/vespa-engine/system-test/tree/master/tests/performance/in_operator
+ //
+ // The following 25 test cases were used to calculate the cost of using btree iterators (strict):
+ // op_hits_ratios = [5, 10, 50, 100, 200] * tokens_in_op = [1, 5, 10, 100, 1000]
+ // For each case we calculate the latency difference against the case with tokens_in_op=1 and the same op_hits_ratio.
+ // This indicates the extra time used to produce the same number of hits when having multiple tokens in the operator.
+ // The latency diff is divided with the number of hits produced and convert to nanoseconds:
+ // 10M * (op_hits_ratio / 1000) * 1000 * 1000
+ // Based on the numbers we can approximate the cost per document (in nanoseconds) as:
+ // 8.0 (ns) * log2(tokens_in_op).
+ // NOTE: This is very simplified. Ideally we should also take into consideration the hit estimate of this blueprint,
+ // as the cost per document will be lower when producing few hits.
+ //
+ // In addition, the following 28 test cases were used to calculate the cost of using the hash filter (non-strict).
+ // filter_hits_ratios = [1, 5, 10, 50, 100, 150, 200] x op_hits_ratios = [200] x tokens_in_op = [5, 10, 100, 1000]
+ // The code was altered to always using the hash filter for non-strict iterators.
+ // For each case we calculate the latency difference against a case from above with tokens_in_op=1 that produce a similar number of hits.
+ // This indicates the extra time used to produce the same number of hits when using the hash filter.
+ // The latency diff is divided with the number of hits the test filter produces and convert to nanoseconds:
+ // 10M * (filter_hits_ratio / 1000) * 1000 * 1000
+ // Based on the numbers we calculate the average cost per document (in nanoseconds) as 26.0 ns.
+
+ float hash_filter_cost_per_doc_ns = 26.0;
+ float btree_iterator_cost_per_doc_ns = 8.0 * std::log2(_terms.size());
+ return hash_filter_cost_per_doc_ns < btree_iterator_cost_per_doc_ns;
+}
+
template <typename PostingStoreType, typename SearchType>
typename DirectMultiTermBlueprint<PostingStoreType, SearchType>::IteratorWeights
DirectMultiTermBlueprint<PostingStoreType, SearchType>::create_iterators(std::vector<IteratorType>& btree_iterators,
@@ -96,14 +134,21 @@ DirectMultiTermBlueprint<PostingStoreType, SearchType>::create_search_helper(con
if (_terms.empty()) {
return std::make_unique<queryeval::EmptySearch>();
}
+ auto& tfmd = *tfmda[0];
+ bool field_is_filter = getState().fields()[0].isFilter();
+ if constexpr (SearchType::supports_hash_filter) {
+ if (use_hash_filter(strict)) {
+ return SearchType::create_hash_filter(tfmd, (filter_search || field_is_filter),
+ _weights, _terms,
+ _iattr, _attr, _dictionary_snapshot);
+ }
+ }
std::vector<IteratorType> btree_iterators;
std::vector<queryeval::SearchIterator::UP> bitvectors;
const size_t num_children = _terms.size();
btree_iterators.reserve(num_children);
- auto& tfmd = *tfmda[0];
bool use_bit_vector_when_available = filter_search || !_attr.has_always_btree_iterator();
auto weights = create_iterators(btree_iterators, bitvectors, use_bit_vector_when_available, tfmd, strict);
- bool field_is_filter = getState().fields()[0].isFilter();
if constexpr (!SearchType::require_btree_iterators) {
auto multi_term = !btree_iterators.empty() ?
SearchType::create(tfmd, (filter_search || field_is_filter), std::move(weights), std::move(btree_iterators))
diff --git a/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.h b/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.h
index 9c3ea258fdc..43500366f21 100644
--- a/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.h
+++ b/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.h
@@ -23,6 +23,7 @@ class MultiTermHashFilter final : public queryeval::SearchIterator
public:
using Key = typename WrapperType::TokenT;
using TokenMap = vespalib::hash_map<Key, int32_t, vespalib::hash<Key>, std::equal_to<Key>, vespalib::hashtable_base::and_modulator>;
+ static constexpr bool unpack_weights = WrapperType::unpack_weights;
private:
fef::TermFieldMatchData& _tfmd;
diff --git a/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.hpp b/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.hpp
index 96d5b3ac1f3..e67bda147d7 100644
--- a/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.hpp
@@ -42,10 +42,14 @@ template <typename WrapperType>
void
MultiTermHashFilter<WrapperType>::doUnpack(uint32_t docId)
{
- _tfmd.reset(docId);
- fef::TermFieldMatchDataPosition pos;
- pos.setElementWeight(_weight);
- _tfmd.appendPosition(pos);
+ if constexpr (unpack_weights) {
+ _tfmd.reset(docId);
+ fef::TermFieldMatchDataPosition pos;
+ pos.setElementWeight(_weight);
+ _tfmd.appendPosition(pos);
+ } else {
+ _tfmd.resetOnlyDocId(docId);
+ }
}
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h b/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h
index e49fcbcb5bc..c74c2d4e9a7 100644
--- a/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h
+++ b/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h
@@ -27,6 +27,7 @@ protected:
public:
static constexpr bool filter_search = false;
static constexpr bool require_btree_iterators = true;
+ static constexpr bool supports_hash_filter = false;
// TODO: use MultiSearch::Children to pass ownership
static SearchIterator::UP create(const std::vector<SearchIterator*> &children,
diff --git a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp
index 0c57b21aba6..0be014474d0 100644
--- a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp
+++ b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp
@@ -1,10 +1,13 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "weighted_set_term_search.h"
+#include <vespa/searchcommon/attribute/i_search_context.h>
+#include <vespa/searchcommon/attribute/iattributevector.h>
+#include <vespa/searchlib/attribute/i_direct_posting_store.h>
+#include <vespa/searchlib/attribute/multi_term_hash_filter.hpp>
#include <vespa/searchlib/common/bitvector.h>
-#include <vespa/searchlib/attribute/multi_term_or_filter_search.h>
#include <vespa/vespalib/objects/visit.h>
-#include <vespa/searchcommon/attribute/i_search_context.h>
+#include <vespa/vespalib/stllike/hash_map.hpp>
#include "iterator_pack.h"
#include "blueprint.h"
@@ -237,4 +240,98 @@ WeightedSetTermSearch::create(fef::TermFieldMatchData &tmd,
return create_helper_resolve_pack<DocidWithWeightIterator, DocidWithWeightIteratorPack>(tmd, is_filter_search, std::move(weights), std::move(iterators));
}
+namespace {
+
+class HashFilterWrapper {
+protected:
+ const attribute::IAttributeVector& _attr;
+public:
+ HashFilterWrapper(const attribute::IAttributeVector& attr) : _attr(attr) {}
+};
+
+template <bool unpack_weights_t>
+class StringHashFilterWrapper : public HashFilterWrapper {
+public:
+ using TokenT = attribute::IAttributeVector::EnumHandle;
+ static constexpr bool unpack_weights = unpack_weights_t;
+ StringHashFilterWrapper(const attribute::IAttributeVector& attr)
+ : HashFilterWrapper(attr)
+ {}
+ auto mapToken(const IDirectPostingStore::LookupResult& term, const IDirectPostingStore& store, vespalib::datastore::EntryRef dict_snapshot) const {
+ std::vector<TokenT> result;
+ store.collect_folded(term.enum_idx, dict_snapshot, [&](vespalib::datastore::EntryRef ref) { result.emplace_back(ref.ref()); });
+ return result;
+ }
+ TokenT getToken(uint32_t docid) const {
+ return _attr.getEnum(docid);
+ }
+};
+
+template <bool unpack_weights_t>
+class IntegerHashFilterWrapper : public HashFilterWrapper {
+public:
+ using TokenT = attribute::IAttributeVector::largeint_t;
+ static constexpr bool unpack_weights = unpack_weights_t;
+ IntegerHashFilterWrapper(const attribute::IAttributeVector& attr)
+ : HashFilterWrapper(attr)
+ {}
+ auto mapToken(const IDirectPostingStore::LookupResult& term,
+ const IDirectPostingStore& store,
+ vespalib::datastore::EntryRef) const {
+ std::vector<TokenT> result;
+ result.emplace_back(store.get_integer_value(term.enum_idx));
+ return result;
+ }
+ TokenT getToken(uint32_t docid) const {
+ return _attr.getInt(docid);
+ }
+};
+
+template <typename WrapperType>
+SearchIterator::UP
+create_hash_filter_helper(fef::TermFieldMatchData& tfmd,
+ const std::vector<int32_t>& weights,
+ const std::vector<IDirectPostingStore::LookupResult>& terms,
+ const attribute::IAttributeVector& attr,
+ const IDirectPostingStore& posting_store,
+ vespalib::datastore::EntryRef dict_snapshot)
+{
+ using FilterType = attribute::MultiTermHashFilter<WrapperType>;
+ typename FilterType::TokenMap tokens;
+ WrapperType wrapper(attr);
+ for (size_t i = 0; i < terms.size(); ++i) {
+ for (auto token : wrapper.mapToken(terms[i], posting_store, dict_snapshot)) {
+ tokens[token] = weights[i];
+ }
+ }
+ return std::make_unique<FilterType>(tfmd, wrapper, std::move(tokens));
+}
+
+}
+
+SearchIterator::UP
+WeightedSetTermSearch::create_hash_filter(search::fef::TermFieldMatchData& tmd,
+ bool is_filter_search,
+ const std::vector<int32_t>& weights,
+ const std::vector<IDirectPostingStore::LookupResult>& terms,
+ const attribute::IAttributeVector& attr,
+ const IDirectPostingStore& posting_store,
+ vespalib::datastore::EntryRef dict_snapshot)
+{
+ if (attr.isStringType()) {
+ if (is_filter_search) {
+ return create_hash_filter_helper<StringHashFilterWrapper<false>>(tmd, weights, terms, attr, posting_store, dict_snapshot);
+ } else {
+ return create_hash_filter_helper<StringHashFilterWrapper<true>>(tmd, weights, terms, attr, posting_store, dict_snapshot);
+ }
+ } else {
+ assert(attr.isIntegerType());
+ if (is_filter_search) {
+ return create_hash_filter_helper<IntegerHashFilterWrapper<false>>(tmd, weights, terms, attr, posting_store, dict_snapshot);
+ } else {
+ return create_hash_filter_helper<IntegerHashFilterWrapper<true>>(tmd, weights, terms, attr, posting_store, dict_snapshot);
+ }
+ }
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h
index 7e47928fb90..d078fd5babc 100644
--- a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h
+++ b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h
@@ -11,6 +11,8 @@
#include <variant>
#include <vector>
+namespace search::attribute { class IAttributeVector; }
+
namespace search::fef { class TermFieldMatchData; }
namespace search::queryeval {
@@ -30,6 +32,8 @@ public:
static constexpr bool filter_search = false;
// Whether this iterator requires btree iterators for all tokens/terms used by the operator.
static constexpr bool require_btree_iterators = false;
+ // Whether this supports creating a hash filter iterator;
+ static constexpr bool supports_hash_filter = true;
// TODO: pass ownership with unique_ptr
static SearchIterator::UP create(const std::vector<SearchIterator *> &children,
@@ -48,6 +52,14 @@ public:
std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights,
std::vector<DocidWithWeightIterator> &&iterators);
+ static SearchIterator::UP create_hash_filter(search::fef::TermFieldMatchData& tmd,
+ bool is_filter_search,
+ const std::vector<int32_t>& weights,
+ const std::vector<IDirectPostingStore::LookupResult>& terms,
+ const attribute::IAttributeVector& attr,
+ const IDirectPostingStore& posting_store,
+ vespalib::datastore::EntryRef dictionary_snapshot);
+
// used during docsum fetching to identify matching elements
// initRange must be called before use.
// doSeek/doUnpack must not be called.