diff options
Diffstat (limited to 'searchlib')
23 files changed, 363 insertions, 81 deletions
diff --git a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp index a94ca9d890e..87b771af8e6 100644 --- a/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp +++ b/searchlib/src/tests/attribute/direct_multi_term_blueprint/direct_multi_term_blueprint_test.cpp @@ -26,6 +26,7 @@ using LookupKey = IDirectPostingStore::LookupKey; struct IntegerKey : public LookupKey { int64_t _value; IntegerKey(int64_t value_in) : _value(value_in) {} + IntegerKey(const vespalib::string&) : _value() { abort(); } vespalib::stringref asString() const override { abort(); } bool asInteger(int64_t& value) const override { value = _value; return true; } }; @@ -33,6 +34,7 @@ struct IntegerKey : public LookupKey { struct StringKey : public LookupKey { vespalib::string _value; StringKey(int64_t value_in) : _value(std::to_string(value_in)) {} + StringKey(const vespalib::string& value_in) : _value(value_in) {} vespalib::stringref asString() const override { return _value; } bool asInteger(int64_t&) const override { abort(); } }; @@ -78,6 +80,10 @@ populate_attribute(AttributeType& attr, const std::vector<DataType>& values) for (auto docid : range(300, 128)) { attr.update(docid, values[3]); } + if (values.size() > 4) { + attr.update(40, values[4]); + attr.update(41, values[5]); + } attr.commit(true); } @@ -93,7 +99,7 @@ make_attribute(CollectionType col_type, BasicType type, bool field_is_filter) auto attr = test::AttributeBuilder(field_name, cfg).docs(num_docs).get(); if (type == BasicType::STRING) { populate_attribute<StringAttribute, vespalib::string>(dynamic_cast<StringAttribute&>(*attr), - {"1", "3", "100", "300"}); + {"1", "3", "100", "300", "foo", "Foo"}); } else { populate_attribute<IntegerAttribute, int64_t>(dynamic_cast<IntegerAttribute&>(*attr), {1, 3, 100, 300}); @@ -223,15 +229,16 @@ public: tfmd.tagAsNotNeeded(); } } - template <typename BlueprintType> - void add_term_helper(BlueprintType& b, int64_t term_value) { + template <typename BlueprintType, typename TermType> + void add_term_helper(BlueprintType& b, TermType term_value) { if (integer_type) { b.addTerm(IntegerKey(term_value), 1, estimate); } else { b.addTerm(StringKey(term_value), 1, estimate); } } - void add_term(int64_t term_value) { + template <typename TermType> + void add_term(TermType term_value) { if (single_type) { if (in_operator) { add_term_helper(dynamic_cast<SingleInBlueprintType&>(*blueprint), term_value); @@ -246,8 +253,18 @@ public: } } } - std::unique_ptr<SearchIterator> create_leaf_search() const { - return blueprint->createLeafSearch(tfmda, true); + void add_terms(const std::vector<int64_t>& term_values) { + for (auto value : term_values) { + add_term(value); + } + } + void add_terms(const std::vector<vespalib::string>& term_values) { + for (auto value : term_values) { + add_term(value); + } + } + std::unique_ptr<SearchIterator> create_leaf_search(bool strict = true) const { + return blueprint->createLeafSearch(tfmda, strict); } vespalib::string resolve_iterator_with_unpack() const { if (in_operator) { @@ -262,7 +279,7 @@ expect_hits(const Docids& exp_docids, SearchIterator& itr) { SimpleResult exp(exp_docids); SimpleResult act; - act.search(itr); + act.search(itr, doc_id_limit); EXPECT_EQ(exp, act); } @@ -294,8 +311,7 @@ INSTANTIATE_TEST_SUITE_P(DefaultInstantiation, TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_none_filter_field) { setup(false, true); - add_term(1); - add_term(3); + add_terms({1, 3}); auto itr = create_leaf_search(); EXPECT_THAT(itr->asString(), StartsWith(resolve_iterator_with_unpack())); expect_hits({10, 30, 31}, *itr); @@ -307,8 +323,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_used_instead_of_btree_iterators_ if (!in_operator) { return; } - add_term(1); - add_term(100); + add_terms({1, 100}); auto itr = create_leaf_search(); expect_or_iterator(*itr, 2); expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); @@ -322,8 +337,7 @@ TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_instead_of_bitvectors_ if (in_operator) { return; } - add_term(1); - add_term(100); + add_terms({1, 100}); auto itr = create_leaf_search(); EXPECT_THAT(itr->asString(), StartsWith(iterator_unpack_docid_and_weights)); expect_hits(concat({10}, range(100, 128)), *itr); @@ -332,10 +346,7 @@ TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_instead_of_bitvectors_ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_filter_field) { setup(true, true); - add_term(1); - add_term(3); - add_term(100); - add_term(300); + add_terms({1, 3, 100, 300}); auto itr = create_leaf_search(); expect_or_iterator(*itr, 3); expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); @@ -347,8 +358,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_fil TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field) { setup(true, true); - add_term(100); - add_term(300); + add_terms({100, 300}); auto itr = create_leaf_search(); expect_or_iterator(*itr, 2); expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); @@ -359,8 +369,7 @@ TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field) TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_filter_field_when_ranking_not_needed) { setup(true, false); - add_term(1); - add_term(3); + add_terms({1, 3}); auto itr = create_leaf_search(); EXPECT_THAT(itr->asString(), StartsWith(iterator_unpack_none)); expect_hits({10, 30, 31}, *itr); @@ -369,10 +378,7 @@ TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_filter_field_when_ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_filter_field_when_ranking_not_needed) { setup(true, false); - add_term(1); - add_term(3); - add_term(100); - add_term(300); + add_terms({1, 3, 100, 300}); auto itr = create_leaf_search(); expect_or_iterator(*itr, 3); expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); @@ -384,8 +390,7 @@ TEST_P(DirectMultiTermBlueprintTest, bitvectors_and_btree_iterators_used_for_fil TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field_when_ranking_not_needed) { setup(true, false); - add_term(100); - add_term(300); + add_terms({100, 300}); auto itr = create_leaf_search(); expect_or_iterator(*itr, 2); expect_or_child(*itr, 0, "search::BitVectorIteratorStrictT"); @@ -393,4 +398,41 @@ TEST_P(DirectMultiTermBlueprintTest, only_bitvectors_used_for_filter_field_when_ expect_hits(concat(range(100, 128), range(300, 128)), *itr); } +TEST_P(DirectMultiTermBlueprintTest, hash_filter_used_for_non_strict_iterator_with_10_or_more_terms) +{ + setup(true, true); + if (!single_type) { + return; + } + add_terms({1, 3, 3, 3, 3, 3, 3, 3, 3, 3}); + auto itr = create_leaf_search(false); + EXPECT_THAT(itr->asString(), StartsWith("search::attribute::MultiTermHashFilter")); + expect_hits({10, 30, 31}, *itr); +} + +TEST_P(DirectMultiTermBlueprintTest, btree_iterators_used_for_non_strict_iterator_with_9_or_less_terms) +{ + setup(true, true); + if (!single_type) { + return; + } + add_terms({1, 3, 3, 3, 3, 3, 3, 3, 3}); + auto itr = create_leaf_search(false); + EXPECT_THAT(itr->asString(), StartsWith(iterator_unpack_docid)); + expect_hits({10, 30, 31}, *itr); +} + +TEST_P(DirectMultiTermBlueprintTest, hash_filter_with_string_folding_used_for_non_strict_iterator) +{ + setup(true, true); + if (!single_type || integer_type) { + return; + } + // "foo" matches documents with "foo" (40) and "Foo" (41). + add_terms({"foo", "3", "3", "3", "3", "3", "3", "3", "3", "3"}); + auto itr = create_leaf_search(false); + EXPECT_THAT(itr->asString(), StartsWith("search::attribute::MultiTermHashFilter")); + expect_hits({30, 31, 40, 41}, *itr); +} + GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/query/streaming_query_test.cpp b/searchlib/src/tests/query/streaming_query_test.cpp index fe6149e6fba..97b3d88c25e 100644 --- a/searchlib/src/tests/query/streaming_query_test.cpp +++ b/searchlib/src/tests/query/streaming_query_test.cpp @@ -23,9 +23,9 @@ using TermType = QueryTerm::Type; using search::fef::SimpleTermData; using search::fef::MatchData; -void assertHit(const Hit & h, size_t expWordpos, size_t expContext, int32_t weight) { +void assertHit(const Hit & h, size_t expWordpos, size_t exp_field_id, int32_t weight) { EXPECT_EQ(h.wordpos(), expWordpos); - EXPECT_EQ(h.context(), expContext); + EXPECT_EQ(h.field_id(), exp_field_id); EXPECT_EQ(h.weight(), weight); } @@ -479,11 +479,11 @@ TEST(StreamingQueryTest, test_phrase_evaluate) p->evaluateHits(hits); ASSERT_EQ(3u, hits.size()); EXPECT_EQ(hits[0].wordpos(), 2u); - EXPECT_EQ(hits[0].context(), 0u); + EXPECT_EQ(hits[0].field_id(), 0u); EXPECT_EQ(hits[1].wordpos(), 6u); - EXPECT_EQ(hits[1].context(), 1u); + EXPECT_EQ(hits[1].field_id(), 1u); EXPECT_EQ(hits[2].wordpos(), 2u); - EXPECT_EQ(hits[2].context(), 3u); + EXPECT_EQ(hits[2].field_id(), 3u); ASSERT_EQ(4u, p->getFieldInfoSize()); EXPECT_EQ(p->getFieldInfo(0).getHitOffset(), 0u); EXPECT_EQ(p->getFieldInfo(0).getHitCount(), 1u); @@ -847,22 +847,22 @@ TEST(StreamingQueryTest, test_same_element_evaluate) sameElem->evaluateHits(hits); EXPECT_EQ(4u, hits.size()); EXPECT_EQ(0u, hits[0].wordpos()); - EXPECT_EQ(2u, hits[0].context()); + EXPECT_EQ(2u, hits[0].field_id()); EXPECT_EQ(0u, hits[0].elemId()); EXPECT_EQ(130, hits[0].weight()); EXPECT_EQ(0u, hits[1].wordpos()); - EXPECT_EQ(2u, hits[1].context()); + EXPECT_EQ(2u, hits[1].field_id()); EXPECT_EQ(2u, hits[1].elemId()); EXPECT_EQ(140, hits[1].weight()); EXPECT_EQ(0u, hits[2].wordpos()); - EXPECT_EQ(2u, hits[2].context()); + EXPECT_EQ(2u, hits[2].field_id()); EXPECT_EQ(4u, hits[2].elemId()); EXPECT_EQ(150, hits[2].weight()); EXPECT_EQ(0u, hits[3].wordpos()); - EXPECT_EQ(2u, hits[3].context()); + EXPECT_EQ(2u, hits[3].field_id()); EXPECT_EQ(5u, hits[3].elemId()); EXPECT_EQ(160, hits[3].weight()); EXPECT_TRUE(sameElem->evaluate()); diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp index 01148c11c9c..d0353ab8947 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp @@ -6,12 +6,12 @@ #include <vespa/searchlib/common/bitvector.h> #include <vespa/searchlib/fef/matchdatalayout.h> #include <vespa/searchlib/query/query_term_ucs4.h> +#include <vespa/searchlib/queryeval/filter_wrapper.h> +#include <vespa/searchlib/queryeval/orsearch.h> #include <vespa/searchlib/queryeval/weighted_set_term_search.h> #include <vespa/vespalib/objects/visit.h> -#include <vespa/vespalib/util/stringfmt.h> #include <vespa/vespalib/stllike/hash_map.hpp> -#include <vespa/searchlib/queryeval/filter_wrapper.h> -#include <vespa/searchlib/queryeval/orsearch.h> +#include <vespa/vespalib/util/stringfmt.h> namespace search { @@ -38,6 +38,7 @@ class StringEnumWrapper : public AttrWrapper { public: using TokenT = uint32_t; + static constexpr bool unpack_weights = true; explicit StringEnumWrapper(const IAttributeVector & attr) : AttrWrapper(attr) {} auto mapToken(const ISearchContext &context) const { @@ -52,6 +53,7 @@ class IntegerWrapper : public AttrWrapper { public: using TokenT = uint64_t; + static constexpr bool unpack_weights = true; explicit IntegerWrapper(const IAttributeVector & attr) : AttrWrapper(attr) {} std::vector<int64_t> mapToken(const ISearchContext &context) const { std::vector<int64_t> result; diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h index 42651854599..485427391ad 100644 --- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h +++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.h @@ -35,6 +35,8 @@ private: using IteratorType = typename PostingStoreType::IteratorType; using IteratorWeights = std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>>; + bool use_hash_filter(bool strict) const; + IteratorWeights create_iterators(std::vector<IteratorType>& btree_iterators, std::vector<std::unique_ptr<queryeval::SearchIterator>>& bitvectors, bool use_bitvector_when_available, diff --git a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp index 6fc8ac63026..0a3b24142a5 100644 --- a/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp +++ b/searchlib/src/vespa/searchlib/attribute/direct_multi_term_blueprint.hpp @@ -8,6 +8,7 @@ #include <vespa/searchlib/queryeval/emptysearch.h> #include <vespa/searchlib/queryeval/filter_wrapper.h> #include <vespa/searchlib/queryeval/orsearch.h> +#include <cmath> #include <memory> #include <type_traits> @@ -38,6 +39,43 @@ DirectMultiTermBlueprint<PostingStoreType, SearchType>::DirectMultiTermBlueprint template <typename PostingStoreType, typename SearchType> DirectMultiTermBlueprint<PostingStoreType, SearchType>::~DirectMultiTermBlueprint() = default; + +template <typename PostingStoreType, typename SearchType> +bool +DirectMultiTermBlueprint<PostingStoreType, SearchType>::use_hash_filter(bool strict) const +{ + if (strict || _iattr.hasMultiValue()) { + return false; + } + // The following very simplified formula was created after analysing performance of the IN operator + // on a 10M document corpus using a machine with an Intel Xeon 2.5 GHz CPU with 48 cores and 256 Gb of memory: + // https://github.com/vespa-engine/system-test/tree/master/tests/performance/in_operator + // + // The following 25 test cases were used to calculate the cost of using btree iterators (strict): + // op_hits_ratios = [5, 10, 50, 100, 200] * tokens_in_op = [1, 5, 10, 100, 1000] + // For each case we calculate the latency difference against the case with tokens_in_op=1 and the same op_hits_ratio. + // This indicates the extra time used to produce the same number of hits when having multiple tokens in the operator. + // The latency diff is divided with the number of hits produced and convert to nanoseconds: + // 10M * (op_hits_ratio / 1000) * 1000 * 1000 + // Based on the numbers we can approximate the cost per document (in nanoseconds) as: + // 8.0 (ns) * log2(tokens_in_op). + // NOTE: This is very simplified. Ideally we should also take into consideration the hit estimate of this blueprint, + // as the cost per document will be lower when producing few hits. + // + // In addition, the following 28 test cases were used to calculate the cost of using the hash filter (non-strict). + // filter_hits_ratios = [1, 5, 10, 50, 100, 150, 200] x op_hits_ratios = [200] x tokens_in_op = [5, 10, 100, 1000] + // The code was altered to always using the hash filter for non-strict iterators. + // For each case we calculate the latency difference against a case from above with tokens_in_op=1 that produce a similar number of hits. + // This indicates the extra time used to produce the same number of hits when using the hash filter. + // The latency diff is divided with the number of hits the test filter produces and convert to nanoseconds: + // 10M * (filter_hits_ratio / 1000) * 1000 * 1000 + // Based on the numbers we calculate the average cost per document (in nanoseconds) as 26.0 ns. + + float hash_filter_cost_per_doc_ns = 26.0; + float btree_iterator_cost_per_doc_ns = 8.0 * std::log2(_terms.size()); + return hash_filter_cost_per_doc_ns < btree_iterator_cost_per_doc_ns; +} + template <typename PostingStoreType, typename SearchType> typename DirectMultiTermBlueprint<PostingStoreType, SearchType>::IteratorWeights DirectMultiTermBlueprint<PostingStoreType, SearchType>::create_iterators(std::vector<IteratorType>& btree_iterators, @@ -96,14 +134,21 @@ DirectMultiTermBlueprint<PostingStoreType, SearchType>::create_search_helper(con if (_terms.empty()) { return std::make_unique<queryeval::EmptySearch>(); } + auto& tfmd = *tfmda[0]; + bool field_is_filter = getState().fields()[0].isFilter(); + if constexpr (SearchType::supports_hash_filter) { + if (use_hash_filter(strict)) { + return SearchType::create_hash_filter(tfmd, (filter_search || field_is_filter), + _weights, _terms, + _iattr, _attr, _dictionary_snapshot); + } + } std::vector<IteratorType> btree_iterators; std::vector<queryeval::SearchIterator::UP> bitvectors; const size_t num_children = _terms.size(); btree_iterators.reserve(num_children); - auto& tfmd = *tfmda[0]; bool use_bit_vector_when_available = filter_search || !_attr.has_always_btree_iterator(); auto weights = create_iterators(btree_iterators, bitvectors, use_bit_vector_when_available, tfmd, strict); - bool field_is_filter = getState().fields()[0].isFilter(); if constexpr (!SearchType::require_btree_iterators) { auto multi_term = !btree_iterators.empty() ? SearchType::create(tfmd, (filter_search || field_is_filter), std::move(weights), std::move(btree_iterators)) diff --git a/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.h b/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.h index 9c3ea258fdc..43500366f21 100644 --- a/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.h +++ b/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.h @@ -23,6 +23,7 @@ class MultiTermHashFilter final : public queryeval::SearchIterator public: using Key = typename WrapperType::TokenT; using TokenMap = vespalib::hash_map<Key, int32_t, vespalib::hash<Key>, std::equal_to<Key>, vespalib::hashtable_base::and_modulator>; + static constexpr bool unpack_weights = WrapperType::unpack_weights; private: fef::TermFieldMatchData& _tfmd; diff --git a/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.hpp b/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.hpp index 96d5b3ac1f3..e67bda147d7 100644 --- a/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multi_term_hash_filter.hpp @@ -42,10 +42,14 @@ template <typename WrapperType> void MultiTermHashFilter<WrapperType>::doUnpack(uint32_t docId) { - _tfmd.reset(docId); - fef::TermFieldMatchDataPosition pos; - pos.setElementWeight(_weight); - _tfmd.appendPosition(pos); + if constexpr (unpack_weights) { + _tfmd.reset(docId); + fef::TermFieldMatchDataPosition pos; + pos.setElementWeight(_weight); + _tfmd.appendPosition(pos); + } else { + _tfmd.resetOnlyDocId(docId); + } } } diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt index 05a75f4662e..76119a6d58f 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt @@ -2,6 +2,7 @@ vespa_add_library(searchlib_query_streaming OBJECT SOURCES dot_product_term.cpp + fuzzy_term.cpp in_term.cpp multi_term.cpp nearest_neighbor_query_node.cpp diff --git a/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp index 1871bda564d..b3bfbb0e86b 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/dot_product_term.cpp @@ -24,7 +24,7 @@ DotProductTerm::build_scores(Scores& scores) const for (const auto& term : _terms) { auto& hl = term->evaluateHits(hl_store); for (auto& hit : hl) { - scores[hit.context()] += ((int64_t)term->weight().percent()) * hit.weight(); + scores[hit.field_id()] += ((int64_t)term->weight().percent()) * hit.weight(); } } } diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp new file mode 100644 index 00000000000..f33fe44369a --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp @@ -0,0 +1,43 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "fuzzy_term.h" + +namespace search::streaming { + +namespace { + +constexpr bool normalizing_implies_cased(Normalizing norm) noexcept { + return (norm == Normalizing::NONE); +} + +} + +FuzzyTerm::FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term, + const string& index, Type type, Normalizing normalizing, + uint8_t max_edits, uint32_t prefix_size) + : QueryTerm(std::move(result_base), term, index, type, normalizing), + _dfa_matcher(), + _fallback_matcher() +{ + setFuzzyMaxEditDistance(max_edits); + setFuzzyPrefixLength(prefix_size); + + std::string_view term_view(term.data(), term.size()); + const bool cased = normalizing_implies_cased(normalizing); + if (attribute::DfaFuzzyMatcher::supports_max_edits(max_edits)) { + _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_size, cased); + } else { + _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_size, cased); + } +} + +FuzzyTerm::~FuzzyTerm() = default; + +bool FuzzyTerm::is_match(std::string_view term) const { + if (_dfa_matcher) { + return _dfa_matcher->is_match(term); + } else { + return _fallback_matcher->isMatch(term); + } +} + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h new file mode 100644 index 00000000000..c6c88b18969 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h @@ -0,0 +1,34 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "queryterm.h" +#include <vespa/searchlib/attribute/dfa_fuzzy_matcher.h> +#include <vespa/vespalib/fuzzy/fuzzy_matcher.h> +#include <memory> +#include <string_view> + +namespace search::streaming { + +/** + * Query term that matches candidate field terms that are within a query-specified + * maximum number of edits (add, delete or substitute a character), with case + * sensitivity controlled by the provided Normalizing mode. + * + * Optionally, terms may be prefixed-locked, which enforces field terms to have a + * particular prefix and where edits are only counted for the remaining term suffix. + */ +class FuzzyTerm : public QueryTerm { + std::unique_ptr<attribute::DfaFuzzyMatcher> _dfa_matcher; + std::unique_ptr<vespalib::FuzzyMatcher> _fallback_matcher; +public: + FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term, + const string& index, Type type, Normalizing normalizing, + uint8_t max_edits, uint32_t prefix_size); + ~FuzzyTerm() override; + + [[nodiscard]] FuzzyTerm* as_fuzzy_term() noexcept override { return this; } + + [[nodiscard]] bool is_match(std::string_view term) const; +}; + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/hit.h b/searchlib/src/vespa/searchlib/query/streaming/hit.h index a798d293491..1e467a895ac 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/hit.h +++ b/searchlib/src/vespa/searchlib/query/streaming/hit.h @@ -9,19 +9,17 @@ namespace search::streaming { class Hit { public: - Hit(uint32_t pos_, uint32_t context_, uint32_t elemId_, int32_t weight_) noexcept - : _position(pos_ | (context_<<24)), + Hit(uint32_t pos_, uint32_t field_id_, uint32_t elemId_, int32_t weight_) noexcept + : _position(pos_ | (field_id_<<24)), _elemId(elemId_), _weight(weight_) { } int32_t weight() const { return _weight; } uint32_t pos() const { return _position; } uint32_t wordpos() const { return _position & 0xffffff; } - uint32_t context() const { return _position >> 24; } + uint32_t field_id() const noexcept { return _position >> 24; } uint32_t elemId() const { return _elemId; } - bool operator < (const Hit & b) const { return cmp(b) < 0; } private: - int cmp(const Hit & b) const { return _position - b._position; } uint32_t _position; uint32_t _elemId; int32_t _weight; diff --git a/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp index 3e75f4a5114..c164db69ba1 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/in_term.cpp @@ -29,9 +29,9 @@ InTerm::unpack_match_data(uint32_t docid, const ITermData& td, MatchData& match_ for (const auto& term : _terms) { auto& hl = term->evaluateHits(hl_store); for (auto& hit : hl) { - if (!prev_field_id.has_value() || prev_field_id.value() != hit.context()) { - prev_field_id = hit.context(); - matching_field_ids.insert(hit.context()); + if (!prev_field_id.has_value() || prev_field_id.value() != hit.field_id()) { + prev_field_id = hit.field_id(); + matching_field_ids.insert(hit.field_id()); } } } diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp index dd34b9b7e73..f5a09892551 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.cpp @@ -51,11 +51,4 @@ MultiTerm::evaluate() const return false; } -const HitList& -MultiTerm::evaluateHits(HitList& hl) const -{ - hl.clear(); - return hl; -} - } diff --git a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h index 3bb69e29693..6f795c31356 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/multi_term.h +++ b/searchlib/src/vespa/searchlib/query/streaming/multi_term.h @@ -32,7 +32,6 @@ public: MultiTerm* as_multi_term() noexcept override { return this; } void reset() override; bool evaluate() const override; - const HitList& evaluateHits(HitList& hl) const override; virtual void unpack_match_data(uint32_t docid, const fef::ITermData& td, fef::MatchData& match_data) = 0; const std::vector<std::unique_ptr<QueryTerm>>& get_terms() const noexcept { return _terms; } }; diff --git a/searchlib/src/vespa/searchlib/query/streaming/query.cpp b/searchlib/src/vespa/searchlib/query/streaming/query.cpp index ca742aabe26..618922eced9 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/query.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/query.cpp @@ -208,7 +208,7 @@ SameElementQueryNode::evaluateHits(HitList & hl) const currMatchCount++; if ((currMatchCount+1) == numFields) { Hit h = nextHL[indexVector[currMatchCount]]; - hl.emplace_back(0, h.context(), h.elemId(), h.weight()); + hl.emplace_back(0, h.field_id(), h.elemId(), h.weight()); currMatchCount = 0; indexVector[0]++; } @@ -260,26 +260,26 @@ PhraseQueryNode::evaluateHits(HitList & hl) const const auto & currHit = curr->evaluateHits(tmpHL)[currIndex]; size_t firstPosition = currHit.pos(); uint32_t currElemId = currHit.elemId(); - uint32_t currContext = currHit.context(); + uint32_t curr_field_id = currHit.field_id(); const HitList & nextHL = next->evaluateHits(tmpHL); int diff(0); size_t nextIndexMax = nextHL.size(); while ((nextIndex < nextIndexMax) && - ((nextHL[nextIndex].context() < currContext) || - ((nextHL[nextIndex].context() == currContext) && (nextHL[nextIndex].elemId() <= currElemId))) && + ((nextHL[nextIndex].field_id() < curr_field_id) || + ((nextHL[nextIndex].field_id() == curr_field_id) && (nextHL[nextIndex].elemId() <= currElemId))) && ((diff = nextHL[nextIndex].pos()-firstPosition) < 1)) { nextIndex++; } - if ((diff == 1) && (nextHL[nextIndex].context() == currContext) && (nextHL[nextIndex].elemId() == currElemId)) { + if ((diff == 1) && (nextHL[nextIndex].field_id() == curr_field_id) && (nextHL[nextIndex].elemId() == currElemId)) { currPhraseLen++; if ((currPhraseLen+1) == fullPhraseLen) { Hit h = nextHL[indexVector[currPhraseLen]]; hl.push_back(h); - const QueryTerm::FieldInfo & fi = next->getFieldInfo(h.context()); - updateFieldInfo(h.context(), hl.size() - 1, fi.getFieldLength()); + const QueryTerm::FieldInfo & fi = next->getFieldInfo(h.field_id()); + updateFieldInfo(h.field_id(), hl.size() - 1, fi.getFieldLength()); currPhraseLen = 0; indexVector[0]++; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 2ee515f062a..e71529a8aca 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -1,7 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "query.h" +#include "fuzzy_term.h" #include "nearest_neighbor_query_node.h" +#include "query.h" #include "regexp_term.h" #include <vespa/searchlib/parsequery/stackdumpiterator.h> #include <vespa/searchlib/query/streaming/dot_product_term.h> @@ -147,17 +148,16 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor } else { Normalizing normalize_mode = factory.normalizing_mode(ssIndex); std::unique_ptr<QueryTerm> qt; - if (sTerm != TermType::REGEXP) { - qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode); - } else { + if (sTerm == TermType::REGEXP) { qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode); + } else if (sTerm == TermType::FUZZYTERM) { + qt = std::make_unique<FuzzyTerm>(factory.create(), ssTerm, ssIndex, TermType::FUZZYTERM, normalize_mode, + queryRep.getFuzzyMaxEditDistance(), queryRep.getFuzzyPrefixLength()); + } else [[likely]] { + qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode); } qt->setWeight(queryRep.GetWeight()); qt->setUniqueId(queryRep.getUniqueId()); - if (qt->isFuzzy()) { - qt->setFuzzyMaxEditDistance(queryRep.getFuzzyMaxEditDistance()); - qt->setFuzzyPrefixLength(queryRep.getFuzzyPrefixLength()); - } if (allowRewrite && possibleFloat(*qt, ssTerm) && factory.allow_float_terms_rewrite(ssIndex)) { auto phrase = std::make_unique<PhraseQueryNode>(); auto dotPos = ssTerm.find('.'); diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index 3e05d381ee2..fb002ec1867 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -185,4 +185,10 @@ QueryTerm::as_regexp_term() noexcept return nullptr; } +FuzzyTerm* +QueryTerm::as_fuzzy_term() noexcept +{ + return nullptr; +} + } diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index cd2bdd7eaec..b4dfa98ebe5 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -11,6 +11,7 @@ namespace search::streaming { +class FuzzyTerm; class NearestNeighborQueryNode; class MultiTerm; class RegexpTerm; @@ -64,7 +65,7 @@ public: QueryTerm & operator = (QueryTerm &&) = delete; ~QueryTerm() override; bool evaluate() const override; - const HitList & evaluateHits(HitList & hl) const override; + const HitList & evaluateHits(HitList & hl) const final override; void reset() override; void getLeaves(QueryTermList & tl) override; void getLeaves(ConstQueryTermList & tl) const override; @@ -95,6 +96,7 @@ public: virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept; virtual MultiTerm* as_multi_term() noexcept; virtual RegexpTerm* as_regexp_term() noexcept; + virtual FuzzyTerm* as_fuzzy_term() noexcept; protected: using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>; string _index; diff --git a/searchlib/src/vespa/searchlib/query/streaming/weighted_set_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/weighted_set_term.cpp index 90d0be5d43c..d2d706eef3d 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/weighted_set_term.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/weighted_set_term.cpp @@ -25,7 +25,7 @@ WeightedSetTerm::unpack_match_data(uint32_t docid, const ITermData& td, MatchDat for (const auto& term : _terms) { auto& hl = term->evaluateHits(hl_store); for (auto& hit : hl) { - scores[hit.context()].emplace_back(term->weight().percent()); + scores[hit.field_id()].emplace_back(term->weight().percent()); } } auto num_fields = td.numFields(); diff --git a/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h b/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h index e49fcbcb5bc..c74c2d4e9a7 100644 --- a/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/dot_product_search.h @@ -27,6 +27,7 @@ protected: public: static constexpr bool filter_search = false; static constexpr bool require_btree_iterators = true; + static constexpr bool supports_hash_filter = false; // TODO: use MultiSearch::Children to pass ownership static SearchIterator::UP create(const std::vector<SearchIterator*> &children, diff --git a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp index 0c57b21aba6..0be014474d0 100644 --- a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.cpp @@ -1,10 +1,13 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "weighted_set_term_search.h" +#include <vespa/searchcommon/attribute/i_search_context.h> +#include <vespa/searchcommon/attribute/iattributevector.h> +#include <vespa/searchlib/attribute/i_direct_posting_store.h> +#include <vespa/searchlib/attribute/multi_term_hash_filter.hpp> #include <vespa/searchlib/common/bitvector.h> -#include <vespa/searchlib/attribute/multi_term_or_filter_search.h> #include <vespa/vespalib/objects/visit.h> -#include <vespa/searchcommon/attribute/i_search_context.h> +#include <vespa/vespalib/stllike/hash_map.hpp> #include "iterator_pack.h" #include "blueprint.h" @@ -237,4 +240,98 @@ WeightedSetTermSearch::create(fef::TermFieldMatchData &tmd, return create_helper_resolve_pack<DocidWithWeightIterator, DocidWithWeightIteratorPack>(tmd, is_filter_search, std::move(weights), std::move(iterators)); } +namespace { + +class HashFilterWrapper { +protected: + const attribute::IAttributeVector& _attr; +public: + HashFilterWrapper(const attribute::IAttributeVector& attr) : _attr(attr) {} +}; + +template <bool unpack_weights_t> +class StringHashFilterWrapper : public HashFilterWrapper { +public: + using TokenT = attribute::IAttributeVector::EnumHandle; + static constexpr bool unpack_weights = unpack_weights_t; + StringHashFilterWrapper(const attribute::IAttributeVector& attr) + : HashFilterWrapper(attr) + {} + auto mapToken(const IDirectPostingStore::LookupResult& term, const IDirectPostingStore& store, vespalib::datastore::EntryRef dict_snapshot) const { + std::vector<TokenT> result; + store.collect_folded(term.enum_idx, dict_snapshot, [&](vespalib::datastore::EntryRef ref) { result.emplace_back(ref.ref()); }); + return result; + } + TokenT getToken(uint32_t docid) const { + return _attr.getEnum(docid); + } +}; + +template <bool unpack_weights_t> +class IntegerHashFilterWrapper : public HashFilterWrapper { +public: + using TokenT = attribute::IAttributeVector::largeint_t; + static constexpr bool unpack_weights = unpack_weights_t; + IntegerHashFilterWrapper(const attribute::IAttributeVector& attr) + : HashFilterWrapper(attr) + {} + auto mapToken(const IDirectPostingStore::LookupResult& term, + const IDirectPostingStore& store, + vespalib::datastore::EntryRef) const { + std::vector<TokenT> result; + result.emplace_back(store.get_integer_value(term.enum_idx)); + return result; + } + TokenT getToken(uint32_t docid) const { + return _attr.getInt(docid); + } +}; + +template <typename WrapperType> +SearchIterator::UP +create_hash_filter_helper(fef::TermFieldMatchData& tfmd, + const std::vector<int32_t>& weights, + const std::vector<IDirectPostingStore::LookupResult>& terms, + const attribute::IAttributeVector& attr, + const IDirectPostingStore& posting_store, + vespalib::datastore::EntryRef dict_snapshot) +{ + using FilterType = attribute::MultiTermHashFilter<WrapperType>; + typename FilterType::TokenMap tokens; + WrapperType wrapper(attr); + for (size_t i = 0; i < terms.size(); ++i) { + for (auto token : wrapper.mapToken(terms[i], posting_store, dict_snapshot)) { + tokens[token] = weights[i]; + } + } + return std::make_unique<FilterType>(tfmd, wrapper, std::move(tokens)); +} + +} + +SearchIterator::UP +WeightedSetTermSearch::create_hash_filter(search::fef::TermFieldMatchData& tmd, + bool is_filter_search, + const std::vector<int32_t>& weights, + const std::vector<IDirectPostingStore::LookupResult>& terms, + const attribute::IAttributeVector& attr, + const IDirectPostingStore& posting_store, + vespalib::datastore::EntryRef dict_snapshot) +{ + if (attr.isStringType()) { + if (is_filter_search) { + return create_hash_filter_helper<StringHashFilterWrapper<false>>(tmd, weights, terms, attr, posting_store, dict_snapshot); + } else { + return create_hash_filter_helper<StringHashFilterWrapper<true>>(tmd, weights, terms, attr, posting_store, dict_snapshot); + } + } else { + assert(attr.isIntegerType()); + if (is_filter_search) { + return create_hash_filter_helper<IntegerHashFilterWrapper<false>>(tmd, weights, terms, attr, posting_store, dict_snapshot); + } else { + return create_hash_filter_helper<IntegerHashFilterWrapper<true>>(tmd, weights, terms, attr, posting_store, dict_snapshot); + } + } +} + } diff --git a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h index 7e47928fb90..d078fd5babc 100644 --- a/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/weighted_set_term_search.h @@ -11,6 +11,8 @@ #include <variant> #include <vector> +namespace search::attribute { class IAttributeVector; } + namespace search::fef { class TermFieldMatchData; } namespace search::queryeval { @@ -30,6 +32,8 @@ public: static constexpr bool filter_search = false; // Whether this iterator requires btree iterators for all tokens/terms used by the operator. static constexpr bool require_btree_iterators = false; + // Whether this supports creating a hash filter iterator; + static constexpr bool supports_hash_filter = true; // TODO: pass ownership with unique_ptr static SearchIterator::UP create(const std::vector<SearchIterator *> &children, @@ -48,6 +52,14 @@ public: std::variant<std::reference_wrapper<const std::vector<int32_t>>, std::vector<int32_t>> weights, std::vector<DocidWithWeightIterator> &&iterators); + static SearchIterator::UP create_hash_filter(search::fef::TermFieldMatchData& tmd, + bool is_filter_search, + const std::vector<int32_t>& weights, + const std::vector<IDirectPostingStore::LookupResult>& terms, + const attribute::IAttributeVector& attr, + const IDirectPostingStore& posting_store, + vespalib::datastore::EntryRef dictionary_snapshot); + // used during docsum fetching to identify matching elements // initRange must be called before use. // doSeek/doUnpack must not be called. |