diff options
Diffstat (limited to 'searchlib')
23 files changed, 171 insertions, 93 deletions
diff --git a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp index 8ba8c62c5ff..55810c02550 100644 --- a/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp +++ b/searchlib/src/tests/attribute/dfa_fuzzy_matcher/dfa_fuzzy_matcher_test.cpp @@ -120,11 +120,12 @@ struct MatchStats { template <bool collect_matches> void -brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words) +brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, + bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - FuzzyMatcher matcher(target, 2, prefix_size, cased); + FuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match); auto itr = view.begin(); size_t matches = 0; size_t seeks = 0; @@ -144,11 +145,12 @@ brute_force_fuzzy_match_in_dictionary(std::string_view target, const StringEnumS template <bool collect_matches> void -dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words) +dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, + bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit); + DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit); Utf8Reader reader(vespalib::stringref(target.data(), target.size())); std::string target_copy; Utf8Writer<std::string> writer(target_copy); @@ -187,11 +189,12 @@ dfa_fuzzy_match_in_dictionary(std::string_view target, const StringEnumStore& st template <bool collect_matches> void -dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, bool cased, MatchStats& stats, StringVector& matched_words) +dfa_fuzzy_match_in_dictionary_no_skip(std::string_view target, const StringEnumStore& store, uint32_t prefix_size, + bool cased, bool prefix_match, MatchStats& stats, StringVector& matched_words) { auto view = store.get_dictionary().get_posting_dictionary().getFrozenView(); vespalib::Timer timer; - DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, LevenshteinDfa::DfaType::Explicit); + DfaFuzzyMatcher matcher(target, 2, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Explicit); auto itr = view.begin(); size_t matches = 0; size_t seeks = 0; @@ -247,20 +250,23 @@ struct DfaFuzzyMatcherTest : public ::testing::TestWithParam<TestParam> { updater.commit(); store.freeze_dictionary(); } - void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) { + void expect_prefix_matches(std::string_view target, uint32_t prefix_size, bool prefix_match, const StringVector& exp_matches) { MatchStats stats; StringVector brute_force_matches; StringVector dfa_matches; StringVector dfa_no_skip_matches; bool cased = GetParam()._cased; SCOPED_TRACE(target); - brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, brute_force_matches); - dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, stats, dfa_matches); - dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, stats, dfa_no_skip_matches); + brute_force_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, brute_force_matches); + dfa_fuzzy_match_in_dictionary<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_matches); + dfa_fuzzy_match_in_dictionary_no_skip<true>(target, store, prefix_size, cased, prefix_match, stats, dfa_no_skip_matches); EXPECT_EQ(exp_matches, brute_force_matches); EXPECT_EQ(exp_matches, dfa_matches); EXPECT_EQ(exp_matches, dfa_no_skip_matches); } + void expect_prefix_matches(std::string_view target, uint32_t prefix_size, const StringVector& exp_matches) { + expect_prefix_matches(target, prefix_size, false, exp_matches); + } void expect_matches(std::string_view target, const StringVector& exp_matches) { expect_prefix_matches(target, 0, exp_matches); } @@ -284,11 +290,12 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary) expect_matches("forcecast", {"forecast"}); } -TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size) +TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_lock_length) { bool cased = GetParam()._cased; StringVector words = { "board", "boat", "bob", "door", "food", "foot", "football", "foothill", - "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha", char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")}; + "for", "forbid", "force", "ford", "forearm", "forecast", "forest", "H", "HA", "h", "ha", + char_from_u8(u8"Ørn"), char_from_u8(u8"øre"), char_from_u8(u8"Ås"), char_from_u8(u8"ås")}; populate_dictionary(words); expect_prefix_matches("a", 1, {}); expect_prefix_matches("b", 1, {"bob"}); @@ -321,6 +328,25 @@ TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_size) } } +TEST_P(DfaFuzzyMatcherTest, fuzzy_match_in_dictionary_with_prefix_matching) { + // We include the empty string to make "everything matches"-checks easier since + // the dictionary implicitly returns such an empty string sentinel already. + StringVector words = {"", "board", "boat", "bob", "door", "food", "foot", "football", "foothill", + "for", "forbid", "force", "ford", "forearm", "forecast", "forest"}; + populate_dictionary(words); + expect_prefix_matches("bo", 0, true, words); // matches everything + expect_prefix_matches("bo", 2, true, {"board", "boat", "bob"}); + expect_prefix_matches("boar", 0, true, {"board", "boat", "bob", "door", "for", "forbid", + "force", "ford", "forearm", "forecast", "forest"}); + expect_prefix_matches("board", 0, true, {"board", "boat", "ford"}); + expect_prefix_matches("footb", 0, true, {"food", "foot", "football", "foothill", "forbid"}); + expect_prefix_matches("footba", 0, true, {"foot", "football", "foothill"}); + expect_prefix_matches("footbal", 0, true, {"football", "foothill"}); + expect_prefix_matches("football", 0, true, {"football", "foothill"}); + expect_prefix_matches("footballs", 0, true, {"football"}); + expect_prefix_matches("z", 1, true, {}); +} + void benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDictionary& dict, size_t words_to_match, bool cased, bool dfa_algorithm) { @@ -329,9 +355,9 @@ benchmark_fuzzy_match_in_dictionary(const StringEnumStore& store, const RawDicti for (size_t i = 0; i < std::min(words_to_match, dict.size()); ++i) { const auto& entry = dict[i]; if (dfa_algorithm) { - dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy); + dfa_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy); } else { - brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, stats, dummy); + brute_force_fuzzy_match_in_dictionary<false>(entry.first, store, 0, cased, false, stats, dummy); } } std::cout << (dfa_algorithm ? "DFA:" : "Brute force:") << " samples=" << stats.samples << ", avg_matches=" << stats.avg_matches() << ", avg_seeks=" << stats.avg_seeks() << ", avg_elapsed_ms=" << stats.avg_elapsed_ms() << std::endl; diff --git a/searchlib/src/tests/query/customtypevisitor_test.cpp b/searchlib/src/tests/query/customtypevisitor_test.cpp index 815ff3bf1fb..fb89f2ef061 100644 --- a/searchlib/src/tests/query/customtypevisitor_test.cpp +++ b/searchlib/src/tests/query/customtypevisitor_test.cpp @@ -37,7 +37,7 @@ struct MyRangeTerm : InitTerm<RangeTerm> {}; struct MyStringTerm : InitTerm<StringTerm> {}; struct MySubstrTerm : InitTerm<SubstringTerm> {}; struct MySuffixTerm : InitTerm<SuffixTerm> {}; -struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0) {} }; +struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(): FuzzyTerm("term", "view", 0, Weight(0), 2, 0, false) {} }; struct MyWeakAnd : WeakAnd { MyWeakAnd() : WeakAnd(1234, "view") {} }; struct MyWeightedSetTerm : WeightedSetTerm { MyWeightedSetTerm() : WeightedSetTerm(0, "view", 0, Weight(42)) {} }; struct MyDotProduct : DotProduct { MyDotProduct() : DotProduct(0, "view", 0, Weight(42)) {} }; diff --git a/searchlib/src/tests/query/query_visitor_test.cpp b/searchlib/src/tests/query/query_visitor_test.cpp index b0c9c4e570e..bfce382b684 100644 --- a/searchlib/src/tests/query/query_visitor_test.cpp +++ b/searchlib/src/tests/query/query_visitor_test.cpp @@ -88,7 +88,7 @@ TEST("requireThatAllNodesCanBeVisited") { checkVisit<NearestNeighborTerm>(new SimpleNearestNeighborTerm("query_tensor", "doc_tensor", 0, Weight(0), 123, true, 321, 100100.25)); checkVisit<TrueQueryNode>(new SimpleTrue()); checkVisit<FalseQueryNode>(new SimpleFalse()); - checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0)); + checkVisit<FuzzyTerm>(new SimpleFuzzyTerm("t", "field", 0, Weight(0), 2, 0, false)); checkVisit<InTerm>(new SimpleInTerm(std::make_unique<StringTermVector>(0), MultiTerm::Type::STRING, "field", 0, Weight(0))); } diff --git a/searchlib/src/tests/query/querybuilder_test.cpp b/searchlib/src/tests/query/querybuilder_test.cpp index be795c8c94b..c70fe5b510b 100644 --- a/searchlib/src/tests/query/querybuilder_test.cpp +++ b/searchlib/src/tests/query/querybuilder_test.cpp @@ -117,7 +117,7 @@ Node::UP createQueryTree() { builder.add_true_node(); builder.add_false_node(); } - builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1); + builder.addFuzzyTerm(str[5], view[5], id[5], weight[5], 3, 1, false); } Node::UP node = builder.build(); ASSERT_TRUE(node.get()); @@ -326,8 +326,8 @@ void checkQueryTreeTypes(Node *node) { auto* fuzzy_term = as_node<FuzzyTerm>(and_node->getChildren()[12]); EXPECT_TRUE(checkTerm(fuzzy_term, str[5], view[5], id[5], weight[5])); - EXPECT_EQUAL(3u, fuzzy_term->getMaxEditDistance()); - EXPECT_EQUAL(1u, fuzzy_term->getPrefixLength()); + EXPECT_EQUAL(3u, fuzzy_term->max_edit_distance()); + EXPECT_EQUAL(1u, fuzzy_term->prefix_lock_length()); } struct AbstractTypes { @@ -452,8 +452,9 @@ struct MyTrue : TrueQueryNode {}; struct MyFalse : FalseQueryNode {}; struct MyFuzzyTerm : FuzzyTerm { MyFuzzyTerm(const Type &t, const string &f, int32_t i, Weight w, - uint32_t m, uint32_t p) - : FuzzyTerm(t, f, i, w, m, p) { + uint32_t m, uint32_t p, bool prefix_match) + : FuzzyTerm(t, f, i, w, m, p, prefix_match) + { } }; struct MyInTerm : InTerm { @@ -645,23 +646,27 @@ TEST("require that All Range Syntaxes Work") { EXPECT_TRUE(range2 == range_term->getTerm()); } -TEST("require that fuzzy node can be created") { - QueryBuilder<SimpleQueryNodeTypes> builder; - builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1); - Node::UP node = builder.build(); +TEST("fuzzy node can be created") { + for (bool prefix_match : {false, true}) { + QueryBuilder<SimpleQueryNodeTypes> builder; + builder.addFuzzyTerm("term", "view", 0, Weight(0), 3, 1, prefix_match); + Node::UP node = builder.build(); - string stackDump = StackDumpCreator::create(*node); - { - SimpleQueryStackDumpIterator iterator(stackDump); - Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator); - FuzzyTerm *fuzzy_node = as_node<FuzzyTerm>(new_node.get()); - EXPECT_EQUAL(3u, fuzzy_node->getMaxEditDistance()); - EXPECT_EQUAL(1u, fuzzy_node->getPrefixLength()); - } - { - search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump); - EXPECT_EQUAL(3u, queryTermSimple->getFuzzyMaxEditDistance()); - EXPECT_EQUAL(1u, queryTermSimple->getFuzzyPrefixLength()); + string stackDump = StackDumpCreator::create(*node); + { + SimpleQueryStackDumpIterator iterator(stackDump); + Node::UP new_node = QueryTreeCreator<SimpleQueryNodeTypes>::create(iterator); + auto *fuzzy_node = as_node<FuzzyTerm>(new_node.get()); + EXPECT_EQUAL(3u, fuzzy_node->max_edit_distance()); + EXPECT_EQUAL(1u, fuzzy_node->prefix_lock_length()); + EXPECT_EQUAL(prefix_match, fuzzy_node->prefix_match()); + } + { + search::QueryTermSimple::UP queryTermSimple = search::QueryTermDecoder::decodeTerm(stackDump); + EXPECT_EQUAL(3u, queryTermSimple->fuzzy_max_edit_distance()); + EXPECT_EQUAL(1u, queryTermSimple->fuzzy_prefix_lock_length()); + EXPECT_EQUAL(prefix_match, queryTermSimple->fuzzy_prefix_match()); + } } } diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp index 5f3ab9cd3d8..3ae2bdd3598 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.cpp @@ -44,8 +44,12 @@ extract_suffix(std::string_view target, uint32_t prefix_size) } -DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, LevenshteinDfa::DfaType dfa_type) - : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits, (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), dfa_type)), +DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, + bool cased, bool prefix_match, LevenshteinDfa::DfaType dfa_type) + : _dfa(vespalib::fuzzy::LevenshteinDfa::build(extract_suffix(target, prefix_size), max_edits, + (cased ? LevenshteinDfa::Casing::Cased : LevenshteinDfa::Casing::Uncased), + dfa_type, // TODO reorder args + (prefix_match ? LevenshteinDfa::Matching::Prefix : LevenshteinDfa::Matching::FullString))), _successor(), _prefix(extract_prefix(target, prefix_size, cased)), _prefix_size(prefix_size), @@ -54,8 +58,8 @@ DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uin _successor = _prefix; } -DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased) - : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, LevenshteinDfa::DfaType::Table) +DfaFuzzyMatcher::DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match) + : DfaFuzzyMatcher(target, max_edits, prefix_size, cased, prefix_match, LevenshteinDfa::DfaType::Table) { } diff --git a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h index 653af602c0d..ee57e9e6583 100644 --- a/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h +++ b/searchlib/src/vespa/searchlib/attribute/dfa_fuzzy_matcher.h @@ -26,8 +26,10 @@ private: const char* skip_prefix(const char* word) const; public: - DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type); - DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased); // Defaults to table-based DFA + DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match, + vespalib::fuzzy::LevenshteinDfa::DfaType dfa_type); + // Defaults to table-based DFA: + DfaFuzzyMatcher(std::string_view target, uint8_t max_edits, uint32_t prefix_size, bool cased, bool prefix_match); ~DfaFuzzyMatcher(); [[nodiscard]] static constexpr bool supports_max_edits(uint8_t edits) noexcept { diff --git a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp index f1a643dc376..c6cb6e4a637 100644 --- a/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp +++ b/searchlib/src/vespa/searchlib/attribute/string_search_helper.cpp @@ -49,17 +49,19 @@ StringSearchHelper::StringSearchHelper(QueryTermUCS4 & term, bool cased, vespali ? vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::None) : vespalib::Regex::from_pattern(term.getTerm(), vespalib::Regex::Options::IgnoreCase); } else if (isFuzzy()) { - auto max_edit_dist = term.getFuzzyMaxEditDistance(); + const auto max_edit_dist = term.fuzzy_max_edit_distance(); _fuzzyMatcher = std::make_unique<vespalib::FuzzyMatcher>(term.getTerm(), max_edit_dist, - term.getFuzzyPrefixLength(), - isCased()); + term.fuzzy_prefix_lock_length(), + isCased(), + term.fuzzy_prefix_match()); if ((fuzzy_matching_algorithm != FMA::BruteForce) && (max_edit_dist > 0 && max_edit_dist <= 2)) { _dfa_fuzzy_matcher = std::make_unique<DfaFuzzyMatcher>(term.getTerm(), max_edit_dist, - term.getFuzzyPrefixLength(), + term.fuzzy_prefix_lock_length(), isCased(), + term.fuzzy_prefix_match(), to_dfa_type(fuzzy_matching_algorithm)); } } else if (isCased()) { diff --git a/searchlib/src/vespa/searchlib/parsequery/parse.h b/searchlib/src/vespa/searchlib/parsequery/parse.h index 5e3b1dffe3a..a05c68bea46 100644 --- a/searchlib/src/vespa/searchlib/parsequery/parse.h +++ b/searchlib/src/vespa/searchlib/parsequery/parse.h @@ -87,6 +87,8 @@ public: IFLAG_NORANK = 0x00000001, // this term should not be ranked (not exposed to rank framework) IFLAG_SPECIALTOKEN = 0x00000002, IFLAG_NOPOSITIONDATA = 0x00000004, // we should not use position data when ranking this term + IFLAG_FILTER = 0x00000008, // see GetCreator(flags) below + IFLAG_PREFIX_MATCH = 0x00000010 }; /** Extra information on each item (creator id) coded in bit 3 of flags */ diff --git a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h index d15e1ca0512..bf6e3abf1ea 100644 --- a/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h +++ b/searchlib/src/vespa/searchlib/parsequery/stackdumpiterator.h @@ -113,9 +113,18 @@ public: uint32_t getUniqueId() const noexcept { return _currUniqueId; } // Get the flags of the current item. - bool hasNoRankFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_NORANK) != 0; } - bool hasSpecialTokenFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_SPECIALTOKEN) != 0; } - bool hasNoPositionDataFlag() const noexcept { return (_currFlags & ParseItem::IFLAG_NOPOSITIONDATA) != 0; } + [[nodiscard]] bool hasNoRankFlag() const noexcept { + return (_currFlags & ParseItem::IFLAG_NORANK) != 0; + } + [[nodiscard]] bool hasSpecialTokenFlag() const noexcept { + return (_currFlags & ParseItem::IFLAG_SPECIALTOKEN) != 0; + } + [[nodiscard]] bool hasNoPositionDataFlag() const noexcept { + return (_currFlags & ParseItem::IFLAG_NOPOSITIONDATA) != 0; + } + [[nodiscard]] bool has_prefix_match_semantics() const noexcept { + return (_currFlags & ParseItem::IFLAG_PREFIX_MATCH) != 0; + } uint32_t getArity() const noexcept { return _currArity; } @@ -127,9 +136,9 @@ public: bool getAllowApproximate() const noexcept { return (_extraIntArg2 != 0); } uint32_t getExploreAdditionalHits() const noexcept { return _extraIntArg3; } - // fuzzy match arguments - uint32_t getFuzzyMaxEditDistance() const noexcept { return _extraIntArg1; } - uint32_t getFuzzyPrefixLength() const noexcept { return _extraIntArg2; } + // fuzzy match arguments (see also: has_prefix_match_semantics() for fuzzy prefix matching) + [[nodiscard]] uint32_t fuzzy_max_edit_distance() const noexcept { return _extraIntArg1; } + [[nodiscard]] uint32_t fuzzy_prefix_lock_length() const noexcept { return _extraIntArg2; } std::unique_ptr<query::PredicateQueryTerm> getPredicateQueryTerm(); std::unique_ptr<query::TermVector> get_terms(); diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp index 060cd5015b3..09fc443cf0e 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp @@ -248,10 +248,11 @@ QueryTermSimple::QueryTermSimple(const string & term_, Type type) _type(type), _diversityCutoffStrict(false), _valid(true), + _fuzzy_prefix_match(false), _term(term_), _diversityAttribute(), - _fuzzyMaxEditDistance(2), - _fuzzyPrefixLength(0) + _fuzzy_max_edit_distance(2), + _fuzzy_prefix_lock_length(0) { if (isFullRange(_term)) { stringref rest(_term.c_str() + 1, _term.size() - 2); diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.h b/searchlib/src/vespa/searchlib/query/query_term_simple.h index a740afb0340..b0ee7aa54bc 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.h +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.h @@ -44,8 +44,9 @@ public: size_t getDiversityCutoffGroups() const noexcept { return _diversityCutoffGroups; } bool getDiversityCutoffStrict() const noexcept { return _diversityCutoffStrict; } vespalib::stringref getDiversityAttribute() const noexcept { return _diversityAttribute; } - size_t getFuzzyMaxEditDistance() const noexcept { return _fuzzyMaxEditDistance; } - size_t getFuzzyPrefixLength() const noexcept { return _fuzzyPrefixLength; } + [[nodiscard]] size_t fuzzy_max_edit_distance() const noexcept { return _fuzzy_max_edit_distance; } + [[nodiscard]] size_t fuzzy_prefix_lock_length() const noexcept { return _fuzzy_prefix_lock_length; } + [[nodiscard]] bool fuzzy_prefix_match() const noexcept { return _fuzzy_prefix_match; } bool getAsIntegerTerm(int64_t & lower, int64_t & upper) const noexcept; bool getAsFloatTerm(double & lower, double & upper) const noexcept; bool getAsFloatTerm(float & lower, float & upper) const noexcept; @@ -77,14 +78,17 @@ private: Type _type; bool _diversityCutoffStrict; bool _valid; +protected: + bool _fuzzy_prefix_match; // set in QueryTerm +private: string _term; stringref _diversityAttribute; template <typename T, typename D> bool getAsNumericTerm(T & lower, T & upper, D d) const noexcept; protected: - uint32_t _fuzzyMaxEditDistance; // set in QueryTerm - uint32_t _fuzzyPrefixLength; + uint32_t _fuzzy_max_edit_distance; // set in QueryTerm + uint32_t _fuzzy_prefix_lock_length; }; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp index f33fe44369a..8a4ec184de8 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp @@ -13,20 +13,21 @@ constexpr bool normalizing_implies_cased(Normalizing norm) noexcept { FuzzyTerm::FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term, const string& index, Type type, Normalizing normalizing, - uint8_t max_edits, uint32_t prefix_size) + uint8_t max_edits, uint32_t prefix_lock_length, bool prefix_match) : QueryTerm(std::move(result_base), term, index, type, normalizing), _dfa_matcher(), _fallback_matcher() { - setFuzzyMaxEditDistance(max_edits); - setFuzzyPrefixLength(prefix_size); + set_fuzzy_max_edit_distance(max_edits); + set_fuzzy_prefix_lock_length(prefix_lock_length); + set_fuzzy_prefix_match(prefix_match); std::string_view term_view(term.data(), term.size()); const bool cased = normalizing_implies_cased(normalizing); if (attribute::DfaFuzzyMatcher::supports_max_edits(max_edits)) { - _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_size, cased); + _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_lock_length, cased, prefix_match); } else { - _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_size, cased); + _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_lock_length, cased, prefix_match); } } diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h index c6c88b18969..552a839cb3b 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h +++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h @@ -23,7 +23,7 @@ class FuzzyTerm : public QueryTerm { public: FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term, const string& index, Type type, Normalizing normalizing, - uint8_t max_edits, uint32_t prefix_size); + uint8_t max_edits, uint32_t prefix_lock_length, bool prefix_match); ~FuzzyTerm() override; [[nodiscard]] FuzzyTerm* as_fuzzy_term() noexcept override { return this; } diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 94a479fd2d3..68e276d0fde 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -130,7 +130,8 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode); } else if (sTerm == TermType::FUZZYTERM) { qt = std::make_unique<FuzzyTerm>(factory.create(), ssTerm, ssIndex, TermType::FUZZYTERM, normalize_mode, - queryRep.getFuzzyMaxEditDistance(), queryRep.getFuzzyPrefixLength()); + queryRep.fuzzy_max_edit_distance(), queryRep.fuzzy_prefix_lock_length(), + queryRep.has_prefix_match_semantics()); } else [[likely]] { qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode); } diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index 05b12804d52..0e0ab98295b 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -100,8 +100,9 @@ public: void visitMembers(vespalib::ObjectVisitor &visitor) const override; void setIndex(const string & index_) override { _index = index_; } const string & getIndex() const override { return _index; } - void setFuzzyMaxEditDistance(uint32_t fuzzyMaxEditDistance) { _fuzzyMaxEditDistance = fuzzyMaxEditDistance; } - void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; } + void set_fuzzy_max_edit_distance(uint32_t fuzzy_max_edit_distance) noexcept { _fuzzy_max_edit_distance = fuzzy_max_edit_distance; } + void set_fuzzy_prefix_lock_length(uint32_t fuzzy_prefix_length) noexcept { _fuzzy_prefix_lock_length = fuzzy_prefix_length; } + void set_fuzzy_prefix_match(bool prefix_match) noexcept { _fuzzy_prefix_match = prefix_match; } virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept; virtual MultiTerm* as_multi_term() noexcept; virtual const MultiTerm* as_multi_term() const noexcept; diff --git a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h index 41990af6908..a498076bb09 100644 --- a/searchlib/src/vespa/searchlib/query/tree/querybuilder.h +++ b/searchlib/src/vespa/searchlib/query/tree/querybuilder.h @@ -226,8 +226,8 @@ create_nearest_neighbor_term(vespalib::stringref query_tensor_name, vespalib::st template <class NodeTypes> typename NodeTypes::FuzzyTerm * createFuzzyTerm(vespalib::stringref term, vespalib::stringref view, int32_t id, Weight weight, - uint32_t maxEditDistance, uint32_t prefixLength) { - return new typename NodeTypes::FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength); + uint32_t max_edit_distance, uint32_t prefix_lock_length, bool prefix_match) { + return new typename NodeTypes::FuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match); } template <class NodeTypes> @@ -343,9 +343,10 @@ public: return addTerm(createRegExpTerm<NodeTypes>(term, view, id, weight)); } typename NodeTypes::FuzzyTerm &addFuzzyTerm(stringref term, stringref view, int32_t id, Weight weight, - uint32_t maxEditDistance, uint32_t prefixLength) { + uint32_t max_edit_distance, uint32_t prefix_lock_length, + bool prefix_match) { adjustWeight(weight); - return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, maxEditDistance, prefixLength)); + return addTerm(createFuzzyTerm<NodeTypes>(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match)); } typename NodeTypes::NearestNeighborTerm &add_nearest_neighbor_term(stringref query_tensor_name, stringref field_name, int32_t id, Weight weight, uint32_t target_num_hits, diff --git a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h index f578bb57e21..116e677d439 100644 --- a/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h +++ b/searchlib/src/vespa/searchlib/query/tree/queryreplicator.h @@ -201,7 +201,8 @@ private: replicate(node, _builder.addFuzzyTerm( node.getTerm(), node.getView(), node.getId(), node.getWeight(), - node.getMaxEditDistance(), node.getPrefixLength())); + node.max_edit_distance(), node.prefix_lock_length(), + node.prefix_match())); } std::unique_ptr<TermVector> replicate_subterms(const InTerm& original) { diff --git a/searchlib/src/vespa/searchlib/query/tree/simplequery.h b/searchlib/src/vespa/searchlib/query/tree/simplequery.h index 535402bf14d..34bc39ae504 100644 --- a/searchlib/src/vespa/searchlib/query/tree/simplequery.h +++ b/searchlib/src/vespa/searchlib/query/tree/simplequery.h @@ -163,8 +163,10 @@ struct SimpleNearestNeighborTerm : NearestNeighborTerm { struct SimpleFuzzyTerm : FuzzyTerm { SimpleFuzzyTerm(const Type &term, vespalib::stringref view, int32_t id, Weight weight, - uint32_t maxEditDistance, uint32_t prefixLength) - : FuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength) { + uint32_t max_edit_distance, uint32_t prefix_lock_length, + bool prefix_match) + : FuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match) + { } ~SimpleFuzzyTerm() override; }; diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp index 6707712ee69..b71a86a214a 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpcreator.cpp @@ -225,6 +225,9 @@ class QueryNodeConverter : public QueryVisitor { if (!node.usePositionData()) { flags |= ParseItem::IFLAG_NOPOSITIONDATA; } + if (node.prefix_match()) { + flags |= ParseItem::IFLAG_PREFIX_MATCH; + } if (flags != 0) { features |= ParseItem::IF_FLAGS; } @@ -289,8 +292,8 @@ class QueryNodeConverter : public QueryVisitor { void visit(FuzzyTerm &node) override { createTerm(node, ParseItem::ITEM_FUZZY); - appendCompressedPositiveNumber(node.getMaxEditDistance()); - appendCompressedPositiveNumber(node.getPrefixLength()); + appendCompressedPositiveNumber(node.max_edit_distance()); + appendCompressedPositiveNumber(node.prefix_lock_length()); } void visit(NearestNeighborTerm &node) override { diff --git a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h index 7c743a72567..6f7d7ddd9ad 100644 --- a/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h +++ b/searchlib/src/vespa/searchlib/query/tree/stackdumpquerycreator.h @@ -42,6 +42,9 @@ public: if (queryStack.hasNoPositionDataFlag()) { t->setPositionData(false); } + if (queryStack.has_prefix_match_semantics()) [[unlikely]] { + t->set_prefix_match(true); + } } } if (builder.hasError()) { @@ -185,9 +188,10 @@ private: } else if (type == ParseItem::ITEM_REGEXP) { t = &builder.addRegExpTerm(term, view, id, weight); } else if (type == ParseItem::ITEM_FUZZY) { - uint32_t maxEditDistance = queryStack.getFuzzyMaxEditDistance(); - uint32_t prefixLength = queryStack.getFuzzyPrefixLength(); - t = &builder.addFuzzyTerm(term, view, id, weight, maxEditDistance, prefixLength); + uint32_t max_edit_distance = queryStack.fuzzy_max_edit_distance(); + uint32_t prefix_lock_length = queryStack.fuzzy_prefix_lock_length(); + bool prefix_match = queryStack.has_prefix_match_semantics(); + t = &builder.addFuzzyTerm(term, view, id, weight, max_edit_distance, prefix_lock_length, prefix_match); } else if (type == ParseItem::ITEM_STRING_IN) { t = &builder.add_in_term(queryStack.get_terms(), MultiTerm::Type::STRING, view, id, weight); } else if (type == ParseItem::ITEM_NUMERIC_IN) { diff --git a/searchlib/src/vespa/searchlib/query/tree/term.cpp b/searchlib/src/vespa/searchlib/query/tree/term.cpp index 45ba259a799..35f6ea5c8f5 100644 --- a/searchlib/src/vespa/searchlib/query/tree/term.cpp +++ b/searchlib/src/vespa/searchlib/query/tree/term.cpp @@ -12,12 +12,14 @@ Term::Term(vespalib::stringref view, int32_t id, Weight weight) : _id(id), _weight(weight), _ranked(true), - _position_data(true) + _position_data(true), + _prefix_match(false) { } void Term::setStateFrom(const Term& other) { setRanked(other.isRanked()); setPositionData(other.usePositionData()); + set_prefix_match(other.prefix_match()); // too late to copy this state: assert(_view == other.getView()); assert(_id == other.getId()); diff --git a/searchlib/src/vespa/searchlib/query/tree/term.h b/searchlib/src/vespa/searchlib/query/tree/term.h index 2f57c3cb06d..678d018d111 100644 --- a/searchlib/src/vespa/searchlib/query/tree/term.h +++ b/searchlib/src/vespa/searchlib/query/tree/term.h @@ -18,6 +18,7 @@ class Term Weight _weight; bool _ranked; bool _position_data; + bool _prefix_match; public: virtual ~Term() = 0; @@ -25,14 +26,17 @@ public: void setView(const vespalib::string & view) { _view = view; } void setRanked(bool ranked) noexcept { _ranked = ranked; } void setPositionData(bool position_data) noexcept { _position_data = position_data; } + // Used for fuzzy prefix matching. Not to be confused with distinct Prefix query term type + void set_prefix_match(bool prefix_match) noexcept { _prefix_match = prefix_match; } void setStateFrom(const Term& other); const vespalib::string & getView() const noexcept { return _view; } Weight getWeight() const noexcept { return _weight; } int32_t getId() const noexcept { return _id; } - bool isRanked() const noexcept { return _ranked; } - bool usePositionData() const noexcept { return _position_data; } + [[nodiscard]] bool isRanked() const noexcept { return _ranked; } + [[nodiscard]] bool usePositionData() const noexcept { return _position_data; } + [[nodiscard]] bool prefix_match() const noexcept { return _prefix_match; } static bool isPossibleRangeTerm(vespalib::stringref term) noexcept { return (term[0] == '[' || term[0] == '<' || term[0] == '>'); diff --git a/searchlib/src/vespa/searchlib/query/tree/termnodes.h b/searchlib/src/vespa/searchlib/query/tree/termnodes.h index 97e659dcc77..42dc0717368 100644 --- a/searchlib/src/vespa/searchlib/query/tree/termnodes.h +++ b/searchlib/src/vespa/searchlib/query/tree/termnodes.h @@ -119,19 +119,22 @@ public: //----------------------------------------------------------------------------- class FuzzyTerm : public QueryNodeMixin<FuzzyTerm, StringBase> { -private: - uint32_t _maxEditDistance; - uint32_t _prefixLength; + uint32_t _max_edit_distance; + uint32_t _prefix_lock_length; + // Prefix match mode is stored in parent Term public: FuzzyTerm(const Type &term, vespalib::stringref view, - int32_t id, Weight weight, uint32_t maxEditDistance, uint32_t prefixLength) - : QueryNodeMixinType(term, view, id, weight), - _maxEditDistance(maxEditDistance), - _prefixLength(prefixLength) - {} + int32_t id, Weight weight, uint32_t max_edit_distance, + uint32_t prefix_lock_length, bool prefix_match) + : QueryNodeMixinType(term, view, id, weight), + _max_edit_distance(max_edit_distance), + _prefix_lock_length(prefix_lock_length) + { + set_prefix_match(prefix_match); + } - uint32_t getMaxEditDistance() const { return _maxEditDistance; } - uint32_t getPrefixLength() const { return _prefixLength; } + [[nodiscard]] uint32_t max_edit_distance() const { return _max_edit_distance; } + [[nodiscard]] uint32_t prefix_lock_length() const { return _prefix_lock_length; } virtual ~FuzzyTerm() = 0; }; |