diff options
author | Tor Brede Vekterli <vekterli@vespa.ai> | 2024-01-19 12:46:15 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-19 12:46:15 +0100 |
commit | 7d737bcdc09d4aefd3cad56383871e6530a84561 (patch) | |
tree | f59b6e62d89906caaf37d8318cd83b0c7893dfca | |
parent | 461c4e60cdcb2e7a657ee05923477b584fd6792d (diff) | |
parent | dc973997098c239d71a57b1c692cb79b868ea8b8 (diff) |
Merge pull request #29969 from vespa-engine/vekterli/support-fuzzy-matching-in-streaming-search
Support fuzzy term matching in streaming search
12 files changed, 309 insertions, 41 deletions
diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt index 05a75f4662e..76119a6d58f 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt @@ -2,6 +2,7 @@ vespa_add_library(searchlib_query_streaming OBJECT SOURCES dot_product_term.cpp + fuzzy_term.cpp in_term.cpp multi_term.cpp nearest_neighbor_query_node.cpp diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp new file mode 100644 index 00000000000..f33fe44369a --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp @@ -0,0 +1,43 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "fuzzy_term.h" + +namespace search::streaming { + +namespace { + +constexpr bool normalizing_implies_cased(Normalizing norm) noexcept { + return (norm == Normalizing::NONE); +} + +} + +FuzzyTerm::FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term, + const string& index, Type type, Normalizing normalizing, + uint8_t max_edits, uint32_t prefix_size) + : QueryTerm(std::move(result_base), term, index, type, normalizing), + _dfa_matcher(), + _fallback_matcher() +{ + setFuzzyMaxEditDistance(max_edits); + setFuzzyPrefixLength(prefix_size); + + std::string_view term_view(term.data(), term.size()); + const bool cased = normalizing_implies_cased(normalizing); + if (attribute::DfaFuzzyMatcher::supports_max_edits(max_edits)) { + _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_size, cased); + } else { + _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_size, cased); + } +} + +FuzzyTerm::~FuzzyTerm() = default; + +bool FuzzyTerm::is_match(std::string_view term) const { + if (_dfa_matcher) { + return _dfa_matcher->is_match(term); + } else { + return _fallback_matcher->isMatch(term); + } +} + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h new file mode 100644 index 00000000000..c6c88b18969 --- /dev/null +++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h @@ -0,0 +1,34 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "queryterm.h" +#include <vespa/searchlib/attribute/dfa_fuzzy_matcher.h> +#include <vespa/vespalib/fuzzy/fuzzy_matcher.h> +#include <memory> +#include <string_view> + +namespace search::streaming { + +/** + * Query term that matches candidate field terms that are within a query-specified + * maximum number of edits (add, delete or substitute a character), with case + * sensitivity controlled by the provided Normalizing mode. + * + * Optionally, terms may be prefixed-locked, which enforces field terms to have a + * particular prefix and where edits are only counted for the remaining term suffix. + */ +class FuzzyTerm : public QueryTerm { + std::unique_ptr<attribute::DfaFuzzyMatcher> _dfa_matcher; + std::unique_ptr<vespalib::FuzzyMatcher> _fallback_matcher; +public: + FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term, + const string& index, Type type, Normalizing normalizing, + uint8_t max_edits, uint32_t prefix_size); + ~FuzzyTerm() override; + + [[nodiscard]] FuzzyTerm* as_fuzzy_term() noexcept override { return this; } + + [[nodiscard]] bool is_match(std::string_view term) const; +}; + +} diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp index 2ee515f062a..e71529a8aca 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp @@ -1,7 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include "query.h" +#include "fuzzy_term.h" #include "nearest_neighbor_query_node.h" +#include "query.h" #include "regexp_term.h" #include <vespa/searchlib/parsequery/stackdumpiterator.h> #include <vespa/searchlib/query/streaming/dot_product_term.h> @@ -147,17 +148,16 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor } else { Normalizing normalize_mode = factory.normalizing_mode(ssIndex); std::unique_ptr<QueryTerm> qt; - if (sTerm != TermType::REGEXP) { - qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode); - } else { + if (sTerm == TermType::REGEXP) { qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode); + } else if (sTerm == TermType::FUZZYTERM) { + qt = std::make_unique<FuzzyTerm>(factory.create(), ssTerm, ssIndex, TermType::FUZZYTERM, normalize_mode, + queryRep.getFuzzyMaxEditDistance(), queryRep.getFuzzyPrefixLength()); + } else [[likely]] { + qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode); } qt->setWeight(queryRep.GetWeight()); qt->setUniqueId(queryRep.getUniqueId()); - if (qt->isFuzzy()) { - qt->setFuzzyMaxEditDistance(queryRep.getFuzzyMaxEditDistance()); - qt->setFuzzyPrefixLength(queryRep.getFuzzyPrefixLength()); - } if (allowRewrite && possibleFloat(*qt, ssTerm) && factory.allow_float_terms_rewrite(ssIndex)) { auto phrase = std::make_unique<PhraseQueryNode>(); auto dotPos = ssTerm.find('.'); diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp index 3e05d381ee2..fb002ec1867 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp @@ -185,4 +185,10 @@ QueryTerm::as_regexp_term() noexcept return nullptr; } +FuzzyTerm* +QueryTerm::as_fuzzy_term() noexcept +{ + return nullptr; +} + } diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h index e7fe488ca89..b4dfa98ebe5 100644 --- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h +++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h @@ -11,6 +11,7 @@ namespace search::streaming { +class FuzzyTerm; class NearestNeighborQueryNode; class MultiTerm; class RegexpTerm; @@ -95,6 +96,7 @@ public: virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept; virtual MultiTerm* as_multi_term() noexcept; virtual RegexpTerm* as_regexp_term() noexcept; + virtual FuzzyTerm* as_fuzzy_term() noexcept; protected: using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>; string _index; diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 24877866c1b..ee2c5e2b5c7 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -3,6 +3,7 @@ #include <vespa/vespalib/testkit/testapp.h> #include <vespa/document/fieldvalue/fieldvalues.h> +#include <vespa/searchlib/query/streaming/fuzzy_term.h> #include <vespa/searchlib/query/streaming/regexp_term.h> #include <vespa/searchlib/query/streaming/queryterm.h> #include <vespa/vsm/searcher/boolfieldsearcher.h> @@ -18,10 +19,14 @@ #include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h> #include <vespa/vsm/searcher/tokenizereader.h> #include <vespa/vsm/vsm/snippetmodifier.h> +#include <concepts> +#include <charconv> +#include <stdexcept> using namespace document; using search::streaming::HitList; using search::streaming::QueryNodeResultFactory; +using search::streaming::FuzzyTerm; using search::streaming::RegexpTerm; using search::streaming::QueryTerm; using search::streaming::Normalizing; @@ -58,6 +63,46 @@ public: } }; +namespace { + +template <std::integral T> +std::string_view maybe_consume_into(std::string_view str, T& val_out) { + auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.size(), val_out); + if (ec != std::errc()) { + return str; + } + return str.substr(ptr - str.data()); +} + +// Parse optional max edits and prefix lock length from term string. +// Syntax: +// "term" -> {2, 0, "term"} (default max edits & prefix length) +// "{1}term" -> {1, 0, "term"} +// "{1,3}term" -> {1, 3, "term"} +// +// Note: this is not a "proper" parser (it accepts empty numeric values); only for testing! +std::tuple<uint8_t, uint32_t, std::string_view> parse_fuzzy_params(std::string_view term) { + if (term.empty() || term[0] != '{') { + return {2, 0, term}; + } + uint8_t max_edits = 2; + uint32_t prefix_length = 0; + term = maybe_consume_into(term.substr(1), max_edits); + if (term.empty() || (term[0] != ',' && term[0] != '}')) { + throw std::invalid_argument("malformed fuzzy params at (or after) max_edits"); + } + if (term[0] == '}') { + return {max_edits, prefix_length, term.substr(1)}; + } + term = maybe_consume_into(term.substr(1), prefix_length); + if (term.empty() || term[0] != '}') { + throw std::invalid_argument("malformed fuzzy params at (or after) prefix_length"); + } + return {max_edits, prefix_length, term.substr(1)}; +} + +} + class Query { private: @@ -66,10 +111,14 @@ private: ParsedQueryTerm pqt = parseQueryTerm(term); ParsedTerm pt = parseTerm(pqt.second); std::string effective_index = pqt.first.empty() ? "index" : pqt.first; - if (pt.second != TermType::REGEXP) { - qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing)); + if (pt.second == TermType::REGEXP) { + qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, TermType::REGEXP, normalizing)); + } else if (pt.second == TermType::FUZZYTERM) { + auto [max_edits, prefix_length, actual_term] = parse_fuzzy_params(pt.first); + qtv.push_back(std::make_unique<FuzzyTerm>(eqnr.create(), vespalib::stringref(actual_term.data(), actual_term.size()), + effective_index, TermType::FUZZYTERM, normalizing, max_edits, prefix_length)); } else { - qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing)); + qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing)); } } for (const auto & i : qtv) { @@ -100,6 +149,8 @@ public: return std::make_pair(term.substr(1, term.size() - 1), TermType::SUFFIXTERM); } else if (term[0] == '#') { // magic regex enabler return std::make_pair(term.substr(1), TermType::REGEXP); + } else if (term[0] == '%') { // equally magic fuzzy enabler + return std::make_pair(term.substr(1), TermType::FUZZYTERM); } else if (term[term.size() - 1] == '*') { return std::make_pair(term.substr(0, term.size() - 1), TermType::PREFIXTERM); } else { @@ -477,31 +528,54 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs) return true; } - TEST("verify correct term parsing") { - ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index"); - ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term"); - ASSERT_TRUE(Query::parseQueryTerm("term").first.empty()); - ASSERT_TRUE(Query::parseQueryTerm("term").second == "term"); - ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr"); - ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM); - ASSERT_TRUE(Query::parseTerm("*suffix").first == "suffix"); - ASSERT_TRUE(Query::parseTerm("*suffix").second == TermType::SUFFIXTERM); - ASSERT_TRUE(Query::parseTerm("prefix*").first == "prefix"); - ASSERT_TRUE(Query::parseTerm("prefix*").second == TermType::PREFIXTERM); - ASSERT_TRUE(Query::parseTerm("#regex").first == "regex"); - ASSERT_TRUE(Query::parseTerm("#regex").second == TermType::REGEXP); - ASSERT_TRUE(Query::parseTerm("term").first == "term"); - ASSERT_TRUE(Query::parseTerm("term").second == TermType::WORD); - } - - TEST("suffix matching") { - EXPECT_EQUAL(assertMatchTermSuffix("a", "vespa"), true); - EXPECT_EQUAL(assertMatchTermSuffix("spa", "vespa"), true); - EXPECT_EQUAL(assertMatchTermSuffix("vespa", "vespa"), true); - EXPECT_EQUAL(assertMatchTermSuffix("vvespa", "vespa"), false); - EXPECT_EQUAL(assertMatchTermSuffix("fspa", "vespa"), false); - EXPECT_EQUAL(assertMatchTermSuffix("v", "vespa"), false); - } +TEST("parsing of test-only fuzzy term params can extract numeric values") { + uint8_t max_edits = 0; + uint32_t prefix_length = 1234; + std::string_view out; + + std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("myterm"); + EXPECT_EQUAL(max_edits, 2u); + EXPECT_EQUAL(prefix_length, 0u); + EXPECT_EQUAL(out, "myterm"); + + std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{3}myterm"); + EXPECT_EQUAL(max_edits, 3u); + EXPECT_EQUAL(prefix_length, 0u); + EXPECT_EQUAL(out, "myterm"); + + std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{2,70}myterm"); + EXPECT_EQUAL(max_edits, 2u); + EXPECT_EQUAL(prefix_length, 70u); + EXPECT_EQUAL(out, "myterm"); +} + +TEST("verify correct term parsing") { + ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index"); + ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term"); + ASSERT_TRUE(Query::parseQueryTerm("term").first.empty()); + ASSERT_TRUE(Query::parseQueryTerm("term").second == "term"); + ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr"); + ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM); + ASSERT_TRUE(Query::parseTerm("*suffix").first == "suffix"); + ASSERT_TRUE(Query::parseTerm("*suffix").second == TermType::SUFFIXTERM); + ASSERT_TRUE(Query::parseTerm("prefix*").first == "prefix"); + ASSERT_TRUE(Query::parseTerm("prefix*").second == TermType::PREFIXTERM); + ASSERT_TRUE(Query::parseTerm("#regex").first == "regex"); + ASSERT_TRUE(Query::parseTerm("#regex").second == TermType::REGEXP); + ASSERT_TRUE(Query::parseTerm("%fuzzy").first == "fuzzy"); + ASSERT_TRUE(Query::parseTerm("%fuzzy").second == TermType::FUZZYTERM); + ASSERT_TRUE(Query::parseTerm("term").first == "term"); + ASSERT_TRUE(Query::parseTerm("term").second == TermType::WORD); +} + +TEST("suffix matching") { + EXPECT_EQUAL(assertMatchTermSuffix("a", "vespa"), true); + EXPECT_EQUAL(assertMatchTermSuffix("spa", "vespa"), true); + EXPECT_EQUAL(assertMatchTermSuffix("vespa", "vespa"), true); + EXPECT_EQUAL(assertMatchTermSuffix("vvespa", "vespa"), false); + EXPECT_EQUAL(assertMatchTermSuffix("fspa", "vespa"), false); + EXPECT_EQUAL(assertMatchTermSuffix("v", "vespa"), false); +} TEST("Test basic strchrfield searchers") { { @@ -654,6 +728,96 @@ TEST("utf8 flexible searcher handles regexes with explicit anchoring") { TEST_DO(assertString(fs, "#^foo$", "oo", Hits())); } +TEST("utf8 flexible searcher handles fuzzy search in uncased mode") { + UTF8FlexibleStringFieldSearcher fs(0); + // Term syntax (only applies to these tests): + // %{k}term => fuzzy match "term" with max edits k + // %{k,p}term => fuzzy match "term" with max edits k, prefix lock length p + + // DFA is used for k in {1, 2} + TEST_DO(assertString(fs, "%{1}abc", "abc", Hits().add(0))); + TEST_DO(assertString(fs, "%{1}ABC", "abc", Hits().add(0))); + TEST_DO(assertString(fs, "%{1}abc", "ABC", Hits().add(0))); + TEST_DO(assertString(fs, "%{1}Abc", "abd", Hits().add(0))); + TEST_DO(assertString(fs, "%{1}abc", "ABCD", Hits().add(0))); + TEST_DO(assertString(fs, "%{1}abc", "abcde", Hits())); + TEST_DO(assertString(fs, "%{2}abc", "abcde", Hits().add(0))); + TEST_DO(assertString(fs, "%{2}abc", "xabcde", Hits())); + // Fallback to non-DFA matcher when k not in {1, 2} + TEST_DO(assertString(fs, "%{3}abc", "abc", Hits().add(0))); + TEST_DO(assertString(fs, "%{3}abc", "XYZ", Hits().add(0))); + TEST_DO(assertString(fs, "%{3}abc", "XYZ!", Hits())); +} + +TEST("utf8 flexible searcher handles fuzzy search in cased mode") { + UTF8FlexibleStringFieldSearcher fs(0); + fs.normalize_mode(Normalizing::NONE); + TEST_DO(assertString(fs, "%{1}abc", "abc", Hits().add(0))); + TEST_DO(assertString(fs, "%{1}abc", "Abc", Hits().add(0))); + TEST_DO(assertString(fs, "%{1}ABC", "abc", Hits())); + TEST_DO(assertString(fs, "%{2}Abc", "abc", Hits().add(0))); + TEST_DO(assertString(fs, "%{2}abc", "AbC", Hits().add(0))); + TEST_DO(assertString(fs, "%{3}abc", "ABC", Hits().add(0))); + TEST_DO(assertString(fs, "%{3}abc", "ABCD", Hits())); +} + +TEST("utf8 flexible searcher handles fuzzy search with prefix locking") { + UTF8FlexibleStringFieldSearcher fs(0); + // DFA + TEST_DO(assertString(fs, "%{1,4}zoid", "zoi", Hits())); + TEST_DO(assertString(fs, "%{1,4}zoid", "zoid", Hits().add(0))); + TEST_DO(assertString(fs, "%{1,4}zoid", "ZOID", Hits().add(0))); + TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoid", Hits())); + TEST_DO(assertString(fs, "%{1,4}zoidberg", "ZoidBerg", Hits().add(0))); + TEST_DO(assertString(fs, "%{1,4}zoidberg", "ZoidBergg", Hits().add(0))); + TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoidborg", Hits().add(0))); + TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoidblergh", Hits())); + TEST_DO(assertString(fs, "%{2,4}zoidberg", "zoidblergh", Hits().add(0))); + // Fallback + TEST_DO(assertString(fs, "%{3,4}zoidberg", "zoidblergh", Hits().add(0))); + TEST_DO(assertString(fs, "%{3,4}zoidberg", "zoidbooorg", Hits().add(0))); + TEST_DO(assertString(fs, "%{3,4}zoidberg", "zoidzooorg", Hits())); + + fs.normalize_mode(Normalizing::NONE); + // DFA + TEST_DO(assertString(fs, "%{1,4}zoid", "ZOID", Hits())); + TEST_DO(assertString(fs, "%{1,4}ZOID", "zoid", Hits())); + TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoidBerg", Hits().add(0))); // 1 edit + TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoidBblerg", Hits())); // 2 edits, 1 max + TEST_DO(assertString(fs, "%{2,4}zoidberg", "zoidBblerg", Hits().add(0))); // 2 edits, 2 max + // Fallback + TEST_DO(assertString(fs, "%{3,4}zoidberg", "zoidBERG", Hits())); // 4 edits, 3 max + TEST_DO(assertString(fs, "%{4,4}zoidberg", "zoidBERG", Hits().add(0))); // 4 edits, 4 max +} + +TEST("utf8 flexible searcher fuzzy match with max_edits=0 implies exact match") { + UTF8FlexibleStringFieldSearcher fs(0); + TEST_DO(assertString(fs, "%{0}zoid", "zoi", Hits())); + TEST_DO(assertString(fs, "%{0,4}zoid", "zoi", Hits())); + TEST_DO(assertString(fs, "%{0}zoid", "zoid", Hits().add(0))); + TEST_DO(assertString(fs, "%{0}zoid", "ZOID", Hits().add(0))); + TEST_DO(assertString(fs, "%{0,4}zoid", "ZOID", Hits().add(0))); + fs.normalize_mode(Normalizing::NONE); + TEST_DO(assertString(fs, "%{0}zoid", "ZOID", Hits())); + TEST_DO(assertString(fs, "%{0,4}zoid", "ZOID", Hits())); + TEST_DO(assertString(fs, "%{0}zoid", "zoid", Hits().add(0))); + TEST_DO(assertString(fs, "%{0,4}zoid", "zoid", Hits().add(0))); +} + +TEST("utf8 flexible searcher caps oversized fuzzy prefix length to term length") { + UTF8FlexibleStringFieldSearcher fs(0); + // DFA + TEST_DO(assertString(fs, "%{1,5}zoid", "zoid", Hits().add(0))); + TEST_DO(assertString(fs, "%{1,9001}zoid", "zoid", Hits().add(0))); + TEST_DO(assertString(fs, "%{1,9001}zoid", "boid", Hits())); + // Fallback + TEST_DO(assertString(fs, "%{0,5}zoid", "zoid", Hits().add(0))); + TEST_DO(assertString(fs, "%{5,5}zoid", "zoid", Hits().add(0))); + TEST_DO(assertString(fs, "%{0,9001}zoid", "zoid", Hits().add(0))); + TEST_DO(assertString(fs, "%{5,9001}zoid", "zoid", Hits().add(0))); + TEST_DO(assertString(fs, "%{5,9001}zoid", "boid", Hits())); +} + TEST("bool search") { BoolFieldSearcher fs(0); TEST_DO(assertBool(fs, "true", true, true)); diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index c5bca6f3899..bb3aa6fdd10 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -46,7 +46,7 @@ public: explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {} FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept; ~FieldSearcher() override; - virtual std::unique_ptr<FieldSearcher> duplicate() const = 0; + [[nodiscard]] virtual std::unique_ptr<FieldSearcher> duplicate() const = 0; bool search(const StorageDocument & doc); virtual void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env); diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp index c0a0249125f..98e88e45b3a 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp @@ -34,7 +34,7 @@ bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef) } } else { for (auto qt : _qtl) { - if (fieldRef.size() >= qt->termLen() || qt->isRegex()) { + if (fieldRef.size() >= qt->termLen() || qt->isRegex() || qt->isFuzzy()) { _words += matchTerm(fieldRef, *qt); } else { _words += countWords(fieldRef); @@ -49,8 +49,8 @@ size_t StrChrFieldSearcher::shortestTerm() const size_t mintsz(_qtl.front()->termLen()); for (auto it=_qtl.begin()+1, mt=_qtl.end(); it != mt; it++) { const QueryTerm & qt = **it; - if (qt.isRegex()) { - return 0; // Must avoid "too short query term" optimization when using regex + if (qt.isRegex() || qt.isFuzzy()) { + return 0; // Must avoid "too short query term" optimization when using regex or fuzzy } mintsz = std::min(mintsz, qt.termLen()); } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp index c6deb6eacd1..d648d2e252e 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp @@ -1,5 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8flexiblestringfieldsearcher.h" +#include <vespa/searchlib/query/streaming/fuzzy_term.h> #include <vespa/searchlib/query/streaming/regexp_term.h> #include <cassert> @@ -40,6 +41,19 @@ UTF8FlexibleStringFieldSearcher::match_regexp(const FieldRef & f, search::stream } size_t +UTF8FlexibleStringFieldSearcher::match_fuzzy(const FieldRef & f, search::streaming::QueryTerm & qt) +{ + auto* fuzzy_term = qt.as_fuzzy_term(); + assert(fuzzy_term != nullptr); + // TODO delegate to matchTermExact if max edits == 0? + // - needs to avoid folding to have consistent normalization semantics + if (fuzzy_term->is_match({f.data(), f.size()})) { + addHit(qt, 0); + } + return countWords(f); +} + +size_t UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) { if (qt.isPrefix()) { @@ -57,6 +71,9 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) } else if (qt.isRegex()) { LOG(debug, "Use regexp match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); return match_regexp(f, qt); + } else if (qt.isFuzzy()) { + LOG(debug, "Use fuzzy match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return match_fuzzy(f, qt); } else { if (substring()) { LOG(debug, "Use substring match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h index cd1715ad158..a5f6ad46246 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h @@ -25,6 +25,7 @@ private: size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; size_t match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt); + size_t match_fuzzy(const FieldRef & f, search::streaming::QueryTerm & qt); public: std::unique_ptr<FieldSearcher> duplicate() const override; diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp index 9c8bb2f185a..63d2007cecf 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -133,7 +133,7 @@ FieldSearchSpec::reconfig(const QueryTerm & term) (term.isSuffix() && _arg1 != "suffix") || (term.isExactstring() && _arg1 != "exact") || (term.isPrefix() && _arg1 == "suffix") || - term.isRegex()) + (term.isRegex() || term.isFuzzy())) { _searcher = std::make_unique<UTF8FlexibleStringFieldSearcher>(id()); propagate_settings_to_searcher(); |