aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-01-18 13:04:53 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-01-18 15:08:07 +0000
commitdc973997098c239d71a57b1c692cb79b868ea8b8 (patch)
treea5613e6d687d6968924a845545d0f39cf238824c
parentab54f9c7cbd1bc3c1434717b875e1dfeb7b27dc4 (diff)
Support fuzzy term matching in streaming search
Uses a DFA-based matcher for max edits in {1, 2} and falls back to the legacy non-DFA matcher for all other values (including 0). Currently only supports fuzzy matching across the full field string, i.e. there's no implicit tokenization or whitespace removal. This matches the semantics we currently have for fuzzy search over attributes in a non-streaming case
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp43
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h34
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp16
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h2
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp220
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h2
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp6
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp17
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h1
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp2
12 files changed, 309 insertions, 41 deletions
diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
index 05a75f4662e..76119a6d58f 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
@@ -2,6 +2,7 @@
vespa_add_library(searchlib_query_streaming OBJECT
SOURCES
dot_product_term.cpp
+ fuzzy_term.cpp
in_term.cpp
multi_term.cpp
nearest_neighbor_query_node.cpp
diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
new file mode 100644
index 00000000000..f33fe44369a
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
@@ -0,0 +1,43 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "fuzzy_term.h"
+
+namespace search::streaming {
+
+namespace {
+
+constexpr bool normalizing_implies_cased(Normalizing norm) noexcept {
+ return (norm == Normalizing::NONE);
+}
+
+}
+
+FuzzyTerm::FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
+ const string& index, Type type, Normalizing normalizing,
+ uint8_t max_edits, uint32_t prefix_size)
+ : QueryTerm(std::move(result_base), term, index, type, normalizing),
+ _dfa_matcher(),
+ _fallback_matcher()
+{
+ setFuzzyMaxEditDistance(max_edits);
+ setFuzzyPrefixLength(prefix_size);
+
+ std::string_view term_view(term.data(), term.size());
+ const bool cased = normalizing_implies_cased(normalizing);
+ if (attribute::DfaFuzzyMatcher::supports_max_edits(max_edits)) {
+ _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_size, cased);
+ } else {
+ _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_size, cased);
+ }
+}
+
+FuzzyTerm::~FuzzyTerm() = default;
+
+bool FuzzyTerm::is_match(std::string_view term) const {
+ if (_dfa_matcher) {
+ return _dfa_matcher->is_match(term);
+ } else {
+ return _fallback_matcher->isMatch(term);
+ }
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
new file mode 100644
index 00000000000..c6c88b18969
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
@@ -0,0 +1,34 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "queryterm.h"
+#include <vespa/searchlib/attribute/dfa_fuzzy_matcher.h>
+#include <vespa/vespalib/fuzzy/fuzzy_matcher.h>
+#include <memory>
+#include <string_view>
+
+namespace search::streaming {
+
+/**
+ * Query term that matches candidate field terms that are within a query-specified
+ * maximum number of edits (add, delete or substitute a character), with case
+ * sensitivity controlled by the provided Normalizing mode.
+ *
+ * Optionally, terms may be prefixed-locked, which enforces field terms to have a
+ * particular prefix and where edits are only counted for the remaining term suffix.
+ */
+class FuzzyTerm : public QueryTerm {
+ std::unique_ptr<attribute::DfaFuzzyMatcher> _dfa_matcher;
+ std::unique_ptr<vespalib::FuzzyMatcher> _fallback_matcher;
+public:
+ FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
+ const string& index, Type type, Normalizing normalizing,
+ uint8_t max_edits, uint32_t prefix_size);
+ ~FuzzyTerm() override;
+
+ [[nodiscard]] FuzzyTerm* as_fuzzy_term() noexcept override { return this; }
+
+ [[nodiscard]] bool is_match(std::string_view term) const;
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 2ee515f062a..e71529a8aca 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -1,7 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include "query.h"
+#include "fuzzy_term.h"
#include "nearest_neighbor_query_node.h"
+#include "query.h"
#include "regexp_term.h"
#include <vespa/searchlib/parsequery/stackdumpiterator.h>
#include <vespa/searchlib/query/streaming/dot_product_term.h>
@@ -147,17 +148,16 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
} else {
Normalizing normalize_mode = factory.normalizing_mode(ssIndex);
std::unique_ptr<QueryTerm> qt;
- if (sTerm != TermType::REGEXP) {
- qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
- } else {
+ if (sTerm == TermType::REGEXP) {
qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode);
+ } else if (sTerm == TermType::FUZZYTERM) {
+ qt = std::make_unique<FuzzyTerm>(factory.create(), ssTerm, ssIndex, TermType::FUZZYTERM, normalize_mode,
+ queryRep.getFuzzyMaxEditDistance(), queryRep.getFuzzyPrefixLength());
+ } else [[likely]] {
+ qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
}
qt->setWeight(queryRep.GetWeight());
qt->setUniqueId(queryRep.getUniqueId());
- if (qt->isFuzzy()) {
- qt->setFuzzyMaxEditDistance(queryRep.getFuzzyMaxEditDistance());
- qt->setFuzzyPrefixLength(queryRep.getFuzzyPrefixLength());
- }
if (allowRewrite && possibleFloat(*qt, ssTerm) && factory.allow_float_terms_rewrite(ssIndex)) {
auto phrase = std::make_unique<PhraseQueryNode>();
auto dotPos = ssTerm.find('.');
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index 3e05d381ee2..fb002ec1867 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -185,4 +185,10 @@ QueryTerm::as_regexp_term() noexcept
return nullptr;
}
+FuzzyTerm*
+QueryTerm::as_fuzzy_term() noexcept
+{
+ return nullptr;
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index cd2bdd7eaec..ed9af969aa2 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -11,6 +11,7 @@
namespace search::streaming {
+class FuzzyTerm;
class NearestNeighborQueryNode;
class MultiTerm;
class RegexpTerm;
@@ -95,6 +96,7 @@ public:
virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept;
virtual MultiTerm* as_multi_term() noexcept;
virtual RegexpTerm* as_regexp_term() noexcept;
+ virtual FuzzyTerm* as_fuzzy_term() noexcept;
protected:
using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>;
string _index;
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 24877866c1b..ee2c5e2b5c7 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -3,6 +3,7 @@
#include <vespa/vespalib/testkit/testapp.h>
#include <vespa/document/fieldvalue/fieldvalues.h>
+#include <vespa/searchlib/query/streaming/fuzzy_term.h>
#include <vespa/searchlib/query/streaming/regexp_term.h>
#include <vespa/searchlib/query/streaming/queryterm.h>
#include <vespa/vsm/searcher/boolfieldsearcher.h>
@@ -18,10 +19,14 @@
#include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h>
#include <vespa/vsm/searcher/tokenizereader.h>
#include <vespa/vsm/vsm/snippetmodifier.h>
+#include <concepts>
+#include <charconv>
+#include <stdexcept>
using namespace document;
using search::streaming::HitList;
using search::streaming::QueryNodeResultFactory;
+using search::streaming::FuzzyTerm;
using search::streaming::RegexpTerm;
using search::streaming::QueryTerm;
using search::streaming::Normalizing;
@@ -58,6 +63,46 @@ public:
}
};
+namespace {
+
+template <std::integral T>
+std::string_view maybe_consume_into(std::string_view str, T& val_out) {
+ auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.size(), val_out);
+ if (ec != std::errc()) {
+ return str;
+ }
+ return str.substr(ptr - str.data());
+}
+
+// Parse optional max edits and prefix lock length from term string.
+// Syntax:
+// "term" -> {2, 0, "term"} (default max edits & prefix length)
+// "{1}term" -> {1, 0, "term"}
+// "{1,3}term" -> {1, 3, "term"}
+//
+// Note: this is not a "proper" parser (it accepts empty numeric values); only for testing!
+std::tuple<uint8_t, uint32_t, std::string_view> parse_fuzzy_params(std::string_view term) {
+ if (term.empty() || term[0] != '{') {
+ return {2, 0, term};
+ }
+ uint8_t max_edits = 2;
+ uint32_t prefix_length = 0;
+ term = maybe_consume_into(term.substr(1), max_edits);
+ if (term.empty() || (term[0] != ',' && term[0] != '}')) {
+ throw std::invalid_argument("malformed fuzzy params at (or after) max_edits");
+ }
+ if (term[0] == '}') {
+ return {max_edits, prefix_length, term.substr(1)};
+ }
+ term = maybe_consume_into(term.substr(1), prefix_length);
+ if (term.empty() || term[0] != '}') {
+ throw std::invalid_argument("malformed fuzzy params at (or after) prefix_length");
+ }
+ return {max_edits, prefix_length, term.substr(1)};
+}
+
+}
+
class Query
{
private:
@@ -66,10 +111,14 @@ private:
ParsedQueryTerm pqt = parseQueryTerm(term);
ParsedTerm pt = parseTerm(pqt.second);
std::string effective_index = pqt.first.empty() ? "index" : pqt.first;
- if (pt.second != TermType::REGEXP) {
- qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing));
+ if (pt.second == TermType::REGEXP) {
+ qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, TermType::REGEXP, normalizing));
+ } else if (pt.second == TermType::FUZZYTERM) {
+ auto [max_edits, prefix_length, actual_term] = parse_fuzzy_params(pt.first);
+ qtv.push_back(std::make_unique<FuzzyTerm>(eqnr.create(), vespalib::stringref(actual_term.data(), actual_term.size()),
+ effective_index, TermType::FUZZYTERM, normalizing, max_edits, prefix_length));
} else {
- qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing));
+ qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing));
}
}
for (const auto & i : qtv) {
@@ -100,6 +149,8 @@ public:
return std::make_pair(term.substr(1, term.size() - 1), TermType::SUFFIXTERM);
} else if (term[0] == '#') { // magic regex enabler
return std::make_pair(term.substr(1), TermType::REGEXP);
+ } else if (term[0] == '%') { // equally magic fuzzy enabler
+ return std::make_pair(term.substr(1), TermType::FUZZYTERM);
} else if (term[term.size() - 1] == '*') {
return std::make_pair(term.substr(0, term.size() - 1), TermType::PREFIXTERM);
} else {
@@ -477,31 +528,54 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
return true;
}
- TEST("verify correct term parsing") {
- ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index");
- ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term");
- ASSERT_TRUE(Query::parseQueryTerm("term").first.empty());
- ASSERT_TRUE(Query::parseQueryTerm("term").second == "term");
- ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr");
- ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM);
- ASSERT_TRUE(Query::parseTerm("*suffix").first == "suffix");
- ASSERT_TRUE(Query::parseTerm("*suffix").second == TermType::SUFFIXTERM);
- ASSERT_TRUE(Query::parseTerm("prefix*").first == "prefix");
- ASSERT_TRUE(Query::parseTerm("prefix*").second == TermType::PREFIXTERM);
- ASSERT_TRUE(Query::parseTerm("#regex").first == "regex");
- ASSERT_TRUE(Query::parseTerm("#regex").second == TermType::REGEXP);
- ASSERT_TRUE(Query::parseTerm("term").first == "term");
- ASSERT_TRUE(Query::parseTerm("term").second == TermType::WORD);
- }
-
- TEST("suffix matching") {
- EXPECT_EQUAL(assertMatchTermSuffix("a", "vespa"), true);
- EXPECT_EQUAL(assertMatchTermSuffix("spa", "vespa"), true);
- EXPECT_EQUAL(assertMatchTermSuffix("vespa", "vespa"), true);
- EXPECT_EQUAL(assertMatchTermSuffix("vvespa", "vespa"), false);
- EXPECT_EQUAL(assertMatchTermSuffix("fspa", "vespa"), false);
- EXPECT_EQUAL(assertMatchTermSuffix("v", "vespa"), false);
- }
+TEST("parsing of test-only fuzzy term params can extract numeric values") {
+ uint8_t max_edits = 0;
+ uint32_t prefix_length = 1234;
+ std::string_view out;
+
+ std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("myterm");
+ EXPECT_EQUAL(max_edits, 2u);
+ EXPECT_EQUAL(prefix_length, 0u);
+ EXPECT_EQUAL(out, "myterm");
+
+ std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{3}myterm");
+ EXPECT_EQUAL(max_edits, 3u);
+ EXPECT_EQUAL(prefix_length, 0u);
+ EXPECT_EQUAL(out, "myterm");
+
+ std::tie(max_edits, prefix_length, out) = parse_fuzzy_params("{2,70}myterm");
+ EXPECT_EQUAL(max_edits, 2u);
+ EXPECT_EQUAL(prefix_length, 70u);
+ EXPECT_EQUAL(out, "myterm");
+}
+
+TEST("verify correct term parsing") {
+ ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index");
+ ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term");
+ ASSERT_TRUE(Query::parseQueryTerm("term").first.empty());
+ ASSERT_TRUE(Query::parseQueryTerm("term").second == "term");
+ ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr");
+ ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM);
+ ASSERT_TRUE(Query::parseTerm("*suffix").first == "suffix");
+ ASSERT_TRUE(Query::parseTerm("*suffix").second == TermType::SUFFIXTERM);
+ ASSERT_TRUE(Query::parseTerm("prefix*").first == "prefix");
+ ASSERT_TRUE(Query::parseTerm("prefix*").second == TermType::PREFIXTERM);
+ ASSERT_TRUE(Query::parseTerm("#regex").first == "regex");
+ ASSERT_TRUE(Query::parseTerm("#regex").second == TermType::REGEXP);
+ ASSERT_TRUE(Query::parseTerm("%fuzzy").first == "fuzzy");
+ ASSERT_TRUE(Query::parseTerm("%fuzzy").second == TermType::FUZZYTERM);
+ ASSERT_TRUE(Query::parseTerm("term").first == "term");
+ ASSERT_TRUE(Query::parseTerm("term").second == TermType::WORD);
+}
+
+TEST("suffix matching") {
+ EXPECT_EQUAL(assertMatchTermSuffix("a", "vespa"), true);
+ EXPECT_EQUAL(assertMatchTermSuffix("spa", "vespa"), true);
+ EXPECT_EQUAL(assertMatchTermSuffix("vespa", "vespa"), true);
+ EXPECT_EQUAL(assertMatchTermSuffix("vvespa", "vespa"), false);
+ EXPECT_EQUAL(assertMatchTermSuffix("fspa", "vespa"), false);
+ EXPECT_EQUAL(assertMatchTermSuffix("v", "vespa"), false);
+}
TEST("Test basic strchrfield searchers") {
{
@@ -654,6 +728,96 @@ TEST("utf8 flexible searcher handles regexes with explicit anchoring") {
TEST_DO(assertString(fs, "#^foo$", "oo", Hits()));
}
+TEST("utf8 flexible searcher handles fuzzy search in uncased mode") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ // Term syntax (only applies to these tests):
+ // %{k}term => fuzzy match "term" with max edits k
+ // %{k,p}term => fuzzy match "term" with max edits k, prefix lock length p
+
+ // DFA is used for k in {1, 2}
+ TEST_DO(assertString(fs, "%{1}abc", "abc", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1}ABC", "abc", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1}abc", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1}Abc", "abd", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1}abc", "ABCD", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1}abc", "abcde", Hits()));
+ TEST_DO(assertString(fs, "%{2}abc", "abcde", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{2}abc", "xabcde", Hits()));
+ // Fallback to non-DFA matcher when k not in {1, 2}
+ TEST_DO(assertString(fs, "%{3}abc", "abc", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{3}abc", "XYZ", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{3}abc", "XYZ!", Hits()));
+}
+
+TEST("utf8 flexible searcher handles fuzzy search in cased mode") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ fs.normalize_mode(Normalizing::NONE);
+ TEST_DO(assertString(fs, "%{1}abc", "abc", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1}abc", "Abc", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1}ABC", "abc", Hits()));
+ TEST_DO(assertString(fs, "%{2}Abc", "abc", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{2}abc", "AbC", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{3}abc", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{3}abc", "ABCD", Hits()));
+}
+
+TEST("utf8 flexible searcher handles fuzzy search with prefix locking") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ // DFA
+ TEST_DO(assertString(fs, "%{1,4}zoid", "zoi", Hits()));
+ TEST_DO(assertString(fs, "%{1,4}zoid", "zoid", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1,4}zoid", "ZOID", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoid", Hits()));
+ TEST_DO(assertString(fs, "%{1,4}zoidberg", "ZoidBerg", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1,4}zoidberg", "ZoidBergg", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoidborg", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoidblergh", Hits()));
+ TEST_DO(assertString(fs, "%{2,4}zoidberg", "zoidblergh", Hits().add(0)));
+ // Fallback
+ TEST_DO(assertString(fs, "%{3,4}zoidberg", "zoidblergh", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{3,4}zoidberg", "zoidbooorg", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{3,4}zoidberg", "zoidzooorg", Hits()));
+
+ fs.normalize_mode(Normalizing::NONE);
+ // DFA
+ TEST_DO(assertString(fs, "%{1,4}zoid", "ZOID", Hits()));
+ TEST_DO(assertString(fs, "%{1,4}ZOID", "zoid", Hits()));
+ TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoidBerg", Hits().add(0))); // 1 edit
+ TEST_DO(assertString(fs, "%{1,4}zoidberg", "zoidBblerg", Hits())); // 2 edits, 1 max
+ TEST_DO(assertString(fs, "%{2,4}zoidberg", "zoidBblerg", Hits().add(0))); // 2 edits, 2 max
+ // Fallback
+ TEST_DO(assertString(fs, "%{3,4}zoidberg", "zoidBERG", Hits())); // 4 edits, 3 max
+ TEST_DO(assertString(fs, "%{4,4}zoidberg", "zoidBERG", Hits().add(0))); // 4 edits, 4 max
+}
+
+TEST("utf8 flexible searcher fuzzy match with max_edits=0 implies exact match") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ TEST_DO(assertString(fs, "%{0}zoid", "zoi", Hits()));
+ TEST_DO(assertString(fs, "%{0,4}zoid", "zoi", Hits()));
+ TEST_DO(assertString(fs, "%{0}zoid", "zoid", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{0}zoid", "ZOID", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{0,4}zoid", "ZOID", Hits().add(0)));
+ fs.normalize_mode(Normalizing::NONE);
+ TEST_DO(assertString(fs, "%{0}zoid", "ZOID", Hits()));
+ TEST_DO(assertString(fs, "%{0,4}zoid", "ZOID", Hits()));
+ TEST_DO(assertString(fs, "%{0}zoid", "zoid", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{0,4}zoid", "zoid", Hits().add(0)));
+}
+
+TEST("utf8 flexible searcher caps oversized fuzzy prefix length to term length") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ // DFA
+ TEST_DO(assertString(fs, "%{1,5}zoid", "zoid", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1,9001}zoid", "zoid", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{1,9001}zoid", "boid", Hits()));
+ // Fallback
+ TEST_DO(assertString(fs, "%{0,5}zoid", "zoid", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{5,5}zoid", "zoid", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{0,9001}zoid", "zoid", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{5,9001}zoid", "zoid", Hits().add(0)));
+ TEST_DO(assertString(fs, "%{5,9001}zoid", "boid", Hits()));
+}
+
TEST("bool search") {
BoolFieldSearcher fs(0);
TEST_DO(assertBool(fs, "true", true, true));
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
index c5bca6f3899..bb3aa6fdd10 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -46,7 +46,7 @@ public:
explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {}
FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept;
~FieldSearcher() override;
- virtual std::unique_ptr<FieldSearcher> duplicate() const = 0;
+ [[nodiscard]] virtual std::unique_ptr<FieldSearcher> duplicate() const = 0;
bool search(const StorageDocument & doc);
virtual void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
index c0a0249125f..98e88e45b3a 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
@@ -34,7 +34,7 @@ bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef)
}
} else {
for (auto qt : _qtl) {
- if (fieldRef.size() >= qt->termLen() || qt->isRegex()) {
+ if (fieldRef.size() >= qt->termLen() || qt->isRegex() || qt->isFuzzy()) {
_words += matchTerm(fieldRef, *qt);
} else {
_words += countWords(fieldRef);
@@ -49,8 +49,8 @@ size_t StrChrFieldSearcher::shortestTerm() const
size_t mintsz(_qtl.front()->termLen());
for (auto it=_qtl.begin()+1, mt=_qtl.end(); it != mt; it++) {
const QueryTerm & qt = **it;
- if (qt.isRegex()) {
- return 0; // Must avoid "too short query term" optimization when using regex
+ if (qt.isRegex() || qt.isFuzzy()) {
+ return 0; // Must avoid "too short query term" optimization when using regex or fuzzy
}
mintsz = std::min(mintsz, qt.termLen());
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
index c6deb6eacd1..d648d2e252e 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
@@ -1,5 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8flexiblestringfieldsearcher.h"
+#include <vespa/searchlib/query/streaming/fuzzy_term.h>
#include <vespa/searchlib/query/streaming/regexp_term.h>
#include <cassert>
@@ -40,6 +41,19 @@ UTF8FlexibleStringFieldSearcher::match_regexp(const FieldRef & f, search::stream
}
size_t
+UTF8FlexibleStringFieldSearcher::match_fuzzy(const FieldRef & f, search::streaming::QueryTerm & qt)
+{
+ auto* fuzzy_term = qt.as_fuzzy_term();
+ assert(fuzzy_term != nullptr);
+ // TODO delegate to matchTermExact if max edits == 0?
+ // - needs to avoid folding to have consistent normalization semantics
+ if (fuzzy_term->is_match({f.data(), f.size()})) {
+ addHit(qt, 0);
+ }
+ return countWords(f);
+}
+
+size_t
UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
{
if (qt.isPrefix()) {
@@ -57,6 +71,9 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
} else if (qt.isRegex()) {
LOG(debug, "Use regexp match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
return match_regexp(f, qt);
+ } else if (qt.isFuzzy()) {
+ LOG(debug, "Use fuzzy match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return match_fuzzy(f, qt);
} else {
if (substring()) {
LOG(debug, "Use substring match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
index cd1715ad158..a5f6ad46246 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
@@ -25,6 +25,7 @@ private:
size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
size_t match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt);
+ size_t match_fuzzy(const FieldRef & f, search::streaming::QueryTerm & qt);
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
index 9c8bb2f185a..63d2007cecf 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -133,7 +133,7 @@ FieldSearchSpec::reconfig(const QueryTerm & term)
(term.isSuffix() && _arg1 != "suffix") ||
(term.isExactstring() && _arg1 != "exact") ||
(term.isPrefix() && _arg1 == "suffix") ||
- term.isRegex())
+ (term.isRegex() || term.isFuzzy()))
{
_searcher = std::make_unique<UTF8FlexibleStringFieldSearcher>(id());
propagate_settings_to_searcher();