summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-01-19 12:46:15 +0100
committerGitHub <noreply@github.com>2024-01-19 12:46:15 +0100
commit7d737bcdc09d4aefd3cad56383871e6530a84561 (patch)
treef59b6e62d89906caaf37d8318cd83b0c7893dfca /searchlib
parent461c4e60cdcb2e7a657ee05923477b584fd6792d (diff)
parentdc973997098c239d71a57b1c692cb79b868ea8b8 (diff)
Merge pull request #29969 from vespa-engine/vekterli/support-fuzzy-matching-in-streaming-search
Support fuzzy term matching in streaming search
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp43
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h34
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp16
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h2
6 files changed, 94 insertions, 8 deletions
diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
index 05a75f4662e..76119a6d58f 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
@@ -2,6 +2,7 @@
vespa_add_library(searchlib_query_streaming OBJECT
SOURCES
dot_product_term.cpp
+ fuzzy_term.cpp
in_term.cpp
multi_term.cpp
nearest_neighbor_query_node.cpp
diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
new file mode 100644
index 00000000000..f33fe44369a
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.cpp
@@ -0,0 +1,43 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "fuzzy_term.h"
+
+namespace search::streaming {
+
+namespace {
+
+constexpr bool normalizing_implies_cased(Normalizing norm) noexcept {
+ return (norm == Normalizing::NONE);
+}
+
+}
+
+FuzzyTerm::FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
+ const string& index, Type type, Normalizing normalizing,
+ uint8_t max_edits, uint32_t prefix_size)
+ : QueryTerm(std::move(result_base), term, index, type, normalizing),
+ _dfa_matcher(),
+ _fallback_matcher()
+{
+ setFuzzyMaxEditDistance(max_edits);
+ setFuzzyPrefixLength(prefix_size);
+
+ std::string_view term_view(term.data(), term.size());
+ const bool cased = normalizing_implies_cased(normalizing);
+ if (attribute::DfaFuzzyMatcher::supports_max_edits(max_edits)) {
+ _dfa_matcher = std::make_unique<attribute::DfaFuzzyMatcher>(term_view, max_edits, prefix_size, cased);
+ } else {
+ _fallback_matcher = std::make_unique<vespalib::FuzzyMatcher>(term_view, max_edits, prefix_size, cased);
+ }
+}
+
+FuzzyTerm::~FuzzyTerm() = default;
+
+bool FuzzyTerm::is_match(std::string_view term) const {
+ if (_dfa_matcher) {
+ return _dfa_matcher->is_match(term);
+ } else {
+ return _fallback_matcher->isMatch(term);
+ }
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
new file mode 100644
index 00000000000..c6c88b18969
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/fuzzy_term.h
@@ -0,0 +1,34 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "queryterm.h"
+#include <vespa/searchlib/attribute/dfa_fuzzy_matcher.h>
+#include <vespa/vespalib/fuzzy/fuzzy_matcher.h>
+#include <memory>
+#include <string_view>
+
+namespace search::streaming {
+
+/**
+ * Query term that matches candidate field terms that are within a query-specified
+ * maximum number of edits (add, delete or substitute a character), with case
+ * sensitivity controlled by the provided Normalizing mode.
+ *
+ * Optionally, terms may be prefixed-locked, which enforces field terms to have a
+ * particular prefix and where edits are only counted for the remaining term suffix.
+ */
+class FuzzyTerm : public QueryTerm {
+ std::unique_ptr<attribute::DfaFuzzyMatcher> _dfa_matcher;
+ std::unique_ptr<vespalib::FuzzyMatcher> _fallback_matcher;
+public:
+ FuzzyTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
+ const string& index, Type type, Normalizing normalizing,
+ uint8_t max_edits, uint32_t prefix_size);
+ ~FuzzyTerm() override;
+
+ [[nodiscard]] FuzzyTerm* as_fuzzy_term() noexcept override { return this; }
+
+ [[nodiscard]] bool is_match(std::string_view term) const;
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 2ee515f062a..e71529a8aca 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -1,7 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include "query.h"
+#include "fuzzy_term.h"
#include "nearest_neighbor_query_node.h"
+#include "query.h"
#include "regexp_term.h"
#include <vespa/searchlib/parsequery/stackdumpiterator.h>
#include <vespa/searchlib/query/streaming/dot_product_term.h>
@@ -147,17 +148,16 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
} else {
Normalizing normalize_mode = factory.normalizing_mode(ssIndex);
std::unique_ptr<QueryTerm> qt;
- if (sTerm != TermType::REGEXP) {
- qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
- } else {
+ if (sTerm == TermType::REGEXP) {
qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode);
+ } else if (sTerm == TermType::FUZZYTERM) {
+ qt = std::make_unique<FuzzyTerm>(factory.create(), ssTerm, ssIndex, TermType::FUZZYTERM, normalize_mode,
+ queryRep.getFuzzyMaxEditDistance(), queryRep.getFuzzyPrefixLength());
+ } else [[likely]] {
+ qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
}
qt->setWeight(queryRep.GetWeight());
qt->setUniqueId(queryRep.getUniqueId());
- if (qt->isFuzzy()) {
- qt->setFuzzyMaxEditDistance(queryRep.getFuzzyMaxEditDistance());
- qt->setFuzzyPrefixLength(queryRep.getFuzzyPrefixLength());
- }
if (allowRewrite && possibleFloat(*qt, ssTerm) && factory.allow_float_terms_rewrite(ssIndex)) {
auto phrase = std::make_unique<PhraseQueryNode>();
auto dotPos = ssTerm.find('.');
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index 3e05d381ee2..fb002ec1867 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -185,4 +185,10 @@ QueryTerm::as_regexp_term() noexcept
return nullptr;
}
+FuzzyTerm*
+QueryTerm::as_fuzzy_term() noexcept
+{
+ return nullptr;
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index e7fe488ca89..b4dfa98ebe5 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -11,6 +11,7 @@
namespace search::streaming {
+class FuzzyTerm;
class NearestNeighborQueryNode;
class MultiTerm;
class RegexpTerm;
@@ -95,6 +96,7 @@ public:
virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept;
virtual MultiTerm* as_multi_term() noexcept;
virtual RegexpTerm* as_regexp_term() noexcept;
+ virtual FuzzyTerm* as_fuzzy_term() noexcept;
protected:
using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>;
string _index;