summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-01-12 12:07:45 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-01-15 16:23:53 +0000
commitae88431f3770388afd22c6856b2ad17c994783ee (patch)
tree934f74c09ac9293269d6e38cac3e9b9359da49b5 /searchlib
parent242fee291a7aefab01f8d22e2059d57201d66c10 (diff)
Add regular expression support to streaming search
Introduces an explicit regex query term node (which wraps an RE2 regex instance internally) and extends the existing UTF-8 flexible string searcher to use this query node. Regex matching is optionally case (in)sensitive depending on the normalization mode used. Note on `searcher/searcher_test.cpp`: this adds a magic sentinel `#` char prefix to query term parsing in the test to let a query term be interpreted as a regex rather than exact/prefix/suffix/substring match.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp8
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h2
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/regexp_term.cpp27
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/regexp_term.h25
6 files changed, 68 insertions, 1 deletions
diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
index 6b9be2e3269..05a75f4662e 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
@@ -11,5 +11,6 @@ vespa_add_library(searchlib_query_streaming OBJECT
queryterm.cpp
wand_term.cpp
weighted_set_term.cpp
+ regexp_term.cpp
DEPENDS
)
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 1ce80660d46..2ee515f062a 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -2,6 +2,7 @@
#include "query.h"
#include "nearest_neighbor_query_node.h"
+#include "regexp_term.h"
#include <vespa/searchlib/parsequery/stackdumpiterator.h>
#include <vespa/searchlib/query/streaming/dot_product_term.h>
#include <vespa/searchlib/query/streaming/in_term.h>
@@ -145,7 +146,12 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
qn = std::make_unique<TrueNode>();
} else {
Normalizing normalize_mode = factory.normalizing_mode(ssIndex);
- auto qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
+ std::unique_ptr<QueryTerm> qt;
+ if (sTerm != TermType::REGEXP) {
+ qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
+ } else {
+ qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode);
+ }
qt->setWeight(queryRep.GetWeight());
qt->setUniqueId(queryRep.getUniqueId());
if (qt->isFuzzy()) {
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index 3950a179d67..3e05d381ee2 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -179,4 +179,10 @@ QueryTerm::as_multi_term() noexcept
return nullptr;
}
+RegexpTerm*
+QueryTerm::as_regexp_term() noexcept
+{
+ return nullptr;
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index 743998a630e..cd2bdd7eaec 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -13,6 +13,7 @@ namespace search::streaming {
class NearestNeighborQueryNode;
class MultiTerm;
+class RegexpTerm;
/**
This is a leaf in the Query tree. All terms are leafs.
@@ -93,6 +94,7 @@ public:
void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; }
virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept;
virtual MultiTerm* as_multi_term() noexcept;
+ virtual RegexpTerm* as_regexp_term() noexcept;
protected:
using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>;
string _index;
diff --git a/searchlib/src/vespa/searchlib/query/streaming/regexp_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/regexp_term.cpp
new file mode 100644
index 00000000000..4508caa7072
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/regexp_term.cpp
@@ -0,0 +1,27 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "regexp_term.h"
+
+namespace search::streaming {
+
+using vespalib::Regex;
+
+namespace {
+
+constexpr Regex::Options normalize_mode_to_regex_opts(Normalizing norm) noexcept {
+ return ((norm == Normalizing::NONE)
+ ? Regex::Options::None
+ : Regex::Options::IgnoreCase);
+}
+
+}
+
+RegexpTerm::RegexpTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
+ const string& index, Type type, Normalizing normalizing)
+ : QueryTerm(std::move(result_base), term, index, type, normalizing),
+ _regexp(Regex::from_pattern({term.data(), term.size()}, normalize_mode_to_regex_opts(normalizing)))
+{
+}
+
+RegexpTerm::~RegexpTerm() = default;
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/regexp_term.h b/searchlib/src/vespa/searchlib/query/streaming/regexp_term.h
new file mode 100644
index 00000000000..96d14eeb0bd
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/regexp_term.h
@@ -0,0 +1,25 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "queryterm.h"
+#include <vespa/vespalib/regex/regex.h>
+
+namespace search::streaming {
+
+/**
+ * Query term that matches fields using a regular expression, with case sensitivity
+ * controlled by the provided Normalizing mode.
+ */
+class RegexpTerm : public QueryTerm {
+ vespalib::Regex _regexp;
+public:
+ RegexpTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
+ const string& index, Type type, Normalizing normalizing);
+ ~RegexpTerm() override;
+
+ RegexpTerm* as_regexp_term() noexcept override { return this; }
+
+ [[nodiscard]] const vespalib::Regex& regexp() const noexcept { return _regexp; }
+};
+
+}