aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors/src/vespa/vsm/searcher
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-01-12 12:07:45 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-01-15 16:23:53 +0000
commitae88431f3770388afd22c6856b2ad17c994783ee (patch)
tree934f74c09ac9293269d6e38cac3e9b9359da49b5 /streamingvisitors/src/vespa/vsm/searcher
parent242fee291a7aefab01f8d22e2059d57201d66c10 (diff)
Add regular expression support to streaming search
Introduces an explicit regex query term node (which wraps an RE2 regex instance internally) and extends the existing UTF-8 flexible string searcher to use this query node. Regex matching is optionally case (in)sensitive depending on the normalization mode used. Note on `searcher/searcher_test.cpp`: this adds a magic sentinel `#` char prefix to query term parsing in the test to let a query term be interpreted as a regex rather than exact/prefix/suffix/substring match.
Diffstat (limited to 'streamingvisitors/src/vespa/vsm/searcher')
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp10
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp16
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h6
3 files changed, 26 insertions, 6 deletions
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
index c1fa6090021..c0a0249125f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
@@ -17,7 +17,7 @@ void StrChrFieldSearcher::prepare(search::streaming::QueryTermList& qtl,
void StrChrFieldSearcher::onValue(const document::FieldValue & fv)
{
- const document::LiteralFieldValueB & sfv = static_cast<const document::LiteralFieldValueB &>(fv);
+ const auto & sfv = static_cast<const document::LiteralFieldValueB &>(fv);
vespalib::stringref val = sfv.getValueRef();
FieldRef fr(val.data(), std::min(maxFieldLength(), val.size()));
matchDoc(fr);
@@ -25,7 +25,6 @@ void StrChrFieldSearcher::onValue(const document::FieldValue & fv)
bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef)
{
- bool retval(true);
if (_qtl.size() > 1) {
size_t mintsz = shortestTerm();
if (fieldRef.size() >= mintsz) {
@@ -35,14 +34,14 @@ bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef)
}
} else {
for (auto qt : _qtl) {
- if (fieldRef.size() >= qt->termLen()) {
+ if (fieldRef.size() >= qt->termLen() || qt->isRegex()) {
_words += matchTerm(fieldRef, *qt);
} else {
_words += countWords(fieldRef);
}
}
}
- return retval;
+ return true;
}
size_t StrChrFieldSearcher::shortestTerm() const
@@ -50,6 +49,9 @@ size_t StrChrFieldSearcher::shortestTerm() const
size_t mintsz(_qtl.front()->termLen());
for (auto it=_qtl.begin()+1, mt=_qtl.end(); it != mt; it++) {
const QueryTerm & qt = **it;
+ if (qt.isRegex()) {
+ return 0; // Must avoid "too short query term" optimization when using regex
+ }
mintsz = std::min(mintsz, qt.termLen());
}
return mintsz;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
index 78f491198ad..c6deb6eacd1 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
@@ -1,5 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8flexiblestringfieldsearcher.h"
+#include <vespa/searchlib/query/streaming/regexp_term.h>
+#include <cassert>
#include <vespa/log/log.h>
LOG_SETUP(".vsm.searcher.utf8flexiblestringfieldsearcher");
@@ -27,6 +29,17 @@ UTF8FlexibleStringFieldSearcher::matchTerms(const FieldRef & f, const size_t min
}
size_t
+UTF8FlexibleStringFieldSearcher::match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt)
+{
+ auto* regexp_term = qt.as_regexp_term();
+ assert(regexp_term != nullptr);
+ if (regexp_term->regexp().partial_match({f.data(), f.size()})) {
+ addHit(qt, 0);
+ }
+ return countWords(f);
+}
+
+size_t
UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
{
if (qt.isPrefix()) {
@@ -41,6 +54,9 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
} else if (qt.isExactstring()) {
LOG(debug, "Use exact match for exact term '%s:%s'", qt.index().c_str(), qt.getTerm());
return matchTermExact(f, qt);
+ } else if (qt.isRegex()) {
+ LOG(debug, "Use regexp match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return match_regexp(f, qt);
} else {
if (substring()) {
LOG(debug, "Use substring match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
index bb1b55dffe4..cd1715ad158 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
@@ -14,16 +14,18 @@ class UTF8FlexibleStringFieldSearcher : public UTF8StringFieldSearcherBase
private:
/**
* Tries to match the given query term against the content of the given field reference.
- * Search strategy is choosen based on the query term type.
+ * Search strategy is chosen based on the query term type.
**/
size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
/**
* Tries to match each query term in the underlying query against the content of the given field reference.
- * Search strategy is choosen based on the query term type.
+ * Search strategy is chosen based on the query term type.
**/
size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
+ size_t match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt);
+
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId);