summaryrefslogtreecommitdiffstats
path: root/streamingvisitors
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@vespa.ai>2024-01-12 12:07:45 +0000
committerTor Brede Vekterli <vekterli@vespa.ai>2024-01-15 16:23:53 +0000
commitae88431f3770388afd22c6856b2ad17c994783ee (patch)
tree934f74c09ac9293269d6e38cac3e9b9359da49b5 /streamingvisitors
parent242fee291a7aefab01f8d22e2059d57201d66c10 (diff)
Add regular expression support to streaming search
Introduces an explicit regex query term node (which wraps an RE2 regex instance internally) and extends the existing UTF-8 flexible string searcher to use this query node. Regex matching is optionally case (in)sensitive depending on the normalization mode used. Note on `searcher/searcher_test.cpp`: this adds a magic sentinel `#` char prefix to query term parsing in the test to let a query term be interpreted as a regex rather than exact/prefix/suffix/substring match.
Diffstat (limited to 'streamingvisitors')
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp49
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp10
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp16
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h6
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp3
5 files changed, 75 insertions, 9 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 7f89071868a..791ed3ba787 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -3,6 +3,7 @@
#include <vespa/vespalib/testkit/testapp.h>
#include <vespa/document/fieldvalue/fieldvalues.h>
+#include <vespa/searchlib/query/streaming/regexp_term.h>
#include <vespa/searchlib/query/streaming/queryterm.h>
#include <vespa/vsm/searcher/boolfieldsearcher.h>
#include <vespa/vsm/searcher/fieldsearcher.h>
@@ -21,6 +22,7 @@
using namespace document;
using search::streaming::HitList;
using search::streaming::QueryNodeResultFactory;
+using search::streaming::RegexpTerm;
using search::streaming::QueryTerm;
using search::streaming::Normalizing;
using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
@@ -63,7 +65,12 @@ private:
for (const auto & term : terms) {
ParsedQueryTerm pqt = parseQueryTerm(term);
ParsedTerm pt = parseTerm(pqt.second);
- qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second, normalizing));
+ std::string effective_index = pqt.first.empty() ? "index" : pqt.first;
+ if (pt.second != TermType::REGEXP) {
+ qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing));
+ } else {
+ qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing));
+ }
}
for (const auto & i : qtv) {
qtl.push_back(i.get());
@@ -91,6 +98,8 @@ public:
return std::make_pair(term.substr(1, term.size() - 2), TermType::SUBSTRINGTERM);
} else if (term[0] == '*') {
return std::make_pair(term.substr(1, term.size() - 1), TermType::SUFFIXTERM);
+ } else if (term[0] == '#') { // magic regex enabler
+ return std::make_pair(term.substr(1), TermType::REGEXP);
} else if (term[term.size() - 1] == '*') {
return std::make_pair(term.substr(0, term.size() - 1), TermType::PREFIXTERM);
} else {
@@ -479,6 +488,8 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
ASSERT_TRUE(Query::parseTerm("*suffix").second == TermType::SUFFIXTERM);
ASSERT_TRUE(Query::parseTerm("prefix*").first == "prefix");
ASSERT_TRUE(Query::parseTerm("prefix*").second == TermType::PREFIXTERM);
+ ASSERT_TRUE(Query::parseTerm("#regex").first == "regex");
+ ASSERT_TRUE(Query::parseTerm("#regex").second == TermType::REGEXP);
ASSERT_TRUE(Query::parseTerm("term").first == "term");
ASSERT_TRUE(Query::parseTerm("term").second == TermType::WORD);
}
@@ -582,7 +593,7 @@ TEST("utf8 exact match") {
TEST_DO(assertString(fs, "hütte", "hütter", Hits()));
}
-TEST("utf8 flexible searcher"){
+TEST("utf8 flexible searcher (except regex)"){
UTF8FlexibleStringFieldSearcher fs(0);
// regular
assertString(fs, "vespa", "vespa", Hits().add(0));
@@ -611,6 +622,38 @@ TEST("utf8 flexible searcher"){
EXPECT_TRUE(testStringFieldInfo(fs));
}
+TEST("utf8 flexible searcher handles regex and by default has case-insensitive partial match semantics") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ // Note: the # term prefix is a magic term-as-regex symbol used only for tests in this file
+ TEST_DO(assertString(fs, "#abc", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#bc", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#ab", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[a-z]", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#(zoid)(berg)", "why not zoidberg?", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[a-z]", "123", Hits()));
+}
+
+TEST("utf8 flexible searcher handles case-sensitive regex matching") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ fs.normalize_mode(Normalizing::NONE);
+ TEST_DO(assertString(fs, "#abc", "ABC", Hits()));
+ TEST_DO(assertString(fs, "#abc", "abc", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[A-Z]", "A", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[A-Z]", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[A-Z]", "abc", Hits()));
+}
+
+TEST("utf8 flexible searcher handles regexes with explicit anchoring") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ TEST_DO(assertString(fs, "#^foo", "food", Hits().add(0)));
+ TEST_DO(assertString(fs, "#^foo", "afoo", Hits()));
+ TEST_DO(assertString(fs, "#foo$", "afoo", Hits().add(0)));
+ TEST_DO(assertString(fs, "#foo$", "food", Hits()));
+ TEST_DO(assertString(fs, "#^foo$", "foo", Hits().add(0)));
+ TEST_DO(assertString(fs, "#^foo$", "food", Hits()));
+ TEST_DO(assertString(fs, "#^foo$", "oo", Hits()));
+}
+
TEST("bool search") {
BoolFieldSearcher fs(0);
TEST_DO(assertBool(fs, "true", true, true));
@@ -635,6 +678,8 @@ TEST("integer search")
TEST_DO(assertInt(fs, "<11", 10, true));
TEST_DO(assertInt(fs, "<11", 11, false));
TEST_DO(assertInt(fs, "-10", -10, true));
+ TEST_DO(assertInt(fs, "10", -10, false));
+ TEST_DO(assertInt(fs, "-10", 10, false));
TEST_DO(assertInt(fs, "-9", -10, false));
TEST_DO(assertInt(fs, "a", 10, false));
TEST_DO(assertInt(fs, "[-5;5]", -5, true));
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
index c1fa6090021..c0a0249125f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
@@ -17,7 +17,7 @@ void StrChrFieldSearcher::prepare(search::streaming::QueryTermList& qtl,
void StrChrFieldSearcher::onValue(const document::FieldValue & fv)
{
- const document::LiteralFieldValueB & sfv = static_cast<const document::LiteralFieldValueB &>(fv);
+ const auto & sfv = static_cast<const document::LiteralFieldValueB &>(fv);
vespalib::stringref val = sfv.getValueRef();
FieldRef fr(val.data(), std::min(maxFieldLength(), val.size()));
matchDoc(fr);
@@ -25,7 +25,6 @@ void StrChrFieldSearcher::onValue(const document::FieldValue & fv)
bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef)
{
- bool retval(true);
if (_qtl.size() > 1) {
size_t mintsz = shortestTerm();
if (fieldRef.size() >= mintsz) {
@@ -35,14 +34,14 @@ bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef)
}
} else {
for (auto qt : _qtl) {
- if (fieldRef.size() >= qt->termLen()) {
+ if (fieldRef.size() >= qt->termLen() || qt->isRegex()) {
_words += matchTerm(fieldRef, *qt);
} else {
_words += countWords(fieldRef);
}
}
}
- return retval;
+ return true;
}
size_t StrChrFieldSearcher::shortestTerm() const
@@ -50,6 +49,9 @@ size_t StrChrFieldSearcher::shortestTerm() const
size_t mintsz(_qtl.front()->termLen());
for (auto it=_qtl.begin()+1, mt=_qtl.end(); it != mt; it++) {
const QueryTerm & qt = **it;
+ if (qt.isRegex()) {
+ return 0; // Must avoid "too short query term" optimization when using regex
+ }
mintsz = std::min(mintsz, qt.termLen());
}
return mintsz;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
index 78f491198ad..c6deb6eacd1 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
@@ -1,5 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8flexiblestringfieldsearcher.h"
+#include <vespa/searchlib/query/streaming/regexp_term.h>
+#include <cassert>
#include <vespa/log/log.h>
LOG_SETUP(".vsm.searcher.utf8flexiblestringfieldsearcher");
@@ -27,6 +29,17 @@ UTF8FlexibleStringFieldSearcher::matchTerms(const FieldRef & f, const size_t min
}
size_t
+UTF8FlexibleStringFieldSearcher::match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt)
+{
+ auto* regexp_term = qt.as_regexp_term();
+ assert(regexp_term != nullptr);
+ if (regexp_term->regexp().partial_match({f.data(), f.size()})) {
+ addHit(qt, 0);
+ }
+ return countWords(f);
+}
+
+size_t
UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
{
if (qt.isPrefix()) {
@@ -41,6 +54,9 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
} else if (qt.isExactstring()) {
LOG(debug, "Use exact match for exact term '%s:%s'", qt.index().c_str(), qt.getTerm());
return matchTermExact(f, qt);
+ } else if (qt.isRegex()) {
+ LOG(debug, "Use regexp match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return match_regexp(f, qt);
} else {
if (substring()) {
LOG(debug, "Use substring match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
index bb1b55dffe4..cd1715ad158 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
@@ -14,16 +14,18 @@ class UTF8FlexibleStringFieldSearcher : public UTF8StringFieldSearcherBase
private:
/**
* Tries to match the given query term against the content of the given field reference.
- * Search strategy is choosen based on the query term type.
+ * Search strategy is chosen based on the query term type.
**/
size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
/**
* Tries to match each query term in the underlying query against the content of the given field reference.
- * Search strategy is choosen based on the query term type.
+ * Search strategy is chosen based on the query term type.
**/
size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
+ size_t match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt);
+
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId);
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
index 715c19a0bb7..468d8e0145a 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -134,7 +134,8 @@ FieldSearchSpec::reconfig(const QueryTerm & term)
if ((term.isSubstring() && _arg1 != "substring") ||
(term.isSuffix() && _arg1 != "suffix") ||
(term.isExactstring() && _arg1 != "exact") ||
- (term.isPrefix() && _arg1 == "suffix"))
+ (term.isPrefix() && _arg1 == "suffix") ||
+ term.isRegex())
{
_searcher = std::make_unique<UTF8FlexibleStringFieldSearcher>(id());
// preserve the basic match property of the searcher