summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/querynode.cpp8
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/queryterm.h2
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/regexp_term.cpp27
-rw-r--r--searchlib/src/vespa/searchlib/query/streaming/regexp_term.h25
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp49
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp10
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp16
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h6
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp3
11 files changed, 143 insertions, 10 deletions
diff --git a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
index 6b9be2e3269..05a75f4662e 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/query/streaming/CMakeLists.txt
@@ -11,5 +11,6 @@ vespa_add_library(searchlib_query_streaming OBJECT
queryterm.cpp
wand_term.cpp
weighted_set_term.cpp
+ regexp_term.cpp
DEPENDS
)
diff --git a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
index 1ce80660d46..2ee515f062a 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/querynode.cpp
@@ -2,6 +2,7 @@
#include "query.h"
#include "nearest_neighbor_query_node.h"
+#include "regexp_term.h"
#include <vespa/searchlib/parsequery/stackdumpiterator.h>
#include <vespa/searchlib/query/streaming/dot_product_term.h>
#include <vespa/searchlib/query/streaming/in_term.h>
@@ -145,7 +146,12 @@ QueryNode::Build(const QueryNode * parent, const QueryNodeResultFactory & factor
qn = std::make_unique<TrueNode>();
} else {
Normalizing normalize_mode = factory.normalizing_mode(ssIndex);
- auto qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
+ std::unique_ptr<QueryTerm> qt;
+ if (sTerm != TermType::REGEXP) {
+ qt = std::make_unique<QueryTerm>(factory.create(), ssTerm, ssIndex, sTerm, normalize_mode);
+ } else {
+ qt = std::make_unique<RegexpTerm>(factory.create(), ssTerm, ssIndex, TermType::REGEXP, normalize_mode);
+ }
qt->setWeight(queryRep.GetWeight());
qt->setUniqueId(queryRep.getUniqueId());
if (qt->isFuzzy()) {
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
index 3950a179d67..3e05d381ee2 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.cpp
@@ -179,4 +179,10 @@ QueryTerm::as_multi_term() noexcept
return nullptr;
}
+RegexpTerm*
+QueryTerm::as_regexp_term() noexcept
+{
+ return nullptr;
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
index 743998a630e..cd2bdd7eaec 100644
--- a/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
+++ b/searchlib/src/vespa/searchlib/query/streaming/queryterm.h
@@ -13,6 +13,7 @@ namespace search::streaming {
class NearestNeighborQueryNode;
class MultiTerm;
+class RegexpTerm;
/**
This is a leaf in the Query tree. All terms are leafs.
@@ -93,6 +94,7 @@ public:
void setFuzzyPrefixLength(uint32_t fuzzyPrefixLength) { _fuzzyPrefixLength = fuzzyPrefixLength; }
virtual NearestNeighborQueryNode* as_nearest_neighbor_query_node() noexcept;
virtual MultiTerm* as_multi_term() noexcept;
+ virtual RegexpTerm* as_regexp_term() noexcept;
protected:
using QueryNodeResultBaseContainer = std::unique_ptr<QueryNodeResultBase>;
string _index;
diff --git a/searchlib/src/vespa/searchlib/query/streaming/regexp_term.cpp b/searchlib/src/vespa/searchlib/query/streaming/regexp_term.cpp
new file mode 100644
index 00000000000..4508caa7072
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/regexp_term.cpp
@@ -0,0 +1,27 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "regexp_term.h"
+
+namespace search::streaming {
+
+using vespalib::Regex;
+
+namespace {
+
+constexpr Regex::Options normalize_mode_to_regex_opts(Normalizing norm) noexcept {
+ return ((norm == Normalizing::NONE)
+ ? Regex::Options::None
+ : Regex::Options::IgnoreCase);
+}
+
+}
+
+RegexpTerm::RegexpTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
+ const string& index, Type type, Normalizing normalizing)
+ : QueryTerm(std::move(result_base), term, index, type, normalizing),
+ _regexp(Regex::from_pattern({term.data(), term.size()}, normalize_mode_to_regex_opts(normalizing)))
+{
+}
+
+RegexpTerm::~RegexpTerm() = default;
+
+}
diff --git a/searchlib/src/vespa/searchlib/query/streaming/regexp_term.h b/searchlib/src/vespa/searchlib/query/streaming/regexp_term.h
new file mode 100644
index 00000000000..96d14eeb0bd
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/query/streaming/regexp_term.h
@@ -0,0 +1,25 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "queryterm.h"
+#include <vespa/vespalib/regex/regex.h>
+
+namespace search::streaming {
+
+/**
+ * Query term that matches fields using a regular expression, with case sensitivity
+ * controlled by the provided Normalizing mode.
+ */
+class RegexpTerm : public QueryTerm {
+ vespalib::Regex _regexp;
+public:
+ RegexpTerm(std::unique_ptr<QueryNodeResultBase> result_base, stringref term,
+ const string& index, Type type, Normalizing normalizing);
+ ~RegexpTerm() override;
+
+ RegexpTerm* as_regexp_term() noexcept override { return this; }
+
+ [[nodiscard]] const vespalib::Regex& regexp() const noexcept { return _regexp; }
+};
+
+}
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 7f89071868a..791ed3ba787 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -3,6 +3,7 @@
#include <vespa/vespalib/testkit/testapp.h>
#include <vespa/document/fieldvalue/fieldvalues.h>
+#include <vespa/searchlib/query/streaming/regexp_term.h>
#include <vespa/searchlib/query/streaming/queryterm.h>
#include <vespa/vsm/searcher/boolfieldsearcher.h>
#include <vespa/vsm/searcher/fieldsearcher.h>
@@ -21,6 +22,7 @@
using namespace document;
using search::streaming::HitList;
using search::streaming::QueryNodeResultFactory;
+using search::streaming::RegexpTerm;
using search::streaming::QueryTerm;
using search::streaming::Normalizing;
using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
@@ -63,7 +65,12 @@ private:
for (const auto & term : terms) {
ParsedQueryTerm pqt = parseQueryTerm(term);
ParsedTerm pt = parseTerm(pqt.second);
- qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second, normalizing));
+ std::string effective_index = pqt.first.empty() ? "index" : pqt.first;
+ if (pt.second != TermType::REGEXP) {
+ qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing));
+ } else {
+ qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing));
+ }
}
for (const auto & i : qtv) {
qtl.push_back(i.get());
@@ -91,6 +98,8 @@ public:
return std::make_pair(term.substr(1, term.size() - 2), TermType::SUBSTRINGTERM);
} else if (term[0] == '*') {
return std::make_pair(term.substr(1, term.size() - 1), TermType::SUFFIXTERM);
+ } else if (term[0] == '#') { // magic regex enabler
+ return std::make_pair(term.substr(1), TermType::REGEXP);
} else if (term[term.size() - 1] == '*') {
return std::make_pair(term.substr(0, term.size() - 1), TermType::PREFIXTERM);
} else {
@@ -479,6 +488,8 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
ASSERT_TRUE(Query::parseTerm("*suffix").second == TermType::SUFFIXTERM);
ASSERT_TRUE(Query::parseTerm("prefix*").first == "prefix");
ASSERT_TRUE(Query::parseTerm("prefix*").second == TermType::PREFIXTERM);
+ ASSERT_TRUE(Query::parseTerm("#regex").first == "regex");
+ ASSERT_TRUE(Query::parseTerm("#regex").second == TermType::REGEXP);
ASSERT_TRUE(Query::parseTerm("term").first == "term");
ASSERT_TRUE(Query::parseTerm("term").second == TermType::WORD);
}
@@ -582,7 +593,7 @@ TEST("utf8 exact match") {
TEST_DO(assertString(fs, "hütte", "hütter", Hits()));
}
-TEST("utf8 flexible searcher"){
+TEST("utf8 flexible searcher (except regex)"){
UTF8FlexibleStringFieldSearcher fs(0);
// regular
assertString(fs, "vespa", "vespa", Hits().add(0));
@@ -611,6 +622,38 @@ TEST("utf8 flexible searcher"){
EXPECT_TRUE(testStringFieldInfo(fs));
}
+TEST("utf8 flexible searcher handles regex and by default has case-insensitive partial match semantics") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ // Note: the # term prefix is a magic term-as-regex symbol used only for tests in this file
+ TEST_DO(assertString(fs, "#abc", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#bc", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#ab", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[a-z]", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#(zoid)(berg)", "why not zoidberg?", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[a-z]", "123", Hits()));
+}
+
+TEST("utf8 flexible searcher handles case-sensitive regex matching") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ fs.normalize_mode(Normalizing::NONE);
+ TEST_DO(assertString(fs, "#abc", "ABC", Hits()));
+ TEST_DO(assertString(fs, "#abc", "abc", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[A-Z]", "A", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[A-Z]", "ABC", Hits().add(0)));
+ TEST_DO(assertString(fs, "#[A-Z]", "abc", Hits()));
+}
+
+TEST("utf8 flexible searcher handles regexes with explicit anchoring") {
+ UTF8FlexibleStringFieldSearcher fs(0);
+ TEST_DO(assertString(fs, "#^foo", "food", Hits().add(0)));
+ TEST_DO(assertString(fs, "#^foo", "afoo", Hits()));
+ TEST_DO(assertString(fs, "#foo$", "afoo", Hits().add(0)));
+ TEST_DO(assertString(fs, "#foo$", "food", Hits()));
+ TEST_DO(assertString(fs, "#^foo$", "foo", Hits().add(0)));
+ TEST_DO(assertString(fs, "#^foo$", "food", Hits()));
+ TEST_DO(assertString(fs, "#^foo$", "oo", Hits()));
+}
+
TEST("bool search") {
BoolFieldSearcher fs(0);
TEST_DO(assertBool(fs, "true", true, true));
@@ -635,6 +678,8 @@ TEST("integer search")
TEST_DO(assertInt(fs, "<11", 10, true));
TEST_DO(assertInt(fs, "<11", 11, false));
TEST_DO(assertInt(fs, "-10", -10, true));
+ TEST_DO(assertInt(fs, "10", -10, false));
+ TEST_DO(assertInt(fs, "-10", 10, false));
TEST_DO(assertInt(fs, "-9", -10, false));
TEST_DO(assertInt(fs, "a", 10, false));
TEST_DO(assertInt(fs, "[-5;5]", -5, true));
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
index c1fa6090021..c0a0249125f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
@@ -17,7 +17,7 @@ void StrChrFieldSearcher::prepare(search::streaming::QueryTermList& qtl,
void StrChrFieldSearcher::onValue(const document::FieldValue & fv)
{
- const document::LiteralFieldValueB & sfv = static_cast<const document::LiteralFieldValueB &>(fv);
+ const auto & sfv = static_cast<const document::LiteralFieldValueB &>(fv);
vespalib::stringref val = sfv.getValueRef();
FieldRef fr(val.data(), std::min(maxFieldLength(), val.size()));
matchDoc(fr);
@@ -25,7 +25,6 @@ void StrChrFieldSearcher::onValue(const document::FieldValue & fv)
bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef)
{
- bool retval(true);
if (_qtl.size() > 1) {
size_t mintsz = shortestTerm();
if (fieldRef.size() >= mintsz) {
@@ -35,14 +34,14 @@ bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef)
}
} else {
for (auto qt : _qtl) {
- if (fieldRef.size() >= qt->termLen()) {
+ if (fieldRef.size() >= qt->termLen() || qt->isRegex()) {
_words += matchTerm(fieldRef, *qt);
} else {
_words += countWords(fieldRef);
}
}
}
- return retval;
+ return true;
}
size_t StrChrFieldSearcher::shortestTerm() const
@@ -50,6 +49,9 @@ size_t StrChrFieldSearcher::shortestTerm() const
size_t mintsz(_qtl.front()->termLen());
for (auto it=_qtl.begin()+1, mt=_qtl.end(); it != mt; it++) {
const QueryTerm & qt = **it;
+ if (qt.isRegex()) {
+ return 0; // Must avoid "too short query term" optimization when using regex
+ }
mintsz = std::min(mintsz, qt.termLen());
}
return mintsz;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
index 78f491198ad..c6deb6eacd1 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
@@ -1,5 +1,7 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8flexiblestringfieldsearcher.h"
+#include <vespa/searchlib/query/streaming/regexp_term.h>
+#include <cassert>
#include <vespa/log/log.h>
LOG_SETUP(".vsm.searcher.utf8flexiblestringfieldsearcher");
@@ -27,6 +29,17 @@ UTF8FlexibleStringFieldSearcher::matchTerms(const FieldRef & f, const size_t min
}
size_t
+UTF8FlexibleStringFieldSearcher::match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt)
+{
+ auto* regexp_term = qt.as_regexp_term();
+ assert(regexp_term != nullptr);
+ if (regexp_term->regexp().partial_match({f.data(), f.size()})) {
+ addHit(qt, 0);
+ }
+ return countWords(f);
+}
+
+size_t
UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
{
if (qt.isPrefix()) {
@@ -41,6 +54,9 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
} else if (qt.isExactstring()) {
LOG(debug, "Use exact match for exact term '%s:%s'", qt.index().c_str(), qt.getTerm());
return matchTermExact(f, qt);
+ } else if (qt.isRegex()) {
+ LOG(debug, "Use regexp match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return match_regexp(f, qt);
} else {
if (substring()) {
LOG(debug, "Use substring match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
index bb1b55dffe4..cd1715ad158 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
@@ -14,16 +14,18 @@ class UTF8FlexibleStringFieldSearcher : public UTF8StringFieldSearcherBase
private:
/**
* Tries to match the given query term against the content of the given field reference.
- * Search strategy is choosen based on the query term type.
+ * Search strategy is chosen based on the query term type.
**/
size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
/**
* Tries to match each query term in the underlying query against the content of the given field reference.
- * Search strategy is choosen based on the query term type.
+ * Search strategy is chosen based on the query term type.
**/
size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
+ size_t match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt);
+
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId);
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
index 715c19a0bb7..468d8e0145a 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -134,7 +134,8 @@ FieldSearchSpec::reconfig(const QueryTerm & term)
if ((term.isSubstring() && _arg1 != "substring") ||
(term.isSuffix() && _arg1 != "suffix") ||
(term.isExactstring() && _arg1 != "exact") ||
- (term.isPrefix() && _arg1 == "suffix"))
+ (term.isPrefix() && _arg1 == "suffix") ||
+ term.isRegex())
{
_searcher = std::make_unique<UTF8FlexibleStringFieldSearcher>(id());
// preserve the basic match property of the searcher