diff options
Diffstat (limited to 'streamingvisitors')
5 files changed, 75 insertions, 9 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 7f89071868a..791ed3ba787 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -3,6 +3,7 @@ #include <vespa/vespalib/testkit/testapp.h> #include <vespa/document/fieldvalue/fieldvalues.h> +#include <vespa/searchlib/query/streaming/regexp_term.h> #include <vespa/searchlib/query/streaming/queryterm.h> #include <vespa/vsm/searcher/boolfieldsearcher.h> #include <vespa/vsm/searcher/fieldsearcher.h> @@ -21,6 +22,7 @@ using namespace document; using search::streaming::HitList; using search::streaming::QueryNodeResultFactory; +using search::streaming::RegexpTerm; using search::streaming::QueryTerm; using search::streaming::Normalizing; using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod; @@ -63,7 +65,12 @@ private: for (const auto & term : terms) { ParsedQueryTerm pqt = parseQueryTerm(term); ParsedTerm pt = parseTerm(pqt.second); - qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second, normalizing)); + std::string effective_index = pqt.first.empty() ? "index" : pqt.first; + if (pt.second != TermType::REGEXP) { + qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing)); + } else { + qtv.push_back(std::make_unique<RegexpTerm>(eqnr.create(), pt.first, effective_index, pt.second, normalizing)); + } } for (const auto & i : qtv) { qtl.push_back(i.get()); @@ -91,6 +98,8 @@ public: return std::make_pair(term.substr(1, term.size() - 2), TermType::SUBSTRINGTERM); } else if (term[0] == '*') { return std::make_pair(term.substr(1, term.size() - 1), TermType::SUFFIXTERM); + } else if (term[0] == '#') { // magic regex enabler + return std::make_pair(term.substr(1), TermType::REGEXP); } else if (term[term.size() - 1] == '*') { return std::make_pair(term.substr(0, term.size() - 1), TermType::PREFIXTERM); } else { @@ -479,6 +488,8 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs) ASSERT_TRUE(Query::parseTerm("*suffix").second == TermType::SUFFIXTERM); ASSERT_TRUE(Query::parseTerm("prefix*").first == "prefix"); ASSERT_TRUE(Query::parseTerm("prefix*").second == TermType::PREFIXTERM); + ASSERT_TRUE(Query::parseTerm("#regex").first == "regex"); + ASSERT_TRUE(Query::parseTerm("#regex").second == TermType::REGEXP); ASSERT_TRUE(Query::parseTerm("term").first == "term"); ASSERT_TRUE(Query::parseTerm("term").second == TermType::WORD); } @@ -582,7 +593,7 @@ TEST("utf8 exact match") { TEST_DO(assertString(fs, "hütte", "hütter", Hits())); } -TEST("utf8 flexible searcher"){ +TEST("utf8 flexible searcher (except regex)"){ UTF8FlexibleStringFieldSearcher fs(0); // regular assertString(fs, "vespa", "vespa", Hits().add(0)); @@ -611,6 +622,38 @@ TEST("utf8 flexible searcher"){ EXPECT_TRUE(testStringFieldInfo(fs)); } +TEST("utf8 flexible searcher handles regex and by default has case-insensitive partial match semantics") { + UTF8FlexibleStringFieldSearcher fs(0); + // Note: the # term prefix is a magic term-as-regex symbol used only for tests in this file + TEST_DO(assertString(fs, "#abc", "ABC", Hits().add(0))); + TEST_DO(assertString(fs, "#bc", "ABC", Hits().add(0))); + TEST_DO(assertString(fs, "#ab", "ABC", Hits().add(0))); + TEST_DO(assertString(fs, "#[a-z]", "ABC", Hits().add(0))); + TEST_DO(assertString(fs, "#(zoid)(berg)", "why not zoidberg?", Hits().add(0))); + TEST_DO(assertString(fs, "#[a-z]", "123", Hits())); +} + +TEST("utf8 flexible searcher handles case-sensitive regex matching") { + UTF8FlexibleStringFieldSearcher fs(0); + fs.normalize_mode(Normalizing::NONE); + TEST_DO(assertString(fs, "#abc", "ABC", Hits())); + TEST_DO(assertString(fs, "#abc", "abc", Hits().add(0))); + TEST_DO(assertString(fs, "#[A-Z]", "A", Hits().add(0))); + TEST_DO(assertString(fs, "#[A-Z]", "ABC", Hits().add(0))); + TEST_DO(assertString(fs, "#[A-Z]", "abc", Hits())); +} + +TEST("utf8 flexible searcher handles regexes with explicit anchoring") { + UTF8FlexibleStringFieldSearcher fs(0); + TEST_DO(assertString(fs, "#^foo", "food", Hits().add(0))); + TEST_DO(assertString(fs, "#^foo", "afoo", Hits())); + TEST_DO(assertString(fs, "#foo$", "afoo", Hits().add(0))); + TEST_DO(assertString(fs, "#foo$", "food", Hits())); + TEST_DO(assertString(fs, "#^foo$", "foo", Hits().add(0))); + TEST_DO(assertString(fs, "#^foo$", "food", Hits())); + TEST_DO(assertString(fs, "#^foo$", "oo", Hits())); +} + TEST("bool search") { BoolFieldSearcher fs(0); TEST_DO(assertBool(fs, "true", true, true)); @@ -635,6 +678,8 @@ TEST("integer search") TEST_DO(assertInt(fs, "<11", 10, true)); TEST_DO(assertInt(fs, "<11", 11, false)); TEST_DO(assertInt(fs, "-10", -10, true)); + TEST_DO(assertInt(fs, "10", -10, false)); + TEST_DO(assertInt(fs, "-10", 10, false)); TEST_DO(assertInt(fs, "-9", -10, false)); TEST_DO(assertInt(fs, "a", 10, false)); TEST_DO(assertInt(fs, "[-5;5]", -5, true)); diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp index c1fa6090021..c0a0249125f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp @@ -17,7 +17,7 @@ void StrChrFieldSearcher::prepare(search::streaming::QueryTermList& qtl, void StrChrFieldSearcher::onValue(const document::FieldValue & fv) { - const document::LiteralFieldValueB & sfv = static_cast<const document::LiteralFieldValueB &>(fv); + const auto & sfv = static_cast<const document::LiteralFieldValueB &>(fv); vespalib::stringref val = sfv.getValueRef(); FieldRef fr(val.data(), std::min(maxFieldLength(), val.size())); matchDoc(fr); @@ -25,7 +25,6 @@ void StrChrFieldSearcher::onValue(const document::FieldValue & fv) bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef) { - bool retval(true); if (_qtl.size() > 1) { size_t mintsz = shortestTerm(); if (fieldRef.size() >= mintsz) { @@ -35,14 +34,14 @@ bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef) } } else { for (auto qt : _qtl) { - if (fieldRef.size() >= qt->termLen()) { + if (fieldRef.size() >= qt->termLen() || qt->isRegex()) { _words += matchTerm(fieldRef, *qt); } else { _words += countWords(fieldRef); } } } - return retval; + return true; } size_t StrChrFieldSearcher::shortestTerm() const @@ -50,6 +49,9 @@ size_t StrChrFieldSearcher::shortestTerm() const size_t mintsz(_qtl.front()->termLen()); for (auto it=_qtl.begin()+1, mt=_qtl.end(); it != mt; it++) { const QueryTerm & qt = **it; + if (qt.isRegex()) { + return 0; // Must avoid "too short query term" optimization when using regex + } mintsz = std::min(mintsz, qt.termLen()); } return mintsz; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp index 78f491198ad..c6deb6eacd1 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp @@ -1,5 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8flexiblestringfieldsearcher.h" +#include <vespa/searchlib/query/streaming/regexp_term.h> +#include <cassert> #include <vespa/log/log.h> LOG_SETUP(".vsm.searcher.utf8flexiblestringfieldsearcher"); @@ -27,6 +29,17 @@ UTF8FlexibleStringFieldSearcher::matchTerms(const FieldRef & f, const size_t min } size_t +UTF8FlexibleStringFieldSearcher::match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt) +{ + auto* regexp_term = qt.as_regexp_term(); + assert(regexp_term != nullptr); + if (regexp_term->regexp().partial_match({f.data(), f.size()})) { + addHit(qt, 0); + } + return countWords(f); +} + +size_t UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) { if (qt.isPrefix()) { @@ -41,6 +54,9 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) } else if (qt.isExactstring()) { LOG(debug, "Use exact match for exact term '%s:%s'", qt.index().c_str(), qt.getTerm()); return matchTermExact(f, qt); + } else if (qt.isRegex()) { + LOG(debug, "Use regexp match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return match_regexp(f, qt); } else { if (substring()) { LOG(debug, "Use substring match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h index bb1b55dffe4..cd1715ad158 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h @@ -14,16 +14,18 @@ class UTF8FlexibleStringFieldSearcher : public UTF8StringFieldSearcherBase private: /** * Tries to match the given query term against the content of the given field reference. - * Search strategy is choosen based on the query term type. + * Search strategy is chosen based on the query term type. **/ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; /** * Tries to match each query term in the underlying query against the content of the given field reference. - * Search strategy is choosen based on the query term type. + * Search strategy is chosen based on the query term type. **/ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; + size_t match_regexp(const FieldRef & f, search::streaming::QueryTerm & qt); + public: std::unique_ptr<FieldSearcher> duplicate() const override; explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId); diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp index 715c19a0bb7..468d8e0145a 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -134,7 +134,8 @@ FieldSearchSpec::reconfig(const QueryTerm & term) if ((term.isSubstring() && _arg1 != "substring") || (term.isSuffix() && _arg1 != "suffix") || (term.isExactstring() && _arg1 != "exact") || - (term.isPrefix() && _arg1 == "suffix")) + (term.isPrefix() && _arg1 == "suffix") || + term.isRegex()) { _searcher = std::make_unique<UTF8FlexibleStringFieldSearcher>(id()); // preserve the basic match property of the searcher |