summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2020-02-27 11:42:04 +0000
committerTor Brede Vekterli <vekterli@verizonmedia.com>2020-03-04 10:42:45 +0100
commit24843614ecb8bbbd148ff00f1775443725652e05 (patch)
tree3997a975b43420cacab8d52d81c1b03c1acf9be1
parent82d960e4f947fba587639c7f70e51d3f700c01b8 (diff)
Use Google RE2 as underlying regex engine
This introduces guaranteed upper bounds for memory usage and CPU time during regex evaluation. Most importantly, it removes the danger of catastrophic backtracking that is currrently present in GCC's std::regex implementation. With this commit, RE2 will be used instead of std::regex for: * Document selection regex/glob operators * Attribute regex search * Evaluation of mTLS authorization rules
-rw-r--r--CMakeLists.txt6
-rw-r--r--cmake/FindRE2.cmake19
-rw-r--r--document/src/tests/documentselectparsertest.cpp15
-rw-r--r--document/src/vespa/document/select/operator.cpp52
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h6
-rw-r--r--searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/stringbase.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/attribute/stringbase.h12
-rw-r--r--vespalib/CMakeLists.txt1
-rw-r--r--vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp3
-rw-r--r--vespalib/src/tests/regex/regex.cpp157
-rw-r--r--vespalib/src/vespa/vespalib/CMakeLists.txt3
-rw-r--r--vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp40
-rw-r--r--vespalib/src/vespa/vespalib/net/tls/peer_policies.h6
-rw-r--r--vespalib/src/vespa/vespalib/regex/CMakeLists.txt10
-rw-r--r--vespalib/src/vespa/vespalib/regex/regex.cpp88
-rw-r--r--vespalib/src/vespa/vespalib/regex/regex.h69
-rw-r--r--vespalib/src/vespa/vespalib/util/regexp.cpp6
-rw-r--r--vespalib/src/vespa/vespalib/util/regexp.h2
21 files changed, 398 insertions, 110 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8aa853e8c39..906a00ad843 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,11 @@
cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
include(functions.cmake)
-list(APPEND CMAKE_MODULE_PATH "$ENV{HOME}/share/cmake/Modules" "/opt/vespa-deps/share/cmake/Modules")
+list(APPEND CMAKE_MODULE_PATH
+ "$ENV{HOME}/share/cmake/Modules"
+ "/opt/vespa-deps/share/cmake/Modules"
+ "${CMAKE_CURRENT_SOURCE_DIR}/cmake"
+)
include(default_build_settings.cmake)
vespa_detect_build_platform()
message("-- Vespa build platform is ${VESPA_OS_DISTRO} ${VESPA_OS_DISTRO_VERSION}")
diff --git a/cmake/FindRE2.cmake b/cmake/FindRE2.cmake
new file mode 100644
index 00000000000..af1ff799bd7
--- /dev/null
+++ b/cmake/FindRE2.cmake
@@ -0,0 +1,19 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+# There is no bundled FindRE2, so we supply our own minimal version to find
+# the system RE2 library and header files.
+
+find_path(RE2_INCLUDE_DIR
+ NAMES re2/re2.h
+)
+
+find_library(RE2_LIBRARIES
+ NAMES re2
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(RE2
+ FOUND_VAR RE2_FOUND
+ REQUIRED_VARS RE2_LIBRARIES RE2_INCLUDE_DIR
+)
+
diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp
index 6d446f6f1d7..110153954af 100644
--- a/document/src/tests/documentselectparsertest.cpp
+++ b/document/src/tests/documentselectparsertest.cpp
@@ -576,6 +576,21 @@ TEST_F(DocumentSelectParserTest, regex_matching_does_not_bind_anchors_to_newline
PARSE("\"a\\nb\\nc\" = \"b\"", *_doc[0], False);
}
+// With a recursive backtracking regex implementation like that found in (at the time of
+// writing) GCC's std::regex implementation, certain expressions on a sufficiently large
+// input will cause a stack overflow and send the whole thing spiraling into a flaming
+// vortex of doom. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86164 for context.
+//
+// Since crashing the process based on user input is considered bad karma for all the
+// obvious reasons, test that the underlying regex engine is not susceptible to such
+// crashes.
+TEST_F(DocumentSelectParserTest, regex_matching_is_not_susceptible_to_catastrophic_backtracking) {
+ std::string long_string(1024*50, 'A'); // -> hstringval field
+ auto doc = createDoc("testdoctype1", "id:foo:testdoctype1::bar", 24, 0.0, long_string, "bar", 0);
+ // This _will_ crash std::regex on GCC 8.3. Don't try this at home. Unless you want to.
+ PARSE(R"(testdoctype1.hstringval =~ ".*")", *doc, True);
+}
+
TEST_F(DocumentSelectParserTest, operators_1)
{
createDocs();
diff --git a/document/src/vespa/document/select/operator.cpp b/document/src/vespa/document/select/operator.cpp
index ef2ee26bdbd..f5cc681c906 100644
--- a/document/src/vespa/document/select/operator.cpp
+++ b/document/src/vespa/document/select/operator.cpp
@@ -1,9 +1,9 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "operator.h"
-#include <regex>
#include <vespa/vespalib/stllike/asciistream.h>
#include <vespa/vespalib/stllike/hash_map.hpp>
+#include <vespa/vespalib/regex/regex.h>
#include <cassert>
#include <ostream>
@@ -96,23 +96,25 @@ RegexOperator::trace(const Value& a, const Value& b, std::ostream& out) const
ResultList
RegexOperator::compareImpl(const Value& a, const Value& b) const
{
- const StringValue* left(dynamic_cast<const StringValue*>(&a));
- const StringValue* right(dynamic_cast<const StringValue*>(&b));
- if (left == 0 || right == 0) return ResultList(Result::Invalid);
+ const auto* left(dynamic_cast<const StringValue*>(&a));
+ const auto* right(dynamic_cast<const StringValue*>(&b));
+ if (left == nullptr || right == nullptr) {
+ return ResultList(Result::Invalid);
+ }
return match(left->getValue(), right->getValue());
}
ResultList
RegexOperator::traceImpl(const Value& a, const Value& b, std::ostream& out) const
{
- const StringValue* left(dynamic_cast<const StringValue*>(&a));
- const StringValue* right(dynamic_cast<const StringValue*>(&b));
- if (left == 0) {
+ const auto* left(dynamic_cast<const StringValue*>(&a));
+ const auto* right(dynamic_cast<const StringValue*>(&b));
+ if (left == nullptr) {
out << "Operator(" << getName() << ") - Left value not a string. "
<< "Returning invalid.\n";
return ResultList(Result::Invalid);
}
- if (right == 0) {
+ if (right == nullptr) {
out << "Operator(" << getName() << ") - Right value not a string. "
<< "Returning invalid.\n";
return ResultList(Result::Invalid);
@@ -126,14 +128,12 @@ RegexOperator::traceImpl(const Value& a, const Value& b, std::ostream& out) cons
ResultList
RegexOperator::match(const vespalib::string& val, vespalib::stringref expr) const
{
- // Should we catch this in parsing?
- if (expr.size() == 0) return ResultList(Result::True);
- try {
- std::regex expression(expr.data(), expr.size());
- return ResultList(Result::get(std::regex_search(val.c_str(), val.c_str() + val.size(), expression)));
- } catch (std::regex_error &) {
- return ResultList(Result::False);
+ if (expr.empty()) {
+ return ResultList(Result::True); // Should we catch this in parsing?
}
+ return ResultList(Result::get(
+ vespalib::Regex::partial_match(std::string_view(val.data(), val.size()),
+ std::string_view(expr.data(), expr.size()))));
}
const RegexOperator RegexOperator::REGEX("=~");
@@ -158,13 +158,15 @@ GlobOperator::trace(const Value& a, const Value& b, std::ostream& out) const
ResultList
GlobOperator::compareImpl(const Value& a, const Value& b) const
{
- const StringValue* right(dynamic_cast<const StringValue*>(&b));
- // Fall back to operator== if it isn't string matching
- if (right == 0) {
+ const auto* right(dynamic_cast<const StringValue*>(&b));
+ // Fall back to operator== if it isn't string matching
+ if (right == nullptr) {
return FunctionOperator::EQ.compare(a, b);
}
- const StringValue* left(dynamic_cast<const StringValue*>(&a));
- if (left == 0) return ResultList(Result::Invalid);
+ const auto* left(dynamic_cast<const StringValue*>(&a));
+ if (left == nullptr) {
+ return ResultList(Result::Invalid);
+ }
vespalib::string regex(convertToRegex(right->getValue()));
return match(left->getValue(), regex);
}
@@ -172,15 +174,15 @@ GlobOperator::compareImpl(const Value& a, const Value& b) const
ResultList
GlobOperator::traceImpl(const Value& a, const Value& b, std::ostream& ost) const
{
- const StringValue* right(dynamic_cast<const StringValue*>(&b));
- // Fall back to operator== if it isn't string matching
- if (right == 0) {
+ const auto* right(dynamic_cast<const StringValue*>(&b));
+ // Fall back to operator== if it isn't string matching
+ if (right == nullptr) {
ost << "Operator(" << getName() << ") - Right val not a string, "
<< "falling back to == behavior.\n";
return FunctionOperator::EQ.trace(a, b, ost);
}
- const StringValue* left(dynamic_cast<const StringValue*>(&a));
- if (left == 0) {
+ const auto* left(dynamic_cast<const StringValue*>(&a));
+ if (left == nullptr) {
ost << "Operator(" << getName() << ") - Left value is not a string, "
<< "returning invalid.\n";
return ResultList(Result::Invalid);
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
index 9af05059bef..14d33914d05 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
@@ -519,12 +519,12 @@ public:
void visit(StringTerm & n) override { visitTerm(n, true); }
void visit(SubstringTerm & n) override {
- query::SimpleRegExpTerm re(vespalib::Regexp::make_from_substring(n.getTerm()),
+ query::SimpleRegExpTerm re(vespalib::RegexpUtil::make_from_substring(n.getTerm()),
n.getView(), n.getId(), n.getWeight());
visitTerm(re);
}
void visit(SuffixTerm & n) override {
- query::SimpleRegExpTerm re(vespalib::Regexp::make_from_suffix(n.getTerm()),
+ query::SimpleRegExpTerm re(vespalib::RegexpUtil::make_from_suffix(n.getTerm()),
n.getView(), n.getId(), n.getWeight());
visitTerm(re);
}
diff --git a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp
index 6cefc03dd70..eafa5bf0e1f 100644
--- a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp
@@ -123,7 +123,7 @@ StringTemplSearchContext(QueryTermSimpleUP qTerm, const AttrType & toBeSearched)
auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm(), true);
lookupRange(comp, comp);
} else if (this->isRegex()) {
- vespalib::string prefix(vespalib::Regexp::get_prefix(this->queryTerm()->getTerm()));
+ vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm()));
auto comp = enumStore.make_folded_comparator(prefix.c_str(), true);
lookupRange(comp, comp);
} else {
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
index 43d8b7ce9d2..e94de44e45b 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
@@ -183,14 +183,14 @@ private:
using PostingList = typename AggregationTraits::PostingList;
using Parent = PostingSearchContext<BaseSC, PostingListFoldedSearchContextT<DataT>, AttrT>;
using FoldedComparatorType = typename Parent::EnumStore::FoldedComparatorType;
- using Regexp = vespalib::Regexp;
+ using RegexpUtil = vespalib::RegexpUtil;
using QueryTermSimpleUP = typename Parent::QueryTermSimpleUP;
using Parent::_toBeSearched;
using Parent::_enumStore;
using Parent::isRegex;
using Parent::getRegex;
bool useThis(const PostingListSearchContext::DictionaryConstIterator & it) const override {
- return isRegex() ? (getRegex() ? std::regex_search(_enumStore.get_value(it.getKey()), *getRegex()) : false ) : true;
+ return isRegex() ? (getRegex() ? getRegex()->partial_match(_enumStore.get_value(it.getKey())) : false ) : true;
}
public:
StringPostingSearchContext(QueryTermSimpleUP qTerm, bool useBitVector, const AttrT &toBeSearched);
@@ -288,7 +288,7 @@ StringPostingSearchContext(QueryTermSimpleUP qTerm, bool useBitVector, const Att
auto comp = _enumStore.make_folded_comparator(this->queryTerm()->getTerm(), true);
this->lookupRange(comp, comp);
} else if (this->isRegex()) {
- vespalib::string prefix(Regexp::get_prefix(this->queryTerm()->getTerm()));
+ vespalib::string prefix(RegexpUtil::get_prefix(this->queryTerm()->getTerm()));
auto comp = _enumStore.make_folded_comparator(prefix.c_str(), true);
this->lookupRange(comp, comp);
} else {
diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp
index 214da6bf230..406cbbbe447 100644
--- a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp
@@ -59,7 +59,7 @@ SingleValueStringAttributeT<B>::StringTemplSearchContext::StringTemplSearchConte
auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm(), true);
lookupRange(comp, comp);
} else if (this->isRegex()) {
- vespalib::string prefix(vespalib::Regexp::get_prefix(this->queryTerm()->getTerm()));
+ vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm()));
auto comp = enumStore.make_folded_comparator(prefix.c_str(), true);
lookupRange(comp, comp);
} else {
diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
index d7523c86e29..32b5b3ca373 100644
--- a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
@@ -231,10 +231,7 @@ StringAttribute::StringSearchContext::StringSearchContext(QueryTermSimple::UP qT
_regex()
{
if (isRegex()) {
- try {
- _regex = std::regex(_queryTerm->getTerm(), std::regex::icase);
- } catch (std::regex_error &) {
- }
+ _regex = vespalib::Regex::from_pattern(_queryTerm->getTerm(), vespalib::Regex::Options::IgnoreCase);
}
}
diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.h b/searchlib/src/vespa/searchlib/attribute/stringbase.h
index cf0a92253de..3518544cbdc 100644
--- a/searchlib/src/vespa/searchlib/attribute/stringbase.h
+++ b/searchlib/src/vespa/searchlib/attribute/stringbase.h
@@ -9,10 +9,10 @@
#include <vespa/searchlib/attribute/i_enum_store.h>
#include <vespa/searchlib/attribute/loadedenumvalue.h>
#include <vespa/searchlib/util/foldedstringcompare.h>
+#include <vespa/vespalib/regex/regex.h>
#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
#include <optional>
-#include <regex>
namespace search {
@@ -103,7 +103,7 @@ protected:
const QueryTermUCS4 * queryTerm() const override;
bool isMatch(const char *src) const {
if (__builtin_expect(isRegex(), false)) {
- return _regex ? std::regex_search(src, *_regex) : false;
+ return _regex ? _regex->partial_match(std::string_view(src)) : false;
}
vespalib::Utf8ReaderForZTS u8reader(src);
uint32_t j = 0;
@@ -162,7 +162,7 @@ protected:
bool isRegex() const { return _isRegex; }
QueryTermSimpleUP _queryTerm;
std::vector<ucs4_t> _termUCS4;
- const std::optional<std::regex>& getRegex() const { return _regex; }
+ const std::optional<vespalib::Regex>& getRegex() const { return _regex; }
private:
WeightedConstChar * getBuffer() const {
if (_buffer == nullptr) {
@@ -170,9 +170,9 @@ protected:
}
return _buffer;
}
- unsigned _bufferLen;
- mutable WeightedConstChar * _buffer;
- std::optional<std::regex> _regex;
+ unsigned _bufferLen;
+ mutable WeightedConstChar * _buffer;
+ std::optional<vespalib::Regex> _regex;
};
private:
SearchContext::UP getSearch(QueryTermSimpleUP term, const attribute::SearchContextParams & params) const override;
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt
index 979184acbae..3530fb816df 100644
--- a/vespalib/CMakeLists.txt
+++ b/vespalib/CMakeLists.txt
@@ -153,6 +153,7 @@ vespa_define_module(
src/vespa/vespalib/net/tls/impl
src/vespa/vespalib/objects
src/vespa/vespalib/portal
+ src/vespa/vespalib/regex
src/vespa/vespalib/stllike
src/vespa/vespalib/test
src/vespa/vespalib/testkit
diff --git a/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp b/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp
index ad45c217701..a9e823bf3ab 100644
--- a/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp
+++ b/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp
@@ -1,11 +1,8 @@
// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vespalib/io/fileutil.h>
#include <vespa/vespalib/net/tls/transport_security_options.h>
-#include <vespa/vespalib/net/tls/transport_security_options_reading.h>
#include <vespa/vespalib/net/tls/policy_checking_certificate_verifier.h>
#include <vespa/vespalib/test/peer_policy_utils.h>
#include <vespa/vespalib/testkit/test_kit.h>
-#include <vespa/vespalib/util/exceptions.h>
using namespace vespalib;
using namespace vespalib::net::tls;
diff --git a/vespalib/src/tests/regex/regex.cpp b/vespalib/src/tests/regex/regex.cpp
index d1b94daa7ba..7dc5a7f4aa9 100644
--- a/vespalib/src/tests/regex/regex.cpp
+++ b/vespalib/src/tests/regex/regex.cpp
@@ -1,70 +1,147 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/vespalib/testkit/test_kit.h>
-
+#include <vespa/vespalib/regex/regex.h>
#include <vespa/vespalib/util/regexp.h>
-#include <vespa/vespalib/util/exception.h>
-#include <regex>
+#include <string>
using namespace vespalib;
TEST("require that prefix detection works") {
- EXPECT_EQUAL("", Regexp::get_prefix(""));
- EXPECT_EQUAL("", Regexp::get_prefix("foo"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo"));
- EXPECT_EQUAL("", Regexp::get_prefix("^foo|bar"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo$"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo[a-z]"));
- EXPECT_EQUAL("fo", Regexp::get_prefix("^foo{0,1}"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo."));
- EXPECT_EQUAL("fo", Regexp::get_prefix("^foo*"));
- EXPECT_EQUAL("fo", Regexp::get_prefix("^foo?"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo+"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix(""));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("foo"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^foo|bar"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo$"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo[a-z]"));
+ EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo{0,1}"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo."));
+ EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo*"));
+ EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo?"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo+"));
}
TEST("require that prefix detection sometimes underestimates the prefix size") {
- EXPECT_EQUAL("", Regexp::get_prefix("^^foo"));
- EXPECT_EQUAL("", Regexp::get_prefix("^foo(bar|baz)"));
- EXPECT_EQUAL("fo", Regexp::get_prefix("^foo{1,2}"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo\\."));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo(bar)"));
- EXPECT_EQUAL("", Regexp::get_prefix("(^foo)"));
- EXPECT_EQUAL("", Regexp::get_prefix("^(foo)"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo[a]"));
- EXPECT_EQUAL("", Regexp::get_prefix("^foo|^foobar"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^^foo"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^foo(bar|baz)"));
+ EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo{1,2}"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo\\."));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo(bar)"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("(^foo)"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^(foo)"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo[a]"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^foo|^foobar"));
}
-const vespalib::string special("^|()[]{}.*?+\\$");
+const std::string special("^|()[]{}.*?+\\$");
struct ExprFixture {
- std::vector<vespalib::string> expressions;
+ std::vector<std::string> expressions;
ExprFixture() {
expressions.push_back(special);
for (char c: special) {
- expressions.push_back(vespalib::string(&c, 1));
+ expressions.emplace_back(std::string(&c, 1));
}
- expressions.push_back("abc");
- expressions.push_back("[:digit:]");
+ expressions.emplace_back("abc");
+ expressions.emplace_back("[:digit:]");
}
};
TEST_F("require that regexp can be made from suffix string", ExprFixture()) {
- for (vespalib::string str: f1.expressions) {
- std::regex re(std::string(Regexp::make_from_suffix(str)));
- EXPECT_TRUE(std::regex_search(std::string(str), re));
- EXPECT_FALSE(std::regex_search(std::string(str + "foo"), re));
- EXPECT_TRUE(std::regex_search(std::string("foo" + str), re));
- EXPECT_FALSE(std::regex_search(std::string("foo" + str + "bar"), re));
+ for (const auto& str: f1.expressions) {
+ auto re = Regex::from_pattern(std::string(RegexpUtil::make_from_suffix(str)));
+ ASSERT_TRUE(re.parsed_ok());
+
+ EXPECT_TRUE(re.partial_match(str));
+ EXPECT_FALSE(re.partial_match(str + "foo"));
+ EXPECT_TRUE(re.partial_match("foo" + str));
+ EXPECT_FALSE(re.partial_match("foo" + str + "bar"));
}
}
TEST_F("require that regexp can be made from substring string", ExprFixture()) {
- for (vespalib::string str: f1.expressions) {
- std::regex re(std::string(Regexp::make_from_substring(str)));
- EXPECT_TRUE(std::regex_search(std::string(str), re));
- EXPECT_TRUE(std::regex_search(std::string(str + "foo"), re));
- EXPECT_TRUE(std::regex_search(std::string("foo" + str), re));
- EXPECT_TRUE(std::regex_search(std::string("foo" + str + "bar"), re));
+ for (const auto& str: f1.expressions) {
+ auto re = Regex::from_pattern(std::string(RegexpUtil::make_from_substring(str)));
+ ASSERT_TRUE(re.parsed_ok());
+
+ EXPECT_TRUE(re.partial_match(str));
+ EXPECT_TRUE(re.partial_match(str + "foo"));
+ EXPECT_TRUE(re.partial_match("foo" + str));
+ EXPECT_TRUE(re.partial_match("foo" + str + "bar"));
}
}
+TEST("full_match requires expression to match entire input string") {
+ std::string pattern = "[Aa][Bb][Cc]";
+ auto re = Regex::from_pattern(pattern);
+ ASSERT_TRUE(re.parsed_ok());
+
+ EXPECT_TRUE(re.full_match("abc"));
+ EXPECT_TRUE(re.full_match("ABC"));
+ EXPECT_FALSE(re.full_match("abcd"));
+ EXPECT_FALSE(re.full_match("aabc"));
+ EXPECT_FALSE(re.full_match("aabcc"));
+
+ EXPECT_TRUE(Regex::full_match("abc", pattern));
+ EXPECT_TRUE(Regex::full_match("ABC", pattern));
+ EXPECT_FALSE(Regex::full_match("abcd", pattern));
+ EXPECT_FALSE(Regex::full_match("aabc", pattern));
+ EXPECT_FALSE(Regex::full_match("aabcc", pattern));
+}
+
+TEST("partial_match requires expression to match substring of input string") {
+ std::string pattern = "[Aa][Bb][Cc]";
+ auto re = Regex::from_pattern(pattern);
+ ASSERT_TRUE(re.parsed_ok());
+
+ EXPECT_TRUE(re.partial_match("abc"));
+ EXPECT_TRUE(re.partial_match("ABC"));
+ EXPECT_TRUE(re.partial_match("abcd"));
+ EXPECT_TRUE(re.partial_match("aabc"));
+ EXPECT_TRUE(re.partial_match("aabcc"));
+ EXPECT_FALSE(re.partial_match("abd"));
+
+ EXPECT_TRUE(Regex::partial_match("abc", pattern));
+ EXPECT_TRUE(Regex::partial_match("ABC", pattern));
+ EXPECT_TRUE(Regex::partial_match("abcd", pattern));
+ EXPECT_TRUE(Regex::partial_match("aabc", pattern));
+ EXPECT_TRUE(Regex::partial_match("aabcc", pattern));
+ EXPECT_FALSE(Regex::partial_match("abd", pattern));
+}
+
+TEST("partial_match can be explicitly anchored") {
+ EXPECT_TRUE(Regex::partial_match("abcc", "^abc"));
+ EXPECT_FALSE(Regex::partial_match("aabc", "^abc"));
+ EXPECT_TRUE(Regex::partial_match("aabc", "abc$"));
+ EXPECT_FALSE(Regex::partial_match("abcc", "abc$"));
+ EXPECT_TRUE(Regex::partial_match("abc", "^abc$"));
+ EXPECT_FALSE(Regex::partial_match("aabc", "^abc$"));
+ EXPECT_FALSE(Regex::partial_match("abcc", "^abc$"));
+}
+
+TEST("Regex instance returns parsed_ok() == false upon parse failure") {
+ auto re = Regex::from_pattern("[a-z"); // Unterminated set
+ EXPECT_FALSE(re.parsed_ok());
+}
+
+TEST("Regex that has failed parsing immediately returns false for matches") {
+ auto re = Regex::from_pattern("[a-z");
+ EXPECT_FALSE(re.parsed_ok());
+ EXPECT_FALSE(re.partial_match("a"));
+ EXPECT_FALSE(re.full_match("b"));
+}
+
+TEST("can create case-insensitive regex matcher") {
+ auto re = Regex::from_pattern("hello", Regex::Options::IgnoreCase);
+ ASSERT_TRUE(re.parsed_ok());
+ EXPECT_TRUE(re.partial_match("HelLo world"));
+ EXPECT_TRUE(re.full_match("HELLO"));
+}
+
+TEST("regex is case sensitive by default") {
+ auto re = Regex::from_pattern("hello");
+ ASSERT_TRUE(re.parsed_ok());
+ EXPECT_FALSE(re.partial_match("HelLo world"));
+ EXPECT_FALSE(re.full_match("HELLO"));
+}
+
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt
index 95f6a407914..4a753a66394 100644
--- a/vespalib/src/vespa/vespalib/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/CMakeLists.txt
@@ -16,6 +16,7 @@ vespa_add_library(vespalib
$<TARGET_OBJECTS:vespalib_vespalib_net_tls_impl>
$<TARGET_OBJECTS:vespalib_vespalib_objects>
$<TARGET_OBJECTS:vespalib_vespalib_portal>
+ $<TARGET_OBJECTS:vespalib_vespalib_regex>
$<TARGET_OBJECTS:vespalib_vespalib_stllike>
$<TARGET_OBJECTS:vespalib_vespalib_test>
$<TARGET_OBJECTS:vespalib_vespalib_testkit>
@@ -30,3 +31,5 @@ vespa_add_library(vespalib
)
vespa_add_target_package_dependency(vespalib OpenSSL)
+vespa_add_target_package_dependency(vespalib RE2)
+
diff --git a/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp b/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp
index 8d2fb04d853..27a11b3f0f1 100644
--- a/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp
+++ b/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp
@@ -1,28 +1,34 @@
// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "peer_policies.h"
+#include <vespa/vespalib/regex/regex.h>
#include <iostream>
-#include <regex>
namespace vespalib::net::tls {
namespace {
-// Note: this is for basix regexp only, _not_ extended regexp
-bool is_basic_regex_special_char(char c) noexcept {
+bool is_regex_special_char(char c) noexcept {
switch (c) {
- case '^':
- case '$':
- case '.':
- case '[':
- case '\\':
- return true;
- default:
- return false;
+ case '^':
+ case '$':
+ case '|':
+ case '{':
+ case '}':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '\\':
+ case '+':
+ case '.':
+ return true;
+ default:
+ return false;
}
}
-std::string glob_to_basic_regex(vespalib::stringref glob) {
+std::string dot_separated_glob_to_regex(vespalib::stringref glob) {
std::string ret = "^";
ret.reserve(glob.size() + 2);
for (auto c : glob) {
@@ -34,7 +40,7 @@ std::string glob_to_basic_regex(vespalib::stringref glob) {
// Same applies for single chars; they should only match _within_ a dot boundary.
ret += "[^.]";
} else {
- if (is_basic_regex_special_char(c)) {
+ if (is_regex_special_char(c)) {
ret += '\\';
}
ret += c;
@@ -45,16 +51,16 @@ std::string glob_to_basic_regex(vespalib::stringref glob) {
}
class RegexHostMatchPattern : public HostGlobPattern {
- std::regex _pattern_as_regex;
+ Regex _pattern_as_regex;
public:
explicit RegexHostMatchPattern(vespalib::stringref glob_pattern)
- : _pattern_as_regex(glob_to_basic_regex(glob_pattern), std::regex_constants::basic)
+ : _pattern_as_regex(Regex::from_pattern(dot_separated_glob_to_regex(glob_pattern)))
{
}
~RegexHostMatchPattern() override = default;
- bool matches(vespalib::stringref str) const override {
- return std::regex_match(str.begin(), str.end(), _pattern_as_regex);
+ [[nodiscard]] bool matches(vespalib::stringref str) const override {
+ return _pattern_as_regex.full_match(std::string_view(str.data(), str.size()));
}
};
diff --git a/vespalib/src/vespa/vespalib/net/tls/peer_policies.h b/vespalib/src/vespa/vespalib/net/tls/peer_policies.h
index c558708de8f..9d34b62415f 100644
--- a/vespalib/src/vespa/vespalib/net/tls/peer_policies.h
+++ b/vespalib/src/vespa/vespalib/net/tls/peer_policies.h
@@ -10,7 +10,7 @@ namespace vespalib::net::tls {
struct HostGlobPattern {
virtual ~HostGlobPattern() = default;
- virtual bool matches(vespalib::stringref str) const = 0;
+ [[nodiscard]] virtual bool matches(vespalib::stringref str) const = 0;
static std::shared_ptr<const HostGlobPattern> create_from_glob(vespalib::stringref pattern);
};
@@ -36,7 +36,7 @@ public:
&& (_original_pattern == rhs._original_pattern));
}
- bool matches(vespalib::stringref str) const {
+ [[nodiscard]] bool matches(vespalib::stringref str) const {
return (_match_pattern && _match_pattern->matches(str));
}
@@ -89,7 +89,7 @@ public:
bool operator==(const AuthorizedPeers& rhs) const {
return (_peer_policies == rhs._peer_policies);
}
- bool allows_all_authenticated() const noexcept {
+ [[nodiscard]] bool allows_all_authenticated() const noexcept {
return _allow_all_if_empty;
}
const std::vector<PeerPolicy>& peer_policies() const noexcept { return _peer_policies; }
diff --git a/vespalib/src/vespa/vespalib/regex/CMakeLists.txt b/vespalib/src/vespa/vespalib/regex/CMakeLists.txt
new file mode 100644
index 00000000000..1034dbf6086
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/regex/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(vespalib_vespalib_regex OBJECT
+ SOURCES
+ regex.cpp
+ DEPENDS
+)
+
+find_package(RE2 REQUIRED)
+# TODO can this be PRIVATE since we don't expose it transitively?
+target_include_directories(vespalib_vespalib_regex PUBLIC ${RE2_INCLUDE_DIR})
diff --git a/vespalib/src/vespa/vespalib/regex/regex.cpp b/vespalib/src/vespa/vespalib/regex/regex.cpp
new file mode 100644
index 00000000000..81677f1b9dd
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/regex/regex.cpp
@@ -0,0 +1,88 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "regex.h"
+#include <re2/re2.h>
+#include <cassert>
+#include <cstdint>
+
+namespace vespalib {
+
+using re2::StringPiece;
+
+// All RE2 instances use a Quiet option to prevent the library from
+// complaining to stderr if pattern compilation fails.
+
+Regex::Regex(std::shared_ptr<const Impl> impl)
+ : _impl(std::move(impl))
+{}
+
+Regex::Regex(const Regex&) = default;
+Regex& Regex::operator=(const Regex&) = default;
+Regex::Regex(Regex&&) noexcept = default;
+Regex& Regex::operator=(Regex&&) noexcept = default;
+
+Regex::~Regex() = default;
+
+class Regex::Impl {
+ RE2 _regex;
+public:
+ Impl(std::string_view pattern, const re2::RE2::Options& opts)
+ : _regex(StringPiece(pattern.data(), pattern.size()), opts)
+ {}
+
+ bool parsed_ok() const noexcept {
+ return _regex.ok();
+ }
+
+ bool partial_match(std::string_view input) const noexcept {
+ assert(input.size() <= INT32_MAX);
+ if (!_regex.ok()) {
+ return false;
+ }
+ return RE2::PartialMatch(StringPiece(input.data(), input.size()), _regex);
+ }
+
+ bool full_match(std::string_view input) const noexcept {
+ assert(input.size() <= INT32_MAX);
+ if (!_regex.ok()) {
+ return false;
+ }
+ return RE2::FullMatch(StringPiece(input.data(), input.size()), _regex);
+ }
+};
+
+Regex Regex::from_pattern(std::string_view pattern, uint32_t opt_mask) {
+ assert(pattern.size() <= INT32_MAX); // StringPiece limitation
+ RE2::Options opts;
+ opts.set_log_errors(false);
+ if ((opt_mask & Options::IgnoreCase) != 0) {
+ opts.set_case_sensitive(false);
+ }
+ return Regex(std::make_shared<Impl>(pattern, opts));
+}
+
+bool Regex::parsed_ok() const noexcept {
+ return _impl->parsed_ok();
+}
+
+bool Regex::partial_match(std::string_view input) const noexcept {
+ return _impl->partial_match(input);
+}
+
+bool Regex::full_match(std::string_view input) const noexcept {
+ return _impl->full_match(input);
+}
+
+bool Regex::partial_match(std::string_view input, std::string_view pattern) noexcept {
+ assert(pattern.size() <= INT32_MAX);
+ Impl impl(pattern, RE2::Quiet);
+ return impl.partial_match(input);
+}
+
+bool Regex::full_match(std::string_view input, std::string_view pattern) noexcept {
+ assert(pattern.size() <= INT32_MAX);
+ Impl impl(pattern, RE2::Quiet);
+ return impl.full_match(input);
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/regex/regex.h b/vespalib/src/vespa/vespalib/regex/regex.h
new file mode 100644
index 00000000000..4382d057252
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/regex/regex.h
@@ -0,0 +1,69 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace vespalib {
+
+/**
+ * A simple Regex library wrapper which provides for both just-in-time
+ * pattern evaluation as well as pattern precompilation and reuse.
+ *
+ * Robustness and input safety:
+ * The underlying regex engine implementation must ensure that pattern
+ * parsing and input processing is safe to be run on _untrusted_ inputs.
+ * This means the underlying implementation shall provide upper bounds
+ * on both memory and CPU time and may never crash or corrupt the process.
+ *
+ * We currently use Google RE2 under the hood to achieve this.
+ *
+ * Note: due to underlying RE2 limitations, string lengths may
+ * not be longer than INT_MAX.
+ *
+ * Thread safety:
+ * A Regex object is safe to be used from multiple threads.
+ *
+ * Exception safety:
+ * Exceptions shall never be thrown from the regex code itself, neither
+ * at parse time nor at match time (ancillary exceptions _could_ be thrown
+ * from memory allocation failures etc, but we assume that the caller
+ * is running vespamalloc which terminates the process instead, making
+ * the whole thing effectively noexcept).
+ *
+ * If the provided regular expression pattern is malformed, parsing
+ * fails silently; all match functions will return false immediately.
+ */
+class Regex {
+ class Impl;
+ std::shared_ptr<const Impl> _impl; // shared_ptr to allow for cheap copying.
+
+ explicit Regex(std::shared_ptr<const Impl> impl);
+public:
+ // TODO consider using type-safe parameter instead.
+ enum Options {
+ None = 0,
+ IgnoreCase = 1
+ };
+
+ ~Regex();
+ Regex(const Regex&);
+ Regex& operator=(const Regex&);
+ Regex(Regex&&) noexcept;
+ Regex& operator=(Regex&&) noexcept;
+
+ [[nodiscard]] bool parsed_ok() const noexcept;
+
+ [[nodiscard]] bool partial_match(std::string_view input) const noexcept;
+ [[nodiscard]] bool full_match(std::string_view input) const noexcept;
+
+ static Regex from_pattern(std::string_view pattern, uint32_t opt_flags = Options::None);
+
+ // Utility matchers for non-precompiled expressions.
+ [[nodiscard]] static bool partial_match(std::string_view input, std::string_view pattern) noexcept;
+ [[nodiscard]] static bool full_match(std::string_view input, std::string_view pattern) noexcept;
+};
+
+}
+
diff --git a/vespalib/src/vespa/vespalib/util/regexp.cpp b/vespalib/src/vespa/vespalib/util/regexp.cpp
index b3cad06382e..0d0c7b69b12 100644
--- a/vespalib/src/vespa/vespalib/util/regexp.cpp
+++ b/vespalib/src/vespa/vespalib/util/regexp.cpp
@@ -41,7 +41,7 @@ vespalib::string escape(vespalib::stringref str) {
} // namespace vespalib::<unnamed>
vespalib::string
-Regexp::get_prefix(vespalib::stringref re)
+RegexpUtil::get_prefix(vespalib::stringref re)
{
vespalib::string prefix;
if ((re.size() > 0) && (re.data()[0] == '^') && !has_option(re)) {
@@ -58,13 +58,13 @@ Regexp::get_prefix(vespalib::stringref re)
}
vespalib::string
-Regexp::make_from_suffix(vespalib::stringref suffix)
+RegexpUtil::make_from_suffix(vespalib::stringref suffix)
{
return escape(suffix) + "$";
}
vespalib::string
-Regexp::make_from_substring(vespalib::stringref substring)
+RegexpUtil::make_from_substring(vespalib::stringref substring)
{
return escape(substring);
}
diff --git a/vespalib/src/vespa/vespalib/util/regexp.h b/vespalib/src/vespa/vespalib/util/regexp.h
index 9897b488aff..74a69fee361 100644
--- a/vespalib/src/vespa/vespalib/util/regexp.h
+++ b/vespalib/src/vespa/vespalib/util/regexp.h
@@ -8,7 +8,7 @@ namespace vespalib {
/**
* Utility class inspecting and generating regular expression strings.
**/
-class Regexp
+class RegexpUtil
{
public:
/**