aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt6
-rw-r--r--cmake/FindRE2.cmake19
-rw-r--r--document/src/tests/documentselectparsertest.cpp15
-rw-r--r--document/src/vespa/document/select/operator.cpp52
-rw-r--r--searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp4
-rw-r--r--searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h6
-rw-r--r--searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp2
-rw-r--r--searchlib/src/vespa/searchlib/attribute/stringbase.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/attribute/stringbase.h12
-rw-r--r--vespalib/CMakeLists.txt1
-rw-r--r--vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp3
-rw-r--r--vespalib/src/tests/regex/regex.cpp157
-rw-r--r--vespalib/src/vespa/vespalib/CMakeLists.txt3
-rw-r--r--vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp40
-rw-r--r--vespalib/src/vespa/vespalib/net/tls/peer_policies.h6
-rw-r--r--vespalib/src/vespa/vespalib/regex/CMakeLists.txt10
-rw-r--r--vespalib/src/vespa/vespalib/regex/regex.cpp88
-rw-r--r--vespalib/src/vespa/vespalib/regex/regex.h69
-rw-r--r--vespalib/src/vespa/vespalib/util/regexp.cpp6
-rw-r--r--vespalib/src/vespa/vespalib/util/regexp.h2
21 files changed, 398 insertions, 110 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8aa853e8c39..906a00ad843 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,11 @@
cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
include(functions.cmake)
-list(APPEND CMAKE_MODULE_PATH "$ENV{HOME}/share/cmake/Modules" "/opt/vespa-deps/share/cmake/Modules")
+list(APPEND CMAKE_MODULE_PATH
+ "$ENV{HOME}/share/cmake/Modules"
+ "/opt/vespa-deps/share/cmake/Modules"
+ "${CMAKE_CURRENT_SOURCE_DIR}/cmake"
+)
include(default_build_settings.cmake)
vespa_detect_build_platform()
message("-- Vespa build platform is ${VESPA_OS_DISTRO} ${VESPA_OS_DISTRO_VERSION}")
diff --git a/cmake/FindRE2.cmake b/cmake/FindRE2.cmake
new file mode 100644
index 00000000000..af1ff799bd7
--- /dev/null
+++ b/cmake/FindRE2.cmake
@@ -0,0 +1,19 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+# There is no bundled FindRE2, so we supply our own minimal version to find
+# the system RE2 library and header files.
+
+find_path(RE2_INCLUDE_DIR
+ NAMES re2/re2.h
+)
+
+find_library(RE2_LIBRARIES
+ NAMES re2
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(RE2
+ FOUND_VAR RE2_FOUND
+ REQUIRED_VARS RE2_LIBRARIES RE2_INCLUDE_DIR
+)
+
diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp
index 6d446f6f1d7..110153954af 100644
--- a/document/src/tests/documentselectparsertest.cpp
+++ b/document/src/tests/documentselectparsertest.cpp
@@ -576,6 +576,21 @@ TEST_F(DocumentSelectParserTest, regex_matching_does_not_bind_anchors_to_newline
PARSE("\"a\\nb\\nc\" = \"b\"", *_doc[0], False);
}
+// With a recursive backtracking regex implementation like that found in (at the time of
+// writing) GCC's std::regex implementation, certain expressions on a sufficiently large
+// input will cause a stack overflow and send the whole thing spiraling into a flaming
+// vortex of doom. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86164 for context.
+//
+// Since crashing the process based on user input is considered bad karma for all the
+// obvious reasons, test that the underlying regex engine is not susceptible to such
+// crashes.
+TEST_F(DocumentSelectParserTest, regex_matching_is_not_susceptible_to_catastrophic_backtracking) {
+ std::string long_string(1024*50, 'A'); // -> hstringval field
+ auto doc = createDoc("testdoctype1", "id:foo:testdoctype1::bar", 24, 0.0, long_string, "bar", 0);
+ // This _will_ crash std::regex on GCC 8.3. Don't try this at home. Unless you want to.
+ PARSE(R"(testdoctype1.hstringval =~ ".*")", *doc, True);
+}
+
TEST_F(DocumentSelectParserTest, operators_1)
{
createDocs();
diff --git a/document/src/vespa/document/select/operator.cpp b/document/src/vespa/document/select/operator.cpp
index ef2ee26bdbd..f5cc681c906 100644
--- a/document/src/vespa/document/select/operator.cpp
+++ b/document/src/vespa/document/select/operator.cpp
@@ -1,9 +1,9 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "operator.h"
-#include <regex>
#include <vespa/vespalib/stllike/asciistream.h>
#include <vespa/vespalib/stllike/hash_map.hpp>
+#include <vespa/vespalib/regex/regex.h>
#include <cassert>
#include <ostream>
@@ -96,23 +96,25 @@ RegexOperator::trace(const Value& a, const Value& b, std::ostream& out) const
ResultList
RegexOperator::compareImpl(const Value& a, const Value& b) const
{
- const StringValue* left(dynamic_cast<const StringValue*>(&a));
- const StringValue* right(dynamic_cast<const StringValue*>(&b));
- if (left == 0 || right == 0) return ResultList(Result::Invalid);
+ const auto* left(dynamic_cast<const StringValue*>(&a));
+ const auto* right(dynamic_cast<const StringValue*>(&b));
+ if (left == nullptr || right == nullptr) {
+ return ResultList(Result::Invalid);
+ }
return match(left->getValue(), right->getValue());
}
ResultList
RegexOperator::traceImpl(const Value& a, const Value& b, std::ostream& out) const
{
- const StringValue* left(dynamic_cast<const StringValue*>(&a));
- const StringValue* right(dynamic_cast<const StringValue*>(&b));
- if (left == 0) {
+ const auto* left(dynamic_cast<const StringValue*>(&a));
+ const auto* right(dynamic_cast<const StringValue*>(&b));
+ if (left == nullptr) {
out << "Operator(" << getName() << ") - Left value not a string. "
<< "Returning invalid.\n";
return ResultList(Result::Invalid);
}
- if (right == 0) {
+ if (right == nullptr) {
out << "Operator(" << getName() << ") - Right value not a string. "
<< "Returning invalid.\n";
return ResultList(Result::Invalid);
@@ -126,14 +128,12 @@ RegexOperator::traceImpl(const Value& a, const Value& b, std::ostream& out) cons
ResultList
RegexOperator::match(const vespalib::string& val, vespalib::stringref expr) const
{
- // Should we catch this in parsing?
- if (expr.size() == 0) return ResultList(Result::True);
- try {
- std::regex expression(expr.data(), expr.size());
- return ResultList(Result::get(std::regex_search(val.c_str(), val.c_str() + val.size(), expression)));
- } catch (std::regex_error &) {
- return ResultList(Result::False);
+ if (expr.empty()) {
+ return ResultList(Result::True); // Should we catch this in parsing?
}
+ return ResultList(Result::get(
+ vespalib::Regex::partial_match(std::string_view(val.data(), val.size()),
+ std::string_view(expr.data(), expr.size()))));
}
const RegexOperator RegexOperator::REGEX("=~");
@@ -158,13 +158,15 @@ GlobOperator::trace(const Value& a, const Value& b, std::ostream& out) const
ResultList
GlobOperator::compareImpl(const Value& a, const Value& b) const
{
- const StringValue* right(dynamic_cast<const StringValue*>(&b));
- // Fall back to operator== if it isn't string matching
- if (right == 0) {
+ const auto* right(dynamic_cast<const StringValue*>(&b));
+ // Fall back to operator== if it isn't string matching
+ if (right == nullptr) {
return FunctionOperator::EQ.compare(a, b);
}
- const StringValue* left(dynamic_cast<const StringValue*>(&a));
- if (left == 0) return ResultList(Result::Invalid);
+ const auto* left(dynamic_cast<const StringValue*>(&a));
+ if (left == nullptr) {
+ return ResultList(Result::Invalid);
+ }
vespalib::string regex(convertToRegex(right->getValue()));
return match(left->getValue(), regex);
}
@@ -172,15 +174,15 @@ GlobOperator::compareImpl(const Value& a, const Value& b) const
ResultList
GlobOperator::traceImpl(const Value& a, const Value& b, std::ostream& ost) const
{
- const StringValue* right(dynamic_cast<const StringValue*>(&b));
- // Fall back to operator== if it isn't string matching
- if (right == 0) {
+ const auto* right(dynamic_cast<const StringValue*>(&b));
+ // Fall back to operator== if it isn't string matching
+ if (right == nullptr) {
ost << "Operator(" << getName() << ") - Right val not a string, "
<< "falling back to == behavior.\n";
return FunctionOperator::EQ.trace(a, b, ost);
}
- const StringValue* left(dynamic_cast<const StringValue*>(&a));
- if (left == 0) {
+ const auto* left(dynamic_cast<const StringValue*>(&a));
+ if (left == nullptr) {
ost << "Operator(" << getName() << ") - Left value is not a string, "
<< "returning invalid.\n";
return ResultList(Result::Invalid);
diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
index 9af05059bef..14d33914d05 100644
--- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp
@@ -519,12 +519,12 @@ public:
void visit(StringTerm & n) override { visitTerm(n, true); }
void visit(SubstringTerm & n) override {
- query::SimpleRegExpTerm re(vespalib::Regexp::make_from_substring(n.getTerm()),
+ query::SimpleRegExpTerm re(vespalib::RegexpUtil::make_from_substring(n.getTerm()),
n.getView(), n.getId(), n.getWeight());
visitTerm(re);
}
void visit(SuffixTerm & n) override {
- query::SimpleRegExpTerm re(vespalib::Regexp::make_from_suffix(n.getTerm()),
+ query::SimpleRegExpTerm re(vespalib::RegexpUtil::make_from_suffix(n.getTerm()),
n.getView(), n.getId(), n.getWeight());
visitTerm(re);
}
diff --git a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp
index 6cefc03dd70..eafa5bf0e1f 100644
--- a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp
@@ -123,7 +123,7 @@ StringTemplSearchContext(QueryTermSimpleUP qTerm, const AttrType & toBeSearched)
auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm(), true);
lookupRange(comp, comp);
} else if (this->isRegex()) {
- vespalib::string prefix(vespalib::Regexp::get_prefix(this->queryTerm()->getTerm()));
+ vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm()));
auto comp = enumStore.make_folded_comparator(prefix.c_str(), true);
lookupRange(comp, comp);
} else {
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
index 43d8b7ce9d2..e94de44e45b 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
@@ -183,14 +183,14 @@ private:
using PostingList = typename AggregationTraits::PostingList;
using Parent = PostingSearchContext<BaseSC, PostingListFoldedSearchContextT<DataT>, AttrT>;
using FoldedComparatorType = typename Parent::EnumStore::FoldedComparatorType;
- using Regexp = vespalib::Regexp;
+ using RegexpUtil = vespalib::RegexpUtil;
using QueryTermSimpleUP = typename Parent::QueryTermSimpleUP;
using Parent::_toBeSearched;
using Parent::_enumStore;
using Parent::isRegex;
using Parent::getRegex;
bool useThis(const PostingListSearchContext::DictionaryConstIterator & it) const override {
- return isRegex() ? (getRegex() ? std::regex_search(_enumStore.get_value(it.getKey()), *getRegex()) : false ) : true;
+ return isRegex() ? (getRegex() ? getRegex()->partial_match(_enumStore.get_value(it.getKey())) : false ) : true;
}
public:
StringPostingSearchContext(QueryTermSimpleUP qTerm, bool useBitVector, const AttrT &toBeSearched);
@@ -288,7 +288,7 @@ StringPostingSearchContext(QueryTermSimpleUP qTerm, bool useBitVector, const Att
auto comp = _enumStore.make_folded_comparator(this->queryTerm()->getTerm(), true);
this->lookupRange(comp, comp);
} else if (this->isRegex()) {
- vespalib::string prefix(Regexp::get_prefix(this->queryTerm()->getTerm()));
+ vespalib::string prefix(RegexpUtil::get_prefix(this->queryTerm()->getTerm()));
auto comp = _enumStore.make_folded_comparator(prefix.c_str(), true);
this->lookupRange(comp, comp);
} else {
diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp
index 214da6bf230..406cbbbe447 100644
--- a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp
+++ b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp
@@ -59,7 +59,7 @@ SingleValueStringAttributeT<B>::StringTemplSearchContext::StringTemplSearchConte
auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm(), true);
lookupRange(comp, comp);
} else if (this->isRegex()) {
- vespalib::string prefix(vespalib::Regexp::get_prefix(this->queryTerm()->getTerm()));
+ vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm()));
auto comp = enumStore.make_folded_comparator(prefix.c_str(), true);
lookupRange(comp, comp);
} else {
diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
index d7523c86e29..32b5b3ca373 100644
--- a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
+++ b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp
@@ -231,10 +231,7 @@ StringAttribute::StringSearchContext::StringSearchContext(QueryTermSimple::UP qT
_regex()
{
if (isRegex()) {
- try {
- _regex = std::regex(_queryTerm->getTerm(), std::regex::icase);
- } catch (std::regex_error &) {
- }
+ _regex = vespalib::Regex::from_pattern(_queryTerm->getTerm(), vespalib::Regex::Options::IgnoreCase);
}
}
diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.h b/searchlib/src/vespa/searchlib/attribute/stringbase.h
index cf0a92253de..3518544cbdc 100644
--- a/searchlib/src/vespa/searchlib/attribute/stringbase.h
+++ b/searchlib/src/vespa/searchlib/attribute/stringbase.h
@@ -9,10 +9,10 @@
#include <vespa/searchlib/attribute/i_enum_store.h>
#include <vespa/searchlib/attribute/loadedenumvalue.h>
#include <vespa/searchlib/util/foldedstringcompare.h>
+#include <vespa/vespalib/regex/regex.h>
#include <vespa/vespalib/text/lowercase.h>
#include <vespa/vespalib/text/utf8.h>
#include <optional>
-#include <regex>
namespace search {
@@ -103,7 +103,7 @@ protected:
const QueryTermUCS4 * queryTerm() const override;
bool isMatch(const char *src) const {
if (__builtin_expect(isRegex(), false)) {
- return _regex ? std::regex_search(src, *_regex) : false;
+ return _regex ? _regex->partial_match(std::string_view(src)) : false;
}
vespalib::Utf8ReaderForZTS u8reader(src);
uint32_t j = 0;
@@ -162,7 +162,7 @@ protected:
bool isRegex() const { return _isRegex; }
QueryTermSimpleUP _queryTerm;
std::vector<ucs4_t> _termUCS4;
- const std::optional<std::regex>& getRegex() const { return _regex; }
+ const std::optional<vespalib::Regex>& getRegex() const { return _regex; }
private:
WeightedConstChar * getBuffer() const {
if (_buffer == nullptr) {
@@ -170,9 +170,9 @@ protected:
}
return _buffer;
}
- unsigned _bufferLen;
- mutable WeightedConstChar * _buffer;
- std::optional<std::regex> _regex;
+ unsigned _bufferLen;
+ mutable WeightedConstChar * _buffer;
+ std::optional<vespalib::Regex> _regex;
};
private:
SearchContext::UP getSearch(QueryTermSimpleUP term, const attribute::SearchContextParams & params) const override;
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt
index 979184acbae..3530fb816df 100644
--- a/vespalib/CMakeLists.txt
+++ b/vespalib/CMakeLists.txt
@@ -153,6 +153,7 @@ vespa_define_module(
src/vespa/vespalib/net/tls/impl
src/vespa/vespalib/objects
src/vespa/vespalib/portal
+ src/vespa/vespalib/regex
src/vespa/vespalib/stllike
src/vespa/vespalib/test
src/vespa/vespalib/testkit
diff --git a/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp b/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp
index ad45c217701..a9e823bf3ab 100644
--- a/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp
+++ b/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp
@@ -1,11 +1,8 @@
// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vespalib/io/fileutil.h>
#include <vespa/vespalib/net/tls/transport_security_options.h>
-#include <vespa/vespalib/net/tls/transport_security_options_reading.h>
#include <vespa/vespalib/net/tls/policy_checking_certificate_verifier.h>
#include <vespa/vespalib/test/peer_policy_utils.h>
#include <vespa/vespalib/testkit/test_kit.h>
-#include <vespa/vespalib/util/exceptions.h>
using namespace vespalib;
using namespace vespalib::net::tls;
diff --git a/vespalib/src/tests/regex/regex.cpp b/vespalib/src/tests/regex/regex.cpp
index d1b94daa7ba..7dc5a7f4aa9 100644
--- a/vespalib/src/tests/regex/regex.cpp
+++ b/vespalib/src/tests/regex/regex.cpp
@@ -1,70 +1,147 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/vespalib/testkit/test_kit.h>
-
+#include <vespa/vespalib/regex/regex.h>
#include <vespa/vespalib/util/regexp.h>
-#include <vespa/vespalib/util/exception.h>
-#include <regex>
+#include <string>
using namespace vespalib;
TEST("require that prefix detection works") {
- EXPECT_EQUAL("", Regexp::get_prefix(""));
- EXPECT_EQUAL("", Regexp::get_prefix("foo"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo"));
- EXPECT_EQUAL("", Regexp::get_prefix("^foo|bar"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo$"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo[a-z]"));
- EXPECT_EQUAL("fo", Regexp::get_prefix("^foo{0,1}"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo."));
- EXPECT_EQUAL("fo", Regexp::get_prefix("^foo*"));
- EXPECT_EQUAL("fo", Regexp::get_prefix("^foo?"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo+"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix(""));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("foo"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^foo|bar"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo$"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo[a-z]"));
+ EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo{0,1}"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo."));
+ EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo*"));
+ EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo?"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo+"));
}
TEST("require that prefix detection sometimes underestimates the prefix size") {
- EXPECT_EQUAL("", Regexp::get_prefix("^^foo"));
- EXPECT_EQUAL("", Regexp::get_prefix("^foo(bar|baz)"));
- EXPECT_EQUAL("fo", Regexp::get_prefix("^foo{1,2}"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo\\."));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo(bar)"));
- EXPECT_EQUAL("", Regexp::get_prefix("(^foo)"));
- EXPECT_EQUAL("", Regexp::get_prefix("^(foo)"));
- EXPECT_EQUAL("foo", Regexp::get_prefix("^foo[a]"));
- EXPECT_EQUAL("", Regexp::get_prefix("^foo|^foobar"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^^foo"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^foo(bar|baz)"));
+ EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo{1,2}"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo\\."));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo(bar)"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("(^foo)"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^(foo)"));
+ EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo[a]"));
+ EXPECT_EQUAL("", RegexpUtil::get_prefix("^foo|^foobar"));
}
-const vespalib::string special("^|()[]{}.*?+\\$");
+const std::string special("^|()[]{}.*?+\\$");
struct ExprFixture {
- std::vector<vespalib::string> expressions;
+ std::vector<std::string> expressions;
ExprFixture() {
expressions.push_back(special);
for (char c: special) {
- expressions.push_back(vespalib::string(&c, 1));
+ expressions.emplace_back(std::string(&c, 1));
}
- expressions.push_back("abc");
- expressions.push_back("[:digit:]");
+ expressions.emplace_back("abc");
+ expressions.emplace_back("[:digit:]");
}
};
TEST_F("require that regexp can be made from suffix string", ExprFixture()) {
- for (vespalib::string str: f1.expressions) {
- std::regex re(std::string(Regexp::make_from_suffix(str)));
- EXPECT_TRUE(std::regex_search(std::string(str), re));
- EXPECT_FALSE(std::regex_search(std::string(str + "foo"), re));
- EXPECT_TRUE(std::regex_search(std::string("foo" + str), re));
- EXPECT_FALSE(std::regex_search(std::string("foo" + str + "bar"), re));
+ for (const auto& str: f1.expressions) {
+ auto re = Regex::from_pattern(std::string(RegexpUtil::make_from_suffix(str)));
+ ASSERT_TRUE(re.parsed_ok());
+
+ EXPECT_TRUE(re.partial_match(str));
+ EXPECT_FALSE(re.partial_match(str + "foo"));
+ EXPECT_TRUE(re.partial_match("foo" + str));
+ EXPECT_FALSE(re.partial_match("foo" + str + "bar"));
}
}
TEST_F("require that regexp can be made from substring string", ExprFixture()) {
- for (vespalib::string str: f1.expressions) {
- std::regex re(std::string(Regexp::make_from_substring(str)));
- EXPECT_TRUE(std::regex_search(std::string(str), re));
- EXPECT_TRUE(std::regex_search(std::string(str + "foo"), re));
- EXPECT_TRUE(std::regex_search(std::string("foo" + str), re));
- EXPECT_TRUE(std::regex_search(std::string("foo" + str + "bar"), re));
+ for (const auto& str: f1.expressions) {
+ auto re = Regex::from_pattern(std::string(RegexpUtil::make_from_substring(str)));
+ ASSERT_TRUE(re.parsed_ok());
+
+ EXPECT_TRUE(re.partial_match(str));
+ EXPECT_TRUE(re.partial_match(str + "foo"));
+ EXPECT_TRUE(re.partial_match("foo" + str));
+ EXPECT_TRUE(re.partial_match("foo" + str + "bar"));
}
}
+TEST("full_match requires expression to match entire input string") {
+ std::string pattern = "[Aa][Bb][Cc]";
+ auto re = Regex::from_pattern(pattern);
+ ASSERT_TRUE(re.parsed_ok());
+
+ EXPECT_TRUE(re.full_match("abc"));
+ EXPECT_TRUE(re.full_match("ABC"));
+ EXPECT_FALSE(re.full_match("abcd"));
+ EXPECT_FALSE(re.full_match("aabc"));
+ EXPECT_FALSE(re.full_match("aabcc"));
+
+ EXPECT_TRUE(Regex::full_match("abc", pattern));
+ EXPECT_TRUE(Regex::full_match("ABC", pattern));
+ EXPECT_FALSE(Regex::full_match("abcd", pattern));
+ EXPECT_FALSE(Regex::full_match("aabc", pattern));
+ EXPECT_FALSE(Regex::full_match("aabcc", pattern));
+}
+
+TEST("partial_match requires expression to match substring of input string") {
+ std::string pattern = "[Aa][Bb][Cc]";
+ auto re = Regex::from_pattern(pattern);
+ ASSERT_TRUE(re.parsed_ok());
+
+ EXPECT_TRUE(re.partial_match("abc"));
+ EXPECT_TRUE(re.partial_match("ABC"));
+ EXPECT_TRUE(re.partial_match("abcd"));
+ EXPECT_TRUE(re.partial_match("aabc"));
+ EXPECT_TRUE(re.partial_match("aabcc"));
+ EXPECT_FALSE(re.partial_match("abd"));
+
+ EXPECT_TRUE(Regex::partial_match("abc", pattern));
+ EXPECT_TRUE(Regex::partial_match("ABC", pattern));
+ EXPECT_TRUE(Regex::partial_match("abcd", pattern));
+ EXPECT_TRUE(Regex::partial_match("aabc", pattern));
+ EXPECT_TRUE(Regex::partial_match("aabcc", pattern));
+ EXPECT_FALSE(Regex::partial_match("abd", pattern));
+}
+
+TEST("partial_match can be explicitly anchored") {
+ EXPECT_TRUE(Regex::partial_match("abcc", "^abc"));
+ EXPECT_FALSE(Regex::partial_match("aabc", "^abc"));
+ EXPECT_TRUE(Regex::partial_match("aabc", "abc$"));
+ EXPECT_FALSE(Regex::partial_match("abcc", "abc$"));
+ EXPECT_TRUE(Regex::partial_match("abc", "^abc$"));
+ EXPECT_FALSE(Regex::partial_match("aabc", "^abc$"));
+ EXPECT_FALSE(Regex::partial_match("abcc", "^abc$"));
+}
+
+TEST("Regex instance returns parsed_ok() == false upon parse failure") {
+ auto re = Regex::from_pattern("[a-z"); // Unterminated set
+ EXPECT_FALSE(re.parsed_ok());
+}
+
+TEST("Regex that has failed parsing immediately returns false for matches") {
+ auto re = Regex::from_pattern("[a-z");
+ EXPECT_FALSE(re.parsed_ok());
+ EXPECT_FALSE(re.partial_match("a"));
+ EXPECT_FALSE(re.full_match("b"));
+}
+
+TEST("can create case-insensitive regex matcher") {
+ auto re = Regex::from_pattern("hello", Regex::Options::IgnoreCase);
+ ASSERT_TRUE(re.parsed_ok());
+ EXPECT_TRUE(re.partial_match("HelLo world"));
+ EXPECT_TRUE(re.full_match("HELLO"));
+}
+
+TEST("regex is case sensitive by default") {
+ auto re = Regex::from_pattern("hello");
+ ASSERT_TRUE(re.parsed_ok());
+ EXPECT_FALSE(re.partial_match("HelLo world"));
+ EXPECT_FALSE(re.full_match("HELLO"));
+}
+
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt
index 95f6a407914..4a753a66394 100644
--- a/vespalib/src/vespa/vespalib/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/CMakeLists.txt
@@ -16,6 +16,7 @@ vespa_add_library(vespalib
$<TARGET_OBJECTS:vespalib_vespalib_net_tls_impl>
$<TARGET_OBJECTS:vespalib_vespalib_objects>
$<TARGET_OBJECTS:vespalib_vespalib_portal>
+ $<TARGET_OBJECTS:vespalib_vespalib_regex>
$<TARGET_OBJECTS:vespalib_vespalib_stllike>
$<TARGET_OBJECTS:vespalib_vespalib_test>
$<TARGET_OBJECTS:vespalib_vespalib_testkit>
@@ -30,3 +31,5 @@ vespa_add_library(vespalib
)
vespa_add_target_package_dependency(vespalib OpenSSL)
+vespa_add_target_package_dependency(vespalib RE2)
+
diff --git a/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp b/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp
index 8d2fb04d853..27a11b3f0f1 100644
--- a/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp
+++ b/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp
@@ -1,28 +1,34 @@
// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "peer_policies.h"
+#include <vespa/vespalib/regex/regex.h>
#include <iostream>
-#include <regex>
namespace vespalib::net::tls {
namespace {
-// Note: this is for basix regexp only, _not_ extended regexp
-bool is_basic_regex_special_char(char c) noexcept {
+bool is_regex_special_char(char c) noexcept {
switch (c) {
- case '^':
- case '$':
- case '.':
- case '[':
- case '\\':
- return true;
- default:
- return false;
+ case '^':
+ case '$':
+ case '|':
+ case '{':
+ case '}':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '\\':
+ case '+':
+ case '.':
+ return true;
+ default:
+ return false;
}
}
-std::string glob_to_basic_regex(vespalib::stringref glob) {
+std::string dot_separated_glob_to_regex(vespalib::stringref glob) {
std::string ret = "^";
ret.reserve(glob.size() + 2);
for (auto c : glob) {
@@ -34,7 +40,7 @@ std::string glob_to_basic_regex(vespalib::stringref glob) {
// Same applies for single chars; they should only match _within_ a dot boundary.
ret += "[^.]";
} else {
- if (is_basic_regex_special_char(c)) {
+ if (is_regex_special_char(c)) {
ret += '\\';
}
ret += c;
@@ -45,16 +51,16 @@ std::string glob_to_basic_regex(vespalib::stringref glob) {
}
class RegexHostMatchPattern : public HostGlobPattern {
- std::regex _pattern_as_regex;
+ Regex _pattern_as_regex;
public:
explicit RegexHostMatchPattern(vespalib::stringref glob_pattern)
- : _pattern_as_regex(glob_to_basic_regex(glob_pattern), std::regex_constants::basic)
+ : _pattern_as_regex(Regex::from_pattern(dot_separated_glob_to_regex(glob_pattern)))
{
}
~RegexHostMatchPattern() override = default;
- bool matches(vespalib::stringref str) const override {
- return std::regex_match(str.begin(), str.end(), _pattern_as_regex);
+ [[nodiscard]] bool matches(vespalib::stringref str) const override {
+ return _pattern_as_regex.full_match(std::string_view(str.data(), str.size()));
}
};
diff --git a/vespalib/src/vespa/vespalib/net/tls/peer_policies.h b/vespalib/src/vespa/vespalib/net/tls/peer_policies.h
index c558708de8f..9d34b62415f 100644
--- a/vespalib/src/vespa/vespalib/net/tls/peer_policies.h
+++ b/vespalib/src/vespa/vespalib/net/tls/peer_policies.h
@@ -10,7 +10,7 @@ namespace vespalib::net::tls {
struct HostGlobPattern {
virtual ~HostGlobPattern() = default;
- virtual bool matches(vespalib::stringref str) const = 0;
+ [[nodiscard]] virtual bool matches(vespalib::stringref str) const = 0;
static std::shared_ptr<const HostGlobPattern> create_from_glob(vespalib::stringref pattern);
};
@@ -36,7 +36,7 @@ public:
&& (_original_pattern == rhs._original_pattern));
}
- bool matches(vespalib::stringref str) const {
+ [[nodiscard]] bool matches(vespalib::stringref str) const {
return (_match_pattern && _match_pattern->matches(str));
}
@@ -89,7 +89,7 @@ public:
bool operator==(const AuthorizedPeers& rhs) const {
return (_peer_policies == rhs._peer_policies);
}
- bool allows_all_authenticated() const noexcept {
+ [[nodiscard]] bool allows_all_authenticated() const noexcept {
return _allow_all_if_empty;
}
const std::vector<PeerPolicy>& peer_policies() const noexcept { return _peer_policies; }
diff --git a/vespalib/src/vespa/vespalib/regex/CMakeLists.txt b/vespalib/src/vespa/vespalib/regex/CMakeLists.txt
new file mode 100644
index 00000000000..1034dbf6086
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/regex/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(vespalib_vespalib_regex OBJECT
+ SOURCES
+ regex.cpp
+ DEPENDS
+)
+
+find_package(RE2 REQUIRED)
+# TODO can this be PRIVATE since we don't expose it transitively?
+target_include_directories(vespalib_vespalib_regex PUBLIC ${RE2_INCLUDE_DIR})
diff --git a/vespalib/src/vespa/vespalib/regex/regex.cpp b/vespalib/src/vespa/vespalib/regex/regex.cpp
new file mode 100644
index 00000000000..81677f1b9dd
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/regex/regex.cpp
@@ -0,0 +1,88 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "regex.h"
+#include <re2/re2.h>
+#include <cassert>
+#include <cstdint>
+
+namespace vespalib {
+
+using re2::StringPiece;
+
+// All RE2 instances use a Quiet option to prevent the library from
+// complaining to stderr if pattern compilation fails.
+
+Regex::Regex(std::shared_ptr<const Impl> impl)
+ : _impl(std::move(impl))
+{}
+
+Regex::Regex(const Regex&) = default;
+Regex& Regex::operator=(const Regex&) = default;
+Regex::Regex(Regex&&) noexcept = default;
+Regex& Regex::operator=(Regex&&) noexcept = default;
+
+Regex::~Regex() = default;
+
+class Regex::Impl {
+ RE2 _regex;
+public:
+ Impl(std::string_view pattern, const re2::RE2::Options& opts)
+ : _regex(StringPiece(pattern.data(), pattern.size()), opts)
+ {}
+
+ bool parsed_ok() const noexcept {
+ return _regex.ok();
+ }
+
+ bool partial_match(std::string_view input) const noexcept {
+ assert(input.size() <= INT32_MAX);
+ if (!_regex.ok()) {
+ return false;
+ }
+ return RE2::PartialMatch(StringPiece(input.data(), input.size()), _regex);
+ }
+
+ bool full_match(std::string_view input) const noexcept {
+ assert(input.size() <= INT32_MAX);
+ if (!_regex.ok()) {
+ return false;
+ }
+ return RE2::FullMatch(StringPiece(input.data(), input.size()), _regex);
+ }
+};
+
+Regex Regex::from_pattern(std::string_view pattern, uint32_t opt_mask) {
+ assert(pattern.size() <= INT32_MAX); // StringPiece limitation
+ RE2::Options opts;
+ opts.set_log_errors(false);
+ if ((opt_mask & Options::IgnoreCase) != 0) {
+ opts.set_case_sensitive(false);
+ }
+ return Regex(std::make_shared<Impl>(pattern, opts));
+}
+
+bool Regex::parsed_ok() const noexcept {
+ return _impl->parsed_ok();
+}
+
+bool Regex::partial_match(std::string_view input) const noexcept {
+ return _impl->partial_match(input);
+}
+
+bool Regex::full_match(std::string_view input) const noexcept {
+ return _impl->full_match(input);
+}
+
+bool Regex::partial_match(std::string_view input, std::string_view pattern) noexcept {
+ assert(pattern.size() <= INT32_MAX);
+ Impl impl(pattern, RE2::Quiet);
+ return impl.partial_match(input);
+}
+
+bool Regex::full_match(std::string_view input, std::string_view pattern) noexcept {
+ assert(pattern.size() <= INT32_MAX);
+ Impl impl(pattern, RE2::Quiet);
+ return impl.full_match(input);
+}
+
+}
diff --git a/vespalib/src/vespa/vespalib/regex/regex.h b/vespalib/src/vespa/vespalib/regex/regex.h
new file mode 100644
index 00000000000..4382d057252
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/regex/regex.h
@@ -0,0 +1,69 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+namespace vespalib {
+
+/**
+ * A simple Regex library wrapper which provides for both just-in-time
+ * pattern evaluation as well as pattern precompilation and reuse.
+ *
+ * Robustness and input safety:
+ * The underlying regex engine implementation must ensure that pattern
+ * parsing and input processing is safe to be run on _untrusted_ inputs.
+ * This means the underlying implementation shall provide upper bounds
+ * on both memory and CPU time and may never crash or corrupt the process.
+ *
+ * We currently use Google RE2 under the hood to achieve this.
+ *
+ * Note: due to underlying RE2 limitations, string lengths may
+ * not be longer than INT_MAX.
+ *
+ * Thread safety:
+ * A Regex object is safe to be used from multiple threads.
+ *
+ * Exception safety:
+ * Exceptions shall never be thrown from the regex code itself, neither
+ * at parse time nor at match time (ancillary exceptions _could_ be thrown
+ * from memory allocation failures etc, but we assume that the caller
+ * is running vespamalloc which terminates the process instead, making
+ * the whole thing effectively noexcept).
+ *
+ * If the provided regular expression pattern is malformed, parsing
+ * fails silently; all match functions will return false immediately.
+ */
+class Regex {
+ class Impl;
+ std::shared_ptr<const Impl> _impl; // shared_ptr to allow for cheap copying.
+
+ explicit Regex(std::shared_ptr<const Impl> impl);
+public:
+ // TODO consider using type-safe parameter instead.
+ enum Options {
+ None = 0,
+ IgnoreCase = 1
+ };
+
+ ~Regex();
+ Regex(const Regex&);
+ Regex& operator=(const Regex&);
+ Regex(Regex&&) noexcept;
+ Regex& operator=(Regex&&) noexcept;
+
+ [[nodiscard]] bool parsed_ok() const noexcept;
+
+ [[nodiscard]] bool partial_match(std::string_view input) const noexcept;
+ [[nodiscard]] bool full_match(std::string_view input) const noexcept;
+
+ static Regex from_pattern(std::string_view pattern, uint32_t opt_flags = Options::None);
+
+ // Utility matchers for non-precompiled expressions.
+ [[nodiscard]] static bool partial_match(std::string_view input, std::string_view pattern) noexcept;
+ [[nodiscard]] static bool full_match(std::string_view input, std::string_view pattern) noexcept;
+};
+
+}
+
diff --git a/vespalib/src/vespa/vespalib/util/regexp.cpp b/vespalib/src/vespa/vespalib/util/regexp.cpp
index b3cad06382e..0d0c7b69b12 100644
--- a/vespalib/src/vespa/vespalib/util/regexp.cpp
+++ b/vespalib/src/vespa/vespalib/util/regexp.cpp
@@ -41,7 +41,7 @@ vespalib::string escape(vespalib::stringref str) {
} // namespace vespalib::<unnamed>
vespalib::string
-Regexp::get_prefix(vespalib::stringref re)
+RegexpUtil::get_prefix(vespalib::stringref re)
{
vespalib::string prefix;
if ((re.size() > 0) && (re.data()[0] == '^') && !has_option(re)) {
@@ -58,13 +58,13 @@ Regexp::get_prefix(vespalib::stringref re)
}
vespalib::string
-Regexp::make_from_suffix(vespalib::stringref suffix)
+RegexpUtil::make_from_suffix(vespalib::stringref suffix)
{
return escape(suffix) + "$";
}
vespalib::string
-Regexp::make_from_substring(vespalib::stringref substring)
+RegexpUtil::make_from_substring(vespalib::stringref substring)
{
return escape(substring);
}
diff --git a/vespalib/src/vespa/vespalib/util/regexp.h b/vespalib/src/vespa/vespalib/util/regexp.h
index 9897b488aff..74a69fee361 100644
--- a/vespalib/src/vespa/vespalib/util/regexp.h
+++ b/vespalib/src/vespa/vespalib/util/regexp.h
@@ -8,7 +8,7 @@ namespace vespalib {
/**
* Utility class inspecting and generating regular expression strings.
**/
-class Regexp
+class RegexpUtil
{
public:
/**