diff options
21 files changed, 398 insertions, 110 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 8aa853e8c39..906a00ad843 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,11 @@ cmake_minimum_required(VERSION 3.5 FATAL_ERROR) include(functions.cmake) -list(APPEND CMAKE_MODULE_PATH "$ENV{HOME}/share/cmake/Modules" "/opt/vespa-deps/share/cmake/Modules") +list(APPEND CMAKE_MODULE_PATH + "$ENV{HOME}/share/cmake/Modules" + "/opt/vespa-deps/share/cmake/Modules" + "${CMAKE_CURRENT_SOURCE_DIR}/cmake" +) include(default_build_settings.cmake) vespa_detect_build_platform() message("-- Vespa build platform is ${VESPA_OS_DISTRO} ${VESPA_OS_DISTRO_VERSION}") diff --git a/cmake/FindRE2.cmake b/cmake/FindRE2.cmake new file mode 100644 index 00000000000..af1ff799bd7 --- /dev/null +++ b/cmake/FindRE2.cmake @@ -0,0 +1,19 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +# There is no bundled FindRE2, so we supply our own minimal version to find +# the system RE2 library and header files. + +find_path(RE2_INCLUDE_DIR + NAMES re2/re2.h +) + +find_library(RE2_LIBRARIES + NAMES re2 +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(RE2 + FOUND_VAR RE2_FOUND + REQUIRED_VARS RE2_LIBRARIES RE2_INCLUDE_DIR +) + diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp index 6d446f6f1d7..110153954af 100644 --- a/document/src/tests/documentselectparsertest.cpp +++ b/document/src/tests/documentselectparsertest.cpp @@ -576,6 +576,21 @@ TEST_F(DocumentSelectParserTest, regex_matching_does_not_bind_anchors_to_newline PARSE("\"a\\nb\\nc\" = \"b\"", *_doc[0], False); } +// With a recursive backtracking regex implementation like that found in (at the time of +// writing) GCC's std::regex implementation, certain expressions on a sufficiently large +// input will cause a stack overflow and send the whole thing spiraling into a flaming +// vortex of doom. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86164 for context. +// +// Since crashing the process based on user input is considered bad karma for all the +// obvious reasons, test that the underlying regex engine is not susceptible to such +// crashes. +TEST_F(DocumentSelectParserTest, regex_matching_is_not_susceptible_to_catastrophic_backtracking) { + std::string long_string(1024*50, 'A'); // -> hstringval field + auto doc = createDoc("testdoctype1", "id:foo:testdoctype1::bar", 24, 0.0, long_string, "bar", 0); + // This _will_ crash std::regex on GCC 8.3. Don't try this at home. Unless you want to. + PARSE(R"(testdoctype1.hstringval =~ ".*")", *doc, True); +} + TEST_F(DocumentSelectParserTest, operators_1) { createDocs(); diff --git a/document/src/vespa/document/select/operator.cpp b/document/src/vespa/document/select/operator.cpp index ef2ee26bdbd..f5cc681c906 100644 --- a/document/src/vespa/document/select/operator.cpp +++ b/document/src/vespa/document/select/operator.cpp @@ -1,9 +1,9 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "operator.h" -#include <regex> #include <vespa/vespalib/stllike/asciistream.h> #include <vespa/vespalib/stllike/hash_map.hpp> +#include <vespa/vespalib/regex/regex.h> #include <cassert> #include <ostream> @@ -96,23 +96,25 @@ RegexOperator::trace(const Value& a, const Value& b, std::ostream& out) const ResultList RegexOperator::compareImpl(const Value& a, const Value& b) const { - const StringValue* left(dynamic_cast<const StringValue*>(&a)); - const StringValue* right(dynamic_cast<const StringValue*>(&b)); - if (left == 0 || right == 0) return ResultList(Result::Invalid); + const auto* left(dynamic_cast<const StringValue*>(&a)); + const auto* right(dynamic_cast<const StringValue*>(&b)); + if (left == nullptr || right == nullptr) { + return ResultList(Result::Invalid); + } return match(left->getValue(), right->getValue()); } ResultList RegexOperator::traceImpl(const Value& a, const Value& b, std::ostream& out) const { - const StringValue* left(dynamic_cast<const StringValue*>(&a)); - const StringValue* right(dynamic_cast<const StringValue*>(&b)); - if (left == 0) { + const auto* left(dynamic_cast<const StringValue*>(&a)); + const auto* right(dynamic_cast<const StringValue*>(&b)); + if (left == nullptr) { out << "Operator(" << getName() << ") - Left value not a string. " << "Returning invalid.\n"; return ResultList(Result::Invalid); } - if (right == 0) { + if (right == nullptr) { out << "Operator(" << getName() << ") - Right value not a string. " << "Returning invalid.\n"; return ResultList(Result::Invalid); @@ -126,14 +128,12 @@ RegexOperator::traceImpl(const Value& a, const Value& b, std::ostream& out) cons ResultList RegexOperator::match(const vespalib::string& val, vespalib::stringref expr) const { - // Should we catch this in parsing? - if (expr.size() == 0) return ResultList(Result::True); - try { - std::regex expression(expr.data(), expr.size()); - return ResultList(Result::get(std::regex_search(val.c_str(), val.c_str() + val.size(), expression))); - } catch (std::regex_error &) { - return ResultList(Result::False); + if (expr.empty()) { + return ResultList(Result::True); // Should we catch this in parsing? } + return ResultList(Result::get( + vespalib::Regex::partial_match(std::string_view(val.data(), val.size()), + std::string_view(expr.data(), expr.size())))); } const RegexOperator RegexOperator::REGEX("=~"); @@ -158,13 +158,15 @@ GlobOperator::trace(const Value& a, const Value& b, std::ostream& out) const ResultList GlobOperator::compareImpl(const Value& a, const Value& b) const { - const StringValue* right(dynamic_cast<const StringValue*>(&b)); - // Fall back to operator== if it isn't string matching - if (right == 0) { + const auto* right(dynamic_cast<const StringValue*>(&b)); + // Fall back to operator== if it isn't string matching + if (right == nullptr) { return FunctionOperator::EQ.compare(a, b); } - const StringValue* left(dynamic_cast<const StringValue*>(&a)); - if (left == 0) return ResultList(Result::Invalid); + const auto* left(dynamic_cast<const StringValue*>(&a)); + if (left == nullptr) { + return ResultList(Result::Invalid); + } vespalib::string regex(convertToRegex(right->getValue())); return match(left->getValue(), regex); } @@ -172,15 +174,15 @@ GlobOperator::compareImpl(const Value& a, const Value& b) const ResultList GlobOperator::traceImpl(const Value& a, const Value& b, std::ostream& ost) const { - const StringValue* right(dynamic_cast<const StringValue*>(&b)); - // Fall back to operator== if it isn't string matching - if (right == 0) { + const auto* right(dynamic_cast<const StringValue*>(&b)); + // Fall back to operator== if it isn't string matching + if (right == nullptr) { ost << "Operator(" << getName() << ") - Right val not a string, " << "falling back to == behavior.\n"; return FunctionOperator::EQ.trace(a, b, ost); } - const StringValue* left(dynamic_cast<const StringValue*>(&a)); - if (left == 0) { + const auto* left(dynamic_cast<const StringValue*>(&a)); + if (left == nullptr) { ost << "Operator(" << getName() << ") - Left value is not a string, " << "returning invalid.\n"; return ResultList(Result::Invalid); diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp index 9af05059bef..14d33914d05 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp @@ -519,12 +519,12 @@ public: void visit(StringTerm & n) override { visitTerm(n, true); } void visit(SubstringTerm & n) override { - query::SimpleRegExpTerm re(vespalib::Regexp::make_from_substring(n.getTerm()), + query::SimpleRegExpTerm re(vespalib::RegexpUtil::make_from_substring(n.getTerm()), n.getView(), n.getId(), n.getWeight()); visitTerm(re); } void visit(SuffixTerm & n) override { - query::SimpleRegExpTerm re(vespalib::Regexp::make_from_suffix(n.getTerm()), + query::SimpleRegExpTerm re(vespalib::RegexpUtil::make_from_suffix(n.getTerm()), n.getView(), n.getId(), n.getWeight()); visitTerm(re); } diff --git a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp index 6cefc03dd70..eafa5bf0e1f 100644 --- a/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multistringattribute.hpp @@ -123,7 +123,7 @@ StringTemplSearchContext(QueryTermSimpleUP qTerm, const AttrType & toBeSearched) auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm(), true); lookupRange(comp, comp); } else if (this->isRegex()) { - vespalib::string prefix(vespalib::Regexp::get_prefix(this->queryTerm()->getTerm())); + vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm())); auto comp = enumStore.make_folded_comparator(prefix.c_str(), true); lookupRange(comp, comp); } else { diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h index 43d8b7ce9d2..e94de44e45b 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h @@ -183,14 +183,14 @@ private: using PostingList = typename AggregationTraits::PostingList; using Parent = PostingSearchContext<BaseSC, PostingListFoldedSearchContextT<DataT>, AttrT>; using FoldedComparatorType = typename Parent::EnumStore::FoldedComparatorType; - using Regexp = vespalib::Regexp; + using RegexpUtil = vespalib::RegexpUtil; using QueryTermSimpleUP = typename Parent::QueryTermSimpleUP; using Parent::_toBeSearched; using Parent::_enumStore; using Parent::isRegex; using Parent::getRegex; bool useThis(const PostingListSearchContext::DictionaryConstIterator & it) const override { - return isRegex() ? (getRegex() ? std::regex_search(_enumStore.get_value(it.getKey()), *getRegex()) : false ) : true; + return isRegex() ? (getRegex() ? getRegex()->partial_match(_enumStore.get_value(it.getKey())) : false ) : true; } public: StringPostingSearchContext(QueryTermSimpleUP qTerm, bool useBitVector, const AttrT &toBeSearched); @@ -288,7 +288,7 @@ StringPostingSearchContext(QueryTermSimpleUP qTerm, bool useBitVector, const Att auto comp = _enumStore.make_folded_comparator(this->queryTerm()->getTerm(), true); this->lookupRange(comp, comp); } else if (this->isRegex()) { - vespalib::string prefix(Regexp::get_prefix(this->queryTerm()->getTerm())); + vespalib::string prefix(RegexpUtil::get_prefix(this->queryTerm()->getTerm())); auto comp = _enumStore.make_folded_comparator(prefix.c_str(), true); this->lookupRange(comp, comp); } else { diff --git a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp index 214da6bf230..406cbbbe447 100644 --- a/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/singlestringattribute.hpp @@ -59,7 +59,7 @@ SingleValueStringAttributeT<B>::StringTemplSearchContext::StringTemplSearchConte auto comp = enumStore.make_folded_comparator(queryTerm()->getTerm(), true); lookupRange(comp, comp); } else if (this->isRegex()) { - vespalib::string prefix(vespalib::Regexp::get_prefix(this->queryTerm()->getTerm())); + vespalib::string prefix(vespalib::RegexpUtil::get_prefix(this->queryTerm()->getTerm())); auto comp = enumStore.make_folded_comparator(prefix.c_str(), true); lookupRange(comp, comp); } else { diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp index d7523c86e29..32b5b3ca373 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.cpp +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.cpp @@ -231,10 +231,7 @@ StringAttribute::StringSearchContext::StringSearchContext(QueryTermSimple::UP qT _regex() { if (isRegex()) { - try { - _regex = std::regex(_queryTerm->getTerm(), std::regex::icase); - } catch (std::regex_error &) { - } + _regex = vespalib::Regex::from_pattern(_queryTerm->getTerm(), vespalib::Regex::Options::IgnoreCase); } } diff --git a/searchlib/src/vespa/searchlib/attribute/stringbase.h b/searchlib/src/vespa/searchlib/attribute/stringbase.h index cf0a92253de..3518544cbdc 100644 --- a/searchlib/src/vespa/searchlib/attribute/stringbase.h +++ b/searchlib/src/vespa/searchlib/attribute/stringbase.h @@ -9,10 +9,10 @@ #include <vespa/searchlib/attribute/i_enum_store.h> #include <vespa/searchlib/attribute/loadedenumvalue.h> #include <vespa/searchlib/util/foldedstringcompare.h> +#include <vespa/vespalib/regex/regex.h> #include <vespa/vespalib/text/lowercase.h> #include <vespa/vespalib/text/utf8.h> #include <optional> -#include <regex> namespace search { @@ -103,7 +103,7 @@ protected: const QueryTermUCS4 * queryTerm() const override; bool isMatch(const char *src) const { if (__builtin_expect(isRegex(), false)) { - return _regex ? std::regex_search(src, *_regex) : false; + return _regex ? _regex->partial_match(std::string_view(src)) : false; } vespalib::Utf8ReaderForZTS u8reader(src); uint32_t j = 0; @@ -162,7 +162,7 @@ protected: bool isRegex() const { return _isRegex; } QueryTermSimpleUP _queryTerm; std::vector<ucs4_t> _termUCS4; - const std::optional<std::regex>& getRegex() const { return _regex; } + const std::optional<vespalib::Regex>& getRegex() const { return _regex; } private: WeightedConstChar * getBuffer() const { if (_buffer == nullptr) { @@ -170,9 +170,9 @@ protected: } return _buffer; } - unsigned _bufferLen; - mutable WeightedConstChar * _buffer; - std::optional<std::regex> _regex; + unsigned _bufferLen; + mutable WeightedConstChar * _buffer; + std::optional<vespalib::Regex> _regex; }; private: SearchContext::UP getSearch(QueryTermSimpleUP term, const attribute::SearchContextParams & params) const override; diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index 979184acbae..3530fb816df 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -153,6 +153,7 @@ vespa_define_module( src/vespa/vespalib/net/tls/impl src/vespa/vespalib/objects src/vespa/vespalib/portal + src/vespa/vespalib/regex src/vespa/vespalib/stllike src/vespa/vespalib/test src/vespa/vespalib/testkit diff --git a/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp b/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp index ad45c217701..a9e823bf3ab 100644 --- a/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp +++ b/vespalib/src/tests/net/tls/policy_checking_certificate_verifier/policy_checking_certificate_verifier_test.cpp @@ -1,11 +1,8 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vespalib/io/fileutil.h> #include <vespa/vespalib/net/tls/transport_security_options.h> -#include <vespa/vespalib/net/tls/transport_security_options_reading.h> #include <vespa/vespalib/net/tls/policy_checking_certificate_verifier.h> #include <vespa/vespalib/test/peer_policy_utils.h> #include <vespa/vespalib/testkit/test_kit.h> -#include <vespa/vespalib/util/exceptions.h> using namespace vespalib; using namespace vespalib::net::tls; diff --git a/vespalib/src/tests/regex/regex.cpp b/vespalib/src/tests/regex/regex.cpp index d1b94daa7ba..7dc5a7f4aa9 100644 --- a/vespalib/src/tests/regex/regex.cpp +++ b/vespalib/src/tests/regex/regex.cpp @@ -1,70 +1,147 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include <vespa/vespalib/testkit/test_kit.h> - +#include <vespa/vespalib/regex/regex.h> #include <vespa/vespalib/util/regexp.h> -#include <vespa/vespalib/util/exception.h> -#include <regex> +#include <string> using namespace vespalib; TEST("require that prefix detection works") { - EXPECT_EQUAL("", Regexp::get_prefix("")); - EXPECT_EQUAL("", Regexp::get_prefix("foo")); - EXPECT_EQUAL("foo", Regexp::get_prefix("^foo")); - EXPECT_EQUAL("", Regexp::get_prefix("^foo|bar")); - EXPECT_EQUAL("foo", Regexp::get_prefix("^foo$")); - EXPECT_EQUAL("foo", Regexp::get_prefix("^foo[a-z]")); - EXPECT_EQUAL("fo", Regexp::get_prefix("^foo{0,1}")); - EXPECT_EQUAL("foo", Regexp::get_prefix("^foo.")); - EXPECT_EQUAL("fo", Regexp::get_prefix("^foo*")); - EXPECT_EQUAL("fo", Regexp::get_prefix("^foo?")); - EXPECT_EQUAL("foo", Regexp::get_prefix("^foo+")); + EXPECT_EQUAL("", RegexpUtil::get_prefix("")); + EXPECT_EQUAL("", RegexpUtil::get_prefix("foo")); + EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo")); + EXPECT_EQUAL("", RegexpUtil::get_prefix("^foo|bar")); + EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo$")); + EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo[a-z]")); + EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo{0,1}")); + EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo.")); + EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo*")); + EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo?")); + EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo+")); } TEST("require that prefix detection sometimes underestimates the prefix size") { - EXPECT_EQUAL("", Regexp::get_prefix("^^foo")); - EXPECT_EQUAL("", Regexp::get_prefix("^foo(bar|baz)")); - EXPECT_EQUAL("fo", Regexp::get_prefix("^foo{1,2}")); - EXPECT_EQUAL("foo", Regexp::get_prefix("^foo\\.")); - EXPECT_EQUAL("foo", Regexp::get_prefix("^foo(bar)")); - EXPECT_EQUAL("", Regexp::get_prefix("(^foo)")); - EXPECT_EQUAL("", Regexp::get_prefix("^(foo)")); - EXPECT_EQUAL("foo", Regexp::get_prefix("^foo[a]")); - EXPECT_EQUAL("", Regexp::get_prefix("^foo|^foobar")); + EXPECT_EQUAL("", RegexpUtil::get_prefix("^^foo")); + EXPECT_EQUAL("", RegexpUtil::get_prefix("^foo(bar|baz)")); + EXPECT_EQUAL("fo", RegexpUtil::get_prefix("^foo{1,2}")); + EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo\\.")); + EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo(bar)")); + EXPECT_EQUAL("", RegexpUtil::get_prefix("(^foo)")); + EXPECT_EQUAL("", RegexpUtil::get_prefix("^(foo)")); + EXPECT_EQUAL("foo", RegexpUtil::get_prefix("^foo[a]")); + EXPECT_EQUAL("", RegexpUtil::get_prefix("^foo|^foobar")); } -const vespalib::string special("^|()[]{}.*?+\\$"); +const std::string special("^|()[]{}.*?+\\$"); struct ExprFixture { - std::vector<vespalib::string> expressions; + std::vector<std::string> expressions; ExprFixture() { expressions.push_back(special); for (char c: special) { - expressions.push_back(vespalib::string(&c, 1)); + expressions.emplace_back(std::string(&c, 1)); } - expressions.push_back("abc"); - expressions.push_back("[:digit:]"); + expressions.emplace_back("abc"); + expressions.emplace_back("[:digit:]"); } }; TEST_F("require that regexp can be made from suffix string", ExprFixture()) { - for (vespalib::string str: f1.expressions) { - std::regex re(std::string(Regexp::make_from_suffix(str))); - EXPECT_TRUE(std::regex_search(std::string(str), re)); - EXPECT_FALSE(std::regex_search(std::string(str + "foo"), re)); - EXPECT_TRUE(std::regex_search(std::string("foo" + str), re)); - EXPECT_FALSE(std::regex_search(std::string("foo" + str + "bar"), re)); + for (const auto& str: f1.expressions) { + auto re = Regex::from_pattern(std::string(RegexpUtil::make_from_suffix(str))); + ASSERT_TRUE(re.parsed_ok()); + + EXPECT_TRUE(re.partial_match(str)); + EXPECT_FALSE(re.partial_match(str + "foo")); + EXPECT_TRUE(re.partial_match("foo" + str)); + EXPECT_FALSE(re.partial_match("foo" + str + "bar")); } } TEST_F("require that regexp can be made from substring string", ExprFixture()) { - for (vespalib::string str: f1.expressions) { - std::regex re(std::string(Regexp::make_from_substring(str))); - EXPECT_TRUE(std::regex_search(std::string(str), re)); - EXPECT_TRUE(std::regex_search(std::string(str + "foo"), re)); - EXPECT_TRUE(std::regex_search(std::string("foo" + str), re)); - EXPECT_TRUE(std::regex_search(std::string("foo" + str + "bar"), re)); + for (const auto& str: f1.expressions) { + auto re = Regex::from_pattern(std::string(RegexpUtil::make_from_substring(str))); + ASSERT_TRUE(re.parsed_ok()); + + EXPECT_TRUE(re.partial_match(str)); + EXPECT_TRUE(re.partial_match(str + "foo")); + EXPECT_TRUE(re.partial_match("foo" + str)); + EXPECT_TRUE(re.partial_match("foo" + str + "bar")); } } +TEST("full_match requires expression to match entire input string") { + std::string pattern = "[Aa][Bb][Cc]"; + auto re = Regex::from_pattern(pattern); + ASSERT_TRUE(re.parsed_ok()); + + EXPECT_TRUE(re.full_match("abc")); + EXPECT_TRUE(re.full_match("ABC")); + EXPECT_FALSE(re.full_match("abcd")); + EXPECT_FALSE(re.full_match("aabc")); + EXPECT_FALSE(re.full_match("aabcc")); + + EXPECT_TRUE(Regex::full_match("abc", pattern)); + EXPECT_TRUE(Regex::full_match("ABC", pattern)); + EXPECT_FALSE(Regex::full_match("abcd", pattern)); + EXPECT_FALSE(Regex::full_match("aabc", pattern)); + EXPECT_FALSE(Regex::full_match("aabcc", pattern)); +} + +TEST("partial_match requires expression to match substring of input string") { + std::string pattern = "[Aa][Bb][Cc]"; + auto re = Regex::from_pattern(pattern); + ASSERT_TRUE(re.parsed_ok()); + + EXPECT_TRUE(re.partial_match("abc")); + EXPECT_TRUE(re.partial_match("ABC")); + EXPECT_TRUE(re.partial_match("abcd")); + EXPECT_TRUE(re.partial_match("aabc")); + EXPECT_TRUE(re.partial_match("aabcc")); + EXPECT_FALSE(re.partial_match("abd")); + + EXPECT_TRUE(Regex::partial_match("abc", pattern)); + EXPECT_TRUE(Regex::partial_match("ABC", pattern)); + EXPECT_TRUE(Regex::partial_match("abcd", pattern)); + EXPECT_TRUE(Regex::partial_match("aabc", pattern)); + EXPECT_TRUE(Regex::partial_match("aabcc", pattern)); + EXPECT_FALSE(Regex::partial_match("abd", pattern)); +} + +TEST("partial_match can be explicitly anchored") { + EXPECT_TRUE(Regex::partial_match("abcc", "^abc")); + EXPECT_FALSE(Regex::partial_match("aabc", "^abc")); + EXPECT_TRUE(Regex::partial_match("aabc", "abc$")); + EXPECT_FALSE(Regex::partial_match("abcc", "abc$")); + EXPECT_TRUE(Regex::partial_match("abc", "^abc$")); + EXPECT_FALSE(Regex::partial_match("aabc", "^abc$")); + EXPECT_FALSE(Regex::partial_match("abcc", "^abc$")); +} + +TEST("Regex instance returns parsed_ok() == false upon parse failure") { + auto re = Regex::from_pattern("[a-z"); // Unterminated set + EXPECT_FALSE(re.parsed_ok()); +} + +TEST("Regex that has failed parsing immediately returns false for matches") { + auto re = Regex::from_pattern("[a-z"); + EXPECT_FALSE(re.parsed_ok()); + EXPECT_FALSE(re.partial_match("a")); + EXPECT_FALSE(re.full_match("b")); +} + +TEST("can create case-insensitive regex matcher") { + auto re = Regex::from_pattern("hello", Regex::Options::IgnoreCase); + ASSERT_TRUE(re.parsed_ok()); + EXPECT_TRUE(re.partial_match("HelLo world")); + EXPECT_TRUE(re.full_match("HELLO")); +} + +TEST("regex is case sensitive by default") { + auto re = Regex::from_pattern("hello"); + ASSERT_TRUE(re.parsed_ok()); + EXPECT_FALSE(re.partial_match("HelLo world")); + EXPECT_FALSE(re.full_match("HELLO")); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/CMakeLists.txt b/vespalib/src/vespa/vespalib/CMakeLists.txt index 95f6a407914..4a753a66394 100644 --- a/vespalib/src/vespa/vespalib/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/CMakeLists.txt @@ -16,6 +16,7 @@ vespa_add_library(vespalib $<TARGET_OBJECTS:vespalib_vespalib_net_tls_impl> $<TARGET_OBJECTS:vespalib_vespalib_objects> $<TARGET_OBJECTS:vespalib_vespalib_portal> + $<TARGET_OBJECTS:vespalib_vespalib_regex> $<TARGET_OBJECTS:vespalib_vespalib_stllike> $<TARGET_OBJECTS:vespalib_vespalib_test> $<TARGET_OBJECTS:vespalib_vespalib_testkit> @@ -30,3 +31,5 @@ vespa_add_library(vespalib ) vespa_add_target_package_dependency(vespalib OpenSSL) +vespa_add_target_package_dependency(vespalib RE2) + diff --git a/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp b/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp index 8d2fb04d853..27a11b3f0f1 100644 --- a/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp +++ b/vespalib/src/vespa/vespalib/net/tls/peer_policies.cpp @@ -1,28 +1,34 @@ // Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "peer_policies.h" +#include <vespa/vespalib/regex/regex.h> #include <iostream> -#include <regex> namespace vespalib::net::tls { namespace { -// Note: this is for basix regexp only, _not_ extended regexp -bool is_basic_regex_special_char(char c) noexcept { +bool is_regex_special_char(char c) noexcept { switch (c) { - case '^': - case '$': - case '.': - case '[': - case '\\': - return true; - default: - return false; + case '^': + case '$': + case '|': + case '{': + case '}': + case '(': + case ')': + case '[': + case ']': + case '\\': + case '+': + case '.': + return true; + default: + return false; } } -std::string glob_to_basic_regex(vespalib::stringref glob) { +std::string dot_separated_glob_to_regex(vespalib::stringref glob) { std::string ret = "^"; ret.reserve(glob.size() + 2); for (auto c : glob) { @@ -34,7 +40,7 @@ std::string glob_to_basic_regex(vespalib::stringref glob) { // Same applies for single chars; they should only match _within_ a dot boundary. ret += "[^.]"; } else { - if (is_basic_regex_special_char(c)) { + if (is_regex_special_char(c)) { ret += '\\'; } ret += c; @@ -45,16 +51,16 @@ std::string glob_to_basic_regex(vespalib::stringref glob) { } class RegexHostMatchPattern : public HostGlobPattern { - std::regex _pattern_as_regex; + Regex _pattern_as_regex; public: explicit RegexHostMatchPattern(vespalib::stringref glob_pattern) - : _pattern_as_regex(glob_to_basic_regex(glob_pattern), std::regex_constants::basic) + : _pattern_as_regex(Regex::from_pattern(dot_separated_glob_to_regex(glob_pattern))) { } ~RegexHostMatchPattern() override = default; - bool matches(vespalib::stringref str) const override { - return std::regex_match(str.begin(), str.end(), _pattern_as_regex); + [[nodiscard]] bool matches(vespalib::stringref str) const override { + return _pattern_as_regex.full_match(std::string_view(str.data(), str.size())); } }; diff --git a/vespalib/src/vespa/vespalib/net/tls/peer_policies.h b/vespalib/src/vespa/vespalib/net/tls/peer_policies.h index c558708de8f..9d34b62415f 100644 --- a/vespalib/src/vespa/vespalib/net/tls/peer_policies.h +++ b/vespalib/src/vespa/vespalib/net/tls/peer_policies.h @@ -10,7 +10,7 @@ namespace vespalib::net::tls { struct HostGlobPattern { virtual ~HostGlobPattern() = default; - virtual bool matches(vespalib::stringref str) const = 0; + [[nodiscard]] virtual bool matches(vespalib::stringref str) const = 0; static std::shared_ptr<const HostGlobPattern> create_from_glob(vespalib::stringref pattern); }; @@ -36,7 +36,7 @@ public: && (_original_pattern == rhs._original_pattern)); } - bool matches(vespalib::stringref str) const { + [[nodiscard]] bool matches(vespalib::stringref str) const { return (_match_pattern && _match_pattern->matches(str)); } @@ -89,7 +89,7 @@ public: bool operator==(const AuthorizedPeers& rhs) const { return (_peer_policies == rhs._peer_policies); } - bool allows_all_authenticated() const noexcept { + [[nodiscard]] bool allows_all_authenticated() const noexcept { return _allow_all_if_empty; } const std::vector<PeerPolicy>& peer_policies() const noexcept { return _peer_policies; } diff --git a/vespalib/src/vespa/vespalib/regex/CMakeLists.txt b/vespalib/src/vespa/vespalib/regex/CMakeLists.txt new file mode 100644 index 00000000000..1034dbf6086 --- /dev/null +++ b/vespalib/src/vespa/vespalib/regex/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(vespalib_vespalib_regex OBJECT + SOURCES + regex.cpp + DEPENDS +) + +find_package(RE2 REQUIRED) +# TODO can this be PRIVATE since we don't expose it transitively? +target_include_directories(vespalib_vespalib_regex PUBLIC ${RE2_INCLUDE_DIR}) diff --git a/vespalib/src/vespa/vespalib/regex/regex.cpp b/vespalib/src/vespa/vespalib/regex/regex.cpp new file mode 100644 index 00000000000..81677f1b9dd --- /dev/null +++ b/vespalib/src/vespa/vespalib/regex/regex.cpp @@ -0,0 +1,88 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "regex.h" +#include <re2/re2.h> +#include <cassert> +#include <cstdint> + +namespace vespalib { + +using re2::StringPiece; + +// All RE2 instances use a Quiet option to prevent the library from +// complaining to stderr if pattern compilation fails. + +Regex::Regex(std::shared_ptr<const Impl> impl) + : _impl(std::move(impl)) +{} + +Regex::Regex(const Regex&) = default; +Regex& Regex::operator=(const Regex&) = default; +Regex::Regex(Regex&&) noexcept = default; +Regex& Regex::operator=(Regex&&) noexcept = default; + +Regex::~Regex() = default; + +class Regex::Impl { + RE2 _regex; +public: + Impl(std::string_view pattern, const re2::RE2::Options& opts) + : _regex(StringPiece(pattern.data(), pattern.size()), opts) + {} + + bool parsed_ok() const noexcept { + return _regex.ok(); + } + + bool partial_match(std::string_view input) const noexcept { + assert(input.size() <= INT32_MAX); + if (!_regex.ok()) { + return false; + } + return RE2::PartialMatch(StringPiece(input.data(), input.size()), _regex); + } + + bool full_match(std::string_view input) const noexcept { + assert(input.size() <= INT32_MAX); + if (!_regex.ok()) { + return false; + } + return RE2::FullMatch(StringPiece(input.data(), input.size()), _regex); + } +}; + +Regex Regex::from_pattern(std::string_view pattern, uint32_t opt_mask) { + assert(pattern.size() <= INT32_MAX); // StringPiece limitation + RE2::Options opts; + opts.set_log_errors(false); + if ((opt_mask & Options::IgnoreCase) != 0) { + opts.set_case_sensitive(false); + } + return Regex(std::make_shared<Impl>(pattern, opts)); +} + +bool Regex::parsed_ok() const noexcept { + return _impl->parsed_ok(); +} + +bool Regex::partial_match(std::string_view input) const noexcept { + return _impl->partial_match(input); +} + +bool Regex::full_match(std::string_view input) const noexcept { + return _impl->full_match(input); +} + +bool Regex::partial_match(std::string_view input, std::string_view pattern) noexcept { + assert(pattern.size() <= INT32_MAX); + Impl impl(pattern, RE2::Quiet); + return impl.partial_match(input); +} + +bool Regex::full_match(std::string_view input, std::string_view pattern) noexcept { + assert(pattern.size() <= INT32_MAX); + Impl impl(pattern, RE2::Quiet); + return impl.full_match(input); +} + +} diff --git a/vespalib/src/vespa/vespalib/regex/regex.h b/vespalib/src/vespa/vespalib/regex/regex.h new file mode 100644 index 00000000000..4382d057252 --- /dev/null +++ b/vespalib/src/vespa/vespalib/regex/regex.h @@ -0,0 +1,69 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <memory> +#include <string> +#include <string_view> + +namespace vespalib { + +/** + * A simple Regex library wrapper which provides for both just-in-time + * pattern evaluation as well as pattern precompilation and reuse. + * + * Robustness and input safety: + * The underlying regex engine implementation must ensure that pattern + * parsing and input processing is safe to be run on _untrusted_ inputs. + * This means the underlying implementation shall provide upper bounds + * on both memory and CPU time and may never crash or corrupt the process. + * + * We currently use Google RE2 under the hood to achieve this. + * + * Note: due to underlying RE2 limitations, string lengths may + * not be longer than INT_MAX. + * + * Thread safety: + * A Regex object is safe to be used from multiple threads. + * + * Exception safety: + * Exceptions shall never be thrown from the regex code itself, neither + * at parse time nor at match time (ancillary exceptions _could_ be thrown + * from memory allocation failures etc, but we assume that the caller + * is running vespamalloc which terminates the process instead, making + * the whole thing effectively noexcept). + * + * If the provided regular expression pattern is malformed, parsing + * fails silently; all match functions will return false immediately. + */ +class Regex { + class Impl; + std::shared_ptr<const Impl> _impl; // shared_ptr to allow for cheap copying. + + explicit Regex(std::shared_ptr<const Impl> impl); +public: + // TODO consider using type-safe parameter instead. + enum Options { + None = 0, + IgnoreCase = 1 + }; + + ~Regex(); + Regex(const Regex&); + Regex& operator=(const Regex&); + Regex(Regex&&) noexcept; + Regex& operator=(Regex&&) noexcept; + + [[nodiscard]] bool parsed_ok() const noexcept; + + [[nodiscard]] bool partial_match(std::string_view input) const noexcept; + [[nodiscard]] bool full_match(std::string_view input) const noexcept; + + static Regex from_pattern(std::string_view pattern, uint32_t opt_flags = Options::None); + + // Utility matchers for non-precompiled expressions. + [[nodiscard]] static bool partial_match(std::string_view input, std::string_view pattern) noexcept; + [[nodiscard]] static bool full_match(std::string_view input, std::string_view pattern) noexcept; +}; + +} + diff --git a/vespalib/src/vespa/vespalib/util/regexp.cpp b/vespalib/src/vespa/vespalib/util/regexp.cpp index b3cad06382e..0d0c7b69b12 100644 --- a/vespalib/src/vespa/vespalib/util/regexp.cpp +++ b/vespalib/src/vespa/vespalib/util/regexp.cpp @@ -41,7 +41,7 @@ vespalib::string escape(vespalib::stringref str) { } // namespace vespalib::<unnamed> vespalib::string -Regexp::get_prefix(vespalib::stringref re) +RegexpUtil::get_prefix(vespalib::stringref re) { vespalib::string prefix; if ((re.size() > 0) && (re.data()[0] == '^') && !has_option(re)) { @@ -58,13 +58,13 @@ Regexp::get_prefix(vespalib::stringref re) } vespalib::string -Regexp::make_from_suffix(vespalib::stringref suffix) +RegexpUtil::make_from_suffix(vespalib::stringref suffix) { return escape(suffix) + "$"; } vespalib::string -Regexp::make_from_substring(vespalib::stringref substring) +RegexpUtil::make_from_substring(vespalib::stringref substring) { return escape(substring); } diff --git a/vespalib/src/vespa/vespalib/util/regexp.h b/vespalib/src/vespa/vespalib/util/regexp.h index 9897b488aff..74a69fee361 100644 --- a/vespalib/src/vespa/vespalib/util/regexp.h +++ b/vespalib/src/vespa/vespalib/util/regexp.h @@ -8,7 +8,7 @@ namespace vespalib { /** * Utility class inspecting and generating regular expression strings. **/ -class Regexp +class RegexpUtil { public: /** |