diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2020-04-01 11:21:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-04-01 11:21:16 +0200 |
commit | be3c2e7f0f1f3edc07973dccb05b13bf30c75de1 (patch) | |
tree | 710db4b6b95bf5bda839070b952d380591126f7e /document | |
parent | 444693a52467663ba4195e4f90991a388defe75c (diff) | |
parent | 4153969ca13541e7434963d02add12d574c9a461 (diff) |
Merge pull request #12784 from vespa-engine/vekterli/add-explicit-limits-to-selection-parser
Add explicit limits to backend document selection parsing
Diffstat (limited to 'document')
-rw-r--r-- | document/src/tests/documentselectparsertest.cpp | 79 | ||||
-rw-r--r-- | document/src/vespa/document/select/CMakeLists.txt | 1 | ||||
-rw-r--r-- | document/src/vespa/document/select/branch.cpp | 6 | ||||
-rw-r--r-- | document/src/vespa/document/select/branch.h | 9 | ||||
-rw-r--r-- | document/src/vespa/document/select/compare.cpp | 2 | ||||
-rw-r--r-- | document/src/vespa/document/select/grammar/lexer.ll | 7 | ||||
-rw-r--r-- | document/src/vespa/document/select/node.h | 29 | ||||
-rw-r--r-- | document/src/vespa/document/select/parser.cpp | 14 | ||||
-rw-r--r-- | document/src/vespa/document/select/parser_limits.cpp | 13 | ||||
-rw-r--r-- | document/src/vespa/document/select/parser_limits.h | 19 | ||||
-rw-r--r-- | document/src/vespa/document/select/valuenode.h | 26 | ||||
-rw-r--r-- | document/src/vespa/document/select/valuenodes.cpp | 39 | ||||
-rw-r--r-- | document/src/vespa/document/select/valuenodes.h | 10 |
13 files changed, 214 insertions, 40 deletions
diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp index 110153954af..59f4ce69b33 100644 --- a/document/src/tests/documentselectparsertest.cpp +++ b/document/src/tests/documentselectparsertest.cpp @@ -18,6 +18,7 @@ #include <vespa/document/select/compare.h> #include <vespa/document/select/operator.h> #include <vespa/document/select/parse_utils.h> +#include <vespa/document/select/parser_limits.h> #include <vespa/vespalib/util/exceptions.h> #include <limits> #include <gtest/gtest.h> @@ -1247,17 +1248,17 @@ TEST_F(DocumentSelectParserTest, testThatSimpleFieldValuesHaveCorrectFieldName) TEST_F(DocumentSelectParserTest, testThatComplexFieldValuesHaveCorrectFieldNames) { - EXPECT_EQ( - vespalib::string("headerval"), - parseFieldValue("testdoctype1.headerval{test}")->getRealFieldName()); + EXPECT_EQ(vespalib::string("headerval"), + parseFieldValue("testdoctype1.headerval{test}")->getRealFieldName()); - EXPECT_EQ( - vespalib::string("headerval"), - parseFieldValue("testdoctype1.headerval[42]")->getRealFieldName()); + EXPECT_EQ(vespalib::string("headerval"), + parseFieldValue("testdoctype1.headerval[42]")->getRealFieldName()); - EXPECT_EQ( - vespalib::string("headerval"), - parseFieldValue("testdoctype1.headerval.meow.meow{test}")->getRealFieldName()); + EXPECT_EQ(vespalib::string("headerval"), + parseFieldValue("testdoctype1.headerval.meow.meow{test}")->getRealFieldName()); + + EXPECT_EQ(vespalib::string("headerval"), + parseFieldValue("testdoctype1.headerval .meow.meow{test}")->getRealFieldName()); } namespace { @@ -1603,4 +1604,64 @@ TEST_F(DocumentSelectParserTest, redundant_glob_wildcards_are_collapsed_into_min EXPECT_EQ(GlobOperator::convertToRegex("*?*?*?*"), "..*..*."); // Don't try this at home, kids! } +TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_field_exprs) { + createDocs(); + std::string expr = "testdoctype1"; + for (size_t i = 0; i < 50000; ++i) { + expr += ".foo"; + } + expr += ".hash() != 0"; + verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested (max 1024 levels)"); +} + +TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_arithmetic_exprs) { + createDocs(); + std::string expr = "1"; + for (size_t i = 0; i < 50000; ++i) { + expr += "+1"; + } + expr += " != 0"; + verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested (max 1024 levels)"); +} + +TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_binary_logical_exprs) { + createDocs(); + // Also throw in some comparisons to ensure they carry over the max depth. + std::string expr = "1 == 2"; + std::string cmp_subexpr = "3 != 4"; + for (size_t i = 0; i < 10000; ++i) { + expr += (i % 2 == 0 ? " and " : " or ") + cmp_subexpr; + } + verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested (max 1024 levels)"); +} + +TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_unary_logical_exprs) { + createDocs(); + std::string expr; + for (size_t i = 0; i < 10000; ++i) { + expr += "not "; + } + expr += "true"; + verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested (max 1024 levels)"); +} + +TEST_F(DocumentSelectParserTest, selection_has_upper_limit_on_input_size) { + createDocs(); + std::string expr = ("testdoctype1.a_biii" + + std::string(select::ParserLimits::MaxSelectionByteSize, 'i') + + "iiig_identifier"); + verifyFailedParse(expr, "ParsingFailedException: expression is too large to be " + "parsed (max 1048576 bytes, got 1048610)"); +} + +TEST_F(DocumentSelectParserTest, lexing_does_not_have_superlinear_time_complexity) { + createDocs(); + std::string expr = ("testdoctype1.hstringval == 'a_biii" + + std::string(select::ParserLimits::MaxSelectionByteSize - 100, 'i') + + "iiig string'"); + // If the lexer is not compiled with the appropriate options, this will take a long time. + // A really, really long time. + PARSE(expr, *_doc[0], False); +} + } // document diff --git a/document/src/vespa/document/select/CMakeLists.txt b/document/src/vespa/document/select/CMakeLists.txt index 81e5d86675c..f210e8abdd7 100644 --- a/document/src/vespa/document/select/CMakeLists.txt +++ b/document/src/vespa/document/select/CMakeLists.txt @@ -36,6 +36,7 @@ vespa_add_library(document_select OBJECT parser.cpp parse_utils.cpp parsing_failed_exception.cpp + parser_limits.cpp ${BISON_DocSelParser_OUTPUTS} ${FLEX_DocSelLexer_OUTPUTS} AFTER diff --git a/document/src/vespa/document/select/branch.cpp b/document/src/vespa/document/select/branch.cpp index b3d5f97ccab..9104e2c5544 100644 --- a/document/src/vespa/document/select/branch.cpp +++ b/document/src/vespa/document/select/branch.cpp @@ -8,7 +8,7 @@ namespace document::select { And::And(std::unique_ptr<Node> left, std::unique_ptr<Node> right, const char* name) - : Branch(name ? name : "and"), + : Branch(name ? name : "and", std::max(left->max_depth(), right->max_depth()) + 1), _left(std::move(left)), _right(std::move(right)) { @@ -54,7 +54,7 @@ And::trace(const Context& context, std::ostream& out) const } Or::Or(std::unique_ptr<Node> left, std::unique_ptr<Node> right, const char* name) - : Branch(name ? name : "or"), + : Branch(name ? name : "or", std::max(left->max_depth(), right->max_depth()) + 1), _left(std::move(left)), _right(std::move(right)) { @@ -100,7 +100,7 @@ Or::trace(const Context& context, std::ostream& out) const } Not::Not(std::unique_ptr<Node> child, const char* name) - : Branch(name ? name : "not"), + : Branch(name ? name : "not", child->max_depth() + 1), _child(std::move(child)) { assert(_child.get()); diff --git a/document/src/vespa/document/select/branch.h b/document/src/vespa/document/select/branch.h index 8637b41de89..77ed74030b5 100644 --- a/document/src/vespa/document/select/branch.h +++ b/document/src/vespa/document/select/branch.h @@ -19,7 +19,8 @@ namespace document::select { class Branch : public Node { public: - Branch(vespalib::stringref name) : Node(name) {} + explicit Branch(vespalib::stringref name) : Node(name) {} + Branch(vespalib::stringref name, uint32_t max_depth) : Node(name, max_depth) {} bool isLeafNode() const override { return false; } }; @@ -30,7 +31,7 @@ class And : public Branch std::unique_ptr<Node> _right; public: And(std::unique_ptr<Node> left, std::unique_ptr<Node> right, - const char* name = 0); + const char* name = nullptr); ResultList contains(const Context& context) const override { return (_left->contains(context) && _right->contains(context)); @@ -53,7 +54,7 @@ class Or : public Branch std::unique_ptr<Node> _right; public: Or(std::unique_ptr<Node> left, std::unique_ptr<Node> right, - const char* name = 0); + const char* name = nullptr); ResultList contains(const Context& context) const override { return (_left->contains(context) || _right->contains(context)); @@ -74,7 +75,7 @@ class Not : public Branch { std::unique_ptr<Node> _child; public: - Not(std::unique_ptr<Node> child, const char* name = 0); + Not(std::unique_ptr<Node> child, const char* name = nullptr); ResultList contains(const Context& context) const override { return !_child->contains(context); } ResultList trace(const Context&, std::ostream& trace) const override; diff --git a/document/src/vespa/document/select/compare.cpp b/document/src/vespa/document/select/compare.cpp index 7db40929a64..caef1bdd250 100644 --- a/document/src/vespa/document/select/compare.cpp +++ b/document/src/vespa/document/select/compare.cpp @@ -15,7 +15,7 @@ Compare::Compare(std::unique_ptr<ValueNode> left, const Operator& op, std::unique_ptr<ValueNode> right, const BucketIdFactory& bucketIdFactory) - : Node("Compare"), + : Node("Compare", std::max(left->max_depth(), right->max_depth()) + 1), _left(std::move(left)), _right(std::move(right)), _operator(op), diff --git a/document/src/vespa/document/select/grammar/lexer.ll b/document/src/vespa/document/select/grammar/lexer.ll index bd011c8ebf6..1222aac02a2 100644 --- a/document/src/vespa/document/select/grammar/lexer.ll +++ b/document/src/vespa/document/select/grammar/lexer.ll @@ -7,6 +7,13 @@ %option noyywrap nounput %option yyclass="document::select::DocSelScanner" + /* Flex lexer must be compiled with batch mode (as opposed to interactive mode) + * or parsing of large tokens appears to trigger superlinear time complexity. + * Also use full, non-compressed lookup tables for maximum performance. + */ +%option batch +%option full + /* Used to track source locations, see https://github.com/bingmann/flex-bison-cpp-example/blob/master/src/scanner.ll */ %{ #define YY_USER_ACTION yyloc->columns(yyleng); diff --git a/document/src/vespa/document/select/node.h b/document/src/vespa/document/select/node.h index 48a64ae63f5..9a3b687d81c 100644 --- a/document/src/vespa/document/select/node.h +++ b/document/src/vespa/document/select/node.h @@ -12,6 +12,7 @@ #include "resultlist.h" #include "context.h" +#include "parser_limits.h" namespace document::select { @@ -21,19 +22,33 @@ class Node : public Printable { protected: vespalib::string _name; + uint32_t _max_depth; bool _parentheses; // Set to true if parentheses was used around this part // Set such that we can recreate original query in print. public: typedef std::unique_ptr<Node> UP; typedef std::shared_ptr<Node> SP; - Node(vespalib::stringref name) : _name(name), _parentheses(false) {} - ~Node() override {} + Node(vespalib::stringref name, uint32_t max_depth) + : _name(name), _max_depth(max_depth), _parentheses(false) + { + throw_parse_error_if_max_depth_exceeded(); + } - void setParentheses() { _parentheses = true; } + explicit Node(vespalib::stringref name) + : _name(name), _max_depth(1), _parentheses(false) + {} + ~Node() override = default; - void clearParentheses() { _parentheses = false; } + // Depth is explicitly tracked to limit recursion to a sane maximum when building and + // processing ASTs, as the Bison framework does not have anything useful for us there. + // The AST is built from the leaves up towards the root, so we can cheaply track depth + // of subtrees in O(1) time per node by computing a node's own depth based on immediate + // children at node construction time. + [[nodiscard]] uint32_t max_depth() const noexcept { return _max_depth; } + void setParentheses() { _parentheses = true; } + void clearParentheses() { _parentheses = false; } bool hadParentheses() const { return _parentheses; } virtual ResultList contains(const Context&) const = 0; @@ -43,6 +58,12 @@ public: virtual Node::UP clone() const = 0; protected: + void throw_parse_error_if_max_depth_exceeded() const { + if (_max_depth > ParserLimits::MaxRecursionDepth) { + throw_max_depth_exceeded_exception(); + } + } + Node::UP wrapParens(Node* node) const { Node::UP ret(node); if (_parentheses) { diff --git a/document/src/vespa/document/select/parser.cpp b/document/src/vespa/document/select/parser.cpp index 9f015409011..fadb46e5aa3 100644 --- a/document/src/vespa/document/select/parser.cpp +++ b/document/src/vespa/document/select/parser.cpp @@ -1,5 +1,6 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "parser.h" +#include "parser_limits.h" #include "scanner.h" #include <vespa/document/base/exceptions.h> #include <vespa/document/util/stringutil.h> @@ -8,7 +9,20 @@ namespace document::select { +namespace { + +void verify_expression_not_too_large(const std::string& expr) { + if (expr.size() > ParserLimits::MaxSelectionByteSize) { + throw ParsingFailedException(vespalib::make_string( + "expression is too large to be parsed (max %zu bytes, got %zu)", + ParserLimits::MaxSelectionByteSize, expr.size())); + } +} + +} + std::unique_ptr<Node> Parser::parse(const std::string& str) const { + verify_expression_not_too_large(str); try { std::istringstream ss(str); DocSelScanner scanner(&ss); diff --git a/document/src/vespa/document/select/parser_limits.cpp b/document/src/vespa/document/select/parser_limits.cpp new file mode 100644 index 00000000000..13e494b376f --- /dev/null +++ b/document/src/vespa/document/select/parser_limits.cpp @@ -0,0 +1,13 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "parser_limits.h" +#include "parsing_failed_exception.h" +#include <vespa/vespalib/util/stringfmt.h> + +namespace document::select { + +void throw_max_depth_exceeded_exception() { + throw ParsingFailedException(vespalib::make_string( + "expression is too deeply nested (max %u levels)", ParserLimits::MaxRecursionDepth)); +} + +} diff --git a/document/src/vespa/document/select/parser_limits.h b/document/src/vespa/document/select/parser_limits.h new file mode 100644 index 00000000000..24c0a165611 --- /dev/null +++ b/document/src/vespa/document/select/parser_limits.h @@ -0,0 +1,19 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <cstdint> +#include <cstddef> + +namespace document::select { + +// Any resource constraints set for parsing document selection expressions +struct ParserLimits { + // Max depth allowed for nodes in the AST tree. + constexpr static uint32_t MaxRecursionDepth = 1024; + // Max size of entire input document selection string, in bytes. + constexpr static size_t MaxSelectionByteSize = 1024*1024; +}; + +void __attribute__((noinline)) throw_max_depth_exceeded_exception(); + +} diff --git a/document/src/vespa/document/select/valuenode.h b/document/src/vespa/document/select/valuenode.h index 04ed8178b40..8dd535a736a 100644 --- a/document/src/vespa/document/select/valuenode.h +++ b/document/src/vespa/document/select/valuenode.h @@ -5,12 +5,13 @@ * * @brief Node representing a value in the tree * - * @author H�kon Humberset + * @author Håkon Humberset */ #pragma once #include "value.h" +#include "parser_limits.h" namespace document::select { @@ -22,8 +23,19 @@ class ValueNode : public Printable public: using UP = std::unique_ptr<ValueNode>; - ValueNode() : _parentheses(false) {} - virtual ~ValueNode() {} + explicit ValueNode(uint32_t max_depth) + : _max_depth(max_depth), _parentheses(false) + { + throw_parse_error_if_max_depth_exceeded(); + } + ValueNode() : _max_depth(1), _parentheses(false) {} + ~ValueNode() override = default; + + // See comments for same function in node.h for a description on how and why + // we track this. Since Node and ValueNode live in completely separate type + // hierarchies, this particular bit of code duplication is unfortunate but + // incurs the least cognitive overhead. + [[nodiscard]] uint32_t max_depth() const noexcept { return _max_depth; } void setParentheses() { _parentheses = true; } void clearParentheses() { _parentheses = false; } @@ -34,9 +46,17 @@ public: virtual ValueNode::UP clone() const = 0; virtual std::unique_ptr<Value> traceValue(const Context &context, std::ostream &out) const; private: + uint32_t _max_depth; bool _parentheses; // Set to true if parentheses was used around this part // Set such that we can recreate original query in print. + protected: + void throw_parse_error_if_max_depth_exceeded() const { + if (_max_depth > ParserLimits::MaxRecursionDepth) { + throw_max_depth_exceeded_exception(); + } + } + ValueNode::UP wrapParens(ValueNode* node) const { ValueNode::UP ret(node); if (_parentheses) { diff --git a/document/src/vespa/document/select/valuenodes.cpp b/document/src/vespa/document/select/valuenodes.cpp index 95cf2f4e7e5..026623cf83c 100644 --- a/document/src/vespa/document/select/valuenodes.cpp +++ b/document/src/vespa/document/select/valuenodes.cpp @@ -21,10 +21,6 @@ LOG_SETUP(".document.select.valuenode"); namespace document::select { namespace { - static const std::regex FIELD_NAME_REGEX("^([_A-Za-z][_A-Za-z0-9]*).*"); -} - -namespace { bool documentTypeEqualsName(const DocumentType& type, vespalib::stringref name) { if (type.getName() == name) return true; @@ -40,7 +36,7 @@ namespace { InvalidValueNode::InvalidValueNode(vespalib::stringref name) : _name(name) -{ } +{} void @@ -194,15 +190,33 @@ FieldValueNode::FieldValueNode(const vespalib::string& doctype, FieldValueNode::~FieldValueNode() = default; -vespalib::string -FieldValueNode::extractFieldName(const std::string & fieldExpression) { - std::smatch match; +namespace { - if (std::regex_match(fieldExpression, match, FIELD_NAME_REGEX) && match[1].matched) { - return vespalib::string(match[1].first, match[1].second); +size_t first_ident_length_or_npos(const vespalib::string& expr) { + for (size_t i = 0; i < expr.size(); ++i) { + switch (expr[i]) { + case '.': + case '{': + case '[': + case ' ': + case '\n': + case '\t': + return i; + default: + continue; + } } + return vespalib::string::npos; +} - throw ParsingFailedException("Fatal: could not extract field name from field expression '" + fieldExpression + "'"); +} + +// TODO remove this pile of fun in favor of actually parsed AST nodes...! +vespalib::string +FieldValueNode::extractFieldName(const vespalib::string & fieldExpression) { + // When we get here the actual contents of the field expression shall already + // have been structurally and syntactically verified by the parser. + return fieldExpression.substr(0, first_ident_length_or_npos(fieldExpression)); } namespace { @@ -844,7 +858,8 @@ FunctionValueNode::print(std::ostream& out, bool verbose, ArithmeticValueNode::ArithmeticValueNode( std::unique_ptr<ValueNode> left, vespalib::stringref op, std::unique_ptr<ValueNode> right) - : _operator(), + : ValueNode(std::max(left->max_depth(), right->max_depth()) + 1), + _operator(), _left(std::move(left)), _right(std::move(right)) { diff --git a/document/src/vespa/document/select/valuenodes.h b/document/src/vespa/document/select/valuenodes.h index 8009542c364..a7d5fa15f37 100644 --- a/document/src/vespa/document/select/valuenodes.h +++ b/document/src/vespa/document/select/valuenodes.h @@ -160,7 +160,7 @@ public: FieldValueNode & operator = (const FieldValueNode &) = delete; FieldValueNode(FieldValueNode &&) = default; FieldValueNode & operator = (FieldValueNode &&) = default; - ~FieldValueNode(); + ~FieldValueNode() override; const vespalib::string& getDocType() const { return _doctype; } const vespalib::string& getRealFieldName() const { return _fieldName; } @@ -175,7 +175,7 @@ public: return wrapParens(new FieldValueNode(_doctype, _fieldExpression)); } - static vespalib::string extractFieldName(const std::string & fieldExpression); + static vespalib::string extractFieldName(const vespalib::string & fieldExpression); private: @@ -192,13 +192,15 @@ class FieldExprNode final : public ValueNode { public: explicit FieldExprNode(const vespalib::string& doctype) : _left_expr(), _right_expr(doctype) {} FieldExprNode(std::unique_ptr<FieldExprNode> left_expr, vespalib::stringref right_expr) - : _left_expr(std::move(left_expr)), _right_expr(right_expr) + : ValueNode(left_expr->max_depth() + 1), + _left_expr(std::move(left_expr)), + _right_expr(right_expr) {} FieldExprNode(const FieldExprNode &) = delete; FieldExprNode & operator = (const FieldExprNode &) = delete; FieldExprNode(FieldExprNode &&) = default; FieldExprNode & operator = (FieldExprNode &&) = default; - ~FieldExprNode(); + ~FieldExprNode() override; std::unique_ptr<FieldValueNode> convert_to_field_value() const; std::unique_ptr<FunctionValueNode> convert_to_function_call() const; |