summaryrefslogtreecommitdiffstats
path: root/document
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2020-03-31 14:30:48 +0000
committerTor Brede Vekterli <vekterli@verizonmedia.com>2020-03-31 14:37:10 +0000
commitbc693476136615bb44b9c4705cd959e53fee1afd (patch)
tree83c01a251046644c5b7ac900909c085ac9aa2e86 /document
parent451173e78f50c4db14f0def7a12eb9881720b94a (diff)
Add explicit limits to backend document selection parsing
Adds the following (very generous) limits: - Max AST depth of 1024 - Max input selection string size of 1 MiB Have to track AST depth manually, as there is no exposed way of doing this natively via Bison. Also removed a regex that had the potential of catastrophic backtracking in case of massive inputs. It wasn't removed during the previous purge due to being used with capture groups, which are not supported by our current vespalib regex wrapper.
Diffstat (limited to 'document')
-rw-r--r--document/src/tests/documentselectparsertest.cpp68
-rw-r--r--document/src/vespa/document/select/CMakeLists.txt1
-rw-r--r--document/src/vespa/document/select/branch.cpp6
-rw-r--r--document/src/vespa/document/select/branch.h9
-rw-r--r--document/src/vespa/document/select/compare.cpp2
-rw-r--r--document/src/vespa/document/select/node.h29
-rw-r--r--document/src/vespa/document/select/parser.cpp12
-rw-r--r--document/src/vespa/document/select/parser_limits.cpp11
-rw-r--r--document/src/vespa/document/select/parser_limits.h19
-rw-r--r--document/src/vespa/document/select/valuenode.h26
-rw-r--r--document/src/vespa/document/select/valuenodes.cpp39
-rw-r--r--document/src/vespa/document/select/valuenodes.h10
12 files changed, 192 insertions, 40 deletions
diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp
index 110153954af..c8cbd1be9c2 100644
--- a/document/src/tests/documentselectparsertest.cpp
+++ b/document/src/tests/documentselectparsertest.cpp
@@ -18,6 +18,7 @@
#include <vespa/document/select/compare.h>
#include <vespa/document/select/operator.h>
#include <vespa/document/select/parse_utils.h>
+#include <vespa/document/select/parser_limits.h>
#include <vespa/vespalib/util/exceptions.h>
#include <limits>
#include <gtest/gtest.h>
@@ -1247,17 +1248,17 @@ TEST_F(DocumentSelectParserTest, testThatSimpleFieldValuesHaveCorrectFieldName)
TEST_F(DocumentSelectParserTest, testThatComplexFieldValuesHaveCorrectFieldNames)
{
- EXPECT_EQ(
- vespalib::string("headerval"),
- parseFieldValue("testdoctype1.headerval{test}")->getRealFieldName());
+ EXPECT_EQ(vespalib::string("headerval"),
+ parseFieldValue("testdoctype1.headerval{test}")->getRealFieldName());
- EXPECT_EQ(
- vespalib::string("headerval"),
- parseFieldValue("testdoctype1.headerval[42]")->getRealFieldName());
+ EXPECT_EQ(vespalib::string("headerval"),
+ parseFieldValue("testdoctype1.headerval[42]")->getRealFieldName());
- EXPECT_EQ(
- vespalib::string("headerval"),
- parseFieldValue("testdoctype1.headerval.meow.meow{test}")->getRealFieldName());
+ EXPECT_EQ(vespalib::string("headerval"),
+ parseFieldValue("testdoctype1.headerval.meow.meow{test}")->getRealFieldName());
+
+ EXPECT_EQ(vespalib::string("headerval"),
+ parseFieldValue("testdoctype1.headerval .meow.meow{test}")->getRealFieldName());
}
namespace {
@@ -1603,4 +1604,53 @@ TEST_F(DocumentSelectParserTest, redundant_glob_wildcards_are_collapsed_into_min
EXPECT_EQ(GlobOperator::convertToRegex("*?*?*?*"), "..*..*."); // Don't try this at home, kids!
}
+TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_field_exprs) {
+ createDocs();
+ std::string expr = "testdoctype1";
+ for (size_t i = 0; i < 50000; ++i) {
+ expr += ".foo";
+ }
+ expr += ".hash() != 0";
+ verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested");
+}
+
+TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_arithmetic_exprs) {
+ createDocs();
+ std::string expr = "1";
+ for (size_t i = 0; i < 50000; ++i) {
+ expr += "+1";
+ }
+ expr += " != 0";
+ verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested");
+}
+
+TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_binary_logical_exprs) {
+ createDocs();
+ // Also throw in some comparisons to ensure they carry over the max depth.
+ std::string expr = "1 == 2";
+ std::string cmp_subexpr = "3 != 4";
+ for (size_t i = 0; i < 10000; ++i) {
+ expr += (i % 2 == 0 ? " and " : " or ") + cmp_subexpr;
+ }
+ verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested");
+}
+
+TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_unary_logical_exprs) {
+ createDocs();
+ std::string expr;
+ for (size_t i = 0; i < 10000; ++i) {
+ expr += "not ";
+ }
+ expr += "true";
+ verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested");
+}
+
+TEST_F(DocumentSelectParserTest, selection_has_upper_limit_on_input_size) {
+ createDocs();
+ std::string expr = ("testdoctype1.a_biii"
+ + std::string(select::ParserLimits::MaxSelectionByteSize, 'i')
+ + "iiig_identifier");
+ verifyFailedParse(expr, "ParsingFailedException: expression is too large to be parsed");
+}
+
} // document
diff --git a/document/src/vespa/document/select/CMakeLists.txt b/document/src/vespa/document/select/CMakeLists.txt
index 81e5d86675c..f210e8abdd7 100644
--- a/document/src/vespa/document/select/CMakeLists.txt
+++ b/document/src/vespa/document/select/CMakeLists.txt
@@ -36,6 +36,7 @@ vespa_add_library(document_select OBJECT
parser.cpp
parse_utils.cpp
parsing_failed_exception.cpp
+ parser_limits.cpp
${BISON_DocSelParser_OUTPUTS}
${FLEX_DocSelLexer_OUTPUTS}
AFTER
diff --git a/document/src/vespa/document/select/branch.cpp b/document/src/vespa/document/select/branch.cpp
index b3d5f97ccab..9104e2c5544 100644
--- a/document/src/vespa/document/select/branch.cpp
+++ b/document/src/vespa/document/select/branch.cpp
@@ -8,7 +8,7 @@
namespace document::select {
And::And(std::unique_ptr<Node> left, std::unique_ptr<Node> right, const char* name)
- : Branch(name ? name : "and"),
+ : Branch(name ? name : "and", std::max(left->max_depth(), right->max_depth()) + 1),
_left(std::move(left)),
_right(std::move(right))
{
@@ -54,7 +54,7 @@ And::trace(const Context& context, std::ostream& out) const
}
Or::Or(std::unique_ptr<Node> left, std::unique_ptr<Node> right, const char* name)
- : Branch(name ? name : "or"),
+ : Branch(name ? name : "or", std::max(left->max_depth(), right->max_depth()) + 1),
_left(std::move(left)),
_right(std::move(right))
{
@@ -100,7 +100,7 @@ Or::trace(const Context& context, std::ostream& out) const
}
Not::Not(std::unique_ptr<Node> child, const char* name)
- : Branch(name ? name : "not"),
+ : Branch(name ? name : "not", child->max_depth() + 1),
_child(std::move(child))
{
assert(_child.get());
diff --git a/document/src/vespa/document/select/branch.h b/document/src/vespa/document/select/branch.h
index 8637b41de89..77ed74030b5 100644
--- a/document/src/vespa/document/select/branch.h
+++ b/document/src/vespa/document/select/branch.h
@@ -19,7 +19,8 @@ namespace document::select {
class Branch : public Node
{
public:
- Branch(vespalib::stringref name) : Node(name) {}
+ explicit Branch(vespalib::stringref name) : Node(name) {}
+ Branch(vespalib::stringref name, uint32_t max_depth) : Node(name, max_depth) {}
bool isLeafNode() const override { return false; }
};
@@ -30,7 +31,7 @@ class And : public Branch
std::unique_ptr<Node> _right;
public:
And(std::unique_ptr<Node> left, std::unique_ptr<Node> right,
- const char* name = 0);
+ const char* name = nullptr);
ResultList contains(const Context& context) const override {
return (_left->contains(context) && _right->contains(context));
@@ -53,7 +54,7 @@ class Or : public Branch
std::unique_ptr<Node> _right;
public:
Or(std::unique_ptr<Node> left, std::unique_ptr<Node> right,
- const char* name = 0);
+ const char* name = nullptr);
ResultList contains(const Context& context) const override {
return (_left->contains(context) || _right->contains(context));
@@ -74,7 +75,7 @@ class Not : public Branch
{
std::unique_ptr<Node> _child;
public:
- Not(std::unique_ptr<Node> child, const char* name = 0);
+ Not(std::unique_ptr<Node> child, const char* name = nullptr);
ResultList contains(const Context& context) const override { return !_child->contains(context); }
ResultList trace(const Context&, std::ostream& trace) const override;
diff --git a/document/src/vespa/document/select/compare.cpp b/document/src/vespa/document/select/compare.cpp
index 7db40929a64..caef1bdd250 100644
--- a/document/src/vespa/document/select/compare.cpp
+++ b/document/src/vespa/document/select/compare.cpp
@@ -15,7 +15,7 @@ Compare::Compare(std::unique_ptr<ValueNode> left,
const Operator& op,
std::unique_ptr<ValueNode> right,
const BucketIdFactory& bucketIdFactory)
- : Node("Compare"),
+ : Node("Compare", std::max(left->max_depth(), right->max_depth()) + 1),
_left(std::move(left)),
_right(std::move(right)),
_operator(op),
diff --git a/document/src/vespa/document/select/node.h b/document/src/vespa/document/select/node.h
index 48a64ae63f5..9a3b687d81c 100644
--- a/document/src/vespa/document/select/node.h
+++ b/document/src/vespa/document/select/node.h
@@ -12,6 +12,7 @@
#include "resultlist.h"
#include "context.h"
+#include "parser_limits.h"
namespace document::select {
@@ -21,19 +22,33 @@ class Node : public Printable
{
protected:
vespalib::string _name;
+ uint32_t _max_depth;
bool _parentheses; // Set to true if parentheses was used around this part
// Set such that we can recreate original query in print.
public:
typedef std::unique_ptr<Node> UP;
typedef std::shared_ptr<Node> SP;
- Node(vespalib::stringref name) : _name(name), _parentheses(false) {}
- ~Node() override {}
+ Node(vespalib::stringref name, uint32_t max_depth)
+ : _name(name), _max_depth(max_depth), _parentheses(false)
+ {
+ throw_parse_error_if_max_depth_exceeded();
+ }
- void setParentheses() { _parentheses = true; }
+ explicit Node(vespalib::stringref name)
+ : _name(name), _max_depth(1), _parentheses(false)
+ {}
+ ~Node() override = default;
- void clearParentheses() { _parentheses = false; }
+ // Depth is explicitly tracked to limit recursion to a sane maximum when building and
+ // processing ASTs, as the Bison framework does not have anything useful for us there.
+ // The AST is built from the leaves up towards the root, so we can cheaply track depth
+ // of subtrees in O(1) time per node by computing a node's own depth based on immediate
+ // children at node construction time.
+ [[nodiscard]] uint32_t max_depth() const noexcept { return _max_depth; }
+ void setParentheses() { _parentheses = true; }
+ void clearParentheses() { _parentheses = false; }
bool hadParentheses() const { return _parentheses; }
virtual ResultList contains(const Context&) const = 0;
@@ -43,6 +58,12 @@ public:
virtual Node::UP clone() const = 0;
protected:
+ void throw_parse_error_if_max_depth_exceeded() const {
+ if (_max_depth > ParserLimits::MaxRecursionDepth) {
+ throw_max_depth_exceeded_exception();
+ }
+ }
+
Node::UP wrapParens(Node* node) const {
Node::UP ret(node);
if (_parentheses) {
diff --git a/document/src/vespa/document/select/parser.cpp b/document/src/vespa/document/select/parser.cpp
index 9f015409011..df8659d22f4 100644
--- a/document/src/vespa/document/select/parser.cpp
+++ b/document/src/vespa/document/select/parser.cpp
@@ -1,5 +1,6 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "parser.h"
+#include "parser_limits.h"
#include "scanner.h"
#include <vespa/document/base/exceptions.h>
#include <vespa/document/util/stringutil.h>
@@ -8,7 +9,18 @@
namespace document::select {
+namespace {
+
+void verify_expression_not_too_large(const std::string& expr) {
+ if (expr.size() > ParserLimits::MaxSelectionByteSize) {
+ throw ParsingFailedException("expression is too large to be parsed");
+ }
+}
+
+}
+
std::unique_ptr<Node> Parser::parse(const std::string& str) const {
+ verify_expression_not_too_large(str);
try {
std::istringstream ss(str);
DocSelScanner scanner(&ss);
diff --git a/document/src/vespa/document/select/parser_limits.cpp b/document/src/vespa/document/select/parser_limits.cpp
new file mode 100644
index 00000000000..7f5715517c6
--- /dev/null
+++ b/document/src/vespa/document/select/parser_limits.cpp
@@ -0,0 +1,11 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "parser_limits.h"
+#include "parsing_failed_exception.h"
+
+namespace document::select {
+
+void throw_max_depth_exceeded_exception() {
+ throw ParsingFailedException("expression is too deeply nested");
+}
+
+}
diff --git a/document/src/vespa/document/select/parser_limits.h b/document/src/vespa/document/select/parser_limits.h
new file mode 100644
index 00000000000..24c0a165611
--- /dev/null
+++ b/document/src/vespa/document/select/parser_limits.h
@@ -0,0 +1,19 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+namespace document::select {
+
+// Any resource constraints set for parsing document selection expressions
+struct ParserLimits {
+ // Max depth allowed for nodes in the AST tree.
+ constexpr static uint32_t MaxRecursionDepth = 1024;
+ // Max size of entire input document selection string, in bytes.
+ constexpr static size_t MaxSelectionByteSize = 1024*1024;
+};
+
+void __attribute__((noinline)) throw_max_depth_exceeded_exception();
+
+}
diff --git a/document/src/vespa/document/select/valuenode.h b/document/src/vespa/document/select/valuenode.h
index 04ed8178b40..8dd535a736a 100644
--- a/document/src/vespa/document/select/valuenode.h
+++ b/document/src/vespa/document/select/valuenode.h
@@ -5,12 +5,13 @@
*
* @brief Node representing a value in the tree
*
- * @author H�kon Humberset
+ * @author Håkon Humberset
*/
#pragma once
#include "value.h"
+#include "parser_limits.h"
namespace document::select {
@@ -22,8 +23,19 @@ class ValueNode : public Printable
public:
using UP = std::unique_ptr<ValueNode>;
- ValueNode() : _parentheses(false) {}
- virtual ~ValueNode() {}
+ explicit ValueNode(uint32_t max_depth)
+ : _max_depth(max_depth), _parentheses(false)
+ {
+ throw_parse_error_if_max_depth_exceeded();
+ }
+ ValueNode() : _max_depth(1), _parentheses(false) {}
+ ~ValueNode() override = default;
+
+ // See comments for same function in node.h for a description on how and why
+ // we track this. Since Node and ValueNode live in completely separate type
+ // hierarchies, this particular bit of code duplication is unfortunate but
+ // incurs the least cognitive overhead.
+ [[nodiscard]] uint32_t max_depth() const noexcept { return _max_depth; }
void setParentheses() { _parentheses = true; }
void clearParentheses() { _parentheses = false; }
@@ -34,9 +46,17 @@ public:
virtual ValueNode::UP clone() const = 0;
virtual std::unique_ptr<Value> traceValue(const Context &context, std::ostream &out) const;
private:
+ uint32_t _max_depth;
bool _parentheses; // Set to true if parentheses was used around this part
// Set such that we can recreate original query in print.
+
protected:
+ void throw_parse_error_if_max_depth_exceeded() const {
+ if (_max_depth > ParserLimits::MaxRecursionDepth) {
+ throw_max_depth_exceeded_exception();
+ }
+ }
+
ValueNode::UP wrapParens(ValueNode* node) const {
ValueNode::UP ret(node);
if (_parentheses) {
diff --git a/document/src/vespa/document/select/valuenodes.cpp b/document/src/vespa/document/select/valuenodes.cpp
index 95cf2f4e7e5..026623cf83c 100644
--- a/document/src/vespa/document/select/valuenodes.cpp
+++ b/document/src/vespa/document/select/valuenodes.cpp
@@ -21,10 +21,6 @@ LOG_SETUP(".document.select.valuenode");
namespace document::select {
namespace {
- static const std::regex FIELD_NAME_REGEX("^([_A-Za-z][_A-Za-z0-9]*).*");
-}
-
-namespace {
bool documentTypeEqualsName(const DocumentType& type, vespalib::stringref name)
{
if (type.getName() == name) return true;
@@ -40,7 +36,7 @@ namespace {
InvalidValueNode::InvalidValueNode(vespalib::stringref name)
: _name(name)
-{ }
+{}
void
@@ -194,15 +190,33 @@ FieldValueNode::FieldValueNode(const vespalib::string& doctype,
FieldValueNode::~FieldValueNode() = default;
-vespalib::string
-FieldValueNode::extractFieldName(const std::string & fieldExpression) {
- std::smatch match;
+namespace {
- if (std::regex_match(fieldExpression, match, FIELD_NAME_REGEX) && match[1].matched) {
- return vespalib::string(match[1].first, match[1].second);
+size_t first_ident_length_or_npos(const vespalib::string& expr) {
+ for (size_t i = 0; i < expr.size(); ++i) {
+ switch (expr[i]) {
+ case '.':
+ case '{':
+ case '[':
+ case ' ':
+ case '\n':
+ case '\t':
+ return i;
+ default:
+ continue;
+ }
}
+ return vespalib::string::npos;
+}
- throw ParsingFailedException("Fatal: could not extract field name from field expression '" + fieldExpression + "'");
+}
+
+// TODO remove this pile of fun in favor of actually parsed AST nodes...!
+vespalib::string
+FieldValueNode::extractFieldName(const vespalib::string & fieldExpression) {
+ // When we get here the actual contents of the field expression shall already
+ // have been structurally and syntactically verified by the parser.
+ return fieldExpression.substr(0, first_ident_length_or_npos(fieldExpression));
}
namespace {
@@ -844,7 +858,8 @@ FunctionValueNode::print(std::ostream& out, bool verbose,
ArithmeticValueNode::ArithmeticValueNode(
std::unique_ptr<ValueNode> left, vespalib::stringref op,
std::unique_ptr<ValueNode> right)
- : _operator(),
+ : ValueNode(std::max(left->max_depth(), right->max_depth()) + 1),
+ _operator(),
_left(std::move(left)),
_right(std::move(right))
{
diff --git a/document/src/vespa/document/select/valuenodes.h b/document/src/vespa/document/select/valuenodes.h
index 8009542c364..a7d5fa15f37 100644
--- a/document/src/vespa/document/select/valuenodes.h
+++ b/document/src/vespa/document/select/valuenodes.h
@@ -160,7 +160,7 @@ public:
FieldValueNode & operator = (const FieldValueNode &) = delete;
FieldValueNode(FieldValueNode &&) = default;
FieldValueNode & operator = (FieldValueNode &&) = default;
- ~FieldValueNode();
+ ~FieldValueNode() override;
const vespalib::string& getDocType() const { return _doctype; }
const vespalib::string& getRealFieldName() const { return _fieldName; }
@@ -175,7 +175,7 @@ public:
return wrapParens(new FieldValueNode(_doctype, _fieldExpression));
}
- static vespalib::string extractFieldName(const std::string & fieldExpression);
+ static vespalib::string extractFieldName(const vespalib::string & fieldExpression);
private:
@@ -192,13 +192,15 @@ class FieldExprNode final : public ValueNode {
public:
explicit FieldExprNode(const vespalib::string& doctype) : _left_expr(), _right_expr(doctype) {}
FieldExprNode(std::unique_ptr<FieldExprNode> left_expr, vespalib::stringref right_expr)
- : _left_expr(std::move(left_expr)), _right_expr(right_expr)
+ : ValueNode(left_expr->max_depth() + 1),
+ _left_expr(std::move(left_expr)),
+ _right_expr(right_expr)
{}
FieldExprNode(const FieldExprNode &) = delete;
FieldExprNode & operator = (const FieldExprNode &) = delete;
FieldExprNode(FieldExprNode &&) = default;
FieldExprNode & operator = (FieldExprNode &&) = default;
- ~FieldExprNode();
+ ~FieldExprNode() override;
std::unique_ptr<FieldValueNode> convert_to_field_value() const;
std::unique_ptr<FunctionValueNode> convert_to_function_call() const;