summaryrefslogtreecommitdiffstats
path: root/document
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2020-04-01 11:21:16 +0200
committerGitHub <noreply@github.com>2020-04-01 11:21:16 +0200
commitbe3c2e7f0f1f3edc07973dccb05b13bf30c75de1 (patch)
tree710db4b6b95bf5bda839070b952d380591126f7e /document
parent444693a52467663ba4195e4f90991a388defe75c (diff)
parent4153969ca13541e7434963d02add12d574c9a461 (diff)
Merge pull request #12784 from vespa-engine/vekterli/add-explicit-limits-to-selection-parser
Add explicit limits to backend document selection parsing
Diffstat (limited to 'document')
-rw-r--r--document/src/tests/documentselectparsertest.cpp79
-rw-r--r--document/src/vespa/document/select/CMakeLists.txt1
-rw-r--r--document/src/vespa/document/select/branch.cpp6
-rw-r--r--document/src/vespa/document/select/branch.h9
-rw-r--r--document/src/vespa/document/select/compare.cpp2
-rw-r--r--document/src/vespa/document/select/grammar/lexer.ll7
-rw-r--r--document/src/vespa/document/select/node.h29
-rw-r--r--document/src/vespa/document/select/parser.cpp14
-rw-r--r--document/src/vespa/document/select/parser_limits.cpp13
-rw-r--r--document/src/vespa/document/select/parser_limits.h19
-rw-r--r--document/src/vespa/document/select/valuenode.h26
-rw-r--r--document/src/vespa/document/select/valuenodes.cpp39
-rw-r--r--document/src/vespa/document/select/valuenodes.h10
13 files changed, 214 insertions, 40 deletions
diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp
index 110153954af..59f4ce69b33 100644
--- a/document/src/tests/documentselectparsertest.cpp
+++ b/document/src/tests/documentselectparsertest.cpp
@@ -18,6 +18,7 @@
#include <vespa/document/select/compare.h>
#include <vespa/document/select/operator.h>
#include <vespa/document/select/parse_utils.h>
+#include <vespa/document/select/parser_limits.h>
#include <vespa/vespalib/util/exceptions.h>
#include <limits>
#include <gtest/gtest.h>
@@ -1247,17 +1248,17 @@ TEST_F(DocumentSelectParserTest, testThatSimpleFieldValuesHaveCorrectFieldName)
TEST_F(DocumentSelectParserTest, testThatComplexFieldValuesHaveCorrectFieldNames)
{
- EXPECT_EQ(
- vespalib::string("headerval"),
- parseFieldValue("testdoctype1.headerval{test}")->getRealFieldName());
+ EXPECT_EQ(vespalib::string("headerval"),
+ parseFieldValue("testdoctype1.headerval{test}")->getRealFieldName());
- EXPECT_EQ(
- vespalib::string("headerval"),
- parseFieldValue("testdoctype1.headerval[42]")->getRealFieldName());
+ EXPECT_EQ(vespalib::string("headerval"),
+ parseFieldValue("testdoctype1.headerval[42]")->getRealFieldName());
- EXPECT_EQ(
- vespalib::string("headerval"),
- parseFieldValue("testdoctype1.headerval.meow.meow{test}")->getRealFieldName());
+ EXPECT_EQ(vespalib::string("headerval"),
+ parseFieldValue("testdoctype1.headerval.meow.meow{test}")->getRealFieldName());
+
+ EXPECT_EQ(vespalib::string("headerval"),
+ parseFieldValue("testdoctype1.headerval .meow.meow{test}")->getRealFieldName());
}
namespace {
@@ -1603,4 +1604,64 @@ TEST_F(DocumentSelectParserTest, redundant_glob_wildcards_are_collapsed_into_min
EXPECT_EQ(GlobOperator::convertToRegex("*?*?*?*"), "..*..*."); // Don't try this at home, kids!
}
+TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_field_exprs) {
+ createDocs();
+ std::string expr = "testdoctype1";
+ for (size_t i = 0; i < 50000; ++i) {
+ expr += ".foo";
+ }
+ expr += ".hash() != 0";
+ verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested (max 1024 levels)");
+}
+
+TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_arithmetic_exprs) {
+ createDocs();
+ std::string expr = "1";
+ for (size_t i = 0; i < 50000; ++i) {
+ expr += "+1";
+ }
+ expr += " != 0";
+ verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested (max 1024 levels)");
+}
+
+TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_binary_logical_exprs) {
+ createDocs();
+ // Also throw in some comparisons to ensure they carry over the max depth.
+ std::string expr = "1 == 2";
+ std::string cmp_subexpr = "3 != 4";
+ for (size_t i = 0; i < 10000; ++i) {
+ expr += (i % 2 == 0 ? " and " : " or ") + cmp_subexpr;
+ }
+ verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested (max 1024 levels)");
+}
+
+TEST_F(DocumentSelectParserTest, recursion_depth_is_bounded_for_unary_logical_exprs) {
+ createDocs();
+ std::string expr;
+ for (size_t i = 0; i < 10000; ++i) {
+ expr += "not ";
+ }
+ expr += "true";
+ verifyFailedParse(expr, "ParsingFailedException: expression is too deeply nested (max 1024 levels)");
+}
+
+TEST_F(DocumentSelectParserTest, selection_has_upper_limit_on_input_size) {
+ createDocs();
+ std::string expr = ("testdoctype1.a_biii"
+ + std::string(select::ParserLimits::MaxSelectionByteSize, 'i')
+ + "iiig_identifier");
+ verifyFailedParse(expr, "ParsingFailedException: expression is too large to be "
+ "parsed (max 1048576 bytes, got 1048610)");
+}
+
+TEST_F(DocumentSelectParserTest, lexing_does_not_have_superlinear_time_complexity) {
+ createDocs();
+ std::string expr = ("testdoctype1.hstringval == 'a_biii"
+ + std::string(select::ParserLimits::MaxSelectionByteSize - 100, 'i')
+ + "iiig string'");
+ // If the lexer is not compiled with the appropriate options, this will take a long time.
+ // A really, really long time.
+ PARSE(expr, *_doc[0], False);
+}
+
} // document
diff --git a/document/src/vespa/document/select/CMakeLists.txt b/document/src/vespa/document/select/CMakeLists.txt
index 81e5d86675c..f210e8abdd7 100644
--- a/document/src/vespa/document/select/CMakeLists.txt
+++ b/document/src/vespa/document/select/CMakeLists.txt
@@ -36,6 +36,7 @@ vespa_add_library(document_select OBJECT
parser.cpp
parse_utils.cpp
parsing_failed_exception.cpp
+ parser_limits.cpp
${BISON_DocSelParser_OUTPUTS}
${FLEX_DocSelLexer_OUTPUTS}
AFTER
diff --git a/document/src/vespa/document/select/branch.cpp b/document/src/vespa/document/select/branch.cpp
index b3d5f97ccab..9104e2c5544 100644
--- a/document/src/vespa/document/select/branch.cpp
+++ b/document/src/vespa/document/select/branch.cpp
@@ -8,7 +8,7 @@
namespace document::select {
And::And(std::unique_ptr<Node> left, std::unique_ptr<Node> right, const char* name)
- : Branch(name ? name : "and"),
+ : Branch(name ? name : "and", std::max(left->max_depth(), right->max_depth()) + 1),
_left(std::move(left)),
_right(std::move(right))
{
@@ -54,7 +54,7 @@ And::trace(const Context& context, std::ostream& out) const
}
Or::Or(std::unique_ptr<Node> left, std::unique_ptr<Node> right, const char* name)
- : Branch(name ? name : "or"),
+ : Branch(name ? name : "or", std::max(left->max_depth(), right->max_depth()) + 1),
_left(std::move(left)),
_right(std::move(right))
{
@@ -100,7 +100,7 @@ Or::trace(const Context& context, std::ostream& out) const
}
Not::Not(std::unique_ptr<Node> child, const char* name)
- : Branch(name ? name : "not"),
+ : Branch(name ? name : "not", child->max_depth() + 1),
_child(std::move(child))
{
assert(_child.get());
diff --git a/document/src/vespa/document/select/branch.h b/document/src/vespa/document/select/branch.h
index 8637b41de89..77ed74030b5 100644
--- a/document/src/vespa/document/select/branch.h
+++ b/document/src/vespa/document/select/branch.h
@@ -19,7 +19,8 @@ namespace document::select {
class Branch : public Node
{
public:
- Branch(vespalib::stringref name) : Node(name) {}
+ explicit Branch(vespalib::stringref name) : Node(name) {}
+ Branch(vespalib::stringref name, uint32_t max_depth) : Node(name, max_depth) {}
bool isLeafNode() const override { return false; }
};
@@ -30,7 +31,7 @@ class And : public Branch
std::unique_ptr<Node> _right;
public:
And(std::unique_ptr<Node> left, std::unique_ptr<Node> right,
- const char* name = 0);
+ const char* name = nullptr);
ResultList contains(const Context& context) const override {
return (_left->contains(context) && _right->contains(context));
@@ -53,7 +54,7 @@ class Or : public Branch
std::unique_ptr<Node> _right;
public:
Or(std::unique_ptr<Node> left, std::unique_ptr<Node> right,
- const char* name = 0);
+ const char* name = nullptr);
ResultList contains(const Context& context) const override {
return (_left->contains(context) || _right->contains(context));
@@ -74,7 +75,7 @@ class Not : public Branch
{
std::unique_ptr<Node> _child;
public:
- Not(std::unique_ptr<Node> child, const char* name = 0);
+ Not(std::unique_ptr<Node> child, const char* name = nullptr);
ResultList contains(const Context& context) const override { return !_child->contains(context); }
ResultList trace(const Context&, std::ostream& trace) const override;
diff --git a/document/src/vespa/document/select/compare.cpp b/document/src/vespa/document/select/compare.cpp
index 7db40929a64..caef1bdd250 100644
--- a/document/src/vespa/document/select/compare.cpp
+++ b/document/src/vespa/document/select/compare.cpp
@@ -15,7 +15,7 @@ Compare::Compare(std::unique_ptr<ValueNode> left,
const Operator& op,
std::unique_ptr<ValueNode> right,
const BucketIdFactory& bucketIdFactory)
- : Node("Compare"),
+ : Node("Compare", std::max(left->max_depth(), right->max_depth()) + 1),
_left(std::move(left)),
_right(std::move(right)),
_operator(op),
diff --git a/document/src/vespa/document/select/grammar/lexer.ll b/document/src/vespa/document/select/grammar/lexer.ll
index bd011c8ebf6..1222aac02a2 100644
--- a/document/src/vespa/document/select/grammar/lexer.ll
+++ b/document/src/vespa/document/select/grammar/lexer.ll
@@ -7,6 +7,13 @@
%option noyywrap nounput
%option yyclass="document::select::DocSelScanner"
+ /* Flex lexer must be compiled with batch mode (as opposed to interactive mode)
+ * or parsing of large tokens appears to trigger superlinear time complexity.
+ * Also use full, non-compressed lookup tables for maximum performance.
+ */
+%option batch
+%option full
+
/* Used to track source locations, see https://github.com/bingmann/flex-bison-cpp-example/blob/master/src/scanner.ll */
%{
#define YY_USER_ACTION yyloc->columns(yyleng);
diff --git a/document/src/vespa/document/select/node.h b/document/src/vespa/document/select/node.h
index 48a64ae63f5..9a3b687d81c 100644
--- a/document/src/vespa/document/select/node.h
+++ b/document/src/vespa/document/select/node.h
@@ -12,6 +12,7 @@
#include "resultlist.h"
#include "context.h"
+#include "parser_limits.h"
namespace document::select {
@@ -21,19 +22,33 @@ class Node : public Printable
{
protected:
vespalib::string _name;
+ uint32_t _max_depth;
bool _parentheses; // Set to true if parentheses was used around this part
// Set such that we can recreate original query in print.
public:
typedef std::unique_ptr<Node> UP;
typedef std::shared_ptr<Node> SP;
- Node(vespalib::stringref name) : _name(name), _parentheses(false) {}
- ~Node() override {}
+ Node(vespalib::stringref name, uint32_t max_depth)
+ : _name(name), _max_depth(max_depth), _parentheses(false)
+ {
+ throw_parse_error_if_max_depth_exceeded();
+ }
- void setParentheses() { _parentheses = true; }
+ explicit Node(vespalib::stringref name)
+ : _name(name), _max_depth(1), _parentheses(false)
+ {}
+ ~Node() override = default;
- void clearParentheses() { _parentheses = false; }
+ // Depth is explicitly tracked to limit recursion to a sane maximum when building and
+ // processing ASTs, as the Bison framework does not have anything useful for us there.
+ // The AST is built from the leaves up towards the root, so we can cheaply track depth
+ // of subtrees in O(1) time per node by computing a node's own depth based on immediate
+ // children at node construction time.
+ [[nodiscard]] uint32_t max_depth() const noexcept { return _max_depth; }
+ void setParentheses() { _parentheses = true; }
+ void clearParentheses() { _parentheses = false; }
bool hadParentheses() const { return _parentheses; }
virtual ResultList contains(const Context&) const = 0;
@@ -43,6 +58,12 @@ public:
virtual Node::UP clone() const = 0;
protected:
+ void throw_parse_error_if_max_depth_exceeded() const {
+ if (_max_depth > ParserLimits::MaxRecursionDepth) {
+ throw_max_depth_exceeded_exception();
+ }
+ }
+
Node::UP wrapParens(Node* node) const {
Node::UP ret(node);
if (_parentheses) {
diff --git a/document/src/vespa/document/select/parser.cpp b/document/src/vespa/document/select/parser.cpp
index 9f015409011..fadb46e5aa3 100644
--- a/document/src/vespa/document/select/parser.cpp
+++ b/document/src/vespa/document/select/parser.cpp
@@ -1,5 +1,6 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "parser.h"
+#include "parser_limits.h"
#include "scanner.h"
#include <vespa/document/base/exceptions.h>
#include <vespa/document/util/stringutil.h>
@@ -8,7 +9,20 @@
namespace document::select {
+namespace {
+
+void verify_expression_not_too_large(const std::string& expr) {
+ if (expr.size() > ParserLimits::MaxSelectionByteSize) {
+ throw ParsingFailedException(vespalib::make_string(
+ "expression is too large to be parsed (max %zu bytes, got %zu)",
+ ParserLimits::MaxSelectionByteSize, expr.size()));
+ }
+}
+
+}
+
std::unique_ptr<Node> Parser::parse(const std::string& str) const {
+ verify_expression_not_too_large(str);
try {
std::istringstream ss(str);
DocSelScanner scanner(&ss);
diff --git a/document/src/vespa/document/select/parser_limits.cpp b/document/src/vespa/document/select/parser_limits.cpp
new file mode 100644
index 00000000000..13e494b376f
--- /dev/null
+++ b/document/src/vespa/document/select/parser_limits.cpp
@@ -0,0 +1,13 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "parser_limits.h"
+#include "parsing_failed_exception.h"
+#include <vespa/vespalib/util/stringfmt.h>
+
+namespace document::select {
+
+void throw_max_depth_exceeded_exception() {
+ throw ParsingFailedException(vespalib::make_string(
+ "expression is too deeply nested (max %u levels)", ParserLimits::MaxRecursionDepth));
+}
+
+}
diff --git a/document/src/vespa/document/select/parser_limits.h b/document/src/vespa/document/select/parser_limits.h
new file mode 100644
index 00000000000..24c0a165611
--- /dev/null
+++ b/document/src/vespa/document/select/parser_limits.h
@@ -0,0 +1,19 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+namespace document::select {
+
+// Any resource constraints set for parsing document selection expressions
+struct ParserLimits {
+ // Max depth allowed for nodes in the AST tree.
+ constexpr static uint32_t MaxRecursionDepth = 1024;
+ // Max size of entire input document selection string, in bytes.
+ constexpr static size_t MaxSelectionByteSize = 1024*1024;
+};
+
+void __attribute__((noinline)) throw_max_depth_exceeded_exception();
+
+}
diff --git a/document/src/vespa/document/select/valuenode.h b/document/src/vespa/document/select/valuenode.h
index 04ed8178b40..8dd535a736a 100644
--- a/document/src/vespa/document/select/valuenode.h
+++ b/document/src/vespa/document/select/valuenode.h
@@ -5,12 +5,13 @@
*
* @brief Node representing a value in the tree
*
- * @author H�kon Humberset
+ * @author Håkon Humberset
*/
#pragma once
#include "value.h"
+#include "parser_limits.h"
namespace document::select {
@@ -22,8 +23,19 @@ class ValueNode : public Printable
public:
using UP = std::unique_ptr<ValueNode>;
- ValueNode() : _parentheses(false) {}
- virtual ~ValueNode() {}
+ explicit ValueNode(uint32_t max_depth)
+ : _max_depth(max_depth), _parentheses(false)
+ {
+ throw_parse_error_if_max_depth_exceeded();
+ }
+ ValueNode() : _max_depth(1), _parentheses(false) {}
+ ~ValueNode() override = default;
+
+ // See comments for same function in node.h for a description on how and why
+ // we track this. Since Node and ValueNode live in completely separate type
+ // hierarchies, this particular bit of code duplication is unfortunate but
+ // incurs the least cognitive overhead.
+ [[nodiscard]] uint32_t max_depth() const noexcept { return _max_depth; }
void setParentheses() { _parentheses = true; }
void clearParentheses() { _parentheses = false; }
@@ -34,9 +46,17 @@ public:
virtual ValueNode::UP clone() const = 0;
virtual std::unique_ptr<Value> traceValue(const Context &context, std::ostream &out) const;
private:
+ uint32_t _max_depth;
bool _parentheses; // Set to true if parentheses was used around this part
// Set such that we can recreate original query in print.
+
protected:
+ void throw_parse_error_if_max_depth_exceeded() const {
+ if (_max_depth > ParserLimits::MaxRecursionDepth) {
+ throw_max_depth_exceeded_exception();
+ }
+ }
+
ValueNode::UP wrapParens(ValueNode* node) const {
ValueNode::UP ret(node);
if (_parentheses) {
diff --git a/document/src/vespa/document/select/valuenodes.cpp b/document/src/vespa/document/select/valuenodes.cpp
index 95cf2f4e7e5..026623cf83c 100644
--- a/document/src/vespa/document/select/valuenodes.cpp
+++ b/document/src/vespa/document/select/valuenodes.cpp
@@ -21,10 +21,6 @@ LOG_SETUP(".document.select.valuenode");
namespace document::select {
namespace {
- static const std::regex FIELD_NAME_REGEX("^([_A-Za-z][_A-Za-z0-9]*).*");
-}
-
-namespace {
bool documentTypeEqualsName(const DocumentType& type, vespalib::stringref name)
{
if (type.getName() == name) return true;
@@ -40,7 +36,7 @@ namespace {
InvalidValueNode::InvalidValueNode(vespalib::stringref name)
: _name(name)
-{ }
+{}
void
@@ -194,15 +190,33 @@ FieldValueNode::FieldValueNode(const vespalib::string& doctype,
FieldValueNode::~FieldValueNode() = default;
-vespalib::string
-FieldValueNode::extractFieldName(const std::string & fieldExpression) {
- std::smatch match;
+namespace {
- if (std::regex_match(fieldExpression, match, FIELD_NAME_REGEX) && match[1].matched) {
- return vespalib::string(match[1].first, match[1].second);
+size_t first_ident_length_or_npos(const vespalib::string& expr) {
+ for (size_t i = 0; i < expr.size(); ++i) {
+ switch (expr[i]) {
+ case '.':
+ case '{':
+ case '[':
+ case ' ':
+ case '\n':
+ case '\t':
+ return i;
+ default:
+ continue;
+ }
}
+ return vespalib::string::npos;
+}
- throw ParsingFailedException("Fatal: could not extract field name from field expression '" + fieldExpression + "'");
+}
+
+// TODO remove this pile of fun in favor of actually parsed AST nodes...!
+vespalib::string
+FieldValueNode::extractFieldName(const vespalib::string & fieldExpression) {
+ // When we get here the actual contents of the field expression shall already
+ // have been structurally and syntactically verified by the parser.
+ return fieldExpression.substr(0, first_ident_length_or_npos(fieldExpression));
}
namespace {
@@ -844,7 +858,8 @@ FunctionValueNode::print(std::ostream& out, bool verbose,
ArithmeticValueNode::ArithmeticValueNode(
std::unique_ptr<ValueNode> left, vespalib::stringref op,
std::unique_ptr<ValueNode> right)
- : _operator(),
+ : ValueNode(std::max(left->max_depth(), right->max_depth()) + 1),
+ _operator(),
_left(std::move(left)),
_right(std::move(right))
{
diff --git a/document/src/vespa/document/select/valuenodes.h b/document/src/vespa/document/select/valuenodes.h
index 8009542c364..a7d5fa15f37 100644
--- a/document/src/vespa/document/select/valuenodes.h
+++ b/document/src/vespa/document/select/valuenodes.h
@@ -160,7 +160,7 @@ public:
FieldValueNode & operator = (const FieldValueNode &) = delete;
FieldValueNode(FieldValueNode &&) = default;
FieldValueNode & operator = (FieldValueNode &&) = default;
- ~FieldValueNode();
+ ~FieldValueNode() override;
const vespalib::string& getDocType() const { return _doctype; }
const vespalib::string& getRealFieldName() const { return _fieldName; }
@@ -175,7 +175,7 @@ public:
return wrapParens(new FieldValueNode(_doctype, _fieldExpression));
}
- static vespalib::string extractFieldName(const std::string & fieldExpression);
+ static vespalib::string extractFieldName(const vespalib::string & fieldExpression);
private:
@@ -192,13 +192,15 @@ class FieldExprNode final : public ValueNode {
public:
explicit FieldExprNode(const vespalib::string& doctype) : _left_expr(), _right_expr(doctype) {}
FieldExprNode(std::unique_ptr<FieldExprNode> left_expr, vespalib::stringref right_expr)
- : _left_expr(std::move(left_expr)), _right_expr(right_expr)
+ : ValueNode(left_expr->max_depth() + 1),
+ _left_expr(std::move(left_expr)),
+ _right_expr(right_expr)
{}
FieldExprNode(const FieldExprNode &) = delete;
FieldExprNode & operator = (const FieldExprNode &) = delete;
FieldExprNode(FieldExprNode &&) = default;
FieldExprNode & operator = (FieldExprNode &&) = default;
- ~FieldExprNode();
+ ~FieldExprNode() override;
std::unique_ptr<FieldValueNode> convert_to_field_value() const;
std::unique_ptr<FunctionValueNode> convert_to_function_call() const;