summaryrefslogtreecommitdiffstats
path: root/document
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2020-02-06 13:28:26 +0000
committerTor Brede Vekterli <vekterli@verizonmedia.com>2020-02-06 13:33:24 +0000
commit7c610e122b374bbde0b847f6eab01494c8253b27 (patch)
tree0cfb9544bd59a5957956eba6ad95a292ad2f826c /document
parent013df9d9d5e84a018b47ff666e3459902aba40fc (diff)
Simplify regexes generated from document selection glob patterns
Attempts to simplify the resulting regex as much as possible to help minimize the number of possible catastrophic backtracking cases that can be triggered by wildcard regexes. The following simplifications are currently performed: * '' -> /^$/ (empty string match) * '*' -> // (any string match) * '*foo*' -> /foo/ (substring match) * '*foo' -> /foo$/ (suffix match) * 'foo*' -> /^foo/ (prefix match) * collapsing runs of consecutive `*` wildcards into a single wildcard. `***` is identical to `**` which is identical to `*` etc, as all these match 0-n characters each. This also works with simplification, i.e. `***foo***` -> /foo/ and `***` -> // This relates to issue #12068
Diffstat (limited to 'document')
-rw-r--r--document/src/tests/documentselectparsertest.cpp90
-rw-r--r--document/src/vespa/document/select/operator.cpp75
-rw-r--r--document/src/vespa/document/select/operator.h22
3 files changed, 129 insertions, 58 deletions
diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp
index 3441a336553..ccec0fd1be2 100644
--- a/document/src/tests/documentselectparsertest.cpp
+++ b/document/src/tests/documentselectparsertest.cpp
@@ -16,6 +16,7 @@
#include <vespa/document/select/invalidconstant.h>
#include <vespa/document/select/doctype.h>
#include <vespa/document/select/compare.h>
+#include <vespa/document/select/operator.h>
#include <vespa/document/select/parse_utils.h>
#include <vespa/vespalib/util/exceptions.h>
#include <limits>
@@ -56,16 +57,6 @@ protected:
void SetUp() override;
void createDocs();
- void testOperators0();
- void testOperators1();
- void testOperators2();
- void testOperators3();
- void testOperators4();
- void testOperators5();
- void testOperators6();
- void testOperators7();
- void testOperators8();
- void testOperators9();
void testDocumentUpdates0();
void testDocumentUpdates1();
void testDocumentUpdates2();
@@ -502,21 +493,7 @@ DocumentSelectParserTest::doParse(vespalib::stringref expr,
EXPECT_EQ(select::ResultList(select::Result::result), \
doParse(expr, (doc).getId())) << (std::string("Doc id: ") + expr);
-TEST_F(DocumentSelectParserTest, testOperators)
-{
- testOperators0();
- testOperators1();
- testOperators2();
- testOperators3();
- testOperators4();
- testOperators5();
- testOperators6();
- testOperators7();
- testOperators8();
- testOperators9();
-}
-
-void DocumentSelectParserTest::testOperators0()
+TEST_F(DocumentSelectParserTest, operators_0)
{
createDocs();
@@ -555,8 +532,17 @@ void DocumentSelectParserTest::testOperators0()
PARSE("\"foo\" == 'foo'", *_doc[0], True);
PARSE("\"bar\" = \"a\"", *_doc[0], False);
PARSE("\"bar\" = \"*a*\"", *_doc[0], True);
+ PARSE("\"bar\" = \"*x*\"", *_doc[0], False);
+ PARSE("\"bar\" = \"ba*\"", *_doc[0], True);
+ PARSE("\"bar\" = \"a*\"", *_doc[0], False)
+ PARSE("\"bar\" = \"*ar\"", *_doc[0], True);
+ PARSE("\"bar\" = \"*a\"", *_doc[0], False);
PARSE("\"bar\" = \"\"", *_doc[0], False);
PARSE("\"\" = \"\"", *_doc[0], True);
+ PARSE("\"\" = \"*\"", *_doc[0], True);
+ PARSE("\"\" = \"****\"", *_doc[0], True);
+ PARSE("\"a\" = \"*?*\"", *_doc[0], True);
+ PARSE("\"a\" = \"*??*\"", *_doc[0], False);
PARSE("\"bar\" =~ \"^a$\"", *_doc[0], False);
PARSE("\"bar\" =~ \"a\"", *_doc[0], True);
PARSE("\"bar\" =~ \"\"", *_doc[0], True);
@@ -565,7 +551,16 @@ void DocumentSelectParserTest::testOperators0()
PARSE("30 = 30", *_doc[0], True);
}
-void DocumentSelectParserTest::testOperators1()
+TEST_F(DocumentSelectParserTest, regex_matching_does_not_bind_anchors_to_newlines) {
+ createDocs();
+
+ PARSE("\"a\\nb\\nc\" =~ \"^b$\"", *_doc[0], False);
+ PARSE("\"a\\r\\nb\\r\\nc\" =~ \"^b$\"", *_doc[0], False);
+ // Same applies to implicit regex created from glob expression
+ PARSE("\"a\\nb\\nc\" = \"b\"", *_doc[0], False);
+}
+
+TEST_F(DocumentSelectParserTest, operators_1)
{
createDocs();
@@ -612,7 +607,7 @@ void DocumentSelectParserTest::testOperators1()
PARSE("testdoctype1.headerval = 10", *_doc[4], True);
}
-void DocumentSelectParserTest::testOperators2()
+TEST_F(DocumentSelectParserTest, operators_2)
{
createDocs();
@@ -637,7 +632,7 @@ void DocumentSelectParserTest::testOperators2()
PARSEI("id.group == \"xyzzy\"", *_doc[10], True);
}
-void DocumentSelectParserTest::testOperators3()
+TEST_F(DocumentSelectParserTest, operators3)
{
createDocs();
{
@@ -670,7 +665,7 @@ void DocumentSelectParserTest::testOperators3()
PARSEI("id == \"id:footype:testdoctype1:n=123456789:badger\"", *_doc[5], False);
}
-void DocumentSelectParserTest::testOperators4()
+TEST_F(DocumentSelectParserTest, operators_4)
{
createDocs();
@@ -697,7 +692,7 @@ void DocumentSelectParserTest::testOperators4()
PARSE("false or testdoctype1.content = 1", *_doc[0], Invalid);
}
-void DocumentSelectParserTest::testOperators5()
+TEST_F(DocumentSelectParserTest, operators_5)
{
createDocs();
@@ -726,7 +721,7 @@ void DocumentSelectParserTest::testOperators5()
PARSEI("-6 % 10 = -6", *_doc[0], True);
}
-void DocumentSelectParserTest::testOperators6()
+TEST_F(DocumentSelectParserTest, operators_6)
{
createDocs();
@@ -770,7 +765,7 @@ void DocumentSelectParserTest::testOperators6()
PARSE("testdoctype1.headerlongval<0", *_doc[7], True);
}
-void DocumentSelectParserTest::testOperators7()
+TEST_F(DocumentSelectParserTest, operators_7)
{
createDocs();
@@ -807,7 +802,7 @@ void DocumentSelectParserTest::testOperators7()
PARSE("testdoctype1.structarray[$x].key == 15 AND testdoctype1.structarray[$y].value == \"structval2\"", *_doc[1], True);
}
-void DocumentSelectParserTest::testOperators8()
+TEST_F(DocumentSelectParserTest, operators_8)
{
createDocs();
@@ -840,7 +835,7 @@ void DocumentSelectParserTest::testOperators8()
PARSE("testdoctype1.structarrmap{$x}[$y].key == 15 AND testdoctype1.structarrmap{$y}[$x].value == \"structval2\"", *_doc[1], False);
}
-void DocumentSelectParserTest::testOperators9()
+TEST_F(DocumentSelectParserTest, operators_9)
{
createDocs();
@@ -1051,6 +1046,11 @@ void DocumentSelectParserTest::testDocumentUpdates0()
PARSEI("\"foo\" == \"foo\"", *_update[0], True);
PARSEI("\"bar\" = \"a\"", *_update[0], False);
PARSEI("\"bar\" = \"*a*\"", *_update[0], True);
+ PARSEI("\"bar\" = \"**\"", *_update[0], True);
+ PARSEI("\"bar\" = \"***\"", *_update[0], True);
+ PARSEI("\"bar\" = \"****\"", *_update[0], True);
+ PARSEI("\"bar\" = \"???\"", *_update[0], True);
+ PARSEI("\"bar\" = \"????\"", *_update[0], False);
PARSEI("\"bar\" = \"\"", *_update[0], False);
PARSEI("\"\" = \"\"", *_update[0], True);
PARSEI("\"bar\" =~ \"^a$\"", *_update[0], False);
@@ -1550,4 +1550,26 @@ TEST_F(DocumentSelectParserTest, imported_field_references_only_support_for_simp
PARSE("with_imported.my_imported_field{foo}", doc, Invalid);
}
+TEST_F(DocumentSelectParserTest, prefix_and_suffix_wildcard_globs_are_rewritten_to_optimized_form) {
+ using select::GlobOperator;
+ EXPECT_EQ(GlobOperator::convertToRegex("*foo"), "foo$");
+ EXPECT_EQ(GlobOperator::convertToRegex("foo*"), "^foo");
+ EXPECT_EQ(GlobOperator::convertToRegex("*foo*"), "foo");
+ EXPECT_EQ(GlobOperator::convertToRegex("*"), ""); // Matches any string.
+ EXPECT_EQ(GlobOperator::convertToRegex("**"), ""); // Still matches any string.
+}
+
+TEST_F(DocumentSelectParserTest, redundant_glob_wildcards_are_collapsed_into_minimal_form) {
+ using select::GlobOperator;
+ EXPECT_EQ(GlobOperator::convertToRegex("***"), ""); // Even still matches any string.
+ EXPECT_EQ(GlobOperator::convertToRegex("**foo**"), "foo");
+ EXPECT_EQ(GlobOperator::convertToRegex("foo***"), "^foo");
+ EXPECT_EQ(GlobOperator::convertToRegex("***foo"), "foo$");
+ EXPECT_EQ(GlobOperator::convertToRegex("foo**bar"), "^foo.*bar$");
+ EXPECT_EQ(GlobOperator::convertToRegex("**foo*bar**"), "foo.*bar");
+ EXPECT_EQ(GlobOperator::convertToRegex("**foo***bar**"), "foo.*bar");
+ EXPECT_EQ(GlobOperator::convertToRegex("*?*"), ".");
+ EXPECT_EQ(GlobOperator::convertToRegex("*?*?*?*"), "..*..*."); // Don't try this at home, kids!
+}
+
} // document
diff --git a/document/src/vespa/document/select/operator.cpp b/document/src/vespa/document/select/operator.cpp
index eaa795549bf..ef2ee26bdbd 100644
--- a/document/src/vespa/document/select/operator.cpp
+++ b/document/src/vespa/document/select/operator.cpp
@@ -191,35 +191,64 @@ GlobOperator::traceImpl(const Value& a, const Value& b, std::ostream& ost) const
return match(left->getValue(), regex);
}
+namespace {
+
+// Returns the number of consecutive wildcard ('*') characters found from
+// _and including_ the character at `i`, i.e. the wildcard run length.
+size_t wildcard_run_length(size_t i, vespalib::stringref str) {
+ size_t n = 0;
+ for (; (i < str.size()) && (str[i] == '*'); ++n, ++i) {}
+ return n;
+}
+
+}
+
vespalib::string
-GlobOperator::convertToRegex(vespalib::stringref globpattern) const
+GlobOperator::convertToRegex(vespalib::stringref globpattern)
{
+ if (globpattern.empty()) {
+ return "^$"; // Empty glob can only match the empty string.
+ }
vespalib::asciistream ost;
- ost << '^';
- for(uint32_t i=0, n=globpattern.size(); i<n; ++i) {
+ size_t i = 0;
+ if (globpattern[0] != '*') {
+ ost << '^';
+ } else {
+ i += wildcard_run_length(0, globpattern); // Skip entire prefix wildcard run.
+ }
+ const size_t n = globpattern.size();
+ for (; i < n; ++i) {
switch(globpattern[i]) {
- case '*': ost << ".*";
- break;
- case '?': ost << ".";
- break;
- case '^':
- case '$':
- case '|':
- case '{':
- case '}':
- case '(':
- case ')':
- case '[':
- case ']':
- case '\\':
- case '+':
- case '.': ost << '\\' << globpattern[i];
- break;
- // Are there other regex special chars we need to escape?
- default: ost << globpattern[i];
+ case '*':
+ i += wildcard_run_length(i, globpattern) - 1; // -1 since we always inc by 1 anyway.
+ if (i != (n - 1)) { // Don't emit trailing wildcard.
+ ost << ".*";
+ }
+ break;
+ case '?':
+ ost << '.';
+ break;
+ case '^':
+ case '$':
+ case '|':
+ case '{':
+ case '}':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '\\':
+ case '+':
+ case '.':
+ ost << '\\' << globpattern[i];
+ break;
+ // Are there other regex special chars we need to escape?
+ default: ost << globpattern[i];
}
}
- ost << '$';
+ if (globpattern[n - 1] != '*') {
+ ost << '$';
+ }
return ost.str();
}
diff --git a/document/src/vespa/document/select/operator.h b/document/src/vespa/document/select/operator.h
index 7383b6cf545..ad73d284c63 100644
--- a/document/src/vespa/document/select/operator.h
+++ b/document/src/vespa/document/select/operator.h
@@ -88,7 +88,27 @@ public:
// Delegates to Value::globCompare
ResultList compare(const Value& a, const Value& b) const override;
ResultList trace(const Value&, const Value&, std::ostream& trace) const override;
- vespalib::string convertToRegex(vespalib::stringref globpattern) const;
+ /**
+ * Converts a standard glob expression into a regular expression string,
+ * supporting the following glob semantics:
+ * '*' matches 0-n arbitrary characters
+ * '?' matches exactly 1 arbitrary character
+ * This code simplifies the resulting regex as much as possible to help
+ * minimize the number of possible catastrophic backtracking cases that
+ * can be triggered by wildcard regexes.
+ *
+ * The following simplifications are currently performed:
+ * - '' -> /^$/ (empty string match)
+ * '*' -> // (any string match)
+ * - '*foo*' -> /foo/ (substring match)
+ * - '*foo' -> /foo$/ (suffix match)
+ * - 'foo*' -> /^foo/ (prefix match)
+ * - collapsing runs of consecutive '*' wildcards into a single
+ * wildcard. *** is identical to ** which is identical to * etc,
+ * as all these match 0-n characters each. This also works with
+ * simplification, i.e. '***foo***' -> /foo/ and '***' -> //
+ */
+ static vespalib::string convertToRegex(vespalib::stringref globpattern);
static bool containsVariables(vespalib::stringref expression);
static const GlobOperator GLOB;