diff options
author | Tor Brede Vekterli <vekterli@verizonmedia.com> | 2020-02-07 14:48:36 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-07 14:48:36 +0100 |
commit | 9d2c5579408e08c1040000a88a4d80ccdf863f20 (patch) | |
tree | 2213c301d1690886bd1d67cb198d108de6429ebf /document | |
parent | bde5d17c9af003f2375c9368a97159f9dc660813 (diff) | |
parent | a4b3b17d971f7c528f1820f441c4c7e1031b0ceb (diff) |
Merge pull request #12096 from vespa-engine/vekterli/simplify-regexes-generated-from-glob-patterns
Simplify regexes generated from document selection glob patterns
Diffstat (limited to 'document')
-rw-r--r-- | document/src/tests/documentselectparsertest.cpp | 90 | ||||
-rw-r--r-- | document/src/vespa/document/select/operator.cpp | 75 | ||||
-rw-r--r-- | document/src/vespa/document/select/operator.h | 22 |
3 files changed, 129 insertions, 58 deletions
diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp index 3441a336553..c1e5fbecd14 100644 --- a/document/src/tests/documentselectparsertest.cpp +++ b/document/src/tests/documentselectparsertest.cpp @@ -16,6 +16,7 @@ #include <vespa/document/select/invalidconstant.h> #include <vespa/document/select/doctype.h> #include <vespa/document/select/compare.h> +#include <vespa/document/select/operator.h> #include <vespa/document/select/parse_utils.h> #include <vespa/vespalib/util/exceptions.h> #include <limits> @@ -56,16 +57,6 @@ protected: void SetUp() override; void createDocs(); - void testOperators0(); - void testOperators1(); - void testOperators2(); - void testOperators3(); - void testOperators4(); - void testOperators5(); - void testOperators6(); - void testOperators7(); - void testOperators8(); - void testOperators9(); void testDocumentUpdates0(); void testDocumentUpdates1(); void testDocumentUpdates2(); @@ -502,21 +493,7 @@ DocumentSelectParserTest::doParse(vespalib::stringref expr, EXPECT_EQ(select::ResultList(select::Result::result), \ doParse(expr, (doc).getId())) << (std::string("Doc id: ") + expr); -TEST_F(DocumentSelectParserTest, testOperators) -{ - testOperators0(); - testOperators1(); - testOperators2(); - testOperators3(); - testOperators4(); - testOperators5(); - testOperators6(); - testOperators7(); - testOperators8(); - testOperators9(); -} - -void DocumentSelectParserTest::testOperators0() +TEST_F(DocumentSelectParserTest, operators_0) { createDocs(); @@ -555,8 +532,17 @@ void DocumentSelectParserTest::testOperators0() PARSE("\"foo\" == 'foo'", *_doc[0], True); PARSE("\"bar\" = \"a\"", *_doc[0], False); PARSE("\"bar\" = \"*a*\"", *_doc[0], True); + PARSE("\"bar\" = \"*x*\"", *_doc[0], False); + PARSE("\"bar\" = \"ba*\"", *_doc[0], True); + PARSE("\"bar\" = \"a*\"", *_doc[0], False) + PARSE("\"bar\" = \"*ar\"", *_doc[0], True); + PARSE("\"bar\" = \"*a\"", *_doc[0], False); PARSE("\"bar\" = \"\"", *_doc[0], False); PARSE("\"\" = \"\"", *_doc[0], True); + PARSE("\"\" = \"*\"", *_doc[0], True); + PARSE("\"\" = \"****\"", *_doc[0], True); + PARSE("\"a\" = \"*?*\"", *_doc[0], True); + PARSE("\"a\" = \"*??*\"", *_doc[0], False); PARSE("\"bar\" =~ \"^a$\"", *_doc[0], False); PARSE("\"bar\" =~ \"a\"", *_doc[0], True); PARSE("\"bar\" =~ \"\"", *_doc[0], True); @@ -565,7 +551,16 @@ void DocumentSelectParserTest::testOperators0() PARSE("30 = 30", *_doc[0], True); } -void DocumentSelectParserTest::testOperators1() +TEST_F(DocumentSelectParserTest, regex_matching_does_not_bind_anchors_to_newlines) { + createDocs(); + + PARSE("\"a\\nb\\nc\" =~ \"^b$\"", *_doc[0], False); + PARSE("\"a\\r\\nb\\r\\nc\" =~ \"^b$\"", *_doc[0], False); + // Same applies to implicit regex created from glob expression + PARSE("\"a\\nb\\nc\" = \"b\"", *_doc[0], False); +} + +TEST_F(DocumentSelectParserTest, operators_1) { createDocs(); @@ -612,7 +607,7 @@ void DocumentSelectParserTest::testOperators1() PARSE("testdoctype1.headerval = 10", *_doc[4], True); } -void DocumentSelectParserTest::testOperators2() +TEST_F(DocumentSelectParserTest, operators_2) { createDocs(); @@ -637,7 +632,7 @@ void DocumentSelectParserTest::testOperators2() PARSEI("id.group == \"xyzzy\"", *_doc[10], True); } -void DocumentSelectParserTest::testOperators3() +TEST_F(DocumentSelectParserTest, operators_3) { createDocs(); { @@ -670,7 +665,7 @@ void DocumentSelectParserTest::testOperators3() PARSEI("id == \"id:footype:testdoctype1:n=123456789:badger\"", *_doc[5], False); } -void DocumentSelectParserTest::testOperators4() +TEST_F(DocumentSelectParserTest, operators_4) { createDocs(); @@ -697,7 +692,7 @@ void DocumentSelectParserTest::testOperators4() PARSE("false or testdoctype1.content = 1", *_doc[0], Invalid); } -void DocumentSelectParserTest::testOperators5() +TEST_F(DocumentSelectParserTest, operators_5) { createDocs(); @@ -726,7 +721,7 @@ void DocumentSelectParserTest::testOperators5() PARSEI("-6 % 10 = -6", *_doc[0], True); } -void DocumentSelectParserTest::testOperators6() +TEST_F(DocumentSelectParserTest, operators_6) { createDocs(); @@ -770,7 +765,7 @@ void DocumentSelectParserTest::testOperators6() PARSE("testdoctype1.headerlongval<0", *_doc[7], True); } -void DocumentSelectParserTest::testOperators7() +TEST_F(DocumentSelectParserTest, operators_7) { createDocs(); @@ -807,7 +802,7 @@ void DocumentSelectParserTest::testOperators7() PARSE("testdoctype1.structarray[$x].key == 15 AND testdoctype1.structarray[$y].value == \"structval2\"", *_doc[1], True); } -void DocumentSelectParserTest::testOperators8() +TEST_F(DocumentSelectParserTest, operators_8) { createDocs(); @@ -840,7 +835,7 @@ void DocumentSelectParserTest::testOperators8() PARSE("testdoctype1.structarrmap{$x}[$y].key == 15 AND testdoctype1.structarrmap{$y}[$x].value == \"structval2\"", *_doc[1], False); } -void DocumentSelectParserTest::testOperators9() +TEST_F(DocumentSelectParserTest, operators_9) { createDocs(); @@ -1051,6 +1046,11 @@ void DocumentSelectParserTest::testDocumentUpdates0() PARSEI("\"foo\" == \"foo\"", *_update[0], True); PARSEI("\"bar\" = \"a\"", *_update[0], False); PARSEI("\"bar\" = \"*a*\"", *_update[0], True); + PARSEI("\"bar\" = \"**\"", *_update[0], True); + PARSEI("\"bar\" = \"***\"", *_update[0], True); + PARSEI("\"bar\" = \"****\"", *_update[0], True); + PARSEI("\"bar\" = \"???\"", *_update[0], True); + PARSEI("\"bar\" = \"????\"", *_update[0], False); PARSEI("\"bar\" = \"\"", *_update[0], False); PARSEI("\"\" = \"\"", *_update[0], True); PARSEI("\"bar\" =~ \"^a$\"", *_update[0], False); @@ -1550,4 +1550,26 @@ TEST_F(DocumentSelectParserTest, imported_field_references_only_support_for_simp PARSE("with_imported.my_imported_field{foo}", doc, Invalid); } +TEST_F(DocumentSelectParserTest, prefix_and_suffix_wildcard_globs_are_rewritten_to_optimized_form) { + using select::GlobOperator; + EXPECT_EQ(GlobOperator::convertToRegex("*foo"), "foo$"); + EXPECT_EQ(GlobOperator::convertToRegex("foo*"), "^foo"); + EXPECT_EQ(GlobOperator::convertToRegex("*foo*"), "foo"); + EXPECT_EQ(GlobOperator::convertToRegex("*"), ""); // Matches any string. + EXPECT_EQ(GlobOperator::convertToRegex("**"), ""); // Still matches any string. +} + +TEST_F(DocumentSelectParserTest, redundant_glob_wildcards_are_collapsed_into_minimal_form) { + using select::GlobOperator; + EXPECT_EQ(GlobOperator::convertToRegex("***"), ""); // Even still matches any string. + EXPECT_EQ(GlobOperator::convertToRegex("**foo**"), "foo"); + EXPECT_EQ(GlobOperator::convertToRegex("foo***"), "^foo"); + EXPECT_EQ(GlobOperator::convertToRegex("***foo"), "foo$"); + EXPECT_EQ(GlobOperator::convertToRegex("foo**bar"), "^foo.*bar$"); + EXPECT_EQ(GlobOperator::convertToRegex("**foo*bar**"), "foo.*bar"); + EXPECT_EQ(GlobOperator::convertToRegex("**foo***bar**"), "foo.*bar"); + EXPECT_EQ(GlobOperator::convertToRegex("*?*"), "."); + EXPECT_EQ(GlobOperator::convertToRegex("*?*?*?*"), "..*..*."); // Don't try this at home, kids! +} + } // document diff --git a/document/src/vespa/document/select/operator.cpp b/document/src/vespa/document/select/operator.cpp index eaa795549bf..ef2ee26bdbd 100644 --- a/document/src/vespa/document/select/operator.cpp +++ b/document/src/vespa/document/select/operator.cpp @@ -191,35 +191,64 @@ GlobOperator::traceImpl(const Value& a, const Value& b, std::ostream& ost) const return match(left->getValue(), regex); } +namespace { + +// Returns the number of consecutive wildcard ('*') characters found from +// _and including_ the character at `i`, i.e. the wildcard run length. +size_t wildcard_run_length(size_t i, vespalib::stringref str) { + size_t n = 0; + for (; (i < str.size()) && (str[i] == '*'); ++n, ++i) {} + return n; +} + +} + vespalib::string -GlobOperator::convertToRegex(vespalib::stringref globpattern) const +GlobOperator::convertToRegex(vespalib::stringref globpattern) { + if (globpattern.empty()) { + return "^$"; // Empty glob can only match the empty string. + } vespalib::asciistream ost; - ost << '^'; - for(uint32_t i=0, n=globpattern.size(); i<n; ++i) { + size_t i = 0; + if (globpattern[0] != '*') { + ost << '^'; + } else { + i += wildcard_run_length(0, globpattern); // Skip entire prefix wildcard run. + } + const size_t n = globpattern.size(); + for (; i < n; ++i) { switch(globpattern[i]) { - case '*': ost << ".*"; - break; - case '?': ost << "."; - break; - case '^': - case '$': - case '|': - case '{': - case '}': - case '(': - case ')': - case '[': - case ']': - case '\\': - case '+': - case '.': ost << '\\' << globpattern[i]; - break; - // Are there other regex special chars we need to escape? - default: ost << globpattern[i]; + case '*': + i += wildcard_run_length(i, globpattern) - 1; // -1 since we always inc by 1 anyway. + if (i != (n - 1)) { // Don't emit trailing wildcard. + ost << ".*"; + } + break; + case '?': + ost << '.'; + break; + case '^': + case '$': + case '|': + case '{': + case '}': + case '(': + case ')': + case '[': + case ']': + case '\\': + case '+': + case '.': + ost << '\\' << globpattern[i]; + break; + // Are there other regex special chars we need to escape? + default: ost << globpattern[i]; } } - ost << '$'; + if (globpattern[n - 1] != '*') { + ost << '$'; + } return ost.str(); } diff --git a/document/src/vespa/document/select/operator.h b/document/src/vespa/document/select/operator.h index 7383b6cf545..ad73d284c63 100644 --- a/document/src/vespa/document/select/operator.h +++ b/document/src/vespa/document/select/operator.h @@ -88,7 +88,27 @@ public: // Delegates to Value::globCompare ResultList compare(const Value& a, const Value& b) const override; ResultList trace(const Value&, const Value&, std::ostream& trace) const override; - vespalib::string convertToRegex(vespalib::stringref globpattern) const; + /** + * Converts a standard glob expression into a regular expression string, + * supporting the following glob semantics: + * '*' matches 0-n arbitrary characters + * '?' matches exactly 1 arbitrary character + * This code simplifies the resulting regex as much as possible to help + * minimize the number of possible catastrophic backtracking cases that + * can be triggered by wildcard regexes. + * + * The following simplifications are currently performed: + * - '' -> /^$/ (empty string match) + * '*' -> // (any string match) + * - '*foo*' -> /foo/ (substring match) + * - '*foo' -> /foo$/ (suffix match) + * - 'foo*' -> /^foo/ (prefix match) + * - collapsing runs of consecutive '*' wildcards into a single + * wildcard. *** is identical to ** which is identical to * etc, + * as all these match 0-n characters each. This also works with + * simplification, i.e. '***foo***' -> /foo/ and '***' -> // + */ + static vespalib::string convertToRegex(vespalib::stringref globpattern); static bool containsVariables(vespalib::stringref expression); static const GlobOperator GLOB; |