summaryrefslogtreecommitdiffstats
path: root/document
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@verizonmedia.com>2020-02-07 14:48:36 +0100
committerGitHub <noreply@github.com>2020-02-07 14:48:36 +0100
commit9d2c5579408e08c1040000a88a4d80ccdf863f20 (patch)
tree2213c301d1690886bd1d67cb198d108de6429ebf /document
parentbde5d17c9af003f2375c9368a97159f9dc660813 (diff)
parenta4b3b17d971f7c528f1820f441c4c7e1031b0ceb (diff)
Merge pull request #12096 from vespa-engine/vekterli/simplify-regexes-generated-from-glob-patterns
Simplify regexes generated from document selection glob patterns
Diffstat (limited to 'document')
-rw-r--r--document/src/tests/documentselectparsertest.cpp90
-rw-r--r--document/src/vespa/document/select/operator.cpp75
-rw-r--r--document/src/vespa/document/select/operator.h22
3 files changed, 129 insertions, 58 deletions
diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp
index 3441a336553..c1e5fbecd14 100644
--- a/document/src/tests/documentselectparsertest.cpp
+++ b/document/src/tests/documentselectparsertest.cpp
@@ -16,6 +16,7 @@
#include <vespa/document/select/invalidconstant.h>
#include <vespa/document/select/doctype.h>
#include <vespa/document/select/compare.h>
+#include <vespa/document/select/operator.h>
#include <vespa/document/select/parse_utils.h>
#include <vespa/vespalib/util/exceptions.h>
#include <limits>
@@ -56,16 +57,6 @@ protected:
void SetUp() override;
void createDocs();
- void testOperators0();
- void testOperators1();
- void testOperators2();
- void testOperators3();
- void testOperators4();
- void testOperators5();
- void testOperators6();
- void testOperators7();
- void testOperators8();
- void testOperators9();
void testDocumentUpdates0();
void testDocumentUpdates1();
void testDocumentUpdates2();
@@ -502,21 +493,7 @@ DocumentSelectParserTest::doParse(vespalib::stringref expr,
EXPECT_EQ(select::ResultList(select::Result::result), \
doParse(expr, (doc).getId())) << (std::string("Doc id: ") + expr);
-TEST_F(DocumentSelectParserTest, testOperators)
-{
- testOperators0();
- testOperators1();
- testOperators2();
- testOperators3();
- testOperators4();
- testOperators5();
- testOperators6();
- testOperators7();
- testOperators8();
- testOperators9();
-}
-
-void DocumentSelectParserTest::testOperators0()
+TEST_F(DocumentSelectParserTest, operators_0)
{
createDocs();
@@ -555,8 +532,17 @@ void DocumentSelectParserTest::testOperators0()
PARSE("\"foo\" == 'foo'", *_doc[0], True);
PARSE("\"bar\" = \"a\"", *_doc[0], False);
PARSE("\"bar\" = \"*a*\"", *_doc[0], True);
+ PARSE("\"bar\" = \"*x*\"", *_doc[0], False);
+ PARSE("\"bar\" = \"ba*\"", *_doc[0], True);
+ PARSE("\"bar\" = \"a*\"", *_doc[0], False)
+ PARSE("\"bar\" = \"*ar\"", *_doc[0], True);
+ PARSE("\"bar\" = \"*a\"", *_doc[0], False);
PARSE("\"bar\" = \"\"", *_doc[0], False);
PARSE("\"\" = \"\"", *_doc[0], True);
+ PARSE("\"\" = \"*\"", *_doc[0], True);
+ PARSE("\"\" = \"****\"", *_doc[0], True);
+ PARSE("\"a\" = \"*?*\"", *_doc[0], True);
+ PARSE("\"a\" = \"*??*\"", *_doc[0], False);
PARSE("\"bar\" =~ \"^a$\"", *_doc[0], False);
PARSE("\"bar\" =~ \"a\"", *_doc[0], True);
PARSE("\"bar\" =~ \"\"", *_doc[0], True);
@@ -565,7 +551,16 @@ void DocumentSelectParserTest::testOperators0()
PARSE("30 = 30", *_doc[0], True);
}
-void DocumentSelectParserTest::testOperators1()
+TEST_F(DocumentSelectParserTest, regex_matching_does_not_bind_anchors_to_newlines) {
+ createDocs();
+
+ PARSE("\"a\\nb\\nc\" =~ \"^b$\"", *_doc[0], False);
+ PARSE("\"a\\r\\nb\\r\\nc\" =~ \"^b$\"", *_doc[0], False);
+ // Same applies to implicit regex created from glob expression
+ PARSE("\"a\\nb\\nc\" = \"b\"", *_doc[0], False);
+}
+
+TEST_F(DocumentSelectParserTest, operators_1)
{
createDocs();
@@ -612,7 +607,7 @@ void DocumentSelectParserTest::testOperators1()
PARSE("testdoctype1.headerval = 10", *_doc[4], True);
}
-void DocumentSelectParserTest::testOperators2()
+TEST_F(DocumentSelectParserTest, operators_2)
{
createDocs();
@@ -637,7 +632,7 @@ void DocumentSelectParserTest::testOperators2()
PARSEI("id.group == \"xyzzy\"", *_doc[10], True);
}
-void DocumentSelectParserTest::testOperators3()
+TEST_F(DocumentSelectParserTest, operators_3)
{
createDocs();
{
@@ -670,7 +665,7 @@ void DocumentSelectParserTest::testOperators3()
PARSEI("id == \"id:footype:testdoctype1:n=123456789:badger\"", *_doc[5], False);
}
-void DocumentSelectParserTest::testOperators4()
+TEST_F(DocumentSelectParserTest, operators_4)
{
createDocs();
@@ -697,7 +692,7 @@ void DocumentSelectParserTest::testOperators4()
PARSE("false or testdoctype1.content = 1", *_doc[0], Invalid);
}
-void DocumentSelectParserTest::testOperators5()
+TEST_F(DocumentSelectParserTest, operators_5)
{
createDocs();
@@ -726,7 +721,7 @@ void DocumentSelectParserTest::testOperators5()
PARSEI("-6 % 10 = -6", *_doc[0], True);
}
-void DocumentSelectParserTest::testOperators6()
+TEST_F(DocumentSelectParserTest, operators_6)
{
createDocs();
@@ -770,7 +765,7 @@ void DocumentSelectParserTest::testOperators6()
PARSE("testdoctype1.headerlongval<0", *_doc[7], True);
}
-void DocumentSelectParserTest::testOperators7()
+TEST_F(DocumentSelectParserTest, operators_7)
{
createDocs();
@@ -807,7 +802,7 @@ void DocumentSelectParserTest::testOperators7()
PARSE("testdoctype1.structarray[$x].key == 15 AND testdoctype1.structarray[$y].value == \"structval2\"", *_doc[1], True);
}
-void DocumentSelectParserTest::testOperators8()
+TEST_F(DocumentSelectParserTest, operators_8)
{
createDocs();
@@ -840,7 +835,7 @@ void DocumentSelectParserTest::testOperators8()
PARSE("testdoctype1.structarrmap{$x}[$y].key == 15 AND testdoctype1.structarrmap{$y}[$x].value == \"structval2\"", *_doc[1], False);
}
-void DocumentSelectParserTest::testOperators9()
+TEST_F(DocumentSelectParserTest, operators_9)
{
createDocs();
@@ -1051,6 +1046,11 @@ void DocumentSelectParserTest::testDocumentUpdates0()
PARSEI("\"foo\" == \"foo\"", *_update[0], True);
PARSEI("\"bar\" = \"a\"", *_update[0], False);
PARSEI("\"bar\" = \"*a*\"", *_update[0], True);
+ PARSEI("\"bar\" = \"**\"", *_update[0], True);
+ PARSEI("\"bar\" = \"***\"", *_update[0], True);
+ PARSEI("\"bar\" = \"****\"", *_update[0], True);
+ PARSEI("\"bar\" = \"???\"", *_update[0], True);
+ PARSEI("\"bar\" = \"????\"", *_update[0], False);
PARSEI("\"bar\" = \"\"", *_update[0], False);
PARSEI("\"\" = \"\"", *_update[0], True);
PARSEI("\"bar\" =~ \"^a$\"", *_update[0], False);
@@ -1550,4 +1550,26 @@ TEST_F(DocumentSelectParserTest, imported_field_references_only_support_for_simp
PARSE("with_imported.my_imported_field{foo}", doc, Invalid);
}
+TEST_F(DocumentSelectParserTest, prefix_and_suffix_wildcard_globs_are_rewritten_to_optimized_form) {
+ using select::GlobOperator;
+ EXPECT_EQ(GlobOperator::convertToRegex("*foo"), "foo$");
+ EXPECT_EQ(GlobOperator::convertToRegex("foo*"), "^foo");
+ EXPECT_EQ(GlobOperator::convertToRegex("*foo*"), "foo");
+ EXPECT_EQ(GlobOperator::convertToRegex("*"), ""); // Matches any string.
+ EXPECT_EQ(GlobOperator::convertToRegex("**"), ""); // Still matches any string.
+}
+
+TEST_F(DocumentSelectParserTest, redundant_glob_wildcards_are_collapsed_into_minimal_form) {
+ using select::GlobOperator;
+ EXPECT_EQ(GlobOperator::convertToRegex("***"), ""); // Even still matches any string.
+ EXPECT_EQ(GlobOperator::convertToRegex("**foo**"), "foo");
+ EXPECT_EQ(GlobOperator::convertToRegex("foo***"), "^foo");
+ EXPECT_EQ(GlobOperator::convertToRegex("***foo"), "foo$");
+ EXPECT_EQ(GlobOperator::convertToRegex("foo**bar"), "^foo.*bar$");
+ EXPECT_EQ(GlobOperator::convertToRegex("**foo*bar**"), "foo.*bar");
+ EXPECT_EQ(GlobOperator::convertToRegex("**foo***bar**"), "foo.*bar");
+ EXPECT_EQ(GlobOperator::convertToRegex("*?*"), ".");
+ EXPECT_EQ(GlobOperator::convertToRegex("*?*?*?*"), "..*..*."); // Don't try this at home, kids!
+}
+
} // document
diff --git a/document/src/vespa/document/select/operator.cpp b/document/src/vespa/document/select/operator.cpp
index eaa795549bf..ef2ee26bdbd 100644
--- a/document/src/vespa/document/select/operator.cpp
+++ b/document/src/vespa/document/select/operator.cpp
@@ -191,35 +191,64 @@ GlobOperator::traceImpl(const Value& a, const Value& b, std::ostream& ost) const
return match(left->getValue(), regex);
}
+namespace {
+
+// Returns the number of consecutive wildcard ('*') characters found from
+// _and including_ the character at `i`, i.e. the wildcard run length.
+size_t wildcard_run_length(size_t i, vespalib::stringref str) {
+ size_t n = 0;
+ for (; (i < str.size()) && (str[i] == '*'); ++n, ++i) {}
+ return n;
+}
+
+}
+
vespalib::string
-GlobOperator::convertToRegex(vespalib::stringref globpattern) const
+GlobOperator::convertToRegex(vespalib::stringref globpattern)
{
+ if (globpattern.empty()) {
+ return "^$"; // Empty glob can only match the empty string.
+ }
vespalib::asciistream ost;
- ost << '^';
- for(uint32_t i=0, n=globpattern.size(); i<n; ++i) {
+ size_t i = 0;
+ if (globpattern[0] != '*') {
+ ost << '^';
+ } else {
+ i += wildcard_run_length(0, globpattern); // Skip entire prefix wildcard run.
+ }
+ const size_t n = globpattern.size();
+ for (; i < n; ++i) {
switch(globpattern[i]) {
- case '*': ost << ".*";
- break;
- case '?': ost << ".";
- break;
- case '^':
- case '$':
- case '|':
- case '{':
- case '}':
- case '(':
- case ')':
- case '[':
- case ']':
- case '\\':
- case '+':
- case '.': ost << '\\' << globpattern[i];
- break;
- // Are there other regex special chars we need to escape?
- default: ost << globpattern[i];
+ case '*':
+ i += wildcard_run_length(i, globpattern) - 1; // -1 since we always inc by 1 anyway.
+ if (i != (n - 1)) { // Don't emit trailing wildcard.
+ ost << ".*";
+ }
+ break;
+ case '?':
+ ost << '.';
+ break;
+ case '^':
+ case '$':
+ case '|':
+ case '{':
+ case '}':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '\\':
+ case '+':
+ case '.':
+ ost << '\\' << globpattern[i];
+ break;
+ // Are there other regex special chars we need to escape?
+ default: ost << globpattern[i];
}
}
- ost << '$';
+ if (globpattern[n - 1] != '*') {
+ ost << '$';
+ }
return ost.str();
}
diff --git a/document/src/vespa/document/select/operator.h b/document/src/vespa/document/select/operator.h
index 7383b6cf545..ad73d284c63 100644
--- a/document/src/vespa/document/select/operator.h
+++ b/document/src/vespa/document/select/operator.h
@@ -88,7 +88,27 @@ public:
// Delegates to Value::globCompare
ResultList compare(const Value& a, const Value& b) const override;
ResultList trace(const Value&, const Value&, std::ostream& trace) const override;
- vespalib::string convertToRegex(vespalib::stringref globpattern) const;
+ /**
+ * Converts a standard glob expression into a regular expression string,
+ * supporting the following glob semantics:
+ * '*' matches 0-n arbitrary characters
+ * '?' matches exactly 1 arbitrary character
+ * This code simplifies the resulting regex as much as possible to help
+ * minimize the number of possible catastrophic backtracking cases that
+ * can be triggered by wildcard regexes.
+ *
+ * The following simplifications are currently performed:
+ * - '' -> /^$/ (empty string match)
+ * '*' -> // (any string match)
+ * - '*foo*' -> /foo/ (substring match)
+ * - '*foo' -> /foo$/ (suffix match)
+ * - 'foo*' -> /^foo/ (prefix match)
+ * - collapsing runs of consecutive '*' wildcards into a single
+ * wildcard. *** is identical to ** which is identical to * etc,
+ * as all these match 0-n characters each. This also works with
+ * simplification, i.e. '***foo***' -> /foo/ and '***' -> //
+ */
+ static vespalib::string convertToRegex(vespalib::stringref globpattern);
static bool containsVariables(vespalib::stringref expression);
static const GlobOperator GLOB;