summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2023-03-20 10:27:54 +0000
committerTor Brede Vekterli <vekterli@yahooinc.com>2023-03-20 10:27:54 +0000
commit2488c506b929dd64c7d9463557af9c1603653fae (patch)
tree61cd68acb84553fcf3a41c26f511d1f21e0e36fa
parentb9cb269f09e3c46e0b4bb263428dd4dba5817e0f (diff)
Add utility wrapper around RE2 possible regex prefix match range
For a strictly start-anchored regex, this provides a lower/upper bound pair that constrains the possible prefix range that may contain a string matching the regex.
-rw-r--r--vespalib/src/tests/regex/regex.cpp26
-rw-r--r--vespalib/src/vespa/vespalib/regex/regex.cpp14
-rw-r--r--vespalib/src/vespa/vespalib/regex/regex.h15
3 files changed, 52 insertions, 3 deletions
diff --git a/vespalib/src/tests/regex/regex.cpp b/vespalib/src/tests/regex/regex.cpp
index 471ba84a68f..28fb6bfffc3 100644
--- a/vespalib/src/tests/regex/regex.cpp
+++ b/vespalib/src/tests/regex/regex.cpp
@@ -150,4 +150,30 @@ TEST("Test that default constructed regex is invalid.") {
ASSERT_FALSE(dummy.valid());
}
+TEST("Can extract min/max prefix range from anchored regex") {
+ auto min_max = Regex::from_pattern("^.*").possible_anchored_match_prefix_range();
+ EXPECT_EQUAL(min_max.first, "");
+ EXPECT_EQUAL(min_max.second, "\xf4\x8f\xbf\xc0"); // Highest possible Unicode char (U+10FFFF) as UTF-8, plus 1
+
+ min_max = Regex::from_pattern("^hello").possible_anchored_match_prefix_range();
+ EXPECT_EQUAL(min_max.first, "hello");
+ EXPECT_EQUAL(min_max.second, "hello");
+
+ min_max = Regex::from_pattern("^hello|^world").possible_anchored_match_prefix_range();
+ EXPECT_EQUAL(min_max.first, "hello");
+ EXPECT_EQUAL(min_max.second, "world");
+
+ min_max = Regex::from_pattern("(^hello|^world|^zoidberg)").possible_anchored_match_prefix_range();
+ EXPECT_EQUAL(min_max.first, "hello");
+ EXPECT_EQUAL(min_max.second, "zoidberg");
+
+ min_max = Regex::from_pattern("^hello (foo|bar|zoo)").possible_anchored_match_prefix_range();
+ EXPECT_EQUAL(min_max.first, "hello bar");
+ EXPECT_EQUAL(min_max.second, "hello zoo");
+
+ min_max = Regex::from_pattern("^(hello|world)+").possible_anchored_match_prefix_range();
+ EXPECT_EQUAL(min_max.first, "hello");
+ EXPECT_EQUAL(min_max.second, "worldwp");
+}
+
TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/vespalib/src/vespa/vespalib/regex/regex.cpp b/vespalib/src/vespa/vespalib/regex/regex.cpp
index a904ed4d4ee..c73df1182d6 100644
--- a/vespalib/src/vespa/vespalib/regex/regex.cpp
+++ b/vespalib/src/vespa/vespalib/regex/regex.cpp
@@ -49,6 +49,16 @@ public:
}
return RE2::FullMatch(StringPiece(input.data(), input.size()), _regex);
}
+
+ std::pair<std::string, std::string> possible_anchored_match_prefix_range() const {
+ constexpr int max_len = 128; // TODO determine a "reasonable" value. RE2 docs are not clear on this.
+ std::string min_prefix, max_prefix;
+
+ if (!_regex.PossibleMatchRange(&min_prefix, &max_prefix, max_len)) {
+ return {};
+ }
+ return {std::move(min_prefix), std::move(max_prefix)};
+ }
};
Regex Regex::from_pattern(std::string_view pattern, uint32_t opt_mask) {
@@ -76,6 +86,10 @@ bool Regex::full_match(std::string_view input) const noexcept {
return _impl->full_match(input);
}
+std::pair<std::string, std::string> Regex::possible_anchored_match_prefix_range() const {
+ return _impl->possible_anchored_match_prefix_range();
+}
+
bool Regex::partial_match(std::string_view input, std::string_view pattern) noexcept {
assert(pattern.size() <= INT32_MAX);
Impl impl(pattern, RE2::Quiet);
diff --git a/vespalib/src/vespa/vespalib/regex/regex.h b/vespalib/src/vespa/vespalib/regex/regex.h
index 6a4d6bc47fc..50eb76b14d4 100644
--- a/vespalib/src/vespa/vespalib/regex/regex.h
+++ b/vespalib/src/vespa/vespalib/regex/regex.h
@@ -4,6 +4,7 @@
#include <memory>
#include <string>
#include <string_view>
+#include <utility>
namespace vespalib {
@@ -37,7 +38,7 @@ namespace vespalib {
*/
class Regex {
class Impl;
- std::unique_ptr<const Impl> _impl; // shared_ptr to allow for cheap copying.
+ std::unique_ptr<const Impl> _impl;
explicit Regex(std::unique_ptr<const Impl> impl);
public:
@@ -55,14 +56,22 @@ public:
Regex(Regex&&) noexcept;
Regex& operator=(Regex&&) noexcept;
- bool valid() const { return bool(_impl); }
+ [[nodiscard]] bool valid() const noexcept { return bool(_impl); }
[[nodiscard]] bool parsed_ok() const noexcept;
[[nodiscard]] bool partial_match(std::string_view input) const noexcept;
[[nodiscard]] bool full_match(std::string_view input) const noexcept;
- static Regex from_pattern(std::string_view pattern, uint32_t opt_flags = Options::None);
+ // Returns a pair of <lower bound, upper bound> prefix strings that constrain the possible
+ // match-able range of inputs for this regex. If there is no shared prefix, or if extracting
+ // the range fails, the strings will be empty.
+ // Important: this is _only_ semantically valid if the regex is strictly start-anchored, i.e.
+ // all possible matching paths start with '^'.
+ // This method does _not_ validate that the regex is strictly start-anchored.
+ [[nodiscard]] std::pair<std::string, std::string> possible_anchored_match_prefix_range() const;
+
+ [[nodiscard]] static Regex from_pattern(std::string_view pattern, uint32_t opt_flags = Options::None);
// Utility matchers for non-precompiled expressions.
[[nodiscard]] static bool partial_match(std::string_view input, std::string_view pattern) noexcept;