diff options
author | Tor Brede Vekterli <vekterli@yahooinc.com> | 2023-03-20 10:27:54 +0000 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahooinc.com> | 2023-03-20 10:27:54 +0000 |
commit | 2488c506b929dd64c7d9463557af9c1603653fae (patch) | |
tree | 61cd68acb84553fcf3a41c26f511d1f21e0e36fa /vespalib | |
parent | b9cb269f09e3c46e0b4bb263428dd4dba5817e0f (diff) |
Add utility wrapper around RE2 possible regex prefix match range
For a strictly start-anchored regex, this provides a lower/upper
bound pair that constrains the possible prefix range that may contain
a string matching the regex.
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/src/tests/regex/regex.cpp | 26 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/regex/regex.cpp | 14 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/regex/regex.h | 15 |
3 files changed, 52 insertions, 3 deletions
diff --git a/vespalib/src/tests/regex/regex.cpp b/vespalib/src/tests/regex/regex.cpp index 471ba84a68f..28fb6bfffc3 100644 --- a/vespalib/src/tests/regex/regex.cpp +++ b/vespalib/src/tests/regex/regex.cpp @@ -150,4 +150,30 @@ TEST("Test that default constructed regex is invalid.") { ASSERT_FALSE(dummy.valid()); } +TEST("Can extract min/max prefix range from anchored regex") { + auto min_max = Regex::from_pattern("^.*").possible_anchored_match_prefix_range(); + EXPECT_EQUAL(min_max.first, ""); + EXPECT_EQUAL(min_max.second, "\xf4\x8f\xbf\xc0"); // Highest possible Unicode char (U+10FFFF) as UTF-8, plus 1 + + min_max = Regex::from_pattern("^hello").possible_anchored_match_prefix_range(); + EXPECT_EQUAL(min_max.first, "hello"); + EXPECT_EQUAL(min_max.second, "hello"); + + min_max = Regex::from_pattern("^hello|^world").possible_anchored_match_prefix_range(); + EXPECT_EQUAL(min_max.first, "hello"); + EXPECT_EQUAL(min_max.second, "world"); + + min_max = Regex::from_pattern("(^hello|^world|^zoidberg)").possible_anchored_match_prefix_range(); + EXPECT_EQUAL(min_max.first, "hello"); + EXPECT_EQUAL(min_max.second, "zoidberg"); + + min_max = Regex::from_pattern("^hello (foo|bar|zoo)").possible_anchored_match_prefix_range(); + EXPECT_EQUAL(min_max.first, "hello bar"); + EXPECT_EQUAL(min_max.second, "hello zoo"); + + min_max = Regex::from_pattern("^(hello|world)+").possible_anchored_match_prefix_range(); + EXPECT_EQUAL(min_max.first, "hello"); + EXPECT_EQUAL(min_max.second, "worldwp"); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/vespalib/src/vespa/vespalib/regex/regex.cpp b/vespalib/src/vespa/vespalib/regex/regex.cpp index a904ed4d4ee..c73df1182d6 100644 --- a/vespalib/src/vespa/vespalib/regex/regex.cpp +++ b/vespalib/src/vespa/vespalib/regex/regex.cpp @@ -49,6 +49,16 @@ public: } return RE2::FullMatch(StringPiece(input.data(), input.size()), _regex); } + + std::pair<std::string, std::string> possible_anchored_match_prefix_range() const { + constexpr int max_len = 128; // TODO determine a "reasonable" value. RE2 docs are not clear on this. + std::string min_prefix, max_prefix; + + if (!_regex.PossibleMatchRange(&min_prefix, &max_prefix, max_len)) { + return {}; + } + return {std::move(min_prefix), std::move(max_prefix)}; + } }; Regex Regex::from_pattern(std::string_view pattern, uint32_t opt_mask) { @@ -76,6 +86,10 @@ bool Regex::full_match(std::string_view input) const noexcept { return _impl->full_match(input); } +std::pair<std::string, std::string> Regex::possible_anchored_match_prefix_range() const { + return _impl->possible_anchored_match_prefix_range(); +} + bool Regex::partial_match(std::string_view input, std::string_view pattern) noexcept { assert(pattern.size() <= INT32_MAX); Impl impl(pattern, RE2::Quiet); diff --git a/vespalib/src/vespa/vespalib/regex/regex.h b/vespalib/src/vespa/vespalib/regex/regex.h index 6a4d6bc47fc..50eb76b14d4 100644 --- a/vespalib/src/vespa/vespalib/regex/regex.h +++ b/vespalib/src/vespa/vespalib/regex/regex.h @@ -4,6 +4,7 @@ #include <memory> #include <string> #include <string_view> +#include <utility> namespace vespalib { @@ -37,7 +38,7 @@ namespace vespalib { */ class Regex { class Impl; - std::unique_ptr<const Impl> _impl; // shared_ptr to allow for cheap copying. + std::unique_ptr<const Impl> _impl; explicit Regex(std::unique_ptr<const Impl> impl); public: @@ -55,14 +56,22 @@ public: Regex(Regex&&) noexcept; Regex& operator=(Regex&&) noexcept; - bool valid() const { return bool(_impl); } + [[nodiscard]] bool valid() const noexcept { return bool(_impl); } [[nodiscard]] bool parsed_ok() const noexcept; [[nodiscard]] bool partial_match(std::string_view input) const noexcept; [[nodiscard]] bool full_match(std::string_view input) const noexcept; - static Regex from_pattern(std::string_view pattern, uint32_t opt_flags = Options::None); + // Returns a pair of <lower bound, upper bound> prefix strings that constrain the possible + // match-able range of inputs for this regex. If there is no shared prefix, or if extracting + // the range fails, the strings will be empty. + // Important: this is _only_ semantically valid if the regex is strictly start-anchored, i.e. + // all possible matching paths start with '^'. + // This method does _not_ validate that the regex is strictly start-anchored. + [[nodiscard]] std::pair<std::string, std::string> possible_anchored_match_prefix_range() const; + + [[nodiscard]] static Regex from_pattern(std::string_view pattern, uint32_t opt_flags = Options::None); // Utility matchers for non-precompiled expressions. [[nodiscard]] static bool partial_match(std::string_view input, std::string_view pattern) noexcept; |