aboutsummaryrefslogtreecommitdiffstats
path: root/vespalib/src/vespa/vespalib/regex/regex.h
blob: 999ab9b120941746e8aa1cc81d7d91b4d25bfb9c (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once

#include <memory>
#include <string>
#include <string_view>
#include <utility>

namespace vespalib {

/**
 * A simple Regex library wrapper which provides for both just-in-time
 * pattern evaluation as well as pattern precompilation and reuse.
 *
 * Robustness and input safety:
 * The underlying regex engine implementation must ensure that pattern
 * parsing and input processing is safe to be run on _untrusted_ inputs.
 * This means the underlying implementation shall provide upper bounds
 * on both memory and CPU time and may never crash or corrupt the process.
 *
 * We currently use Google RE2 under the hood to achieve this.
 *
 * Note: due to underlying RE2 limitations, string lengths may
 * not be longer than INT_MAX.
 *
 * Thread safety:
 * A Regex object is safe to be used from multiple threads.
 *
 * Exception safety:
 * Exceptions shall never be thrown from the regex code itself, neither
 * at parse time nor at match time (ancillary exceptions _could_ be thrown
 * from memory allocation failures etc, but we assume that the caller
 * is running vespamalloc which terminates the process instead, making
 * the whole thing effectively noexcept).
 *
 * If the provided regular expression pattern is malformed, parsing
 * fails silently; all match functions will return false immediately.
 */
class Regex {
    class Impl;
    std::unique_ptr<const Impl> _impl;

    explicit Regex(std::unique_ptr<const Impl> impl);
public:
    // TODO consider using type-safe parameter instead.
    enum Options {
        None              = 0,
        IgnoreCase        = 1,
        DotMatchesNewline = 2
    };

    //Default constructed object is invalid
    Regex();

    ~Regex();
    Regex(Regex&&) noexcept;
    Regex& operator=(Regex&&) noexcept;

    [[nodiscard]] bool valid() const noexcept { return bool(_impl); }

    [[nodiscard]] bool parsed_ok() const noexcept;

    [[nodiscard]] bool partial_match(std::string_view input) const noexcept;
    [[nodiscard]] bool full_match(std::string_view input) const noexcept;

    // Returns a pair of <lower bound, upper bound> prefix strings that constrain the possible
    // match-able range of inputs for this regex. If there is no shared prefix, or if extracting
    // the range fails, the strings will be empty.
    // Important: this is _only_ semantically valid if the regex is strictly start-anchored, i.e.
    // all possible matching paths start with '^'.
    // This method does _not_ validate that the regex is strictly start-anchored.
    [[nodiscard]] std::pair<std::string, std::string> possible_anchored_match_prefix_range() const;

    [[nodiscard]] static Regex from_pattern(std::string_view pattern, uint32_t opt_flags = Options::None);

    // Utility matchers for non-precompiled expressions.
    [[nodiscard]] static bool partial_match(std::string_view input, std::string_view pattern) noexcept;
    [[nodiscard]] static bool full_match(std::string_view input, std::string_view pattern) noexcept;
};

}