1 | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Determine whether this library should match PCRE exactly |
6 | // for a particular Regexp. (If so, the testing framework can |
7 | // check that it does.) |
8 | // |
9 | // This library matches PCRE except in these cases: |
10 | // * the regexp contains a repetition of an empty string, |
11 | // like (a*)* or (a*)+. In this case, PCRE will treat |
12 | // the repetition sequence as ending with an empty string, |
13 | // while this library does not. |
14 | // * Perl and PCRE differ on whether \v matches \n. |
15 | // For historical reasons, this library implements the Perl behavior. |
16 | // * Perl and PCRE allow $ in one-line mode to match either the very |
17 | // end of the text or just before a \n at the end of the text. |
18 | // This library requires it to match only the end of the text. |
19 | // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to |
20 | // match the end of the text if the last character is a \n. |
21 | // This library does allow it. |
22 | // |
23 | // Regexp::MimicsPCRE checks for any of these conditions. |
24 | |
25 | #include "util/logging.h" |
26 | #include "re2/regexp.h" |
27 | #include "re2/walker-inl.h" |
28 | |
29 | namespace re2 { |
30 | |
31 | // Returns whether re might match an empty string. |
32 | static bool CanBeEmptyString(Regexp *re); |
33 | |
34 | // Walker class to compute whether library handles a regexp |
35 | // exactly as PCRE would. See comment at top for conditions. |
36 | |
37 | class PCREWalker : public Regexp::Walker<bool> { |
38 | public: |
39 | PCREWalker() {} |
40 | |
41 | virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, |
42 | bool* child_args, int nchild_args); |
43 | |
44 | virtual bool ShortVisit(Regexp* re, bool a) { |
45 | // Should never be called: we use Walk(), not WalkExponential(). |
46 | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
47 | LOG(DFATAL) << "PCREWalker::ShortVisit called" ; |
48 | #endif |
49 | return a; |
50 | } |
51 | |
52 | private: |
53 | PCREWalker(const PCREWalker&) = delete; |
54 | PCREWalker& operator=(const PCREWalker&) = delete; |
55 | }; |
56 | |
57 | // Called after visiting each of re's children and accumulating |
58 | // the return values in child_args. So child_args contains whether |
59 | // this library mimics PCRE for those subexpressions. |
60 | bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, |
61 | bool* child_args, int nchild_args) { |
62 | // If children failed, so do we. |
63 | for (int i = 0; i < nchild_args; i++) |
64 | if (!child_args[i]) |
65 | return false; |
66 | |
67 | // Otherwise look for other reasons to fail. |
68 | switch (re->op()) { |
69 | // Look for repeated empty string. |
70 | case kRegexpStar: |
71 | case kRegexpPlus: |
72 | case kRegexpQuest: |
73 | if (CanBeEmptyString(re->sub()[0])) |
74 | return false; |
75 | break; |
76 | case kRegexpRepeat: |
77 | if (re->max() == -1 && CanBeEmptyString(re->sub()[0])) |
78 | return false; |
79 | break; |
80 | |
81 | // Look for \v |
82 | case kRegexpLiteral: |
83 | if (re->rune() == '\v') |
84 | return false; |
85 | break; |
86 | |
87 | // Look for $ in single-line mode. |
88 | case kRegexpEndText: |
89 | case kRegexpEmptyMatch: |
90 | if (re->parse_flags() & Regexp::WasDollar) |
91 | return false; |
92 | break; |
93 | |
94 | // Look for ^ in multi-line mode. |
95 | case kRegexpBeginLine: |
96 | // No condition: in single-line mode ^ becomes kRegexpBeginText. |
97 | return false; |
98 | |
99 | default: |
100 | break; |
101 | } |
102 | |
103 | // Not proven guilty. |
104 | return true; |
105 | } |
106 | |
107 | // Returns whether this regexp's behavior will mimic PCRE's exactly. |
108 | bool Regexp::MimicsPCRE() { |
109 | PCREWalker w; |
110 | return w.Walk(this, true); |
111 | } |
112 | |
113 | |
114 | // Walker class to compute whether a Regexp can match an empty string. |
115 | // It is okay to overestimate. For example, \b\B cannot match an empty |
116 | // string, because \b and \B are mutually exclusive, but this isn't |
117 | // that smart and will say it can. Spurious empty strings |
118 | // will reduce the number of regexps we sanity check against PCRE, |
119 | // but they won't break anything. |
120 | |
121 | class EmptyStringWalker : public Regexp::Walker<bool> { |
122 | public: |
123 | EmptyStringWalker() {} |
124 | |
125 | virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, |
126 | bool* child_args, int nchild_args); |
127 | |
128 | virtual bool ShortVisit(Regexp* re, bool a) { |
129 | // Should never be called: we use Walk(), not WalkExponential(). |
130 | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
131 | LOG(DFATAL) << "EmptyStringWalker::ShortVisit called" ; |
132 | #endif |
133 | return a; |
134 | } |
135 | |
136 | private: |
137 | EmptyStringWalker(const EmptyStringWalker&) = delete; |
138 | EmptyStringWalker& operator=(const EmptyStringWalker&) = delete; |
139 | }; |
140 | |
141 | // Called after visiting re's children. child_args contains the return |
142 | // value from each of the children's PostVisits (i.e., whether each child |
143 | // can match an empty string). Returns whether this clause can match an |
144 | // empty string. |
145 | bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, |
146 | bool* child_args, int nchild_args) { |
147 | switch (re->op()) { |
148 | case kRegexpNoMatch: // never empty |
149 | case kRegexpLiteral: |
150 | case kRegexpAnyChar: |
151 | case kRegexpAnyByte: |
152 | case kRegexpCharClass: |
153 | case kRegexpLiteralString: |
154 | return false; |
155 | |
156 | case kRegexpEmptyMatch: // always empty |
157 | case kRegexpBeginLine: // always empty, when they match |
158 | case kRegexpEndLine: |
159 | case kRegexpNoWordBoundary: |
160 | case kRegexpWordBoundary: |
161 | case kRegexpBeginText: |
162 | case kRegexpEndText: |
163 | case kRegexpStar: // can always be empty |
164 | case kRegexpQuest: |
165 | case kRegexpHaveMatch: |
166 | return true; |
167 | |
168 | case kRegexpConcat: // can be empty if all children can |
169 | for (int i = 0; i < nchild_args; i++) |
170 | if (!child_args[i]) |
171 | return false; |
172 | return true; |
173 | |
174 | case kRegexpAlternate: // can be empty if any child can |
175 | for (int i = 0; i < nchild_args; i++) |
176 | if (child_args[i]) |
177 | return true; |
178 | return false; |
179 | |
180 | case kRegexpPlus: // can be empty if the child can |
181 | case kRegexpCapture: |
182 | return child_args[0]; |
183 | |
184 | case kRegexpRepeat: // can be empty if child can or is x{0} |
185 | return child_args[0] || re->min() == 0; |
186 | } |
187 | return false; |
188 | } |
189 | |
190 | // Returns whether re can match an empty string. |
191 | static bool CanBeEmptyString(Regexp* re) { |
192 | EmptyStringWalker w; |
193 | return w.Walk(re, true); |
194 | } |
195 | |
196 | } // namespace re2 |
197 | |