1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Format a regular expression structure as a string. |
6 | // Tested by parse_test.cc |
7 | |
8 | #include <string.h> |
9 | #include <string> |
10 | |
11 | #include "absl/strings/str_format.h" |
12 | #include "util/logging.h" |
13 | #include "util/utf.h" |
14 | #include "re2/regexp.h" |
15 | #include "re2/walker-inl.h" |
16 | |
17 | namespace re2 { |
18 | |
19 | enum { |
20 | PrecAtom, |
21 | PrecUnary, |
22 | PrecConcat, |
23 | PrecAlternate, |
24 | PrecEmpty, |
25 | PrecParen, |
26 | PrecToplevel, |
27 | }; |
28 | |
29 | // Helper function. See description below. |
30 | static void AppendCCRange(std::string* t, Rune lo, Rune hi); |
31 | |
32 | // Walker to generate string in s_. |
33 | // The arg pointers are actually integers giving the |
34 | // context precedence. |
35 | // The child_args are always NULL. |
36 | class ToStringWalker : public Regexp::Walker<int> { |
37 | public: |
38 | explicit ToStringWalker(std::string* t) : t_(t) {} |
39 | |
40 | virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); |
41 | virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, |
42 | int* child_args, int nchild_args); |
43 | virtual int ShortVisit(Regexp* re, int parent_arg) { |
44 | return 0; |
45 | } |
46 | |
47 | private: |
48 | std::string* t_; // The string the walker appends to. |
49 | |
50 | ToStringWalker(const ToStringWalker&) = delete; |
51 | ToStringWalker& operator=(const ToStringWalker&) = delete; |
52 | }; |
53 | |
54 | std::string Regexp::ToString() { |
55 | std::string t; |
56 | ToStringWalker w(&t); |
57 | w.WalkExponential(this, PrecToplevel, 100000); |
58 | if (w.stopped_early()) |
59 | t += " [truncated]" ; |
60 | return t; |
61 | } |
62 | |
63 | #define ToString DontCallToString // Avoid accidental recursion. |
64 | |
65 | // Visits re before children are processed. |
66 | // Appends ( if needed and passes new precedence to children. |
67 | int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { |
68 | int prec = parent_arg; |
69 | int nprec = PrecAtom; |
70 | |
71 | switch (re->op()) { |
72 | case kRegexpNoMatch: |
73 | case kRegexpEmptyMatch: |
74 | case kRegexpLiteral: |
75 | case kRegexpAnyChar: |
76 | case kRegexpAnyByte: |
77 | case kRegexpBeginLine: |
78 | case kRegexpEndLine: |
79 | case kRegexpBeginText: |
80 | case kRegexpEndText: |
81 | case kRegexpWordBoundary: |
82 | case kRegexpNoWordBoundary: |
83 | case kRegexpCharClass: |
84 | case kRegexpHaveMatch: |
85 | nprec = PrecAtom; |
86 | break; |
87 | |
88 | case kRegexpConcat: |
89 | case kRegexpLiteralString: |
90 | if (prec < PrecConcat) |
91 | t_->append("(?:" ); |
92 | nprec = PrecConcat; |
93 | break; |
94 | |
95 | case kRegexpAlternate: |
96 | if (prec < PrecAlternate) |
97 | t_->append("(?:" ); |
98 | nprec = PrecAlternate; |
99 | break; |
100 | |
101 | case kRegexpCapture: |
102 | t_->append("(" ); |
103 | if (re->cap() == 0) |
104 | LOG(DFATAL) << "kRegexpCapture cap() == 0" ; |
105 | if (re->name()) { |
106 | t_->append("?P<" ); |
107 | t_->append(*re->name()); |
108 | t_->append(">" ); |
109 | } |
110 | nprec = PrecParen; |
111 | break; |
112 | |
113 | case kRegexpStar: |
114 | case kRegexpPlus: |
115 | case kRegexpQuest: |
116 | case kRegexpRepeat: |
117 | if (prec < PrecUnary) |
118 | t_->append("(?:" ); |
119 | // The subprecedence here is PrecAtom instead of PrecUnary |
120 | // because PCRE treats two unary ops in a row as a parse error. |
121 | nprec = PrecAtom; |
122 | break; |
123 | } |
124 | |
125 | return nprec; |
126 | } |
127 | |
128 | static void AppendLiteral(std::string *t, Rune r, bool foldcase) { |
129 | if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\" , r)) { |
130 | t->append(1, '\\'); |
131 | t->append(1, static_cast<char>(r)); |
132 | } else if (foldcase && 'a' <= r && r <= 'z') { |
133 | r -= 'a' - 'A'; |
134 | t->append(1, '['); |
135 | t->append(1, static_cast<char>(r)); |
136 | t->append(1, static_cast<char>(r) + 'a' - 'A'); |
137 | t->append(1, ']'); |
138 | } else { |
139 | AppendCCRange(t, r, r); |
140 | } |
141 | } |
142 | |
143 | // Visits re after children are processed. |
144 | // For childless regexps, all the work is done here. |
145 | // For regexps with children, append any unary suffixes or ). |
146 | int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, |
147 | int* child_args, int nchild_args) { |
148 | int prec = parent_arg; |
149 | switch (re->op()) { |
150 | case kRegexpNoMatch: |
151 | // There's no simple symbol for "no match", but |
152 | // [^0-Runemax] excludes everything. |
153 | t_->append("[^\\x00-\\x{10ffff}]" ); |
154 | break; |
155 | |
156 | case kRegexpEmptyMatch: |
157 | // Append (?:) to make empty string visible, |
158 | // unless this is already being parenthesized. |
159 | if (prec < PrecEmpty) |
160 | t_->append("(?:)" ); |
161 | break; |
162 | |
163 | case kRegexpLiteral: |
164 | AppendLiteral(t_, re->rune(), |
165 | (re->parse_flags() & Regexp::FoldCase) != 0); |
166 | break; |
167 | |
168 | case kRegexpLiteralString: |
169 | for (int i = 0; i < re->nrunes(); i++) |
170 | AppendLiteral(t_, re->runes()[i], |
171 | (re->parse_flags() & Regexp::FoldCase) != 0); |
172 | if (prec < PrecConcat) |
173 | t_->append(")" ); |
174 | break; |
175 | |
176 | case kRegexpConcat: |
177 | if (prec < PrecConcat) |
178 | t_->append(")" ); |
179 | break; |
180 | |
181 | case kRegexpAlternate: |
182 | // Clumsy but workable: the children all appended | |
183 | // at the end of their strings, so just remove the last one. |
184 | if ((*t_)[t_->size()-1] == '|') |
185 | t_->erase(t_->size()-1); |
186 | else |
187 | LOG(DFATAL) << "Bad final char: " << t_; |
188 | if (prec < PrecAlternate) |
189 | t_->append(")" ); |
190 | break; |
191 | |
192 | case kRegexpStar: |
193 | t_->append("*" ); |
194 | if (re->parse_flags() & Regexp::NonGreedy) |
195 | t_->append("?" ); |
196 | if (prec < PrecUnary) |
197 | t_->append(")" ); |
198 | break; |
199 | |
200 | case kRegexpPlus: |
201 | t_->append("+" ); |
202 | if (re->parse_flags() & Regexp::NonGreedy) |
203 | t_->append("?" ); |
204 | if (prec < PrecUnary) |
205 | t_->append(")" ); |
206 | break; |
207 | |
208 | case kRegexpQuest: |
209 | t_->append("?" ); |
210 | if (re->parse_flags() & Regexp::NonGreedy) |
211 | t_->append("?" ); |
212 | if (prec < PrecUnary) |
213 | t_->append(")" ); |
214 | break; |
215 | |
216 | case kRegexpRepeat: |
217 | if (re->max() == -1) |
218 | t_->append(absl::StrFormat("{%d,}" , re->min())); |
219 | else if (re->min() == re->max()) |
220 | t_->append(absl::StrFormat("{%d}" , re->min())); |
221 | else |
222 | t_->append(absl::StrFormat("{%d,%d}" , re->min(), re->max())); |
223 | if (re->parse_flags() & Regexp::NonGreedy) |
224 | t_->append("?" ); |
225 | if (prec < PrecUnary) |
226 | t_->append(")" ); |
227 | break; |
228 | |
229 | case kRegexpAnyChar: |
230 | t_->append("." ); |
231 | break; |
232 | |
233 | case kRegexpAnyByte: |
234 | t_->append("\\C" ); |
235 | break; |
236 | |
237 | case kRegexpBeginLine: |
238 | t_->append("^" ); |
239 | break; |
240 | |
241 | case kRegexpEndLine: |
242 | t_->append("$" ); |
243 | break; |
244 | |
245 | case kRegexpBeginText: |
246 | t_->append("(?-m:^)" ); |
247 | break; |
248 | |
249 | case kRegexpEndText: |
250 | if (re->parse_flags() & Regexp::WasDollar) |
251 | t_->append("(?-m:$)" ); |
252 | else |
253 | t_->append("\\z" ); |
254 | break; |
255 | |
256 | case kRegexpWordBoundary: |
257 | t_->append("\\b" ); |
258 | break; |
259 | |
260 | case kRegexpNoWordBoundary: |
261 | t_->append("\\B" ); |
262 | break; |
263 | |
264 | case kRegexpCharClass: { |
265 | if (re->cc()->size() == 0) { |
266 | t_->append("[^\\x00-\\x{10ffff}]" ); |
267 | break; |
268 | } |
269 | t_->append("[" ); |
270 | // Heuristic: show class as negated if it contains the |
271 | // non-character 0xFFFE and yet somehow isn't full. |
272 | CharClass* cc = re->cc(); |
273 | if (cc->Contains(0xFFFE) && !cc->full()) { |
274 | cc = cc->Negate(); |
275 | t_->append("^" ); |
276 | } |
277 | for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) |
278 | AppendCCRange(t_, i->lo, i->hi); |
279 | if (cc != re->cc()) |
280 | cc->Delete(); |
281 | t_->append("]" ); |
282 | break; |
283 | } |
284 | |
285 | case kRegexpCapture: |
286 | t_->append(")" ); |
287 | break; |
288 | |
289 | case kRegexpHaveMatch: |
290 | // There's no syntax accepted by the parser to generate |
291 | // this node (it is generated by RE2::Set) so make something |
292 | // up that is readable but won't compile. |
293 | t_->append(absl::StrFormat("(?HaveMatch:%d)" , re->match_id())); |
294 | break; |
295 | } |
296 | |
297 | // If the parent is an alternation, append the | for it. |
298 | if (prec == PrecAlternate) |
299 | t_->append("|" ); |
300 | |
301 | return 0; |
302 | } |
303 | |
304 | // Appends a rune for use in a character class to the string t. |
305 | static void AppendCCChar(std::string* t, Rune r) { |
306 | if (0x20 <= r && r <= 0x7E) { |
307 | if (strchr("[]^-\\" , r)) |
308 | t->append("\\" ); |
309 | t->append(1, static_cast<char>(r)); |
310 | return; |
311 | } |
312 | switch (r) { |
313 | default: |
314 | break; |
315 | |
316 | case '\r': |
317 | t->append("\\r" ); |
318 | return; |
319 | |
320 | case '\t': |
321 | t->append("\\t" ); |
322 | return; |
323 | |
324 | case '\n': |
325 | t->append("\\n" ); |
326 | return; |
327 | |
328 | case '\f': |
329 | t->append("\\f" ); |
330 | return; |
331 | } |
332 | |
333 | if (r < 0x100) { |
334 | *t += absl::StrFormat("\\x%02x" , static_cast<int>(r)); |
335 | return; |
336 | } |
337 | *t += absl::StrFormat("\\x{%x}" , static_cast<int>(r)); |
338 | } |
339 | |
340 | static void AppendCCRange(std::string* t, Rune lo, Rune hi) { |
341 | if (lo > hi) |
342 | return; |
343 | AppendCCChar(t, lo); |
344 | if (lo < hi) { |
345 | t->append("-" ); |
346 | AppendCCChar(t, hi); |
347 | } |
348 | } |
349 | |
350 | } // namespace re2 |
351 | |