1// Copyright 2006 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Format a regular expression structure as a string.
6// Tested by parse_test.cc
7
8#include <string.h>
9#include <string>
10
11#include "absl/strings/str_format.h"
12#include "util/logging.h"
13#include "util/utf.h"
14#include "re2/regexp.h"
15#include "re2/walker-inl.h"
16
17namespace re2 {
18
19enum {
20 PrecAtom,
21 PrecUnary,
22 PrecConcat,
23 PrecAlternate,
24 PrecEmpty,
25 PrecParen,
26 PrecToplevel,
27};
28
29// Helper function. See description below.
30static void AppendCCRange(std::string* t, Rune lo, Rune hi);
31
32// Walker to generate string in s_.
33// The arg pointers are actually integers giving the
34// context precedence.
35// The child_args are always NULL.
36class ToStringWalker : public Regexp::Walker<int> {
37 public:
38 explicit ToStringWalker(std::string* t) : t_(t) {}
39
40 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
41 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
42 int* child_args, int nchild_args);
43 virtual int ShortVisit(Regexp* re, int parent_arg) {
44 return 0;
45 }
46
47 private:
48 std::string* t_; // The string the walker appends to.
49
50 ToStringWalker(const ToStringWalker&) = delete;
51 ToStringWalker& operator=(const ToStringWalker&) = delete;
52};
53
54std::string Regexp::ToString() {
55 std::string t;
56 ToStringWalker w(&t);
57 w.WalkExponential(this, PrecToplevel, 100000);
58 if (w.stopped_early())
59 t += " [truncated]";
60 return t;
61}
62
63#define ToString DontCallToString // Avoid accidental recursion.
64
65// Visits re before children are processed.
66// Appends ( if needed and passes new precedence to children.
67int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
68 int prec = parent_arg;
69 int nprec = PrecAtom;
70
71 switch (re->op()) {
72 case kRegexpNoMatch:
73 case kRegexpEmptyMatch:
74 case kRegexpLiteral:
75 case kRegexpAnyChar:
76 case kRegexpAnyByte:
77 case kRegexpBeginLine:
78 case kRegexpEndLine:
79 case kRegexpBeginText:
80 case kRegexpEndText:
81 case kRegexpWordBoundary:
82 case kRegexpNoWordBoundary:
83 case kRegexpCharClass:
84 case kRegexpHaveMatch:
85 nprec = PrecAtom;
86 break;
87
88 case kRegexpConcat:
89 case kRegexpLiteralString:
90 if (prec < PrecConcat)
91 t_->append("(?:");
92 nprec = PrecConcat;
93 break;
94
95 case kRegexpAlternate:
96 if (prec < PrecAlternate)
97 t_->append("(?:");
98 nprec = PrecAlternate;
99 break;
100
101 case kRegexpCapture:
102 t_->append("(");
103 if (re->cap() == 0)
104 LOG(DFATAL) << "kRegexpCapture cap() == 0";
105 if (re->name()) {
106 t_->append("?P<");
107 t_->append(*re->name());
108 t_->append(">");
109 }
110 nprec = PrecParen;
111 break;
112
113 case kRegexpStar:
114 case kRegexpPlus:
115 case kRegexpQuest:
116 case kRegexpRepeat:
117 if (prec < PrecUnary)
118 t_->append("(?:");
119 // The subprecedence here is PrecAtom instead of PrecUnary
120 // because PCRE treats two unary ops in a row as a parse error.
121 nprec = PrecAtom;
122 break;
123 }
124
125 return nprec;
126}
127
128static void AppendLiteral(std::string *t, Rune r, bool foldcase) {
129 if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
130 t->append(1, '\\');
131 t->append(1, static_cast<char>(r));
132 } else if (foldcase && 'a' <= r && r <= 'z') {
133 r -= 'a' - 'A';
134 t->append(1, '[');
135 t->append(1, static_cast<char>(r));
136 t->append(1, static_cast<char>(r) + 'a' - 'A');
137 t->append(1, ']');
138 } else {
139 AppendCCRange(t, r, r);
140 }
141}
142
143// Visits re after children are processed.
144// For childless regexps, all the work is done here.
145// For regexps with children, append any unary suffixes or ).
146int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
147 int* child_args, int nchild_args) {
148 int prec = parent_arg;
149 switch (re->op()) {
150 case kRegexpNoMatch:
151 // There's no simple symbol for "no match", but
152 // [^0-Runemax] excludes everything.
153 t_->append("[^\\x00-\\x{10ffff}]");
154 break;
155
156 case kRegexpEmptyMatch:
157 // Append (?:) to make empty string visible,
158 // unless this is already being parenthesized.
159 if (prec < PrecEmpty)
160 t_->append("(?:)");
161 break;
162
163 case kRegexpLiteral:
164 AppendLiteral(t_, re->rune(),
165 (re->parse_flags() & Regexp::FoldCase) != 0);
166 break;
167
168 case kRegexpLiteralString:
169 for (int i = 0; i < re->nrunes(); i++)
170 AppendLiteral(t_, re->runes()[i],
171 (re->parse_flags() & Regexp::FoldCase) != 0);
172 if (prec < PrecConcat)
173 t_->append(")");
174 break;
175
176 case kRegexpConcat:
177 if (prec < PrecConcat)
178 t_->append(")");
179 break;
180
181 case kRegexpAlternate:
182 // Clumsy but workable: the children all appended |
183 // at the end of their strings, so just remove the last one.
184 if ((*t_)[t_->size()-1] == '|')
185 t_->erase(t_->size()-1);
186 else
187 LOG(DFATAL) << "Bad final char: " << t_;
188 if (prec < PrecAlternate)
189 t_->append(")");
190 break;
191
192 case kRegexpStar:
193 t_->append("*");
194 if (re->parse_flags() & Regexp::NonGreedy)
195 t_->append("?");
196 if (prec < PrecUnary)
197 t_->append(")");
198 break;
199
200 case kRegexpPlus:
201 t_->append("+");
202 if (re->parse_flags() & Regexp::NonGreedy)
203 t_->append("?");
204 if (prec < PrecUnary)
205 t_->append(")");
206 break;
207
208 case kRegexpQuest:
209 t_->append("?");
210 if (re->parse_flags() & Regexp::NonGreedy)
211 t_->append("?");
212 if (prec < PrecUnary)
213 t_->append(")");
214 break;
215
216 case kRegexpRepeat:
217 if (re->max() == -1)
218 t_->append(absl::StrFormat("{%d,}", re->min()));
219 else if (re->min() == re->max())
220 t_->append(absl::StrFormat("{%d}", re->min()));
221 else
222 t_->append(absl::StrFormat("{%d,%d}", re->min(), re->max()));
223 if (re->parse_flags() & Regexp::NonGreedy)
224 t_->append("?");
225 if (prec < PrecUnary)
226 t_->append(")");
227 break;
228
229 case kRegexpAnyChar:
230 t_->append(".");
231 break;
232
233 case kRegexpAnyByte:
234 t_->append("\\C");
235 break;
236
237 case kRegexpBeginLine:
238 t_->append("^");
239 break;
240
241 case kRegexpEndLine:
242 t_->append("$");
243 break;
244
245 case kRegexpBeginText:
246 t_->append("(?-m:^)");
247 break;
248
249 case kRegexpEndText:
250 if (re->parse_flags() & Regexp::WasDollar)
251 t_->append("(?-m:$)");
252 else
253 t_->append("\\z");
254 break;
255
256 case kRegexpWordBoundary:
257 t_->append("\\b");
258 break;
259
260 case kRegexpNoWordBoundary:
261 t_->append("\\B");
262 break;
263
264 case kRegexpCharClass: {
265 if (re->cc()->size() == 0) {
266 t_->append("[^\\x00-\\x{10ffff}]");
267 break;
268 }
269 t_->append("[");
270 // Heuristic: show class as negated if it contains the
271 // non-character 0xFFFE and yet somehow isn't full.
272 CharClass* cc = re->cc();
273 if (cc->Contains(0xFFFE) && !cc->full()) {
274 cc = cc->Negate();
275 t_->append("^");
276 }
277 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
278 AppendCCRange(t_, i->lo, i->hi);
279 if (cc != re->cc())
280 cc->Delete();
281 t_->append("]");
282 break;
283 }
284
285 case kRegexpCapture:
286 t_->append(")");
287 break;
288
289 case kRegexpHaveMatch:
290 // There's no syntax accepted by the parser to generate
291 // this node (it is generated by RE2::Set) so make something
292 // up that is readable but won't compile.
293 t_->append(absl::StrFormat("(?HaveMatch:%d)", re->match_id()));
294 break;
295 }
296
297 // If the parent is an alternation, append the | for it.
298 if (prec == PrecAlternate)
299 t_->append("|");
300
301 return 0;
302}
303
304// Appends a rune for use in a character class to the string t.
305static void AppendCCChar(std::string* t, Rune r) {
306 if (0x20 <= r && r <= 0x7E) {
307 if (strchr("[]^-\\", r))
308 t->append("\\");
309 t->append(1, static_cast<char>(r));
310 return;
311 }
312 switch (r) {
313 default:
314 break;
315
316 case '\r':
317 t->append("\\r");
318 return;
319
320 case '\t':
321 t->append("\\t");
322 return;
323
324 case '\n':
325 t->append("\\n");
326 return;
327
328 case '\f':
329 t->append("\\f");
330 return;
331 }
332
333 if (r < 0x100) {
334 *t += absl::StrFormat("\\x%02x", static_cast<int>(r));
335 return;
336 }
337 *t += absl::StrFormat("\\x{%x}", static_cast<int>(r));
338}
339
340static void AppendCCRange(std::string* t, Rune lo, Rune hi) {
341 if (lo > hi)
342 return;
343 AppendCCChar(t, lo);
344 if (lo < hi) {
345 t->append("-");
346 AppendCCChar(t, hi);
347 }
348}
349
350} // namespace re2
351