1// Copyright 2006 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Rewrite POSIX and other features in re
6// to use simple extended regular expression features.
7// Also sort and simplify character classes.
8
9#include <string>
10
11#include "util/logging.h"
12#include "util/utf.h"
13#include "re2/pod_array.h"
14#include "re2/regexp.h"
15#include "re2/walker-inl.h"
16
17namespace re2 {
18
19// Parses the regexp src and then simplifies it and sets *dst to the
20// string representation of the simplified form. Returns true on success.
21// Returns false and sets *error (if error != NULL) on error.
22bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags,
23 std::string* dst, RegexpStatus* status) {
24 Regexp* re = Parse(src, flags, status);
25 if (re == NULL)
26 return false;
27 Regexp* sre = re->Simplify();
28 re->Decref();
29 if (sre == NULL) {
30 if (status) {
31 status->set_code(kRegexpInternalError);
32 status->set_error_arg(src);
33 }
34 return false;
35 }
36 *dst = sre->ToString();
37 sre->Decref();
38 return true;
39}
40
41// Assuming the simple_ flags on the children are accurate,
42// is this Regexp* simple?
43bool Regexp::ComputeSimple() {
44 Regexp** subs;
45 switch (op_) {
46 case kRegexpNoMatch:
47 case kRegexpEmptyMatch:
48 case kRegexpLiteral:
49 case kRegexpLiteralString:
50 case kRegexpBeginLine:
51 case kRegexpEndLine:
52 case kRegexpBeginText:
53 case kRegexpWordBoundary:
54 case kRegexpNoWordBoundary:
55 case kRegexpEndText:
56 case kRegexpAnyChar:
57 case kRegexpAnyByte:
58 case kRegexpHaveMatch:
59 return true;
60 case kRegexpConcat:
61 case kRegexpAlternate:
62 // These are simple as long as the subpieces are simple.
63 subs = sub();
64 for (int i = 0; i < nsub_; i++)
65 if (!subs[i]->simple())
66 return false;
67 return true;
68 case kRegexpCharClass:
69 // Simple as long as the char class is not empty, not full.
70 if (ccb_ != NULL)
71 return !ccb_->empty() && !ccb_->full();
72 return !cc_->empty() && !cc_->full();
73 case kRegexpCapture:
74 subs = sub();
75 return subs[0]->simple();
76 case kRegexpStar:
77 case kRegexpPlus:
78 case kRegexpQuest:
79 subs = sub();
80 if (!subs[0]->simple())
81 return false;
82 switch (subs[0]->op_) {
83 case kRegexpStar:
84 case kRegexpPlus:
85 case kRegexpQuest:
86 case kRegexpEmptyMatch:
87 case kRegexpNoMatch:
88 return false;
89 default:
90 break;
91 }
92 return true;
93 case kRegexpRepeat:
94 return false;
95 }
96 LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
97 return false;
98}
99
100// Walker subclass used by Simplify.
101// Coalesces runs of star/plus/quest/repeat of the same literal along with any
102// occurrences of that literal into repeats of that literal. It also works for
103// char classes, any char and any byte.
104// PostVisit creates the coalesced result, which should then be simplified.
105class CoalesceWalker : public Regexp::Walker<Regexp*> {
106 public:
107 CoalesceWalker() {}
108 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
109 Regexp** child_args, int nchild_args);
110 virtual Regexp* Copy(Regexp* re);
111 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
112
113 private:
114 // These functions are declared inside CoalesceWalker so that
115 // they can edit the private fields of the Regexps they construct.
116
117 // Returns true if r1 and r2 can be coalesced. In particular, ensures that
118 // the parse flags are consistent. (They will not be checked again later.)
119 static bool CanCoalesce(Regexp* r1, Regexp* r2);
120
121 // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
122 // will be empty match and the coalesced op. In other cases, where part of a
123 // literal string was removed to be coalesced, the array elements afterwards
124 // will be the coalesced op and the remainder of the literal string.
125 static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
126
127 CoalesceWalker(const CoalesceWalker&) = delete;
128 CoalesceWalker& operator=(const CoalesceWalker&) = delete;
129};
130
131// Walker subclass used by Simplify.
132// The simplify walk is purely post-recursive: given the simplified children,
133// PostVisit creates the simplified result.
134// The child_args are simplified Regexp*s.
135class SimplifyWalker : public Regexp::Walker<Regexp*> {
136 public:
137 SimplifyWalker() {}
138 virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
139 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
140 Regexp** child_args, int nchild_args);
141 virtual Regexp* Copy(Regexp* re);
142 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
143
144 private:
145 // These functions are declared inside SimplifyWalker so that
146 // they can edit the private fields of the Regexps they construct.
147
148 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
149 // Caller must Decref return value when done with it.
150 static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
151
152 // Simplifies the expression re{min,max} in terms of *, +, and ?.
153 // Returns a new regexp. Does not edit re. Does not consume reference to re.
154 // Caller must Decref return value when done with it.
155 static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
156 Regexp::ParseFlags parse_flags);
157
158 // Simplifies a character class by expanding any named classes
159 // into rune ranges. Does not edit re. Does not consume ref to re.
160 // Caller must Decref return value when done with it.
161 static Regexp* SimplifyCharClass(Regexp* re);
162
163 SimplifyWalker(const SimplifyWalker&) = delete;
164 SimplifyWalker& operator=(const SimplifyWalker&) = delete;
165};
166
167// Simplifies a regular expression, returning a new regexp.
168// The new regexp uses traditional Unix egrep features only,
169// plus the Perl (?:) non-capturing parentheses.
170// Otherwise, no POSIX or Perl additions. The new regexp
171// captures exactly the same subexpressions (with the same indices)
172// as the original.
173// Does not edit current object.
174// Caller must Decref() return value when done with it.
175
176Regexp* Regexp::Simplify() {
177 CoalesceWalker cw;
178 Regexp* cre = cw.Walk(this, NULL);
179 if (cre == NULL)
180 return NULL;
181 if (cw.stopped_early()) {
182 cre->Decref();
183 return NULL;
184 }
185 SimplifyWalker sw;
186 Regexp* sre = sw.Walk(cre, NULL);
187 cre->Decref();
188 if (sre == NULL)
189 return NULL;
190 if (sw.stopped_early()) {
191 sre->Decref();
192 return NULL;
193 }
194 return sre;
195}
196
197#define Simplify DontCallSimplify // Avoid accidental recursion
198
199// Utility function for PostVisit implementations that compares re->sub() with
200// child_args to determine whether any child_args changed. In the common case,
201// where nothing changed, calls Decref() for all child_args and returns false,
202// so PostVisit must return re->Incref(). Otherwise, returns true.
203static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
204 for (int i = 0; i < re->nsub(); i++) {
205 Regexp* sub = re->sub()[i];
206 Regexp* newsub = child_args[i];
207 if (newsub != sub)
208 return true;
209 }
210 for (int i = 0; i < re->nsub(); i++) {
211 Regexp* newsub = child_args[i];
212 newsub->Decref();
213 }
214 return false;
215}
216
217Regexp* CoalesceWalker::Copy(Regexp* re) {
218 return re->Incref();
219}
220
221Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
222 // Should never be called: we use Walk(), not WalkExponential().
223#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
224 LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
225#endif
226 return re->Incref();
227}
228
229Regexp* CoalesceWalker::PostVisit(Regexp* re,
230 Regexp* parent_arg,
231 Regexp* pre_arg,
232 Regexp** child_args,
233 int nchild_args) {
234 if (re->nsub() == 0)
235 return re->Incref();
236
237 if (re->op() != kRegexpConcat) {
238 if (!ChildArgsChanged(re, child_args))
239 return re->Incref();
240
241 // Something changed. Build a new op.
242 Regexp* nre = new Regexp(re->op(), re->parse_flags());
243 nre->AllocSub(re->nsub());
244 Regexp** nre_subs = nre->sub();
245 for (int i = 0; i < re->nsub(); i++)
246 nre_subs[i] = child_args[i];
247 // Repeats and Captures have additional data that must be copied.
248 if (re->op() == kRegexpRepeat) {
249 nre->min_ = re->min();
250 nre->max_ = re->max();
251 } else if (re->op() == kRegexpCapture) {
252 nre->cap_ = re->cap();
253 }
254 return nre;
255 }
256
257 bool can_coalesce = false;
258 for (int i = 0; i < re->nsub(); i++) {
259 if (i+1 < re->nsub() &&
260 CanCoalesce(child_args[i], child_args[i+1])) {
261 can_coalesce = true;
262 break;
263 }
264 }
265 if (!can_coalesce) {
266 if (!ChildArgsChanged(re, child_args))
267 return re->Incref();
268
269 // Something changed. Build a new op.
270 Regexp* nre = new Regexp(re->op(), re->parse_flags());
271 nre->AllocSub(re->nsub());
272 Regexp** nre_subs = nre->sub();
273 for (int i = 0; i < re->nsub(); i++)
274 nre_subs[i] = child_args[i];
275 return nre;
276 }
277
278 for (int i = 0; i < re->nsub(); i++) {
279 if (i+1 < re->nsub() &&
280 CanCoalesce(child_args[i], child_args[i+1]))
281 DoCoalesce(&child_args[i], &child_args[i+1]);
282 }
283 // Determine how many empty matches were left by DoCoalesce.
284 int n = 0;
285 for (int i = n; i < re->nsub(); i++) {
286 if (child_args[i]->op() == kRegexpEmptyMatch)
287 n++;
288 }
289 // Build a new op.
290 Regexp* nre = new Regexp(re->op(), re->parse_flags());
291 nre->AllocSub(re->nsub() - n);
292 Regexp** nre_subs = nre->sub();
293 for (int i = 0, j = 0; i < re->nsub(); i++) {
294 if (child_args[i]->op() == kRegexpEmptyMatch) {
295 child_args[i]->Decref();
296 continue;
297 }
298 nre_subs[j] = child_args[i];
299 j++;
300 }
301 return nre;
302}
303
304bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
305 // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
306 // any byte.
307 if ((r1->op() == kRegexpStar ||
308 r1->op() == kRegexpPlus ||
309 r1->op() == kRegexpQuest ||
310 r1->op() == kRegexpRepeat) &&
311 (r1->sub()[0]->op() == kRegexpLiteral ||
312 r1->sub()[0]->op() == kRegexpCharClass ||
313 r1->sub()[0]->op() == kRegexpAnyChar ||
314 r1->sub()[0]->op() == kRegexpAnyByte)) {
315 // r2 must be a star/plus/quest/repeat of the same literal, char class,
316 // any char or any byte.
317 if ((r2->op() == kRegexpStar ||
318 r2->op() == kRegexpPlus ||
319 r2->op() == kRegexpQuest ||
320 r2->op() == kRegexpRepeat) &&
321 Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
322 // The parse flags must be consistent.
323 ((r1->parse_flags() & Regexp::NonGreedy) ==
324 (r2->parse_flags() & Regexp::NonGreedy))) {
325 return true;
326 }
327 // ... OR an occurrence of that literal, char class, any char or any byte
328 if (Regexp::Equal(r1->sub()[0], r2)) {
329 return true;
330 }
331 // ... OR a literal string that begins with that literal.
332 if (r1->sub()[0]->op() == kRegexpLiteral &&
333 r2->op() == kRegexpLiteralString &&
334 r2->runes()[0] == r1->sub()[0]->rune() &&
335 // The parse flags must be consistent.
336 ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
337 (r2->parse_flags() & Regexp::FoldCase))) {
338 return true;
339 }
340 }
341 return false;
342}
343
344void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
345 Regexp* r1 = *r1ptr;
346 Regexp* r2 = *r2ptr;
347
348 Regexp* nre = Regexp::Repeat(
349 r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
350
351 switch (r1->op()) {
352 case kRegexpStar:
353 nre->min_ = 0;
354 nre->max_ = -1;
355 break;
356
357 case kRegexpPlus:
358 nre->min_ = 1;
359 nre->max_ = -1;
360 break;
361
362 case kRegexpQuest:
363 nre->min_ = 0;
364 nre->max_ = 1;
365 break;
366
367 case kRegexpRepeat:
368 nre->min_ = r1->min();
369 nre->max_ = r1->max();
370 break;
371
372 default:
373 LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
374 nre->Decref();
375 return;
376 }
377
378 switch (r2->op()) {
379 case kRegexpStar:
380 nre->max_ = -1;
381 goto LeaveEmpty;
382
383 case kRegexpPlus:
384 nre->min_++;
385 nre->max_ = -1;
386 goto LeaveEmpty;
387
388 case kRegexpQuest:
389 if (nre->max() != -1)
390 nre->max_++;
391 goto LeaveEmpty;
392
393 case kRegexpRepeat:
394 nre->min_ += r2->min();
395 if (r2->max() == -1)
396 nre->max_ = -1;
397 else if (nre->max() != -1)
398 nre->max_ += r2->max();
399 goto LeaveEmpty;
400
401 case kRegexpLiteral:
402 case kRegexpCharClass:
403 case kRegexpAnyChar:
404 case kRegexpAnyByte:
405 nre->min_++;
406 if (nre->max() != -1)
407 nre->max_++;
408 goto LeaveEmpty;
409
410 LeaveEmpty:
411 *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
412 *r2ptr = nre;
413 break;
414
415 case kRegexpLiteralString: {
416 Rune r = r1->sub()[0]->rune();
417 // Determine how much of the literal string is removed.
418 // We know that we have at least one rune. :)
419 int n = 1;
420 while (n < r2->nrunes() && r2->runes()[n] == r)
421 n++;
422 nre->min_ += n;
423 if (nre->max() != -1)
424 nre->max_ += n;
425 if (n == r2->nrunes())
426 goto LeaveEmpty;
427 *r1ptr = nre;
428 *r2ptr = Regexp::LiteralString(
429 &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
430 break;
431 }
432
433 default:
434 LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
435 nre->Decref();
436 return;
437 }
438
439 r1->Decref();
440 r2->Decref();
441}
442
443Regexp* SimplifyWalker::Copy(Regexp* re) {
444 return re->Incref();
445}
446
447Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
448 // Should never be called: we use Walk(), not WalkExponential().
449#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
450 LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
451#endif
452 return re->Incref();
453}
454
455Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
456 if (re->simple()) {
457 *stop = true;
458 return re->Incref();
459 }
460 return NULL;
461}
462
463Regexp* SimplifyWalker::PostVisit(Regexp* re,
464 Regexp* parent_arg,
465 Regexp* pre_arg,
466 Regexp** child_args,
467 int nchild_args) {
468 switch (re->op()) {
469 case kRegexpNoMatch:
470 case kRegexpEmptyMatch:
471 case kRegexpLiteral:
472 case kRegexpLiteralString:
473 case kRegexpBeginLine:
474 case kRegexpEndLine:
475 case kRegexpBeginText:
476 case kRegexpWordBoundary:
477 case kRegexpNoWordBoundary:
478 case kRegexpEndText:
479 case kRegexpAnyChar:
480 case kRegexpAnyByte:
481 case kRegexpHaveMatch:
482 // All these are always simple.
483 re->simple_ = true;
484 return re->Incref();
485
486 case kRegexpConcat:
487 case kRegexpAlternate: {
488 // These are simple as long as the subpieces are simple.
489 if (!ChildArgsChanged(re, child_args)) {
490 re->simple_ = true;
491 return re->Incref();
492 }
493 Regexp* nre = new Regexp(re->op(), re->parse_flags());
494 nre->AllocSub(re->nsub());
495 Regexp** nre_subs = nre->sub();
496 for (int i = 0; i < re->nsub(); i++)
497 nre_subs[i] = child_args[i];
498 nre->simple_ = true;
499 return nre;
500 }
501
502 case kRegexpCapture: {
503 Regexp* newsub = child_args[0];
504 if (newsub == re->sub()[0]) {
505 newsub->Decref();
506 re->simple_ = true;
507 return re->Incref();
508 }
509 Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
510 nre->AllocSub(1);
511 nre->sub()[0] = newsub;
512 nre->cap_ = re->cap();
513 nre->simple_ = true;
514 return nre;
515 }
516
517 case kRegexpStar:
518 case kRegexpPlus:
519 case kRegexpQuest: {
520 Regexp* newsub = child_args[0];
521 // Special case: repeat the empty string as much as
522 // you want, but it's still the empty string.
523 if (newsub->op() == kRegexpEmptyMatch)
524 return newsub;
525
526 // These are simple as long as the subpiece is simple.
527 if (newsub == re->sub()[0]) {
528 newsub->Decref();
529 re->simple_ = true;
530 return re->Incref();
531 }
532
533 // These are also idempotent if flags are constant.
534 if (re->op() == newsub->op() &&
535 re->parse_flags() == newsub->parse_flags())
536 return newsub;
537
538 Regexp* nre = new Regexp(re->op(), re->parse_flags());
539 nre->AllocSub(1);
540 nre->sub()[0] = newsub;
541 nre->simple_ = true;
542 return nre;
543 }
544
545 case kRegexpRepeat: {
546 Regexp* newsub = child_args[0];
547 // Special case: repeat the empty string as much as
548 // you want, but it's still the empty string.
549 if (newsub->op() == kRegexpEmptyMatch)
550 return newsub;
551
552 Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
553 re->parse_flags());
554 newsub->Decref();
555 nre->simple_ = true;
556 return nre;
557 }
558
559 case kRegexpCharClass: {
560 Regexp* nre = SimplifyCharClass(re);
561 nre->simple_ = true;
562 return nre;
563 }
564 }
565
566 LOG(ERROR) << "Simplify case not handled: " << re->op();
567 return re->Incref();
568}
569
570// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
571// Returns a new Regexp, handing the ref to the caller.
572Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
573 Regexp::ParseFlags parse_flags) {
574 Regexp* re = new Regexp(kRegexpConcat, parse_flags);
575 re->AllocSub(2);
576 Regexp** subs = re->sub();
577 subs[0] = re1;
578 subs[1] = re2;
579 return re;
580}
581
582// Simplifies the expression re{min,max} in terms of *, +, and ?.
583// Returns a new regexp. Does not edit re. Does not consume reference to re.
584// Caller must Decref return value when done with it.
585// The result will *not* necessarily have the right capturing parens
586// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
587// but in the Regexp* representation, both (x) are marked as $1.
588Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
589 Regexp::ParseFlags f) {
590 // x{n,} means at least n matches of x.
591 if (max == -1) {
592 // Special case: x{0,} is x*
593 if (min == 0)
594 return Regexp::Star(re->Incref(), f);
595
596 // Special case: x{1,} is x+
597 if (min == 1)
598 return Regexp::Plus(re->Incref(), f);
599
600 // General case: x{4,} is xxxx+
601 PODArray<Regexp*> nre_subs(min);
602 for (int i = 0; i < min-1; i++)
603 nre_subs[i] = re->Incref();
604 nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
605 return Regexp::Concat(nre_subs.data(), min, f);
606 }
607
608 // Special case: (x){0} matches only empty string.
609 if (min == 0 && max == 0)
610 return new Regexp(kRegexpEmptyMatch, f);
611
612 // Special case: x{1} is just x.
613 if (min == 1 && max == 1)
614 return re->Incref();
615
616 // General case: x{n,m} means n copies of x and m copies of x?.
617 // The machine will do less work if we nest the final m copies,
618 // so that x{2,5} = xx(x(x(x)?)?)?
619
620 // Build leading prefix: xx. Capturing only on the last one.
621 Regexp* nre = NULL;
622 if (min > 0) {
623 PODArray<Regexp*> nre_subs(min);
624 for (int i = 0; i < min; i++)
625 nre_subs[i] = re->Incref();
626 nre = Regexp::Concat(nre_subs.data(), min, f);
627 }
628
629 // Build and attach suffix: (x(x(x)?)?)?
630 if (max > min) {
631 Regexp* suf = Regexp::Quest(re->Incref(), f);
632 for (int i = min+1; i < max; i++)
633 suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
634 if (nre == NULL)
635 nre = suf;
636 else
637 nre = Concat2(nre, suf, f);
638 }
639
640 if (nre == NULL) {
641 // Some degenerate case, like min > max, or min < max < 0.
642 // This shouldn't happen, because the parser rejects such regexps.
643 LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
644 return new Regexp(kRegexpNoMatch, f);
645 }
646
647 return nre;
648}
649
650// Simplifies a character class.
651// Caller must Decref return value when done with it.
652Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
653 CharClass* cc = re->cc();
654
655 // Special cases
656 if (cc->empty())
657 return new Regexp(kRegexpNoMatch, re->parse_flags());
658 if (cc->full())
659 return new Regexp(kRegexpAnyChar, re->parse_flags());
660
661 return re->Incref();
662}
663
664} // namespace re2
665