1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | // Use of this source code is governed by a BSD-style |
3 | // license that can be found in the LICENSE file. |
4 | |
5 | // Rewrite POSIX and other features in re |
6 | // to use simple extended regular expression features. |
7 | // Also sort and simplify character classes. |
8 | |
9 | #include <string> |
10 | |
11 | #include "util/logging.h" |
12 | #include "util/utf.h" |
13 | #include "re2/pod_array.h" |
14 | #include "re2/regexp.h" |
15 | #include "re2/walker-inl.h" |
16 | |
17 | namespace re2 { |
18 | |
19 | // Parses the regexp src and then simplifies it and sets *dst to the |
20 | // string representation of the simplified form. Returns true on success. |
21 | // Returns false and sets *error (if error != NULL) on error. |
22 | bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags, |
23 | std::string* dst, RegexpStatus* status) { |
24 | Regexp* re = Parse(src, flags, status); |
25 | if (re == NULL) |
26 | return false; |
27 | Regexp* sre = re->Simplify(); |
28 | re->Decref(); |
29 | if (sre == NULL) { |
30 | if (status) { |
31 | status->set_code(kRegexpInternalError); |
32 | status->set_error_arg(src); |
33 | } |
34 | return false; |
35 | } |
36 | *dst = sre->ToString(); |
37 | sre->Decref(); |
38 | return true; |
39 | } |
40 | |
41 | // Assuming the simple_ flags on the children are accurate, |
42 | // is this Regexp* simple? |
43 | bool Regexp::ComputeSimple() { |
44 | Regexp** subs; |
45 | switch (op_) { |
46 | case kRegexpNoMatch: |
47 | case kRegexpEmptyMatch: |
48 | case kRegexpLiteral: |
49 | case kRegexpLiteralString: |
50 | case kRegexpBeginLine: |
51 | case kRegexpEndLine: |
52 | case kRegexpBeginText: |
53 | case kRegexpWordBoundary: |
54 | case kRegexpNoWordBoundary: |
55 | case kRegexpEndText: |
56 | case kRegexpAnyChar: |
57 | case kRegexpAnyByte: |
58 | case kRegexpHaveMatch: |
59 | return true; |
60 | case kRegexpConcat: |
61 | case kRegexpAlternate: |
62 | // These are simple as long as the subpieces are simple. |
63 | subs = sub(); |
64 | for (int i = 0; i < nsub_; i++) |
65 | if (!subs[i]->simple()) |
66 | return false; |
67 | return true; |
68 | case kRegexpCharClass: |
69 | // Simple as long as the char class is not empty, not full. |
70 | if (ccb_ != NULL) |
71 | return !ccb_->empty() && !ccb_->full(); |
72 | return !cc_->empty() && !cc_->full(); |
73 | case kRegexpCapture: |
74 | subs = sub(); |
75 | return subs[0]->simple(); |
76 | case kRegexpStar: |
77 | case kRegexpPlus: |
78 | case kRegexpQuest: |
79 | subs = sub(); |
80 | if (!subs[0]->simple()) |
81 | return false; |
82 | switch (subs[0]->op_) { |
83 | case kRegexpStar: |
84 | case kRegexpPlus: |
85 | case kRegexpQuest: |
86 | case kRegexpEmptyMatch: |
87 | case kRegexpNoMatch: |
88 | return false; |
89 | default: |
90 | break; |
91 | } |
92 | return true; |
93 | case kRegexpRepeat: |
94 | return false; |
95 | } |
96 | LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; |
97 | return false; |
98 | } |
99 | |
100 | // Walker subclass used by Simplify. |
101 | // Coalesces runs of star/plus/quest/repeat of the same literal along with any |
102 | // occurrences of that literal into repeats of that literal. It also works for |
103 | // char classes, any char and any byte. |
104 | // PostVisit creates the coalesced result, which should then be simplified. |
105 | class CoalesceWalker : public Regexp::Walker<Regexp*> { |
106 | public: |
107 | CoalesceWalker() {} |
108 | virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, |
109 | Regexp** child_args, int nchild_args); |
110 | virtual Regexp* Copy(Regexp* re); |
111 | virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); |
112 | |
113 | private: |
114 | // These functions are declared inside CoalesceWalker so that |
115 | // they can edit the private fields of the Regexps they construct. |
116 | |
117 | // Returns true if r1 and r2 can be coalesced. In particular, ensures that |
118 | // the parse flags are consistent. (They will not be checked again later.) |
119 | static bool CanCoalesce(Regexp* r1, Regexp* r2); |
120 | |
121 | // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards |
122 | // will be empty match and the coalesced op. In other cases, where part of a |
123 | // literal string was removed to be coalesced, the array elements afterwards |
124 | // will be the coalesced op and the remainder of the literal string. |
125 | static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr); |
126 | |
127 | CoalesceWalker(const CoalesceWalker&) = delete; |
128 | CoalesceWalker& operator=(const CoalesceWalker&) = delete; |
129 | }; |
130 | |
131 | // Walker subclass used by Simplify. |
132 | // The simplify walk is purely post-recursive: given the simplified children, |
133 | // PostVisit creates the simplified result. |
134 | // The child_args are simplified Regexp*s. |
135 | class SimplifyWalker : public Regexp::Walker<Regexp*> { |
136 | public: |
137 | SimplifyWalker() {} |
138 | virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); |
139 | virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, |
140 | Regexp** child_args, int nchild_args); |
141 | virtual Regexp* Copy(Regexp* re); |
142 | virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); |
143 | |
144 | private: |
145 | // These functions are declared inside SimplifyWalker so that |
146 | // they can edit the private fields of the Regexps they construct. |
147 | |
148 | // Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
149 | // Caller must Decref return value when done with it. |
150 | static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); |
151 | |
152 | // Simplifies the expression re{min,max} in terms of *, +, and ?. |
153 | // Returns a new regexp. Does not edit re. Does not consume reference to re. |
154 | // Caller must Decref return value when done with it. |
155 | static Regexp* SimplifyRepeat(Regexp* re, int min, int max, |
156 | Regexp::ParseFlags parse_flags); |
157 | |
158 | // Simplifies a character class by expanding any named classes |
159 | // into rune ranges. Does not edit re. Does not consume ref to re. |
160 | // Caller must Decref return value when done with it. |
161 | static Regexp* SimplifyCharClass(Regexp* re); |
162 | |
163 | SimplifyWalker(const SimplifyWalker&) = delete; |
164 | SimplifyWalker& operator=(const SimplifyWalker&) = delete; |
165 | }; |
166 | |
167 | // Simplifies a regular expression, returning a new regexp. |
168 | // The new regexp uses traditional Unix egrep features only, |
169 | // plus the Perl (?:) non-capturing parentheses. |
170 | // Otherwise, no POSIX or Perl additions. The new regexp |
171 | // captures exactly the same subexpressions (with the same indices) |
172 | // as the original. |
173 | // Does not edit current object. |
174 | // Caller must Decref() return value when done with it. |
175 | |
176 | Regexp* Regexp::Simplify() { |
177 | CoalesceWalker cw; |
178 | Regexp* cre = cw.Walk(this, NULL); |
179 | if (cre == NULL) |
180 | return NULL; |
181 | if (cw.stopped_early()) { |
182 | cre->Decref(); |
183 | return NULL; |
184 | } |
185 | SimplifyWalker sw; |
186 | Regexp* sre = sw.Walk(cre, NULL); |
187 | cre->Decref(); |
188 | if (sre == NULL) |
189 | return NULL; |
190 | if (sw.stopped_early()) { |
191 | sre->Decref(); |
192 | return NULL; |
193 | } |
194 | return sre; |
195 | } |
196 | |
197 | #define Simplify DontCallSimplify // Avoid accidental recursion |
198 | |
199 | // Utility function for PostVisit implementations that compares re->sub() with |
200 | // child_args to determine whether any child_args changed. In the common case, |
201 | // where nothing changed, calls Decref() for all child_args and returns false, |
202 | // so PostVisit must return re->Incref(). Otherwise, returns true. |
203 | static bool ChildArgsChanged(Regexp* re, Regexp** child_args) { |
204 | for (int i = 0; i < re->nsub(); i++) { |
205 | Regexp* sub = re->sub()[i]; |
206 | Regexp* newsub = child_args[i]; |
207 | if (newsub != sub) |
208 | return true; |
209 | } |
210 | for (int i = 0; i < re->nsub(); i++) { |
211 | Regexp* newsub = child_args[i]; |
212 | newsub->Decref(); |
213 | } |
214 | return false; |
215 | } |
216 | |
217 | Regexp* CoalesceWalker::Copy(Regexp* re) { |
218 | return re->Incref(); |
219 | } |
220 | |
221 | Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { |
222 | // Should never be called: we use Walk(), not WalkExponential(). |
223 | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
224 | LOG(DFATAL) << "CoalesceWalker::ShortVisit called" ; |
225 | #endif |
226 | return re->Incref(); |
227 | } |
228 | |
229 | Regexp* CoalesceWalker::PostVisit(Regexp* re, |
230 | Regexp* parent_arg, |
231 | Regexp* pre_arg, |
232 | Regexp** child_args, |
233 | int nchild_args) { |
234 | if (re->nsub() == 0) |
235 | return re->Incref(); |
236 | |
237 | if (re->op() != kRegexpConcat) { |
238 | if (!ChildArgsChanged(re, child_args)) |
239 | return re->Incref(); |
240 | |
241 | // Something changed. Build a new op. |
242 | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
243 | nre->AllocSub(re->nsub()); |
244 | Regexp** nre_subs = nre->sub(); |
245 | for (int i = 0; i < re->nsub(); i++) |
246 | nre_subs[i] = child_args[i]; |
247 | // Repeats and Captures have additional data that must be copied. |
248 | if (re->op() == kRegexpRepeat) { |
249 | nre->min_ = re->min(); |
250 | nre->max_ = re->max(); |
251 | } else if (re->op() == kRegexpCapture) { |
252 | nre->cap_ = re->cap(); |
253 | } |
254 | return nre; |
255 | } |
256 | |
257 | bool can_coalesce = false; |
258 | for (int i = 0; i < re->nsub(); i++) { |
259 | if (i+1 < re->nsub() && |
260 | CanCoalesce(child_args[i], child_args[i+1])) { |
261 | can_coalesce = true; |
262 | break; |
263 | } |
264 | } |
265 | if (!can_coalesce) { |
266 | if (!ChildArgsChanged(re, child_args)) |
267 | return re->Incref(); |
268 | |
269 | // Something changed. Build a new op. |
270 | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
271 | nre->AllocSub(re->nsub()); |
272 | Regexp** nre_subs = nre->sub(); |
273 | for (int i = 0; i < re->nsub(); i++) |
274 | nre_subs[i] = child_args[i]; |
275 | return nre; |
276 | } |
277 | |
278 | for (int i = 0; i < re->nsub(); i++) { |
279 | if (i+1 < re->nsub() && |
280 | CanCoalesce(child_args[i], child_args[i+1])) |
281 | DoCoalesce(&child_args[i], &child_args[i+1]); |
282 | } |
283 | // Determine how many empty matches were left by DoCoalesce. |
284 | int n = 0; |
285 | for (int i = n; i < re->nsub(); i++) { |
286 | if (child_args[i]->op() == kRegexpEmptyMatch) |
287 | n++; |
288 | } |
289 | // Build a new op. |
290 | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
291 | nre->AllocSub(re->nsub() - n); |
292 | Regexp** nre_subs = nre->sub(); |
293 | for (int i = 0, j = 0; i < re->nsub(); i++) { |
294 | if (child_args[i]->op() == kRegexpEmptyMatch) { |
295 | child_args[i]->Decref(); |
296 | continue; |
297 | } |
298 | nre_subs[j] = child_args[i]; |
299 | j++; |
300 | } |
301 | return nre; |
302 | } |
303 | |
304 | bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) { |
305 | // r1 must be a star/plus/quest/repeat of a literal, char class, any char or |
306 | // any byte. |
307 | if ((r1->op() == kRegexpStar || |
308 | r1->op() == kRegexpPlus || |
309 | r1->op() == kRegexpQuest || |
310 | r1->op() == kRegexpRepeat) && |
311 | (r1->sub()[0]->op() == kRegexpLiteral || |
312 | r1->sub()[0]->op() == kRegexpCharClass || |
313 | r1->sub()[0]->op() == kRegexpAnyChar || |
314 | r1->sub()[0]->op() == kRegexpAnyByte)) { |
315 | // r2 must be a star/plus/quest/repeat of the same literal, char class, |
316 | // any char or any byte. |
317 | if ((r2->op() == kRegexpStar || |
318 | r2->op() == kRegexpPlus || |
319 | r2->op() == kRegexpQuest || |
320 | r2->op() == kRegexpRepeat) && |
321 | Regexp::Equal(r1->sub()[0], r2->sub()[0]) && |
322 | // The parse flags must be consistent. |
323 | ((r1->parse_flags() & Regexp::NonGreedy) == |
324 | (r2->parse_flags() & Regexp::NonGreedy))) { |
325 | return true; |
326 | } |
327 | // ... OR an occurrence of that literal, char class, any char or any byte |
328 | if (Regexp::Equal(r1->sub()[0], r2)) { |
329 | return true; |
330 | } |
331 | // ... OR a literal string that begins with that literal. |
332 | if (r1->sub()[0]->op() == kRegexpLiteral && |
333 | r2->op() == kRegexpLiteralString && |
334 | r2->runes()[0] == r1->sub()[0]->rune() && |
335 | // The parse flags must be consistent. |
336 | ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) == |
337 | (r2->parse_flags() & Regexp::FoldCase))) { |
338 | return true; |
339 | } |
340 | } |
341 | return false; |
342 | } |
343 | |
344 | void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { |
345 | Regexp* r1 = *r1ptr; |
346 | Regexp* r2 = *r2ptr; |
347 | |
348 | Regexp* nre = Regexp::Repeat( |
349 | r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0); |
350 | |
351 | switch (r1->op()) { |
352 | case kRegexpStar: |
353 | nre->min_ = 0; |
354 | nre->max_ = -1; |
355 | break; |
356 | |
357 | case kRegexpPlus: |
358 | nre->min_ = 1; |
359 | nre->max_ = -1; |
360 | break; |
361 | |
362 | case kRegexpQuest: |
363 | nre->min_ = 0; |
364 | nre->max_ = 1; |
365 | break; |
366 | |
367 | case kRegexpRepeat: |
368 | nre->min_ = r1->min(); |
369 | nre->max_ = r1->max(); |
370 | break; |
371 | |
372 | default: |
373 | LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); |
374 | nre->Decref(); |
375 | return; |
376 | } |
377 | |
378 | switch (r2->op()) { |
379 | case kRegexpStar: |
380 | nre->max_ = -1; |
381 | goto LeaveEmpty; |
382 | |
383 | case kRegexpPlus: |
384 | nre->min_++; |
385 | nre->max_ = -1; |
386 | goto LeaveEmpty; |
387 | |
388 | case kRegexpQuest: |
389 | if (nre->max() != -1) |
390 | nre->max_++; |
391 | goto LeaveEmpty; |
392 | |
393 | case kRegexpRepeat: |
394 | nre->min_ += r2->min(); |
395 | if (r2->max() == -1) |
396 | nre->max_ = -1; |
397 | else if (nre->max() != -1) |
398 | nre->max_ += r2->max(); |
399 | goto LeaveEmpty; |
400 | |
401 | case kRegexpLiteral: |
402 | case kRegexpCharClass: |
403 | case kRegexpAnyChar: |
404 | case kRegexpAnyByte: |
405 | nre->min_++; |
406 | if (nre->max() != -1) |
407 | nre->max_++; |
408 | goto LeaveEmpty; |
409 | |
410 | LeaveEmpty: |
411 | *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags); |
412 | *r2ptr = nre; |
413 | break; |
414 | |
415 | case kRegexpLiteralString: { |
416 | Rune r = r1->sub()[0]->rune(); |
417 | // Determine how much of the literal string is removed. |
418 | // We know that we have at least one rune. :) |
419 | int n = 1; |
420 | while (n < r2->nrunes() && r2->runes()[n] == r) |
421 | n++; |
422 | nre->min_ += n; |
423 | if (nre->max() != -1) |
424 | nre->max_ += n; |
425 | if (n == r2->nrunes()) |
426 | goto LeaveEmpty; |
427 | *r1ptr = nre; |
428 | *r2ptr = Regexp::LiteralString( |
429 | &r2->runes()[n], r2->nrunes() - n, r2->parse_flags()); |
430 | break; |
431 | } |
432 | |
433 | default: |
434 | LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); |
435 | nre->Decref(); |
436 | return; |
437 | } |
438 | |
439 | r1->Decref(); |
440 | r2->Decref(); |
441 | } |
442 | |
443 | Regexp* SimplifyWalker::Copy(Regexp* re) { |
444 | return re->Incref(); |
445 | } |
446 | |
447 | Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { |
448 | // Should never be called: we use Walk(), not WalkExponential(). |
449 | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
450 | LOG(DFATAL) << "SimplifyWalker::ShortVisit called" ; |
451 | #endif |
452 | return re->Incref(); |
453 | } |
454 | |
455 | Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { |
456 | if (re->simple()) { |
457 | *stop = true; |
458 | return re->Incref(); |
459 | } |
460 | return NULL; |
461 | } |
462 | |
463 | Regexp* SimplifyWalker::PostVisit(Regexp* re, |
464 | Regexp* parent_arg, |
465 | Regexp* pre_arg, |
466 | Regexp** child_args, |
467 | int nchild_args) { |
468 | switch (re->op()) { |
469 | case kRegexpNoMatch: |
470 | case kRegexpEmptyMatch: |
471 | case kRegexpLiteral: |
472 | case kRegexpLiteralString: |
473 | case kRegexpBeginLine: |
474 | case kRegexpEndLine: |
475 | case kRegexpBeginText: |
476 | case kRegexpWordBoundary: |
477 | case kRegexpNoWordBoundary: |
478 | case kRegexpEndText: |
479 | case kRegexpAnyChar: |
480 | case kRegexpAnyByte: |
481 | case kRegexpHaveMatch: |
482 | // All these are always simple. |
483 | re->simple_ = true; |
484 | return re->Incref(); |
485 | |
486 | case kRegexpConcat: |
487 | case kRegexpAlternate: { |
488 | // These are simple as long as the subpieces are simple. |
489 | if (!ChildArgsChanged(re, child_args)) { |
490 | re->simple_ = true; |
491 | return re->Incref(); |
492 | } |
493 | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
494 | nre->AllocSub(re->nsub()); |
495 | Regexp** nre_subs = nre->sub(); |
496 | for (int i = 0; i < re->nsub(); i++) |
497 | nre_subs[i] = child_args[i]; |
498 | nre->simple_ = true; |
499 | return nre; |
500 | } |
501 | |
502 | case kRegexpCapture: { |
503 | Regexp* newsub = child_args[0]; |
504 | if (newsub == re->sub()[0]) { |
505 | newsub->Decref(); |
506 | re->simple_ = true; |
507 | return re->Incref(); |
508 | } |
509 | Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); |
510 | nre->AllocSub(1); |
511 | nre->sub()[0] = newsub; |
512 | nre->cap_ = re->cap(); |
513 | nre->simple_ = true; |
514 | return nre; |
515 | } |
516 | |
517 | case kRegexpStar: |
518 | case kRegexpPlus: |
519 | case kRegexpQuest: { |
520 | Regexp* newsub = child_args[0]; |
521 | // Special case: repeat the empty string as much as |
522 | // you want, but it's still the empty string. |
523 | if (newsub->op() == kRegexpEmptyMatch) |
524 | return newsub; |
525 | |
526 | // These are simple as long as the subpiece is simple. |
527 | if (newsub == re->sub()[0]) { |
528 | newsub->Decref(); |
529 | re->simple_ = true; |
530 | return re->Incref(); |
531 | } |
532 | |
533 | // These are also idempotent if flags are constant. |
534 | if (re->op() == newsub->op() && |
535 | re->parse_flags() == newsub->parse_flags()) |
536 | return newsub; |
537 | |
538 | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
539 | nre->AllocSub(1); |
540 | nre->sub()[0] = newsub; |
541 | nre->simple_ = true; |
542 | return nre; |
543 | } |
544 | |
545 | case kRegexpRepeat: { |
546 | Regexp* newsub = child_args[0]; |
547 | // Special case: repeat the empty string as much as |
548 | // you want, but it's still the empty string. |
549 | if (newsub->op() == kRegexpEmptyMatch) |
550 | return newsub; |
551 | |
552 | Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, |
553 | re->parse_flags()); |
554 | newsub->Decref(); |
555 | nre->simple_ = true; |
556 | return nre; |
557 | } |
558 | |
559 | case kRegexpCharClass: { |
560 | Regexp* nre = SimplifyCharClass(re); |
561 | nre->simple_ = true; |
562 | return nre; |
563 | } |
564 | } |
565 | |
566 | LOG(ERROR) << "Simplify case not handled: " << re->op(); |
567 | return re->Incref(); |
568 | } |
569 | |
570 | // Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
571 | // Returns a new Regexp, handing the ref to the caller. |
572 | Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, |
573 | Regexp::ParseFlags parse_flags) { |
574 | Regexp* re = new Regexp(kRegexpConcat, parse_flags); |
575 | re->AllocSub(2); |
576 | Regexp** subs = re->sub(); |
577 | subs[0] = re1; |
578 | subs[1] = re2; |
579 | return re; |
580 | } |
581 | |
582 | // Simplifies the expression re{min,max} in terms of *, +, and ?. |
583 | // Returns a new regexp. Does not edit re. Does not consume reference to re. |
584 | // Caller must Decref return value when done with it. |
585 | // The result will *not* necessarily have the right capturing parens |
586 | // if you call ToString() and re-parse it: (x){2} becomes (x)(x), |
587 | // but in the Regexp* representation, both (x) are marked as $1. |
588 | Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, |
589 | Regexp::ParseFlags f) { |
590 | // x{n,} means at least n matches of x. |
591 | if (max == -1) { |
592 | // Special case: x{0,} is x* |
593 | if (min == 0) |
594 | return Regexp::Star(re->Incref(), f); |
595 | |
596 | // Special case: x{1,} is x+ |
597 | if (min == 1) |
598 | return Regexp::Plus(re->Incref(), f); |
599 | |
600 | // General case: x{4,} is xxxx+ |
601 | PODArray<Regexp*> nre_subs(min); |
602 | for (int i = 0; i < min-1; i++) |
603 | nre_subs[i] = re->Incref(); |
604 | nre_subs[min-1] = Regexp::Plus(re->Incref(), f); |
605 | return Regexp::Concat(nre_subs.data(), min, f); |
606 | } |
607 | |
608 | // Special case: (x){0} matches only empty string. |
609 | if (min == 0 && max == 0) |
610 | return new Regexp(kRegexpEmptyMatch, f); |
611 | |
612 | // Special case: x{1} is just x. |
613 | if (min == 1 && max == 1) |
614 | return re->Incref(); |
615 | |
616 | // General case: x{n,m} means n copies of x and m copies of x?. |
617 | // The machine will do less work if we nest the final m copies, |
618 | // so that x{2,5} = xx(x(x(x)?)?)? |
619 | |
620 | // Build leading prefix: xx. Capturing only on the last one. |
621 | Regexp* nre = NULL; |
622 | if (min > 0) { |
623 | PODArray<Regexp*> nre_subs(min); |
624 | for (int i = 0; i < min; i++) |
625 | nre_subs[i] = re->Incref(); |
626 | nre = Regexp::Concat(nre_subs.data(), min, f); |
627 | } |
628 | |
629 | // Build and attach suffix: (x(x(x)?)?)? |
630 | if (max > min) { |
631 | Regexp* suf = Regexp::Quest(re->Incref(), f); |
632 | for (int i = min+1; i < max; i++) |
633 | suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); |
634 | if (nre == NULL) |
635 | nre = suf; |
636 | else |
637 | nre = Concat2(nre, suf, f); |
638 | } |
639 | |
640 | if (nre == NULL) { |
641 | // Some degenerate case, like min > max, or min < max < 0. |
642 | // This shouldn't happen, because the parser rejects such regexps. |
643 | LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; |
644 | return new Regexp(kRegexpNoMatch, f); |
645 | } |
646 | |
647 | return nre; |
648 | } |
649 | |
650 | // Simplifies a character class. |
651 | // Caller must Decref return value when done with it. |
652 | Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { |
653 | CharClass* cc = re->cc(); |
654 | |
655 | // Special cases |
656 | if (cc->empty()) |
657 | return new Regexp(kRegexpNoMatch, re->parse_flags()); |
658 | if (cc->full()) |
659 | return new Regexp(kRegexpAnyChar, re->parse_flags()); |
660 | |
661 | return re->Incref(); |
662 | } |
663 | |
664 | } // namespace re2 |
665 | |