1// Copyright 2006 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression parser.
6
7// The parser is a simple precedence-based parser with a
8// manual stack. The parsing work is done by the methods
9// of the ParseState class. The Regexp::Parse function is
10// essentially just a lexer that calls the ParseState method
11// for each token.
12
13// The parser recognizes POSIX extended regular expressions
14// excluding backreferences, collating elements, and collating
15// classes. It also allows the empty string as a regular expression
16// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
17// See regexp.h for rationale.
18
19#include <ctype.h>
20#include <stddef.h>
21#include <stdint.h>
22#include <string.h>
23#include <algorithm>
24#include <map>
25#include <string>
26#include <vector>
27
28#include "absl/base/macros.h"
29#include "util/logging.h"
30#include "util/utf.h"
31#include "re2/pod_array.h"
32#include "re2/regexp.h"
33#include "re2/unicode_casefold.h"
34#include "re2/unicode_groups.h"
35#include "re2/walker-inl.h"
36
37#if defined(RE2_USE_ICU)
38#include "unicode/uniset.h"
39#include "unicode/unistr.h"
40#include "unicode/utypes.h"
41#endif
42
43namespace re2 {
44
45// Controls the maximum repeat count permitted by the parser.
46static int maximum_repeat_count = 1000;
47
48void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) {
49 maximum_repeat_count = i;
50}
51
52// Regular expression parse state.
53// The list of parsed regexps so far is maintained as a vector of
54// Regexp pointers called the stack. Left parenthesis and vertical
55// bar markers are also placed on the stack, as Regexps with
56// non-standard opcodes.
57// Scanning a left parenthesis causes the parser to push a left parenthesis
58// marker on the stack.
59// Scanning a vertical bar causes the parser to pop the stack until it finds a
60// vertical bar or left parenthesis marker (not popping the marker),
61// concatenate all the popped results, and push them back on
62// the stack (DoConcatenation).
63// Scanning a right parenthesis causes the parser to act as though it
64// has seen a vertical bar, which then leaves the top of the stack in the
65// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
66// The parser pops all this off the stack and creates an alternation of the
67// regexps (DoAlternation).
68
69class Regexp::ParseState {
70 public:
71 ParseState(ParseFlags flags, absl::string_view whole_regexp,
72 RegexpStatus* status);
73 ~ParseState();
74
75 ParseFlags flags() { return flags_; }
76 int rune_max() { return rune_max_; }
77
78 // Parse methods. All public methods return a bool saying
79 // whether parsing should continue. If a method returns
80 // false, it has set fields in *status_, and the parser
81 // should return NULL.
82
83 // Pushes the given regular expression onto the stack.
84 // Could check for too much memory used here.
85 bool PushRegexp(Regexp* re);
86
87 // Pushes the literal rune r onto the stack.
88 bool PushLiteral(Rune r);
89
90 // Pushes a regexp with the given op (and no args) onto the stack.
91 bool PushSimpleOp(RegexpOp op);
92
93 // Pushes a ^ onto the stack.
94 bool PushCaret();
95
96 // Pushes a \b (word == true) or \B (word == false) onto the stack.
97 bool PushWordBoundary(bool word);
98
99 // Pushes a $ onto the stack.
100 bool PushDollar();
101
102 // Pushes a . onto the stack
103 bool PushDot();
104
105 // Pushes a repeat operator regexp onto the stack.
106 // A valid argument for the operator must already be on the stack.
107 // s is the name of the operator, for use in error messages.
108 bool PushRepeatOp(RegexpOp op, absl::string_view s, bool nongreedy);
109
110 // Pushes a repetition regexp onto the stack.
111 // A valid argument for the operator must already be on the stack.
112 bool PushRepetition(int min, int max, absl::string_view s, bool nongreedy);
113
114 // Checks whether a particular regexp op is a marker.
115 bool IsMarker(RegexpOp op);
116
117 // Processes a left parenthesis in the input.
118 // Pushes a marker onto the stack.
119 bool DoLeftParen(absl::string_view name);
120 bool DoLeftParenNoCapture();
121
122 // Processes a vertical bar in the input.
123 bool DoVerticalBar();
124
125 // Processes a right parenthesis in the input.
126 bool DoRightParen();
127
128 // Processes the end of input, returning the final regexp.
129 Regexp* DoFinish();
130
131 // Finishes the regexp if necessary, preparing it for use
132 // in a more complicated expression.
133 // If it is a CharClassBuilder, converts into a CharClass.
134 Regexp* FinishRegexp(Regexp*);
135
136 // These routines don't manipulate the parse stack
137 // directly, but they do need to look at flags_.
138 // ParseCharClass also manipulates the internals of Regexp
139 // while creating *out_re.
140
141 // Parse a character class into *out_re.
142 // Removes parsed text from s.
143 bool ParseCharClass(absl::string_view* s, Regexp** out_re,
144 RegexpStatus* status);
145
146 // Parse a character class character into *rp.
147 // Removes parsed text from s.
148 bool ParseCCCharacter(absl::string_view* s, Rune* rp,
149 absl::string_view whole_class,
150 RegexpStatus* status);
151
152 // Parse a character class range into rr.
153 // Removes parsed text from s.
154 bool ParseCCRange(absl::string_view* s, RuneRange* rr,
155 absl::string_view whole_class,
156 RegexpStatus* status);
157
158 // Parse a Perl flag set or non-capturing group from s.
159 bool ParsePerlFlags(absl::string_view* s);
160
161 // Finishes the current concatenation,
162 // collapsing it into a single regexp on the stack.
163 void DoConcatenation();
164
165 // Finishes the current alternation,
166 // collapsing it to a single regexp on the stack.
167 void DoAlternation();
168
169 // Generalized DoAlternation/DoConcatenation.
170 void DoCollapse(RegexpOp op);
171
172 // Maybe concatenate Literals into LiteralString.
173 bool MaybeConcatString(int r, ParseFlags flags);
174
175private:
176 ParseFlags flags_;
177 absl::string_view whole_regexp_;
178 RegexpStatus* status_;
179 Regexp* stacktop_;
180 int ncap_; // number of capturing parens seen
181 int rune_max_; // maximum char value for this encoding
182
183 ParseState(const ParseState&) = delete;
184 ParseState& operator=(const ParseState&) = delete;
185};
186
187// Pseudo-operators - only on parse stack.
188const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
189const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
190
191Regexp::ParseState::ParseState(ParseFlags flags,
192 absl::string_view whole_regexp,
193 RegexpStatus* status)
194 : flags_(flags), whole_regexp_(whole_regexp),
195 status_(status), stacktop_(NULL), ncap_(0) {
196 if (flags_ & Latin1)
197 rune_max_ = 0xFF;
198 else
199 rune_max_ = Runemax;
200}
201
202// Cleans up by freeing all the regexps on the stack.
203Regexp::ParseState::~ParseState() {
204 Regexp* next;
205 for (Regexp* re = stacktop_; re != NULL; re = next) {
206 next = re->down_;
207 re->down_ = NULL;
208 if (re->op() == kLeftParen)
209 delete re->name_;
210 re->Decref();
211 }
212}
213
214// Finishes the regexp if necessary, preparing it for use in
215// a more complex expression.
216// If it is a CharClassBuilder, converts into a CharClass.
217Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
218 if (re == NULL)
219 return NULL;
220 re->down_ = NULL;
221
222 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
223 CharClassBuilder* ccb = re->ccb_;
224 re->ccb_ = NULL;
225 re->cc_ = ccb->GetCharClass();
226 delete ccb;
227 }
228
229 return re;
230}
231
232// Pushes the given regular expression onto the stack.
233// Could check for too much memory used here.
234bool Regexp::ParseState::PushRegexp(Regexp* re) {
235 MaybeConcatString(-1, NoParseFlags);
236
237 // Special case: a character class of one character is just
238 // a literal. This is a common idiom for escaping
239 // single characters (e.g., [.] instead of \.), and some
240 // analysis does better with fewer character classes.
241 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
242 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
243 re->ccb_->RemoveAbove(rune_max_);
244 if (re->ccb_->size() == 1) {
245 Rune r = re->ccb_->begin()->lo;
246 re->Decref();
247 re = new Regexp(kRegexpLiteral, flags_);
248 re->rune_ = r;
249 } else if (re->ccb_->size() == 2) {
250 Rune r = re->ccb_->begin()->lo;
251 if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
252 re->Decref();
253 re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
254 re->rune_ = r + 'a' - 'A';
255 }
256 }
257 }
258
259 if (!IsMarker(re->op()))
260 re->simple_ = re->ComputeSimple();
261 re->down_ = stacktop_;
262 stacktop_ = re;
263 return true;
264}
265
266// Searches the case folding tables and returns the CaseFold* that contains r.
267// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
268// If there isn't one, returns NULL.
269const CaseFold* LookupCaseFold(const CaseFold* f, int n, Rune r) {
270 const CaseFold* ef = f + n;
271
272 // Binary search for entry containing r.
273 while (n > 0) {
274 int m = n/2;
275 if (f[m].lo <= r && r <= f[m].hi)
276 return &f[m];
277 if (r < f[m].lo) {
278 n = m;
279 } else {
280 f += m+1;
281 n -= m+1;
282 }
283 }
284
285 // There is no entry that contains r, but f points
286 // where it would have been. Unless f points at
287 // the end of the array, it points at the next entry
288 // after r.
289 if (f < ef)
290 return f;
291
292 // No entry contains r; no entry contains runes > r.
293 return NULL;
294}
295
296// Returns the result of applying the fold f to the rune r.
297Rune ApplyFold(const CaseFold* f, Rune r) {
298 switch (f->delta) {
299 default:
300 return r + f->delta;
301
302 case EvenOddSkip: // even <-> odd but only applies to every other
303 if ((r - f->lo) % 2)
304 return r;
305 ABSL_FALLTHROUGH_INTENDED;
306 case EvenOdd: // even <-> odd
307 if (r%2 == 0)
308 return r + 1;
309 return r - 1;
310
311 case OddEvenSkip: // odd <-> even but only applies to every other
312 if ((r - f->lo) % 2)
313 return r;
314 ABSL_FALLTHROUGH_INTENDED;
315 case OddEven: // odd <-> even
316 if (r%2 == 1)
317 return r + 1;
318 return r - 1;
319 }
320}
321
322// Returns the next Rune in r's folding cycle (see unicode_casefold.h).
323// Examples:
324// CycleFoldRune('A') = 'a'
325// CycleFoldRune('a') = 'A'
326//
327// CycleFoldRune('K') = 'k'
328// CycleFoldRune('k') = 0x212A (Kelvin)
329// CycleFoldRune(0x212A) = 'K'
330//
331// CycleFoldRune('?') = '?'
332Rune CycleFoldRune(Rune r) {
333 const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
334 if (f == NULL || r < f->lo)
335 return r;
336 return ApplyFold(f, r);
337}
338
339// Add lo-hi to the class, along with their fold-equivalent characters.
340// If lo-hi is already in the class, assume that the fold-equivalent
341// chars are there too, so there's no work to do.
342static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
343 // AddFoldedRange calls itself recursively for each rune in the fold cycle.
344 // Most folding cycles are small: there aren't any bigger than four in the
345 // current Unicode tables. make_unicode_casefold.py checks that
346 // the cycles are not too long, and we double-check here using depth.
347 if (depth > 10) {
348 LOG(DFATAL) << "AddFoldedRange recurses too much.";
349 return;
350 }
351
352 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done
353 return;
354
355 while (lo <= hi) {
356 const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
357 if (f == NULL) // lo has no fold, nor does anything above lo
358 break;
359 if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
360 lo = f->lo;
361 continue;
362 }
363
364 // Add in the result of folding the range lo - f->hi
365 // and that range's fold, recursively.
366 Rune lo1 = lo;
367 Rune hi1 = std::min<Rune>(hi, f->hi);
368 switch (f->delta) {
369 default:
370 lo1 += f->delta;
371 hi1 += f->delta;
372 break;
373 case EvenOdd:
374 if (lo1%2 == 1)
375 lo1--;
376 if (hi1%2 == 0)
377 hi1++;
378 break;
379 case OddEven:
380 if (lo1%2 == 0)
381 lo1--;
382 if (hi1%2 == 1)
383 hi1++;
384 break;
385 }
386 AddFoldedRange(cc, lo1, hi1, depth+1);
387
388 // Pick up where this fold left off.
389 lo = f->hi + 1;
390 }
391}
392
393// Pushes the literal rune r onto the stack.
394bool Regexp::ParseState::PushLiteral(Rune r) {
395 // Do case folding if needed.
396 if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
397 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
398 re->ccb_ = new CharClassBuilder;
399 Rune r1 = r;
400 do {
401 if (!(flags_ & NeverNL) || r != '\n') {
402 re->ccb_->AddRange(r, r);
403 }
404 r = CycleFoldRune(r);
405 } while (r != r1);
406 return PushRegexp(re);
407 }
408
409 // Exclude newline if applicable.
410 if ((flags_ & NeverNL) && r == '\n')
411 return PushRegexp(new Regexp(kRegexpNoMatch, flags_));
412
413 // No fancy stuff worked. Ordinary literal.
414 if (MaybeConcatString(r, flags_))
415 return true;
416
417 Regexp* re = new Regexp(kRegexpLiteral, flags_);
418 re->rune_ = r;
419 return PushRegexp(re);
420}
421
422// Pushes a ^ onto the stack.
423bool Regexp::ParseState::PushCaret() {
424 if (flags_ & OneLine) {
425 return PushSimpleOp(kRegexpBeginText);
426 }
427 return PushSimpleOp(kRegexpBeginLine);
428}
429
430// Pushes a \b or \B onto the stack.
431bool Regexp::ParseState::PushWordBoundary(bool word) {
432 if (word)
433 return PushSimpleOp(kRegexpWordBoundary);
434 return PushSimpleOp(kRegexpNoWordBoundary);
435}
436
437// Pushes a $ onto the stack.
438bool Regexp::ParseState::PushDollar() {
439 if (flags_ & OneLine) {
440 // Clumsy marker so that MimicsPCRE() can tell whether
441 // this kRegexpEndText was a $ and not a \z.
442 Regexp::ParseFlags oflags = flags_;
443 flags_ = flags_ | WasDollar;
444 bool ret = PushSimpleOp(kRegexpEndText);
445 flags_ = oflags;
446 return ret;
447 }
448 return PushSimpleOp(kRegexpEndLine);
449}
450
451// Pushes a . onto the stack.
452bool Regexp::ParseState::PushDot() {
453 if ((flags_ & DotNL) && !(flags_ & NeverNL))
454 return PushSimpleOp(kRegexpAnyChar);
455 // Rewrite . into [^\n]
456 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
457 re->ccb_ = new CharClassBuilder;
458 re->ccb_->AddRange(0, '\n' - 1);
459 re->ccb_->AddRange('\n' + 1, rune_max_);
460 return PushRegexp(re);
461}
462
463// Pushes a regexp with the given op (and no args) onto the stack.
464bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
465 Regexp* re = new Regexp(op, flags_);
466 return PushRegexp(re);
467}
468
469// Pushes a repeat operator regexp onto the stack.
470// A valid argument for the operator must already be on the stack.
471// The char c is the name of the operator, for use in error messages.
472bool Regexp::ParseState::PushRepeatOp(RegexpOp op, absl::string_view s,
473 bool nongreedy) {
474 if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
475 status_->set_code(kRegexpRepeatArgument);
476 status_->set_error_arg(s);
477 return false;
478 }
479 Regexp::ParseFlags fl = flags_;
480 if (nongreedy)
481 fl = fl ^ NonGreedy;
482
483 // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but
484 // they're mostly for use during simplification, not during parsing.
485 if (op == stacktop_->op() && fl == stacktop_->parse_flags())
486 return true;
487
488 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
489 // op is a repeat, we just have to check that stacktop_->op() is too,
490 // then adjust stacktop_.
491 if ((stacktop_->op() == kRegexpStar ||
492 stacktop_->op() == kRegexpPlus ||
493 stacktop_->op() == kRegexpQuest) &&
494 fl == stacktop_->parse_flags()) {
495 stacktop_->op_ = kRegexpStar;
496 return true;
497 }
498
499 Regexp* re = new Regexp(op, fl);
500 re->AllocSub(1);
501 re->down_ = stacktop_->down_;
502 re->sub()[0] = FinishRegexp(stacktop_);
503 re->simple_ = re->ComputeSimple();
504 stacktop_ = re;
505 return true;
506}
507
508// RepetitionWalker reports whether the repetition regexp is valid.
509// Valid means that the combination of the top-level repetition
510// and any inner repetitions does not exceed n copies of the
511// innermost thing.
512// This rewalks the regexp tree and is called for every repetition,
513// so we have to worry about inducing quadratic behavior in the parser.
514// We avoid this by only using RepetitionWalker when min or max >= 2.
515// In that case the depth of any >= 2 nesting can only get to 9 without
516// triggering a parse error, so each subtree can only be rewalked 9 times.
517class RepetitionWalker : public Regexp::Walker<int> {
518 public:
519 RepetitionWalker() {}
520 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
521 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
522 int* child_args, int nchild_args);
523 virtual int ShortVisit(Regexp* re, int parent_arg);
524
525 private:
526 RepetitionWalker(const RepetitionWalker&) = delete;
527 RepetitionWalker& operator=(const RepetitionWalker&) = delete;
528};
529
530int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
531 int arg = parent_arg;
532 if (re->op() == kRegexpRepeat) {
533 int m = re->max();
534 if (m < 0) {
535 m = re->min();
536 }
537 if (m > 0) {
538 arg /= m;
539 }
540 }
541 return arg;
542}
543
544int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
545 int* child_args, int nchild_args) {
546 int arg = pre_arg;
547 for (int i = 0; i < nchild_args; i++) {
548 if (child_args[i] < arg) {
549 arg = child_args[i];
550 }
551 }
552 return arg;
553}
554
555int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
556 // Should never be called: we use Walk(), not WalkExponential().
557#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
558 LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
559#endif
560 return 0;
561}
562
563// Pushes a repetition regexp onto the stack.
564// A valid argument for the operator must already be on the stack.
565bool Regexp::ParseState::PushRepetition(int min, int max, absl::string_view s,
566 bool nongreedy) {
567 if ((max != -1 && max < min) ||
568 min > maximum_repeat_count ||
569 max > maximum_repeat_count) {
570 status_->set_code(kRegexpRepeatSize);
571 status_->set_error_arg(s);
572 return false;
573 }
574 if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
575 status_->set_code(kRegexpRepeatArgument);
576 status_->set_error_arg(s);
577 return false;
578 }
579 Regexp::ParseFlags fl = flags_;
580 if (nongreedy)
581 fl = fl ^ NonGreedy;
582 Regexp* re = new Regexp(kRegexpRepeat, fl);
583 re->min_ = min;
584 re->max_ = max;
585 re->AllocSub(1);
586 re->down_ = stacktop_->down_;
587 re->sub()[0] = FinishRegexp(stacktop_);
588 re->simple_ = re->ComputeSimple();
589 stacktop_ = re;
590 if (min >= 2 || max >= 2) {
591 RepetitionWalker w;
592 if (w.Walk(stacktop_, maximum_repeat_count) == 0) {
593 status_->set_code(kRegexpRepeatSize);
594 status_->set_error_arg(s);
595 return false;
596 }
597 }
598 return true;
599}
600
601// Checks whether a particular regexp op is a marker.
602bool Regexp::ParseState::IsMarker(RegexpOp op) {
603 return op >= kLeftParen;
604}
605
606// Processes a left parenthesis in the input.
607// Pushes a marker onto the stack.
608bool Regexp::ParseState::DoLeftParen(absl::string_view name) {
609 Regexp* re = new Regexp(kLeftParen, flags_);
610 re->cap_ = ++ncap_;
611 if (name.data() != NULL)
612 re->name_ = new std::string(name);
613 return PushRegexp(re);
614}
615
616// Pushes a non-capturing marker onto the stack.
617bool Regexp::ParseState::DoLeftParenNoCapture() {
618 Regexp* re = new Regexp(kLeftParen, flags_);
619 re->cap_ = -1;
620 return PushRegexp(re);
621}
622
623// Processes a vertical bar in the input.
624bool Regexp::ParseState::DoVerticalBar() {
625 MaybeConcatString(-1, NoParseFlags);
626 DoConcatenation();
627
628 // Below the vertical bar is a list to alternate.
629 // Above the vertical bar is a list to concatenate.
630 // We just did the concatenation, so either swap
631 // the result below the vertical bar or push a new
632 // vertical bar on the stack.
633 Regexp* r1;
634 Regexp* r2;
635 if ((r1 = stacktop_) != NULL &&
636 (r2 = r1->down_) != NULL &&
637 r2->op() == kVerticalBar) {
638 Regexp* r3;
639 if ((r3 = r2->down_) != NULL &&
640 (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
641 // AnyChar is above or below the vertical bar. Let it subsume
642 // the other when the other is Literal, CharClass or AnyChar.
643 if (r3->op() == kRegexpAnyChar &&
644 (r1->op() == kRegexpLiteral ||
645 r1->op() == kRegexpCharClass ||
646 r1->op() == kRegexpAnyChar)) {
647 // Discard r1.
648 stacktop_ = r2;
649 r1->Decref();
650 return true;
651 }
652 if (r1->op() == kRegexpAnyChar &&
653 (r3->op() == kRegexpLiteral ||
654 r3->op() == kRegexpCharClass ||
655 r3->op() == kRegexpAnyChar)) {
656 // Rearrange the stack and discard r3.
657 r1->down_ = r3->down_;
658 r2->down_ = r1;
659 stacktop_ = r2;
660 r3->Decref();
661 return true;
662 }
663 }
664 // Swap r1 below vertical bar (r2).
665 r1->down_ = r2->down_;
666 r2->down_ = r1;
667 stacktop_ = r2;
668 return true;
669 }
670 return PushSimpleOp(kVerticalBar);
671}
672
673// Processes a right parenthesis in the input.
674bool Regexp::ParseState::DoRightParen() {
675 // Finish the current concatenation and alternation.
676 DoAlternation();
677
678 // The stack should be: LeftParen regexp
679 // Remove the LeftParen, leaving the regexp,
680 // parenthesized.
681 Regexp* r1;
682 Regexp* r2;
683 if ((r1 = stacktop_) == NULL ||
684 (r2 = r1->down_) == NULL ||
685 r2->op() != kLeftParen) {
686 status_->set_code(kRegexpUnexpectedParen);
687 status_->set_error_arg(whole_regexp_);
688 return false;
689 }
690
691 // Pop off r1, r2. Will Decref or reuse below.
692 stacktop_ = r2->down_;
693
694 // Restore flags from when paren opened.
695 Regexp* re = r2;
696 flags_ = re->parse_flags();
697
698 // Rewrite LeftParen as capture if needed.
699 if (re->cap_ > 0) {
700 re->op_ = kRegexpCapture;
701 // re->cap_ is already set
702 re->AllocSub(1);
703 re->sub()[0] = FinishRegexp(r1);
704 re->simple_ = re->ComputeSimple();
705 } else {
706 re->Decref();
707 re = r1;
708 }
709 return PushRegexp(re);
710}
711
712// Processes the end of input, returning the final regexp.
713Regexp* Regexp::ParseState::DoFinish() {
714 DoAlternation();
715 Regexp* re = stacktop_;
716 if (re != NULL && re->down_ != NULL) {
717 status_->set_code(kRegexpMissingParen);
718 status_->set_error_arg(whole_regexp_);
719 return NULL;
720 }
721 stacktop_ = NULL;
722 return FinishRegexp(re);
723}
724
725// Returns the leading regexp that re starts with.
726// The returned Regexp* points into a piece of re,
727// so it must not be used after the caller calls re->Decref().
728Regexp* Regexp::LeadingRegexp(Regexp* re) {
729 if (re->op() == kRegexpEmptyMatch)
730 return NULL;
731 if (re->op() == kRegexpConcat && re->nsub() >= 2) {
732 Regexp** sub = re->sub();
733 if (sub[0]->op() == kRegexpEmptyMatch)
734 return NULL;
735 return sub[0];
736 }
737 return re;
738}
739
740// Removes LeadingRegexp(re) from re and returns what's left.
741// Consumes the reference to re and may edit it in place.
742// If caller wants to hold on to LeadingRegexp(re),
743// must have already Incref'ed it.
744Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
745 if (re->op() == kRegexpEmptyMatch)
746 return re;
747 if (re->op() == kRegexpConcat && re->nsub() >= 2) {
748 Regexp** sub = re->sub();
749 if (sub[0]->op() == kRegexpEmptyMatch)
750 return re;
751 sub[0]->Decref();
752 sub[0] = NULL;
753 if (re->nsub() == 2) {
754 // Collapse concatenation to single regexp.
755 Regexp* nre = sub[1];
756 sub[1] = NULL;
757 re->Decref();
758 return nre;
759 }
760 // 3 or more -> 2 or more.
761 re->nsub_--;
762 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
763 return re;
764 }
765 Regexp::ParseFlags pf = re->parse_flags();
766 re->Decref();
767 return new Regexp(kRegexpEmptyMatch, pf);
768}
769
770// Returns the leading string that re starts with.
771// The returned Rune* points into a piece of re,
772// so it must not be used after the caller calls re->Decref().
773Rune* Regexp::LeadingString(Regexp* re, int* nrune,
774 Regexp::ParseFlags* flags) {
775 while (re->op() == kRegexpConcat && re->nsub() > 0)
776 re = re->sub()[0];
777
778 *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
779
780 if (re->op() == kRegexpLiteral) {
781 *nrune = 1;
782 return &re->rune_;
783 }
784
785 if (re->op() == kRegexpLiteralString) {
786 *nrune = re->nrunes_;
787 return re->runes_;
788 }
789
790 *nrune = 0;
791 return NULL;
792}
793
794// Removes the first n leading runes from the beginning of re.
795// Edits re in place.
796void Regexp::RemoveLeadingString(Regexp* re, int n) {
797 // Chase down concats to find first string.
798 // For regexps generated by parser, nested concats are
799 // flattened except when doing so would overflow the 16-bit
800 // limit on the size of a concatenation, so we should never
801 // see more than two here.
802 Regexp* stk[4];
803 size_t d = 0;
804 while (re->op() == kRegexpConcat) {
805 if (d < ABSL_ARRAYSIZE(stk))
806 stk[d++] = re;
807 re = re->sub()[0];
808 }
809
810 // Remove leading string from re.
811 if (re->op() == kRegexpLiteral) {
812 re->rune_ = 0;
813 re->op_ = kRegexpEmptyMatch;
814 } else if (re->op() == kRegexpLiteralString) {
815 if (n >= re->nrunes_) {
816 delete[] re->runes_;
817 re->runes_ = NULL;
818 re->nrunes_ = 0;
819 re->op_ = kRegexpEmptyMatch;
820 } else if (n == re->nrunes_ - 1) {
821 Rune rune = re->runes_[re->nrunes_ - 1];
822 delete[] re->runes_;
823 re->runes_ = NULL;
824 re->nrunes_ = 0;
825 re->rune_ = rune;
826 re->op_ = kRegexpLiteral;
827 } else {
828 re->nrunes_ -= n;
829 memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
830 }
831 }
832
833 // If re is now empty, concatenations might simplify too.
834 while (d > 0) {
835 re = stk[--d];
836 Regexp** sub = re->sub();
837 if (sub[0]->op() == kRegexpEmptyMatch) {
838 sub[0]->Decref();
839 sub[0] = NULL;
840 // Delete first element of concat.
841 switch (re->nsub()) {
842 case 0:
843 case 1:
844 // Impossible.
845 LOG(DFATAL) << "Concat of " << re->nsub();
846 re->submany_ = NULL;
847 re->op_ = kRegexpEmptyMatch;
848 break;
849
850 case 2: {
851 // Replace re with sub[1].
852 Regexp* old = sub[1];
853 sub[1] = NULL;
854 re->Swap(old);
855 old->Decref();
856 break;
857 }
858
859 default:
860 // Slide down.
861 re->nsub_--;
862 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
863 break;
864 }
865 }
866 }
867}
868
869// In the context of factoring alternations, a Splice is: a factored prefix or
870// merged character class computed by one iteration of one round of factoring;
871// the span of subexpressions of the alternation to be "spliced" (i.e. removed
872// and replaced); and, for a factored prefix, the number of suffixes after any
873// factoring that might have subsequently been performed on them. For a merged
874// character class, there are no suffixes, of course, so the field is ignored.
875struct Splice {
876 Splice(Regexp* prefix, Regexp** sub, int nsub)
877 : prefix(prefix),
878 sub(sub),
879 nsub(nsub),
880 nsuffix(-1) {}
881
882 Regexp* prefix;
883 Regexp** sub;
884 int nsub;
885 int nsuffix;
886};
887
888// Named so because it is used to implement an explicit stack, a Frame is: the
889// span of subexpressions of the alternation to be factored; the current round
890// of factoring; any Splices computed; and, for a factored prefix, an iterator
891// to the next Splice to be factored (i.e. in another Frame) because suffixes.
892struct Frame {
893 Frame(Regexp** sub, int nsub)
894 : sub(sub),
895 nsub(nsub),
896 round(0) {}
897
898 Regexp** sub;
899 int nsub;
900 int round;
901 std::vector<Splice> splices;
902 int spliceidx;
903};
904
905// Bundled into a class for friend access to Regexp without needing to declare
906// (or define) Splice in regexp.h.
907class FactorAlternationImpl {
908 public:
909 static void Round1(Regexp** sub, int nsub,
910 Regexp::ParseFlags flags,
911 std::vector<Splice>* splices);
912 static void Round2(Regexp** sub, int nsub,
913 Regexp::ParseFlags flags,
914 std::vector<Splice>* splices);
915 static void Round3(Regexp** sub, int nsub,
916 Regexp::ParseFlags flags,
917 std::vector<Splice>* splices);
918};
919
920// Factors common prefixes from alternation.
921// For example,
922// ABC|ABD|AEF|BCX|BCY
923// simplifies to
924// A(B(C|D)|EF)|BC(X|Y)
925// and thence to
926// A(B[CD]|EF)|BC[XY]
927//
928// Rewrites sub to contain simplified list to alternate and returns
929// the new length of sub. Adjusts reference counts accordingly
930// (incoming sub[i] decremented, outgoing sub[i] incremented).
931int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
932 std::vector<Frame> stk;
933 stk.emplace_back(sub, nsub);
934
935 for (;;) {
936 auto& sub = stk.back().sub;
937 auto& nsub = stk.back().nsub;
938 auto& round = stk.back().round;
939 auto& splices = stk.back().splices;
940 auto& spliceidx = stk.back().spliceidx;
941
942 if (splices.empty()) {
943 // Advance to the next round of factoring. Note that this covers
944 // the initialised state: when splices is empty and round is 0.
945 round++;
946 } else if (spliceidx < static_cast<int>(splices.size())) {
947 // We have at least one more Splice to factor. Recurse logically.
948 stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub);
949 continue;
950 } else {
951 // We have no more Splices to factor. Apply them.
952 auto iter = splices.begin();
953 int out = 0;
954 for (int i = 0; i < nsub; ) {
955 // Copy until we reach where the next Splice begins.
956 while (sub + i < iter->sub)
957 sub[out++] = sub[i++];
958 switch (round) {
959 case 1:
960 case 2: {
961 // Assemble the Splice prefix and the suffixes.
962 Regexp* re[2];
963 re[0] = iter->prefix;
964 re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags);
965 sub[out++] = Regexp::Concat(re, 2, flags);
966 i += iter->nsub;
967 break;
968 }
969 case 3:
970 // Just use the Splice prefix.
971 sub[out++] = iter->prefix;
972 i += iter->nsub;
973 break;
974 default:
975 LOG(DFATAL) << "unknown round: " << round;
976 break;
977 }
978 // If we are done, copy until the end of sub.
979 if (++iter == splices.end()) {
980 while (i < nsub)
981 sub[out++] = sub[i++];
982 }
983 }
984 splices.clear();
985 nsub = out;
986 // Advance to the next round of factoring.
987 round++;
988 }
989
990 switch (round) {
991 case 1:
992 FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
993 break;
994 case 2:
995 FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
996 break;
997 case 3:
998 FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
999 break;
1000 case 4:
1001 if (stk.size() == 1) {
1002 // We are at the top of the stack. Just return.
1003 return nsub;
1004 } else {
1005 // Pop the stack and set the number of suffixes.
1006 // (Note that references will be invalidated!)
1007 int nsuffix = nsub;
1008 stk.pop_back();
1009 stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix;
1010 ++stk.back().spliceidx;
1011 continue;
1012 }
1013 default:
1014 LOG(DFATAL) << "unknown round: " << round;
1015 break;
1016 }
1017
1018 // Set spliceidx depending on whether we have Splices to factor.
1019 if (splices.empty() || round == 3) {
1020 spliceidx = static_cast<int>(splices.size());
1021 } else {
1022 spliceidx = 0;
1023 }
1024 }
1025}
1026
1027void FactorAlternationImpl::Round1(Regexp** sub, int nsub,
1028 Regexp::ParseFlags flags,
1029 std::vector<Splice>* splices) {
1030 // Round 1: Factor out common literal prefixes.
1031 int start = 0;
1032 Rune* rune = NULL;
1033 int nrune = 0;
1034 Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
1035 for (int i = 0; i <= nsub; i++) {
1036 // Invariant: sub[start:i] consists of regexps that all
1037 // begin with rune[0:nrune].
1038 Rune* rune_i = NULL;
1039 int nrune_i = 0;
1040 Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
1041 if (i < nsub) {
1042 rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i);
1043 if (runeflags_i == runeflags) {
1044 int same = 0;
1045 while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
1046 same++;
1047 if (same > 0) {
1048 // Matches at least one rune in current range. Keep going around.
1049 nrune = same;
1050 continue;
1051 }
1052 }
1053 }
1054
1055 // Found end of a run with common leading literal string:
1056 // sub[start:i] all begin with rune[0:nrune],
1057 // but sub[i] does not even begin with rune[0].
1058 if (i == start) {
1059 // Nothing to do - first iteration.
1060 } else if (i == start+1) {
1061 // Just one: don't bother factoring.
1062 } else {
1063 Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags);
1064 for (int j = start; j < i; j++)
1065 Regexp::RemoveLeadingString(sub[j], nrune);
1066 splices->emplace_back(prefix, sub + start, i - start);
1067 }
1068
1069 // Prepare for next iteration (if there is one).
1070 if (i < nsub) {
1071 start = i;
1072 rune = rune_i;
1073 nrune = nrune_i;
1074 runeflags = runeflags_i;
1075 }
1076 }
1077}
1078
1079void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
1080 Regexp::ParseFlags flags,
1081 std::vector<Splice>* splices) {
1082 // Round 2: Factor out common simple prefixes,
1083 // just the first piece of each concatenation.
1084 // This will be good enough a lot of the time.
1085 //
1086 // Complex subexpressions (e.g. involving quantifiers)
1087 // are not safe to factor because that collapses their
1088 // distinct paths through the automaton, which affects
1089 // correctness in some cases.
1090 int start = 0;
1091 Regexp* first = NULL;
1092 for (int i = 0; i <= nsub; i++) {
1093 // Invariant: sub[start:i] consists of regexps that all
1094 // begin with first.
1095 Regexp* first_i = NULL;
1096 if (i < nsub) {
1097 first_i = Regexp::LeadingRegexp(sub[i]);
1098 if (first != NULL &&
1099 // first must be an empty-width op
1100 // OR a char class, any char or any byte
1101 // OR a fixed repeat of a literal, char class, any char or any byte.
1102 (first->op() == kRegexpBeginLine ||
1103 first->op() == kRegexpEndLine ||
1104 first->op() == kRegexpWordBoundary ||
1105 first->op() == kRegexpNoWordBoundary ||
1106 first->op() == kRegexpBeginText ||
1107 first->op() == kRegexpEndText ||
1108 first->op() == kRegexpCharClass ||
1109 first->op() == kRegexpAnyChar ||
1110 first->op() == kRegexpAnyByte ||
1111 (first->op() == kRegexpRepeat &&
1112 first->min() == first->max() &&
1113 (first->sub()[0]->op() == kRegexpLiteral ||
1114 first->sub()[0]->op() == kRegexpCharClass ||
1115 first->sub()[0]->op() == kRegexpAnyChar ||
1116 first->sub()[0]->op() == kRegexpAnyByte))) &&
1117 Regexp::Equal(first, first_i))
1118 continue;
1119 }
1120
1121 // Found end of a run with common leading regexp:
1122 // sub[start:i] all begin with first,
1123 // but sub[i] does not.
1124 if (i == start) {
1125 // Nothing to do - first iteration.
1126 } else if (i == start+1) {
1127 // Just one: don't bother factoring.
1128 } else {
1129 Regexp* prefix = first->Incref();
1130 for (int j = start; j < i; j++)
1131 sub[j] = Regexp::RemoveLeadingRegexp(sub[j]);
1132 splices->emplace_back(prefix, sub + start, i - start);
1133 }
1134
1135 // Prepare for next iteration (if there is one).
1136 if (i < nsub) {
1137 start = i;
1138 first = first_i;
1139 }
1140 }
1141}
1142
1143void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
1144 Regexp::ParseFlags flags,
1145 std::vector<Splice>* splices) {
1146 // Round 3: Merge runs of literals and/or character classes.
1147 int start = 0;
1148 Regexp* first = NULL;
1149 for (int i = 0; i <= nsub; i++) {
1150 // Invariant: sub[start:i] consists of regexps that all
1151 // are either literals (i.e. runes) or character classes.
1152 Regexp* first_i = NULL;
1153 if (i < nsub) {
1154 first_i = sub[i];
1155 if (first != NULL &&
1156 (first->op() == kRegexpLiteral ||
1157 first->op() == kRegexpCharClass) &&
1158 (first_i->op() == kRegexpLiteral ||
1159 first_i->op() == kRegexpCharClass))
1160 continue;
1161 }
1162
1163 // Found end of a run of Literal/CharClass:
1164 // sub[start:i] all are either one or the other,
1165 // but sub[i] is not.
1166 if (i == start) {
1167 // Nothing to do - first iteration.
1168 } else if (i == start+1) {
1169 // Just one: don't bother factoring.
1170 } else {
1171 CharClassBuilder ccb;
1172 for (int j = start; j < i; j++) {
1173 Regexp* re = sub[j];
1174 if (re->op() == kRegexpCharClass) {
1175 CharClass* cc = re->cc();
1176 for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
1177 ccb.AddRange(it->lo, it->hi);
1178 } else if (re->op() == kRegexpLiteral) {
1179 ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
1180 } else {
1181 LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
1182 << re->ToString();
1183 }
1184 re->Decref();
1185 }
1186 Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
1187 splices->emplace_back(re, sub + start, i - start);
1188 }
1189
1190 // Prepare for next iteration (if there is one).
1191 if (i < nsub) {
1192 start = i;
1193 first = first_i;
1194 }
1195 }
1196}
1197
1198// Collapse the regexps on top of the stack, down to the
1199// first marker, into a new op node (op == kRegexpAlternate
1200// or op == kRegexpConcat).
1201void Regexp::ParseState::DoCollapse(RegexpOp op) {
1202 // Scan backward to marker, counting children of composite.
1203 int n = 0;
1204 Regexp* next = NULL;
1205 Regexp* sub;
1206 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1207 next = sub->down_;
1208 if (sub->op_ == op)
1209 n += sub->nsub_;
1210 else
1211 n++;
1212 }
1213
1214 // If there's just one child, leave it alone.
1215 // (Concat of one thing is that one thing; alternate of one thing is same.)
1216 if (stacktop_ != NULL && stacktop_->down_ == next)
1217 return;
1218
1219 // Construct op (alternation or concatenation), flattening op of op.
1220 PODArray<Regexp*> subs(n);
1221 next = NULL;
1222 int i = n;
1223 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1224 next = sub->down_;
1225 if (sub->op_ == op) {
1226 Regexp** sub_subs = sub->sub();
1227 for (int k = sub->nsub_ - 1; k >= 0; k--)
1228 subs[--i] = sub_subs[k]->Incref();
1229 sub->Decref();
1230 } else {
1231 subs[--i] = FinishRegexp(sub);
1232 }
1233 }
1234
1235 Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true);
1236 re->simple_ = re->ComputeSimple();
1237 re->down_ = next;
1238 stacktop_ = re;
1239}
1240
1241// Finishes the current concatenation,
1242// collapsing it into a single regexp on the stack.
1243void Regexp::ParseState::DoConcatenation() {
1244 Regexp* r1 = stacktop_;
1245 if (r1 == NULL || IsMarker(r1->op())) {
1246 // empty concatenation is special case
1247 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);
1248 PushRegexp(re);
1249 }
1250 DoCollapse(kRegexpConcat);
1251}
1252
1253// Finishes the current alternation,
1254// collapsing it to a single regexp on the stack.
1255void Regexp::ParseState::DoAlternation() {
1256 DoVerticalBar();
1257 // Now stack top is kVerticalBar.
1258 Regexp* r1 = stacktop_;
1259 stacktop_ = r1->down_;
1260 r1->Decref();
1261 DoCollapse(kRegexpAlternate);
1262}
1263
1264// Incremental conversion of concatenated literals into strings.
1265// If top two elements on stack are both literal or string,
1266// collapse into single string.
1267// Don't walk down the stack -- the parser calls this frequently
1268// enough that below the bottom two is known to be collapsed.
1269// Only called when another regexp is about to be pushed
1270// on the stack, so that the topmost literal is not being considered.
1271// (Otherwise ab* would turn into (ab)*.)
1272// If r >= 0, consider pushing a literal r on the stack.
1273// Return whether that happened.
1274bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
1275 Regexp* re1;
1276 Regexp* re2;
1277 if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)
1278 return false;
1279
1280 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
1281 return false;
1282 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
1283 return false;
1284 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
1285 return false;
1286
1287 if (re2->op_ == kRegexpLiteral) {
1288 // convert into string
1289 Rune rune = re2->rune_;
1290 re2->op_ = kRegexpLiteralString;
1291 re2->nrunes_ = 0;
1292 re2->runes_ = NULL;
1293 re2->AddRuneToString(rune);
1294 }
1295
1296 // push re1 into re2.
1297 if (re1->op_ == kRegexpLiteral) {
1298 re2->AddRuneToString(re1->rune_);
1299 } else {
1300 for (int i = 0; i < re1->nrunes_; i++)
1301 re2->AddRuneToString(re1->runes_[i]);
1302 re1->nrunes_ = 0;
1303 delete[] re1->runes_;
1304 re1->runes_ = NULL;
1305 }
1306
1307 // reuse re1 if possible
1308 if (r >= 0) {
1309 re1->op_ = kRegexpLiteral;
1310 re1->rune_ = r;
1311 re1->parse_flags_ = static_cast<uint16_t>(flags);
1312 return true;
1313 }
1314
1315 stacktop_ = re2;
1316 re1->Decref();
1317 return false;
1318}
1319
1320// Lexing routines.
1321
1322// Parses a decimal integer, storing it in *np.
1323// Sets *s to span the remainder of the string.
1324static bool ParseInteger(absl::string_view* s, int* np) {
1325 if (s->empty() || !isdigit((*s)[0] & 0xFF))
1326 return false;
1327 // Disallow leading zeros.
1328 if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF))
1329 return false;
1330 int n = 0;
1331 int c;
1332 while (!s->empty() && isdigit(c = (*s)[0] & 0xFF)) {
1333 // Avoid overflow.
1334 if (n >= 100000000)
1335 return false;
1336 n = n*10 + c - '0';
1337 s->remove_prefix(1); // digit
1338 }
1339 *np = n;
1340 return true;
1341}
1342
1343// Parses a repetition suffix like {1,2} or {2} or {2,}.
1344// Sets *s to span the remainder of the string on success.
1345// Sets *lo and *hi to the given range.
1346// In the case of {2,}, the high number is unbounded;
1347// sets *hi to -1 to signify this.
1348// {,2} is NOT a valid suffix.
1349// The Maybe in the name signifies that the regexp parse
1350// doesn't fail even if ParseRepetition does, so the string_view
1351// s must NOT be edited unless MaybeParseRepetition returns true.
1352static bool MaybeParseRepetition(absl::string_view* sp, int* lo, int* hi) {
1353 absl::string_view s = *sp;
1354 if (s.empty() || s[0] != '{')
1355 return false;
1356 s.remove_prefix(1); // '{'
1357 if (!ParseInteger(&s, lo))
1358 return false;
1359 if (s.empty())
1360 return false;
1361 if (s[0] == ',') {
1362 s.remove_prefix(1); // ','
1363 if (s.empty())
1364 return false;
1365 if (s[0] == '}') {
1366 // {2,} means at least 2
1367 *hi = -1;
1368 } else {
1369 // {2,4} means 2, 3, or 4.
1370 if (!ParseInteger(&s, hi))
1371 return false;
1372 }
1373 } else {
1374 // {2} means exactly two
1375 *hi = *lo;
1376 }
1377 if (s.empty() || s[0] != '}')
1378 return false;
1379 s.remove_prefix(1); // '}'
1380 *sp = s;
1381 return true;
1382}
1383
1384// Removes the next Rune from the string_view and stores it in *r.
1385// Returns number of bytes removed from sp.
1386// Behaves as though there is a terminating NUL at the end of sp.
1387// Argument order is backwards from usual Google style
1388// but consistent with chartorune.
1389static int StringViewToRune(Rune* r, absl::string_view* sp,
1390 RegexpStatus* status) {
1391 // fullrune() takes int, not size_t. However, it just looks
1392 // at the leading byte and treats any length >= 4 the same.
1393 if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) {
1394 int n = chartorune(r, sp->data());
1395 // Some copies of chartorune have a bug that accepts
1396 // encodings of values in (10FFFF, 1FFFFF] as valid.
1397 // Those values break the character class algorithm,
1398 // which assumes Runemax is the largest rune.
1399 if (*r > Runemax) {
1400 n = 1;
1401 *r = Runeerror;
1402 }
1403 if (!(n == 1 && *r == Runeerror)) { // no decoding error
1404 sp->remove_prefix(n);
1405 return n;
1406 }
1407 }
1408
1409 if (status != NULL) {
1410 status->set_code(kRegexpBadUTF8);
1411 status->set_error_arg(absl::string_view());
1412 }
1413 return -1;
1414}
1415
1416// Returns whether name is valid UTF-8.
1417// If not, sets status to kRegexpBadUTF8.
1418static bool IsValidUTF8(absl::string_view s, RegexpStatus* status) {
1419 absl::string_view t = s;
1420 Rune r;
1421 while (!t.empty()) {
1422 if (StringViewToRune(&r, &t, status) < 0)
1423 return false;
1424 }
1425 return true;
1426}
1427
1428// Is c a hex digit?
1429static int IsHex(int c) {
1430 return ('0' <= c && c <= '9') ||
1431 ('A' <= c && c <= 'F') ||
1432 ('a' <= c && c <= 'f');
1433}
1434
1435// Convert hex digit to value.
1436static int UnHex(int c) {
1437 if ('0' <= c && c <= '9')
1438 return c - '0';
1439 if ('A' <= c && c <= 'F')
1440 return c - 'A' + 10;
1441 if ('a' <= c && c <= 'f')
1442 return c - 'a' + 10;
1443 LOG(DFATAL) << "Bad hex digit " << c;
1444 return 0;
1445}
1446
1447// Parse an escape sequence (e.g., \n, \{).
1448// Sets *s to span the remainder of the string.
1449// Sets *rp to the named character.
1450static bool ParseEscape(absl::string_view* s, Rune* rp,
1451 RegexpStatus* status, int rune_max) {
1452 const char* begin = s->data();
1453 if (s->empty() || (*s)[0] != '\\') {
1454 // Should not happen - caller always checks.
1455 status->set_code(kRegexpInternalError);
1456 status->set_error_arg(absl::string_view());
1457 return false;
1458 }
1459 if (s->size() == 1) {
1460 status->set_code(kRegexpTrailingBackslash);
1461 status->set_error_arg(absl::string_view());
1462 return false;
1463 }
1464 Rune c, c1;
1465 s->remove_prefix(1); // backslash
1466 if (StringViewToRune(&c, s, status) < 0)
1467 return false;
1468 int code;
1469 switch (c) {
1470 default:
1471 if (c < Runeself && !isalpha(c) && !isdigit(c)) {
1472 // Escaped non-word characters are always themselves.
1473 // PCRE is not quite so rigorous: it accepts things like
1474 // \q, but we don't. We once rejected \_, but too many
1475 // programs and people insist on using it, so allow \_.
1476 *rp = c;
1477 return true;
1478 }
1479 goto BadEscape;
1480
1481 // Octal escapes.
1482 case '1':
1483 case '2':
1484 case '3':
1485 case '4':
1486 case '5':
1487 case '6':
1488 case '7':
1489 // Single non-zero octal digit is a backreference; not supported.
1490 if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7')
1491 goto BadEscape;
1492 ABSL_FALLTHROUGH_INTENDED;
1493 case '0':
1494 // consume up to three octal digits; already have one.
1495 code = c - '0';
1496 if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') {
1497 code = code * 8 + c - '0';
1498 s->remove_prefix(1); // digit
1499 if (!s->empty()) {
1500 c = (*s)[0];
1501 if ('0' <= c && c <= '7') {
1502 code = code * 8 + c - '0';
1503 s->remove_prefix(1); // digit
1504 }
1505 }
1506 }
1507 if (code > rune_max)
1508 goto BadEscape;
1509 *rp = code;
1510 return true;
1511
1512 // Hexadecimal escapes
1513 case 'x':
1514 if (s->empty())
1515 goto BadEscape;
1516 if (StringViewToRune(&c, s, status) < 0)
1517 return false;
1518 if (c == '{') {
1519 // Any number of digits in braces.
1520 // Update n as we consume the string, so that
1521 // the whole thing gets shown in the error message.
1522 // Perl accepts any text at all; it ignores all text
1523 // after the first non-hex digit. We require only hex digits,
1524 // and at least one.
1525 if (StringViewToRune(&c, s, status) < 0)
1526 return false;
1527 int nhex = 0;
1528 code = 0;
1529 while (IsHex(c)) {
1530 nhex++;
1531 code = code * 16 + UnHex(c);
1532 if (code > rune_max)
1533 goto BadEscape;
1534 if (s->empty())
1535 goto BadEscape;
1536 if (StringViewToRune(&c, s, status) < 0)
1537 return false;
1538 }
1539 if (c != '}' || nhex == 0)
1540 goto BadEscape;
1541 *rp = code;
1542 return true;
1543 }
1544 // Easy case: two hex digits.
1545 if (s->empty())
1546 goto BadEscape;
1547 if (StringViewToRune(&c1, s, status) < 0)
1548 return false;
1549 if (!IsHex(c) || !IsHex(c1))
1550 goto BadEscape;
1551 *rp = UnHex(c) * 16 + UnHex(c1);
1552 return true;
1553
1554 // C escapes.
1555 case 'n':
1556 *rp = '\n';
1557 return true;
1558 case 'r':
1559 *rp = '\r';
1560 return true;
1561 case 't':
1562 *rp = '\t';
1563 return true;
1564
1565 // Less common C escapes.
1566 case 'a':
1567 *rp = '\a';
1568 return true;
1569 case 'f':
1570 *rp = '\f';
1571 return true;
1572 case 'v':
1573 *rp = '\v';
1574 return true;
1575
1576 // This code is disabled to avoid misparsing
1577 // the Perl word-boundary \b as a backspace
1578 // when in POSIX regexp mode. Surprisingly,
1579 // in Perl, \b means word-boundary but [\b]
1580 // means backspace. We don't support that:
1581 // if you want a backspace embed a literal
1582 // backspace character or use \x08.
1583 //
1584 // case 'b':
1585 // *rp = '\b';
1586 // return true;
1587 }
1588
1589 LOG(DFATAL) << "Not reached in ParseEscape.";
1590
1591BadEscape:
1592 // Unrecognized escape sequence.
1593 status->set_code(kRegexpBadEscape);
1594 status->set_error_arg(
1595 absl::string_view(begin, static_cast<size_t>(s->data() - begin)));
1596 return false;
1597}
1598
1599// Add a range to the character class, but exclude newline if asked.
1600// Also handle case folding.
1601void CharClassBuilder::AddRangeFlags(
1602 Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
1603
1604 // Take out \n if the flags say so.
1605 bool cutnl = !(parse_flags & Regexp::ClassNL) ||
1606 (parse_flags & Regexp::NeverNL);
1607 if (cutnl && lo <= '\n' && '\n' <= hi) {
1608 if (lo < '\n')
1609 AddRangeFlags(lo, '\n' - 1, parse_flags);
1610 if (hi > '\n')
1611 AddRangeFlags('\n' + 1, hi, parse_flags);
1612 return;
1613 }
1614
1615 // If folding case, add fold-equivalent characters too.
1616 if (parse_flags & Regexp::FoldCase)
1617 AddFoldedRange(this, lo, hi, 0);
1618 else
1619 AddRange(lo, hi);
1620}
1621
1622// Look for a group with the given name.
1623static const UGroup* LookupGroup(absl::string_view name,
1624 const UGroup* groups, int ngroups) {
1625 // Simple name lookup.
1626 for (int i = 0; i < ngroups; i++)
1627 if (absl::string_view(groups[i].name) == name)
1628 return &groups[i];
1629 return NULL;
1630}
1631
1632// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
1633static const UGroup* LookupPosixGroup(absl::string_view name) {
1634 return LookupGroup(name, posix_groups, num_posix_groups);
1635}
1636
1637static const UGroup* LookupPerlGroup(absl::string_view name) {
1638 return LookupGroup(name, perl_groups, num_perl_groups);
1639}
1640
1641#if !defined(RE2_USE_ICU)
1642// Fake UGroup containing all Runes
1643static URange16 any16[] = { { 0, 65535 } };
1644static URange32 any32[] = { { 65536, Runemax } };
1645static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
1646
1647// Look for a Unicode group with the given name (e.g., "Han")
1648static const UGroup* LookupUnicodeGroup(absl::string_view name) {
1649 // Special case: "Any" means any.
1650 if (name == absl::string_view("Any"))
1651 return &anygroup;
1652 return LookupGroup(name, unicode_groups, num_unicode_groups);
1653}
1654#endif
1655
1656// Add a UGroup or its negation to the character class.
1657static void AddUGroup(CharClassBuilder* cc, const UGroup* g, int sign,
1658 Regexp::ParseFlags parse_flags) {
1659 if (sign == +1) {
1660 for (int i = 0; i < g->nr16; i++) {
1661 cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
1662 }
1663 for (int i = 0; i < g->nr32; i++) {
1664 cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
1665 }
1666 } else {
1667 if (parse_flags & Regexp::FoldCase) {
1668 // Normally adding a case-folded group means
1669 // adding all the extra fold-equivalent runes too.
1670 // But if we're adding the negation of the group,
1671 // we have to exclude all the runes that are fold-equivalent
1672 // to what's already missing. Too hard, so do in two steps.
1673 CharClassBuilder ccb1;
1674 AddUGroup(&ccb1, g, +1, parse_flags);
1675 // If the flags say to take out \n, put it in, so that negating will take it out.
1676 // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
1677 bool cutnl = !(parse_flags & Regexp::ClassNL) ||
1678 (parse_flags & Regexp::NeverNL);
1679 if (cutnl) {
1680 ccb1.AddRange('\n', '\n');
1681 }
1682 ccb1.Negate();
1683 cc->AddCharClass(&ccb1);
1684 return;
1685 }
1686 int next = 0;
1687 for (int i = 0; i < g->nr16; i++) {
1688 if (next < g->r16[i].lo)
1689 cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
1690 next = g->r16[i].hi + 1;
1691 }
1692 for (int i = 0; i < g->nr32; i++) {
1693 if (next < g->r32[i].lo)
1694 cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
1695 next = g->r32[i].hi + 1;
1696 }
1697 if (next <= Runemax)
1698 cc->AddRangeFlags(next, Runemax, parse_flags);
1699 }
1700}
1701
1702// Maybe parse a Perl character class escape sequence.
1703// Only recognizes the Perl character classes (\d \s \w \D \S \W),
1704// not the Perl empty-string classes (\b \B \A \Z \z).
1705// On success, sets *s to span the remainder of the string
1706// and returns the corresponding UGroup.
1707// The string_view must *NOT* be edited unless the call succeeds.
1708const UGroup* MaybeParsePerlCCEscape(absl::string_view* s,
1709 Regexp::ParseFlags parse_flags) {
1710 if (!(parse_flags & Regexp::PerlClasses))
1711 return NULL;
1712 if (s->size() < 2 || (*s)[0] != '\\')
1713 return NULL;
1714 // Could use StringViewToRune, but there aren't
1715 // any non-ASCII Perl group names.
1716 absl::string_view name(s->data(), 2);
1717 const UGroup* g = LookupPerlGroup(name);
1718 if (g == NULL)
1719 return NULL;
1720 s->remove_prefix(name.size());
1721 return g;
1722}
1723
1724enum ParseStatus {
1725 kParseOk, // Did some parsing.
1726 kParseError, // Found an error.
1727 kParseNothing, // Decided not to parse.
1728};
1729
1730// Maybe parses a Unicode character group like \p{Han} or \P{Han}
1731// (the latter is a negated group).
1732ParseStatus ParseUnicodeGroup(absl::string_view* s,
1733 Regexp::ParseFlags parse_flags,
1734 CharClassBuilder* cc, RegexpStatus* status) {
1735 // Decide whether to parse.
1736 if (!(parse_flags & Regexp::UnicodeGroups))
1737 return kParseNothing;
1738 if (s->size() < 2 || (*s)[0] != '\\')
1739 return kParseNothing;
1740 Rune c = (*s)[1];
1741 if (c != 'p' && c != 'P')
1742 return kParseNothing;
1743
1744 // Committed to parse. Results:
1745 int sign = +1; // -1 = negated char class
1746 if (c == 'P')
1747 sign = -sign;
1748 absl::string_view seq = *s; // \p{Han} or \pL
1749 absl::string_view name; // Han or L
1750 s->remove_prefix(2); // '\\', 'p'
1751
1752 if (!StringViewToRune(&c, s, status))
1753 return kParseError;
1754 if (c != '{') {
1755 // Name is the bit of string we just skipped over for c.
1756 const char* p = seq.data() + 2;
1757 name = absl::string_view(p, static_cast<size_t>(s->data() - p));
1758 } else {
1759 // Name is in braces. Look for closing }
1760 size_t end = s->find('}', 0);
1761 if (end == absl::string_view::npos) {
1762 if (!IsValidUTF8(seq, status))
1763 return kParseError;
1764 status->set_code(kRegexpBadCharRange);
1765 status->set_error_arg(seq);
1766 return kParseError;
1767 }
1768 name = absl::string_view(s->data(), end); // without '}'
1769 s->remove_prefix(end + 1); // with '}'
1770 if (!IsValidUTF8(name, status))
1771 return kParseError;
1772 }
1773
1774 // Chop seq where s now begins.
1775 seq = absl::string_view(seq.data(), static_cast<size_t>(s->data() - seq.data()));
1776
1777 if (!name.empty() && name[0] == '^') {
1778 sign = -sign;
1779 name.remove_prefix(1); // '^'
1780 }
1781
1782#if !defined(RE2_USE_ICU)
1783 // Look up the group in the RE2 Unicode data.
1784 const UGroup* g = LookupUnicodeGroup(name);
1785 if (g == NULL) {
1786 status->set_code(kRegexpBadCharRange);
1787 status->set_error_arg(seq);
1788 return kParseError;
1789 }
1790
1791 AddUGroup(cc, g, sign, parse_flags);
1792#else
1793 // Look up the group in the ICU Unicode data. Because ICU provides full
1794 // Unicode properties support, this could be more than a lookup by name.
1795 ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
1796 std::string("\\p{") + std::string(name) + std::string("}"));
1797 UErrorCode uerr = U_ZERO_ERROR;
1798 ::icu::UnicodeSet uset(ustr, uerr);
1799 if (U_FAILURE(uerr)) {
1800 status->set_code(kRegexpBadCharRange);
1801 status->set_error_arg(seq);
1802 return kParseError;
1803 }
1804
1805 // Convert the UnicodeSet to a URange32 and UGroup that we can add.
1806 int nr = uset.getRangeCount();
1807 PODArray<URange32> r(nr);
1808 for (int i = 0; i < nr; i++) {
1809 r[i].lo = uset.getRangeStart(i);
1810 r[i].hi = uset.getRangeEnd(i);
1811 }
1812 UGroup g = {"", +1, 0, 0, r.data(), nr};
1813 AddUGroup(cc, &g, sign, parse_flags);
1814#endif
1815
1816 return kParseOk;
1817}
1818
1819// Parses a character class name like [:alnum:].
1820// Sets *s to span the remainder of the string.
1821// Adds the ranges corresponding to the class to ranges.
1822static ParseStatus ParseCCName(absl::string_view* s,
1823 Regexp::ParseFlags parse_flags,
1824 CharClassBuilder* cc, RegexpStatus* status) {
1825 // Check begins with [:
1826 const char* p = s->data();
1827 const char* ep = s->data() + s->size();
1828 if (ep - p < 2 || p[0] != '[' || p[1] != ':')
1829 return kParseNothing;
1830
1831 // Look for closing :].
1832 const char* q;
1833 for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)
1834 ;
1835
1836 // If no closing :], then ignore.
1837 if (q > ep-2)
1838 return kParseNothing;
1839
1840 // Got it. Check that it's valid.
1841 q += 2;
1842 absl::string_view name(p, static_cast<size_t>(q - p));
1843
1844 const UGroup* g = LookupPosixGroup(name);
1845 if (g == NULL) {
1846 status->set_code(kRegexpBadCharRange);
1847 status->set_error_arg(name);
1848 return kParseError;
1849 }
1850
1851 s->remove_prefix(name.size());
1852 AddUGroup(cc, g, g->sign, parse_flags);
1853 return kParseOk;
1854}
1855
1856// Parses a character inside a character class.
1857// There are fewer special characters here than in the rest of the regexp.
1858// Sets *s to span the remainder of the string.
1859// Sets *rp to the character.
1860bool Regexp::ParseState::ParseCCCharacter(absl::string_view* s, Rune* rp,
1861 absl::string_view whole_class,
1862 RegexpStatus* status) {
1863 if (s->empty()) {
1864 status->set_code(kRegexpMissingBracket);
1865 status->set_error_arg(whole_class);
1866 return false;
1867 }
1868
1869 // Allow regular escape sequences even though
1870 // many need not be escaped in this context.
1871 if ((*s)[0] == '\\')
1872 return ParseEscape(s, rp, status, rune_max_);
1873
1874 // Otherwise take the next rune.
1875 return StringViewToRune(rp, s, status) >= 0;
1876}
1877
1878// Parses a character class character, or, if the character
1879// is followed by a hyphen, parses a character class range.
1880// For single characters, rr->lo == rr->hi.
1881// Sets *s to span the remainder of the string.
1882// Sets *rp to the character.
1883bool Regexp::ParseState::ParseCCRange(absl::string_view* s, RuneRange* rr,
1884 absl::string_view whole_class,
1885 RegexpStatus* status) {
1886 absl::string_view os = *s;
1887 if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
1888 return false;
1889 // [a-] means (a|-), so check for final ].
1890 if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') {
1891 s->remove_prefix(1); // '-'
1892 if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
1893 return false;
1894 if (rr->hi < rr->lo) {
1895 status->set_code(kRegexpBadCharRange);
1896 status->set_error_arg(absl::string_view(
1897 os.data(), static_cast<size_t>(s->data() - os.data())));
1898 return false;
1899 }
1900 } else {
1901 rr->hi = rr->lo;
1902 }
1903 return true;
1904}
1905
1906// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
1907// Sets *s to span the remainder of the string.
1908// Sets *out_re to the regexp for the class.
1909bool Regexp::ParseState::ParseCharClass(absl::string_view* s, Regexp** out_re,
1910 RegexpStatus* status) {
1911 absl::string_view whole_class = *s;
1912 if (s->empty() || (*s)[0] != '[') {
1913 // Caller checked this.
1914 status->set_code(kRegexpInternalError);
1915 status->set_error_arg(absl::string_view());
1916 return false;
1917 }
1918 bool negated = false;
1919 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
1920 re->ccb_ = new CharClassBuilder;
1921 s->remove_prefix(1); // '['
1922 if (!s->empty() && (*s)[0] == '^') {
1923 s->remove_prefix(1); // '^'
1924 negated = true;
1925 if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
1926 // If NL can't match implicitly, then pretend
1927 // negated classes include a leading \n.
1928 re->ccb_->AddRange('\n', '\n');
1929 }
1930 }
1931 bool first = true; // ] is okay as first char in class
1932 while (!s->empty() && ((*s)[0] != ']' || first)) {
1933 // - is only okay unescaped as first or last in class.
1934 // Except that Perl allows - anywhere.
1935 if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
1936 (s->size() == 1 || (*s)[1] != ']')) {
1937 absl::string_view t = *s;
1938 t.remove_prefix(1); // '-'
1939 Rune r;
1940 int n = StringViewToRune(&r, &t, status);
1941 if (n < 0) {
1942 re->Decref();
1943 return false;
1944 }
1945 status->set_code(kRegexpBadCharRange);
1946 status->set_error_arg(absl::string_view(s->data(), 1+n));
1947 re->Decref();
1948 return false;
1949 }
1950 first = false;
1951
1952 // Look for [:alnum:] etc.
1953 if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
1954 switch (ParseCCName(s, flags_, re->ccb_, status)) {
1955 case kParseOk:
1956 continue;
1957 case kParseError:
1958 re->Decref();
1959 return false;
1960 case kParseNothing:
1961 break;
1962 }
1963 }
1964
1965 // Look for Unicode character group like \p{Han}
1966 if (s->size() > 2 &&
1967 (*s)[0] == '\\' &&
1968 ((*s)[1] == 'p' || (*s)[1] == 'P')) {
1969 switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
1970 case kParseOk:
1971 continue;
1972 case kParseError:
1973 re->Decref();
1974 return false;
1975 case kParseNothing:
1976 break;
1977 }
1978 }
1979
1980 // Look for Perl character class symbols (extension).
1981 const UGroup* g = MaybeParsePerlCCEscape(s, flags_);
1982 if (g != NULL) {
1983 AddUGroup(re->ccb_, g, g->sign, flags_);
1984 continue;
1985 }
1986
1987 // Otherwise assume single character or simple range.
1988 RuneRange rr;
1989 if (!ParseCCRange(s, &rr, whole_class, status)) {
1990 re->Decref();
1991 return false;
1992 }
1993 // AddRangeFlags is usually called in response to a class like
1994 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
1995 // Regexp::ClassNL is set. In an explicit range or singleton
1996 // like we just parsed, we do not filter \n out, so set ClassNL
1997 // in the flags.
1998 re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
1999 }
2000 if (s->empty()) {
2001 status->set_code(kRegexpMissingBracket);
2002 status->set_error_arg(whole_class);
2003 re->Decref();
2004 return false;
2005 }
2006 s->remove_prefix(1); // ']'
2007
2008 if (negated)
2009 re->ccb_->Negate();
2010
2011 *out_re = re;
2012 return true;
2013}
2014
2015// Returns whether name is a valid capture name.
2016static bool IsValidCaptureName(absl::string_view name) {
2017 if (name.empty())
2018 return false;
2019
2020 // Historically, we effectively used [0-9A-Za-z_]+ to validate; that
2021 // followed Python 2 except for not restricting the first character.
2022 // As of Python 3, Unicode characters beyond ASCII are also allowed;
2023 // accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and
2024 // Pc categories, but again without restricting the first character.
2025 // Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3
2026 // performs it for identifiers, but seemingly not for capture names;
2027 // if they start doing that for capture names, we won't follow suit.
2028 static const CharClass* const cc = []() {
2029 CharClassBuilder ccb;
2030 for (absl::string_view group :
2031 {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"})
2032 AddUGroup(&ccb, LookupGroup(group, unicode_groups, num_unicode_groups),
2033 +1, Regexp::NoParseFlags);
2034 return ccb.GetCharClass();
2035 }();
2036
2037 absl::string_view t = name;
2038 Rune r;
2039 while (!t.empty()) {
2040 if (StringViewToRune(&r, &t, NULL) < 0)
2041 return false;
2042 if (cc->Contains(r))
2043 continue;
2044 return false;
2045 }
2046 return true;
2047}
2048
2049// Parses a Perl flag setting or non-capturing group or both,
2050// like (?i) or (?: or (?i:. Removes from s, updates parse state.
2051// The caller must check that s begins with "(?".
2052// Returns true on success. If the Perl flag is not
2053// well-formed or not supported, sets status_ and returns false.
2054bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
2055 absl::string_view t = *s;
2056
2057 // Caller is supposed to check this.
2058 if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') {
2059 LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
2060 status_->set_code(kRegexpInternalError);
2061 return false;
2062 }
2063
2064 t.remove_prefix(2); // "(?"
2065
2066 // Check for named captures, first introduced in Python's regexp library.
2067 // As usual, there are three slightly different syntaxes:
2068 //
2069 // (?P<name>expr) the original, introduced by Python
2070 // (?<name>expr) the .NET alteration, adopted by Perl 5.10
2071 // (?'name'expr) another .NET alteration, adopted by Perl 5.10
2072 //
2073 // Perl 5.10 gave in and implemented the Python version too,
2074 // but they claim that the last two are the preferred forms.
2075 // PCRE and languages based on it (specifically, PHP and Ruby)
2076 // support all three as well. EcmaScript 4 uses only the Python form.
2077 //
2078 // In both the open source world (via Code Search) and the
2079 // Google source tree, (?P<expr>name) is the dominant form,
2080 // so that's the one we implement. One is enough.
2081 if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
2082 // Pull out name.
2083 size_t end = t.find('>', 2);
2084 if (end == absl::string_view::npos) {
2085 if (!IsValidUTF8(*s, status_))
2086 return false;
2087 status_->set_code(kRegexpBadNamedCapture);
2088 status_->set_error_arg(*s);
2089 return false;
2090 }
2091
2092 // t is "P<name>...", t[end] == '>'
2093 absl::string_view capture(t.data()-2, end+3); // "(?P<name>"
2094 absl::string_view name(t.data()+2, end-2); // "name"
2095 if (!IsValidUTF8(name, status_))
2096 return false;
2097 if (!IsValidCaptureName(name)) {
2098 status_->set_code(kRegexpBadNamedCapture);
2099 status_->set_error_arg(capture);
2100 return false;
2101 }
2102
2103 if (!DoLeftParen(name)) {
2104 // DoLeftParen's failure set status_.
2105 return false;
2106 }
2107
2108 s->remove_prefix(
2109 static_cast<size_t>(capture.data() + capture.size() - s->data()));
2110 return true;
2111 }
2112
2113 bool negated = false;
2114 bool sawflags = false;
2115 int nflags = flags_;
2116 Rune c;
2117 for (bool done = false; !done; ) {
2118 if (t.empty())
2119 goto BadPerlOp;
2120 if (StringViewToRune(&c, &t, status_) < 0)
2121 return false;
2122 switch (c) {
2123 default:
2124 goto BadPerlOp;
2125
2126 // Parse flags.
2127 case 'i':
2128 sawflags = true;
2129 if (negated)
2130 nflags &= ~FoldCase;
2131 else
2132 nflags |= FoldCase;
2133 break;
2134
2135 case 'm': // opposite of our OneLine
2136 sawflags = true;
2137 if (negated)
2138 nflags |= OneLine;
2139 else
2140 nflags &= ~OneLine;
2141 break;
2142
2143 case 's':
2144 sawflags = true;
2145 if (negated)
2146 nflags &= ~DotNL;
2147 else
2148 nflags |= DotNL;
2149 break;
2150
2151 case 'U':
2152 sawflags = true;
2153 if (negated)
2154 nflags &= ~NonGreedy;
2155 else
2156 nflags |= NonGreedy;
2157 break;
2158
2159 // Negation
2160 case '-':
2161 if (negated)
2162 goto BadPerlOp;
2163 negated = true;
2164 sawflags = false;
2165 break;
2166
2167 // Open new group.
2168 case ':':
2169 if (!DoLeftParenNoCapture()) {
2170 // DoLeftParenNoCapture's failure set status_.
2171 return false;
2172 }
2173 done = true;
2174 break;
2175
2176 // Finish flags.
2177 case ')':
2178 done = true;
2179 break;
2180 }
2181 }
2182
2183 if (negated && !sawflags)
2184 goto BadPerlOp;
2185
2186 flags_ = static_cast<Regexp::ParseFlags>(nflags);
2187 *s = t;
2188 return true;
2189
2190BadPerlOp:
2191 status_->set_code(kRegexpBadPerlOp);
2192 status_->set_error_arg(
2193 absl::string_view(s->data(), static_cast<size_t>(t.data() - s->data())));
2194 return false;
2195}
2196
2197// Converts latin1 (assumed to be encoded as Latin1 bytes)
2198// into UTF8 encoding in string.
2199// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
2200// deprecated and because it rejects code points 0x80-0x9F.
2201void ConvertLatin1ToUTF8(absl::string_view latin1, std::string* utf) {
2202 char buf[UTFmax];
2203
2204 utf->clear();
2205 for (size_t i = 0; i < latin1.size(); i++) {
2206 Rune r = latin1[i] & 0xFF;
2207 int n = runetochar(buf, &r);
2208 utf->append(buf, n);
2209 }
2210}
2211
2212// Parses the regular expression given by s,
2213// returning the corresponding Regexp tree.
2214// The caller must Decref the return value when done with it.
2215// Returns NULL on error.
2216Regexp* Regexp::Parse(absl::string_view s, ParseFlags global_flags,
2217 RegexpStatus* status) {
2218 // Make status non-NULL (easier on everyone else).
2219 RegexpStatus xstatus;
2220 if (status == NULL)
2221 status = &xstatus;
2222
2223 ParseState ps(global_flags, s, status);
2224 absl::string_view t = s;
2225
2226 // Convert regexp to UTF-8 (easier on the rest of the parser).
2227 if (global_flags & Latin1) {
2228 std::string* tmp = new std::string;
2229 ConvertLatin1ToUTF8(t, tmp);
2230 status->set_tmp(tmp);
2231 t = *tmp;
2232 }
2233
2234 if (global_flags & Literal) {
2235 // Special parse loop for literal string.
2236 while (!t.empty()) {
2237 Rune r;
2238 if (StringViewToRune(&r, &t, status) < 0)
2239 return NULL;
2240 if (!ps.PushLiteral(r))
2241 return NULL;
2242 }
2243 return ps.DoFinish();
2244 }
2245
2246 absl::string_view lastunary = absl::string_view();
2247 while (!t.empty()) {
2248 absl::string_view isunary = absl::string_view();
2249 switch (t[0]) {
2250 default: {
2251 Rune r;
2252 if (StringViewToRune(&r, &t, status) < 0)
2253 return NULL;
2254 if (!ps.PushLiteral(r))
2255 return NULL;
2256 break;
2257 }
2258
2259 case '(':
2260 // "(?" introduces Perl escape.
2261 if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {
2262 // Flag changes and non-capturing groups.
2263 if (!ps.ParsePerlFlags(&t))
2264 return NULL;
2265 break;
2266 }
2267 if (ps.flags() & NeverCapture) {
2268 if (!ps.DoLeftParenNoCapture())
2269 return NULL;
2270 } else {
2271 if (!ps.DoLeftParen(absl::string_view()))
2272 return NULL;
2273 }
2274 t.remove_prefix(1); // '('
2275 break;
2276
2277 case '|':
2278 if (!ps.DoVerticalBar())
2279 return NULL;
2280 t.remove_prefix(1); // '|'
2281 break;
2282
2283 case ')':
2284 if (!ps.DoRightParen())
2285 return NULL;
2286 t.remove_prefix(1); // ')'
2287 break;
2288
2289 case '^': // Beginning of line.
2290 if (!ps.PushCaret())
2291 return NULL;
2292 t.remove_prefix(1); // '^'
2293 break;
2294
2295 case '$': // End of line.
2296 if (!ps.PushDollar())
2297 return NULL;
2298 t.remove_prefix(1); // '$'
2299 break;
2300
2301 case '.': // Any character (possibly except newline).
2302 if (!ps.PushDot())
2303 return NULL;
2304 t.remove_prefix(1); // '.'
2305 break;
2306
2307 case '[': { // Character class.
2308 Regexp* re;
2309 if (!ps.ParseCharClass(&t, &re, status))
2310 return NULL;
2311 if (!ps.PushRegexp(re))
2312 return NULL;
2313 break;
2314 }
2315
2316 case '*': { // Zero or more.
2317 RegexpOp op;
2318 op = kRegexpStar;
2319 goto Rep;
2320 case '+': // One or more.
2321 op = kRegexpPlus;
2322 goto Rep;
2323 case '?': // Zero or one.
2324 op = kRegexpQuest;
2325 goto Rep;
2326 Rep:
2327 absl::string_view opstr = t;
2328 bool nongreedy = false;
2329 t.remove_prefix(1); // '*' or '+' or '?'
2330 if (ps.flags() & PerlX) {
2331 if (!t.empty() && t[0] == '?') {
2332 nongreedy = true;
2333 t.remove_prefix(1); // '?'
2334 }
2335 if (!lastunary.empty()) {
2336 // In Perl it is not allowed to stack repetition operators:
2337 // a** is a syntax error, not a double-star.
2338 // (and a++ means something else entirely, which we don't support!)
2339 status->set_code(kRegexpRepeatOp);
2340 status->set_error_arg(absl::string_view(
2341 lastunary.data(),
2342 static_cast<size_t>(t.data() - lastunary.data())));
2343 return NULL;
2344 }
2345 }
2346 opstr = absl::string_view(opstr.data(),
2347 static_cast<size_t>(t.data() - opstr.data()));
2348 if (!ps.PushRepeatOp(op, opstr, nongreedy))
2349 return NULL;
2350 isunary = opstr;
2351 break;
2352 }
2353
2354 case '{': { // Counted repetition.
2355 int lo, hi;
2356 absl::string_view opstr = t;
2357 if (!MaybeParseRepetition(&t, &lo, &hi)) {
2358 // Treat like a literal.
2359 if (!ps.PushLiteral('{'))
2360 return NULL;
2361 t.remove_prefix(1); // '{'
2362 break;
2363 }
2364 bool nongreedy = false;
2365 if (ps.flags() & PerlX) {
2366 if (!t.empty() && t[0] == '?') {
2367 nongreedy = true;
2368 t.remove_prefix(1); // '?'
2369 }
2370 if (!lastunary.empty()) {
2371 // Not allowed to stack repetition operators.
2372 status->set_code(kRegexpRepeatOp);
2373 status->set_error_arg(absl::string_view(
2374 lastunary.data(),
2375 static_cast<size_t>(t.data() - lastunary.data())));
2376 return NULL;
2377 }
2378 }
2379 opstr = absl::string_view(opstr.data(),
2380 static_cast<size_t>(t.data() - opstr.data()));
2381 if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
2382 return NULL;
2383 isunary = opstr;
2384 break;
2385 }
2386
2387 case '\\': { // Escaped character or Perl sequence.
2388 // \b and \B: word boundary or not
2389 if ((ps.flags() & Regexp::PerlB) &&
2390 t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) {
2391 if (!ps.PushWordBoundary(t[1] == 'b'))
2392 return NULL;
2393 t.remove_prefix(2); // '\\', 'b'
2394 break;
2395 }
2396
2397 if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {
2398 if (t[1] == 'A') {
2399 if (!ps.PushSimpleOp(kRegexpBeginText))
2400 return NULL;
2401 t.remove_prefix(2); // '\\', 'A'
2402 break;
2403 }
2404 if (t[1] == 'z') {
2405 if (!ps.PushSimpleOp(kRegexpEndText))
2406 return NULL;
2407 t.remove_prefix(2); // '\\', 'z'
2408 break;
2409 }
2410 // Do not recognize \Z, because this library can't
2411 // implement the exact Perl/PCRE semantics.
2412 // (This library treats "(?-m)$" as \z, even though
2413 // in Perl and PCRE it is equivalent to \Z.)
2414
2415 if (t[1] == 'C') { // \C: any byte [sic]
2416 if (!ps.PushSimpleOp(kRegexpAnyByte))
2417 return NULL;
2418 t.remove_prefix(2); // '\\', 'C'
2419 break;
2420 }
2421
2422 if (t[1] == 'Q') { // \Q ... \E: the ... is always literals
2423 t.remove_prefix(2); // '\\', 'Q'
2424 while (!t.empty()) {
2425 if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {
2426 t.remove_prefix(2); // '\\', 'E'
2427 break;
2428 }
2429 Rune r;
2430 if (StringViewToRune(&r, &t, status) < 0)
2431 return NULL;
2432 if (!ps.PushLiteral(r))
2433 return NULL;
2434 }
2435 break;
2436 }
2437 }
2438
2439 if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) {
2440 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2441 re->ccb_ = new CharClassBuilder;
2442 switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
2443 case kParseOk:
2444 if (!ps.PushRegexp(re))
2445 return NULL;
2446 goto Break2;
2447 case kParseError:
2448 re->Decref();
2449 return NULL;
2450 case kParseNothing:
2451 re->Decref();
2452 break;
2453 }
2454 }
2455
2456 const UGroup* g = MaybeParsePerlCCEscape(&t, ps.flags());
2457 if (g != NULL) {
2458 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2459 re->ccb_ = new CharClassBuilder;
2460 AddUGroup(re->ccb_, g, g->sign, ps.flags());
2461 if (!ps.PushRegexp(re))
2462 return NULL;
2463 break;
2464 }
2465
2466 Rune r;
2467 if (!ParseEscape(&t, &r, status, ps.rune_max()))
2468 return NULL;
2469 if (!ps.PushLiteral(r))
2470 return NULL;
2471 break;
2472 }
2473 }
2474 Break2:
2475 lastunary = isunary;
2476 }
2477 return ps.DoFinish();
2478}
2479
2480} // namespace re2
2481