parse.cc source code [tensorflow/external/com_googlesource_code_re2/re2/parse.cc]

1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Regular expression parser.
6
7	// The parser is a simple precedence-based parser with a
8	// manual stack. The parsing work is done by the methods
9	// of the ParseState class. The Regexp::Parse function is
10	// essentially just a lexer that calls the ParseState method
11	// for each token.
12
13	// The parser recognizes POSIX extended regular expressions
14	// excluding backreferences, collating elements, and collating
15	// classes. It also allows the empty string as a regular expression
16	// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
17	// See regexp.h for rationale.
18
19	#include <ctype.h>
20	#include <stddef.h>
21	#include <stdint.h>
22	#include <string.h>
23	#include <algorithm>
24	#include <map>
25	#include <string>
26	#include <vector>
27
28	#include "absl/base/macros.h"
29	#include "util/logging.h"
30	#include "util/utf.h"
31	#include "re2/pod_array.h"
32	#include "re2/regexp.h"
33	#include "re2/unicode_casefold.h"
34	#include "re2/unicode_groups.h"
35	#include "re2/walker-inl.h"
36
37	#if defined(RE2_USE_ICU)
38	#include "unicode/uniset.h"
39	#include "unicode/unistr.h"
40	#include "unicode/utypes.h"
41	#endif
42
43	namespace re2 {
44
45	// Controls the maximum repeat count permitted by the parser.
46	static int maximum_repeat_count = `1000`;
47
48	void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) {
49	maximum_repeat_count = i;
50	}
51
52	// Regular expression parse state.
53	// The list of parsed regexps so far is maintained as a vector of
54	// Regexp pointers called the stack. Left parenthesis and vertical
55	// bar markers are also placed on the stack, as Regexps with
56	// non-standard opcodes.
57	// Scanning a left parenthesis causes the parser to push a left parenthesis
58	// marker on the stack.
59	// Scanning a vertical bar causes the parser to pop the stack until it finds a
60	// vertical bar or left parenthesis marker (not popping the marker),
61	// concatenate all the popped results, and push them back on
62	// the stack (DoConcatenation).
63	// Scanning a right parenthesis causes the parser to act as though it
64	// has seen a vertical bar, which then leaves the top of the stack in the
65	// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
66	// The parser pops all this off the stack and creates an alternation of the
67	// regexps (DoAlternation).
68
69	class Regexp::ParseState {
70	public:
71	ParseState(ParseFlags flags, absl::string_view whole_regexp,
72	RegexpStatus* status);
73	~ParseState();
74
75	ParseFlags flags() { return flags_; }
76	int rune_max() { return rune_max_; }
77
78	// Parse methods. All public methods return a bool saying
79	// whether parsing should continue. If a method returns
80	// false, it has set fields in status_, and the parser*
81	// should return NULL.
82
83	// Pushes the given regular expression onto the stack.
84	// Could check for too much memory used here.
85	bool PushRegexp(Regexp* re);
86
87	// Pushes the literal rune r onto the stack.
88	bool PushLiteral(Rune r);
89
90	// Pushes a regexp with the given op (and no args) onto the stack.
91	bool PushSimpleOp(RegexpOp op);
92
93	// Pushes a ^ onto the stack.
94	bool PushCaret();
95
96	// Pushes a \b (word == true) or \B (word == false) onto the stack.
97	bool PushWordBoundary(bool word);
98
99	// Pushes a $ onto the stack.
100	bool PushDollar();
101
102	// Pushes a . onto the stack
103	bool PushDot();
104
105	// Pushes a repeat operator regexp onto the stack.
106	// A valid argument for the operator must already be on the stack.
107	// s is the name of the operator, for use in error messages.
108	bool PushRepeatOp(RegexpOp op, absl::string_view s, bool nongreedy);
109
110	// Pushes a repetition regexp onto the stack.
111	// A valid argument for the operator must already be on the stack.
112	bool PushRepetition(int min, int max, absl::string_view s, bool nongreedy);
113
114	// Checks whether a particular regexp op is a marker.
115	bool IsMarker(RegexpOp op);
116
117	// Processes a left parenthesis in the input.
118	// Pushes a marker onto the stack.
119	bool DoLeftParen(absl::string_view name);
120	bool DoLeftParenNoCapture();
121
122	// Processes a vertical bar in the input.
123	bool DoVerticalBar();
124
125	// Processes a right parenthesis in the input.
126	bool DoRightParen();
127
128	// Processes the end of input, returning the final regexp.
129	Regexp* DoFinish();
130
131	// Finishes the regexp if necessary, preparing it for use
132	// in a more complicated expression.
133	// If it is a CharClassBuilder, converts into a CharClass.
134	Regexp* FinishRegexp(Regexp*);
135
136	// These routines don't manipulate the parse stack
137	// directly, but they do need to look at flags_.
138	// ParseCharClass also manipulates the internals of Regexp
139	// while creating out_re.*
140
141	// Parse a character class into out_re.*
142	// Removes parsed text from s.
143	bool ParseCharClass(absl::string_view* s, Regexp** out_re,
144	RegexpStatus* status);
145
146	// Parse a character class character into rp.*
147	// Removes parsed text from s.
148	bool ParseCCCharacter(absl::string_view* s, Rune* rp,
149	absl::string_view whole_class,
150	RegexpStatus* status);
151
152	// Parse a character class range into rr.
153	// Removes parsed text from s.
154	bool ParseCCRange(absl::string_view* s, RuneRange* rr,
155	absl::string_view whole_class,
156	RegexpStatus* status);
157
158	// Parse a Perl flag set or non-capturing group from s.
159	bool ParsePerlFlags(absl::string_view* s);
160
161	// Finishes the current concatenation,
162	// collapsing it into a single regexp on the stack.
163	void DoConcatenation();
164
165	// Finishes the current alternation,
166	// collapsing it to a single regexp on the stack.
167	void DoAlternation();
168
169	// Generalized DoAlternation/DoConcatenation.
170	void DoCollapse(RegexpOp op);
171
172	// Maybe concatenate Literals into LiteralString.
173	bool MaybeConcatString(int r, ParseFlags flags);
174
175	private:
176	ParseFlags flags_;
177	absl::string_view whole_regexp_;
178	RegexpStatus* status_;
179	Regexp* stacktop_;
180	int ncap_; // number of capturing parens seen
181	int rune_max_; // maximum char value for this encoding
182
183	ParseState(const ParseState&) = delete;
184	ParseState& operator=(const ParseState&) = delete;
185	};
186
187	// Pseudo-operators - only on parse stack.
188	const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+`1`);
189	const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+`2`);
190
191	Regexp::ParseState::ParseState(ParseFlags flags,
192	absl::string_view whole_regexp,
193	RegexpStatus* status)
194	: flags_(flags), whole_regexp_(whole_regexp),
195	status_(status), stacktop_(NULL), ncap_(`0`) {
196	if (flags_ & Latin1)
197	rune_max_ = `0xFF`;
198	else
199	rune_max_ = Runemax;
200	}
201
202	// Cleans up by freeing all the regexps on the stack.
203	Regexp::ParseState::~ParseState() {
204	Regexp* next;
205	for (Regexp* re = stacktop_; re != NULL; re = next) {
206	next = re->down_;
207	re->down_ = NULL;
208	if (re->op() == kLeftParen)
209	delete re->name_;
210	re->Decref();
211	}
212	}
213
214	// Finishes the regexp if necessary, preparing it for use in
215	// a more complex expression.
216	// If it is a CharClassBuilder, converts into a CharClass.
217	Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
218	if (re == NULL)
219	return NULL;
220	re->down_ = NULL;
221
222	if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
223	CharClassBuilder* ccb = re->ccb_;
224	re->ccb_ = NULL;
225	re->cc_ = ccb->GetCharClass();
226	delete ccb;
227	}
228
229	return re;
230	}
231
232	// Pushes the given regular expression onto the stack.
233	// Could check for too much memory used here.
234	bool Regexp::ParseState::PushRegexp(Regexp* re) {
235	MaybeConcatString(-`1`, NoParseFlags);
236
237	// Special case: a character class of one character is just
238	// a literal. This is a common idiom for escaping
239	// single characters (e.g., [.] instead of \.), and some
240	// analysis does better with fewer character classes.
241	// Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
242	if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
243	re->ccb_->RemoveAbove(rune_max_);
244	if (re->ccb_->size() == `1`) {
245	Rune r = re->ccb_->begin()->lo;
246	re->Decref();
247	re = new Regexp (kRegexpLiteral, flags_);
248	re->rune_ = r;
249	} else if (re->ccb_->size() == `2`) {
250	Rune r = re->ccb_->begin()->lo;
251	if (`'A'` <= r && r <= `'Z'` && re->ccb_->Contains(r + `'a'` - `'A'`)) {
252	re->Decref();
253	re = new Regexp (kRegexpLiteral, flags_ \| FoldCase);
254	re->rune_ = r + `'a'` - `'A'`;
255	}
256	}
257	}
258
259	if (!IsMarker(re->op()))
260	re->simple_ = re->ComputeSimple();
261	re->down_ = stacktop_;
262	stacktop_ = re;
263	return true;
264	}
265
266	// Searches the case folding tables and returns the CaseFold that contains r.*
267	// If there isn't one, returns the CaseFold with smallest f->lo bigger than r.*
268	// If there isn't one, returns NULL.
269	const CaseFold* LookupCaseFold(const CaseFold* f, int n, Rune r) {
270	const CaseFold* ef = f + n;
271
272	// Binary search for entry containing r.
273	while (n > `0`) {
274	int m = n/`2`;
275	if (f[m].lo <= r && r <= f[m].hi)
276	return &f[m];
277	if (r < f[m].lo) {
278	n = m;
279	} else {
280	f += m+`1`;
281	n -= m+`1`;
282	}
283	}
284
285	// There is no entry that contains r, but f points
286	// where it would have been. Unless f points at
287	// the end of the array, it points at the next entry
288	// after r.
289	if (f < ef)
290	return f;
291
292	// No entry contains r; no entry contains runes > r.
293	return NULL;
294	}
295
296	// Returns the result of applying the fold f to the rune r.
297	Rune ApplyFold(const CaseFold* f, Rune r) {
298	switch (f->delta) {
299	default:
300	return r + f->delta;
301
302	case EvenOddSkip: // even <-> odd but only applies to every other
303	if ((r - f->lo) % `2`)
304	return r;
305	ABSL_FALLTHROUGH_INTENDED;
306	case EvenOdd: // even <-> odd
307	if (r%`2` == `0`)
308	return r + `1`;
309	return r - `1`;
310
311	case OddEvenSkip: // odd <-> even but only applies to every other
312	if ((r - f->lo) % `2`)
313	return r;
314	ABSL_FALLTHROUGH_INTENDED;
315	case OddEven: // odd <-> even
316	if (r%`2` == `1`)
317	return r + `1`;
318	return r - `1`;
319	}
320	}
321
322	// Returns the next Rune in r's folding cycle (see unicode_casefold.h).
323	// Examples:
324	// CycleFoldRune('A') = 'a'
325	// CycleFoldRune('a') = 'A'
326	//
327	// CycleFoldRune('K') = 'k'
328	// CycleFoldRune('k') = 0x212A (Kelvin)
329	// CycleFoldRune(0x212A) = 'K'
330	//
331	// CycleFoldRune('?') = '?'
332	Rune CycleFoldRune(Rune r) {
333	const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
334	if (f == NULL \|\| r < f->lo)
335	return r;
336	return ApplyFold(f, r);
337	}
338
339	// Add lo-hi to the class, along with their fold-equivalent characters.
340	// If lo-hi is already in the class, assume that the fold-equivalent
341	// chars are there too, so there's no work to do.
342	static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
343	// AddFoldedRange calls itself recursively for each rune in the fold cycle.
344	// Most folding cycles are small: there aren't any bigger than four in the
345	// current Unicode tables. make_unicode_casefold.py checks that
346	// the cycles are not too long, and we double-check here using depth.
347	if (depth > `10`) {
348	LOG(DFATAL) << "AddFoldedRange recurses too much.";
349	return;
350	}
351
352	if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done
353	return;
354
355	while (lo <= hi) {
356	const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
357	if (f == NULL) // lo has no fold, nor does anything above lo
358	break;
359	if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
360	lo = f->lo;
361	continue;
362	}
363
364	// Add in the result of folding the range lo - f->hi
365	// and that range's fold, recursively.
366	Rune lo1 = lo;
367	Rune hi1 = std::min<Rune>(hi, f->hi);
368	switch (f->delta) {
369	default:
370	lo1 += f->delta;
371	hi1 += f->delta;
372	break;
373	case EvenOdd:
374	if (lo1%`2` == `1`)
375	lo1--;
376	if (hi1%`2` == `0`)
377	hi1++;
378	break;
379	case OddEven:
380	if (lo1%`2` == `0`)
381	lo1--;
382	if (hi1%`2` == `1`)
383	hi1++;
384	break;
385	}
386	AddFoldedRange(cc, lo1, hi1, depth+`1`);
387
388	// Pick up where this fold left off.
389	lo = f->hi + `1`;
390	}
391	}
392
393	// Pushes the literal rune r onto the stack.
394	bool Regexp::ParseState::PushLiteral(Rune r) {
395	// Do case folding if needed.
396	if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
397	Regexp* re = new Regexp (kRegexpCharClass, flags_ & ~FoldCase);
398	re->ccb_ = new CharClassBuilder;
399	Rune r1 = r;
400	do {
401	if (!(flags_ & NeverNL) \|\| r != `'\n'`) {
402	re->ccb_->AddRange(r, r);
403	}
404	r = CycleFoldRune(r);
405	} while (r != r1);
406	return PushRegexp(re);
407	}
408
409	// Exclude newline if applicable.
410	if ((flags_ & NeverNL) && r == `'\n'`)
411	return PushRegexp(new Regexp (kRegexpNoMatch, flags_));
412
413	// No fancy stuff worked. Ordinary literal.
414	if (MaybeConcatString(r, flags_))
415	return true;
416
417	Regexp* re = new Regexp (kRegexpLiteral, flags_);
418	re->rune_ = r;
419	return PushRegexp(re);
420	}
421
422	// Pushes a ^ onto the stack.
423	bool Regexp::ParseState::PushCaret() {
424	if (flags_ & OneLine) {
425	return PushSimpleOp(kRegexpBeginText);
426	}
427	return PushSimpleOp(kRegexpBeginLine);
428	}
429
430	// Pushes a \b or \B onto the stack.
431	bool Regexp::ParseState::PushWordBoundary(bool word) {
432	if (word)
433	return PushSimpleOp(kRegexpWordBoundary);
434	return PushSimpleOp(kRegexpNoWordBoundary);
435	}
436
437	// Pushes a $ onto the stack.
438	bool Regexp::ParseState::PushDollar() {
439	if (flags_ & OneLine) {
440	// Clumsy marker so that MimicsPCRE() can tell whether
441	// this kRegexpEndText was a $ and not a \z.
442	Regexp::ParseFlags oflags = flags_;
443	flags_ = flags_ \| WasDollar;
444	bool ret = PushSimpleOp(kRegexpEndText);
445	flags_ = oflags;
446	return ret;
447	}
448	return PushSimpleOp(kRegexpEndLine);
449	}
450
451	// Pushes a . onto the stack.
452	bool Regexp::ParseState::PushDot() {
453	if ((flags_ & DotNL) && !(flags_ & NeverNL))
454	return PushSimpleOp(kRegexpAnyChar);
455	// Rewrite . into [^\n]
456	Regexp* re = new Regexp (kRegexpCharClass, flags_ & ~FoldCase);
457	re->ccb_ = new CharClassBuilder;
458	re->ccb_->AddRange(`0`, `'\n'` - `1`);
459	re->ccb_->AddRange(`'\n'` + `1`, rune_max_);
460	return PushRegexp(re);
461	}
462
463	// Pushes a regexp with the given op (and no args) onto the stack.
464	bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
465	Regexp* re = new Regexp (op, flags_);
466	return PushRegexp(re);
467	}
468
469	// Pushes a repeat operator regexp onto the stack.
470	// A valid argument for the operator must already be on the stack.
471	// The char c is the name of the operator, for use in error messages.
472	bool Regexp::ParseState::PushRepeatOp(RegexpOp op, absl::string_view s,
473	bool nongreedy) {
474	if (stacktop_ == NULL \|\| IsMarker(stacktop_->op())) {
475	status_->set_code(kRegexpRepeatArgument);
476	status_->set_error_arg(s);
477	return false;
478	}
479	Regexp::ParseFlags fl = flags_;
480	if (nongreedy)
481	fl = fl ^ NonGreedy;
482
483	// Squash , ++ and ??. Regexp::Star() et al. handle this too, but
484	// they're mostly for use during simplification, not during parsing.
485	if (op == stacktop_->op() && fl == stacktop_->parse_flags())
486	return true;
487
488	// Squash +, ?, +, +?, ?* and ?+. They all squash to , so because
489	// op is a repeat, we just have to check that stacktop_->op() is too,
490	// then adjust stacktop_.
491	if ((stacktop_->op() == kRegexpStar \|\|
492	stacktop_->op() == kRegexpPlus \|\|
493	stacktop_->op() == kRegexpQuest) &&
494	fl == stacktop_->parse_flags()) {
495	stacktop_->op_ = kRegexpStar;
496	return true;
497	}
498
499	Regexp* re = new Regexp (op, fl);
500	re->AllocSub(`1`);
501	re->down_ = stacktop_->down_;
502	re->sub()[`0`] = FinishRegexp(stacktop_);
503	re->simple_ = re->ComputeSimple();
504	stacktop_ = re;
505	return true;
506	}
507
508	// RepetitionWalker reports whether the repetition regexp is valid.
509	// Valid means that the combination of the top-level repetition
510	// and any inner repetitions does not exceed n copies of the
511	// innermost thing.
512	// This rewalks the regexp tree and is called for every repetition,
513	// so we have to worry about inducing quadratic behavior in the parser.
514	// We avoid this by only using RepetitionWalker when min or max >= 2.
515	// In that case the depth of any >= 2 nesting can only get to 9 without
516	// triggering a parse error, so each subtree can only be rewalked 9 times.
517	class RepetitionWalker : public Regexp::Walker<int> {
518	public:
519	RepetitionWalker() {}
520	virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
521	virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
522	int* child_args, int nchild_args);
523	virtual int ShortVisit(Regexp* re, int parent_arg);
524
525	private:
526	RepetitionWalker(const RepetitionWalker&) = delete;
527	RepetitionWalker& operator=(const RepetitionWalker&) = delete;
528	};
529
530	int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
531	int arg = parent_arg;
532	if (re->op() == kRegexpRepeat) {
533	int m = re->max();
534	if (m < `0`) {
535	m = re->min();
536	}
537	if (m > `0`) {
538	arg /= m;
539	}
540	}
541	return arg;
542	}
543
544	int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
545	int* child_args, int nchild_args) {
546	int arg = pre_arg;
547	for (int i = `0`; i < nchild_args; i++) {
548	if (child_args[i] < arg) {
549	arg = child_args[i];
550	}
551	}
552	return arg;
553	}
554
555	int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
556	// Should never be called: we use Walk(), not WalkExponential().
557	#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
558	LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
559	#endif
560	return `0`;
561	}
562
563	// Pushes a repetition regexp onto the stack.
564	// A valid argument for the operator must already be on the stack.
565	bool Regexp::ParseState::PushRepetition(int min, int max, absl::string_view s,
566	bool nongreedy) {
567	if ((max != -`1` && max < min) \|\|
568	min > maximum_repeat_count \|\|
569	max > maximum_repeat_count) {
570	status_->set_code(kRegexpRepeatSize);
571	status_->set_error_arg(s);
572	return false;
573	}
574	if (stacktop_ == NULL \|\| IsMarker(stacktop_->op())) {
575	status_->set_code(kRegexpRepeatArgument);
576	status_->set_error_arg(s);
577	return false;
578	}
579	Regexp::ParseFlags fl = flags_;
580	if (nongreedy)
581	fl = fl ^ NonGreedy;
582	Regexp* re = new Regexp (kRegexpRepeat, fl);
583	re->min_ = min;
584	re->max_ = max;
585	re->AllocSub(`1`);
586	re->down_ = stacktop_->down_;
587	re->sub()[`0`] = FinishRegexp(stacktop_);
588	re->simple_ = re->ComputeSimple();
589	stacktop_ = re;
590	if (min >= `2` \|\| max >= `2`) {
591	RepetitionWalker w;
592	if (w.Walk(stacktop_, maximum_repeat_count) == `0`) {
593	status_->set_code(kRegexpRepeatSize);
594	status_->set_error_arg(s);
595	return false;
596	}
597	}
598	return true;
599	}
600
601	// Checks whether a particular regexp op is a marker.
602	bool Regexp::ParseState::IsMarker(RegexpOp op) {
603	return op >= kLeftParen;
604	}
605
606	// Processes a left parenthesis in the input.
607	// Pushes a marker onto the stack.
608	bool Regexp::ParseState::DoLeftParen(absl::string_view name) {
609	Regexp* re = new Regexp (kLeftParen, flags_);
610	re->cap_ = ++ncap_;
611	if (name.data() != NULL)
612	re->name_ = new std::string (name);
613	return PushRegexp(re);
614	}
615
616	// Pushes a non-capturing marker onto the stack.
617	bool Regexp::ParseState::DoLeftParenNoCapture() {
618	Regexp* re = new Regexp (kLeftParen, flags_);
619	re->cap_ = -`1`;
620	return PushRegexp(re);
621	}
622
623	// Processes a vertical bar in the input.
624	bool Regexp::ParseState::DoVerticalBar() {
625	MaybeConcatString(-`1`, NoParseFlags);
626	DoConcatenation();
627
628	// Below the vertical bar is a list to alternate.
629	// Above the vertical bar is a list to concatenate.
630	// We just did the concatenation, so either swap
631	// the result below the vertical bar or push a new
632	// vertical bar on the stack.
633	Regexp* r1;
634	Regexp* r2;
635	if ((r1 = stacktop_) != NULL &&
636	(r2 = r1->down_) != NULL &&
637	r2->op() == kVerticalBar) {
638	Regexp* r3;
639	if ((r3 = r2->down_) != NULL &&
640	(r1->op() == kRegexpAnyChar \|\| r3->op() == kRegexpAnyChar)) {
641	// AnyChar is above or below the vertical bar. Let it subsume
642	// the other when the other is Literal, CharClass or AnyChar.
643	if (r3->op() == kRegexpAnyChar &&
644	(r1->op() == kRegexpLiteral \|\|
645	r1->op() == kRegexpCharClass \|\|
646	r1->op() == kRegexpAnyChar)) {
647	// Discard r1.
648	stacktop_ = r2;
649	r1->Decref();
650	return true;
651	}
652	if (r1->op() == kRegexpAnyChar &&
653	(r3->op() == kRegexpLiteral \|\|
654	r3->op() == kRegexpCharClass \|\|
655	r3->op() == kRegexpAnyChar)) {
656	// Rearrange the stack and discard r3.
657	r1->down_ = r3->down_;
658	r2->down_ = r1;
659	stacktop_ = r2;
660	r3->Decref();
661	return true;
662	}
663	}
664	// Swap r1 below vertical bar (r2).
665	r1->down_ = r2->down_;
666	r2->down_ = r1;
667	stacktop_ = r2;
668	return true;
669	}
670	return PushSimpleOp(kVerticalBar);
671	}
672
673	// Processes a right parenthesis in the input.
674	bool Regexp::ParseState::DoRightParen() {
675	// Finish the current concatenation and alternation.
676	DoAlternation();
677
678	// The stack should be: LeftParen regexp
679	// Remove the LeftParen, leaving the regexp,
680	// parenthesized.
681	Regexp* r1;
682	Regexp* r2;
683	if ((r1 = stacktop_) == NULL \|\|
684	(r2 = r1->down_) == NULL \|\|
685	r2->op() != kLeftParen) {
686	status_->set_code(kRegexpUnexpectedParen);
687	status_->set_error_arg(whole_regexp_);
688	return false;
689	}
690
691	// Pop off r1, r2. Will Decref or reuse below.
692	stacktop_ = r2->down_;
693
694	// Restore flags from when paren opened.
695	Regexp* re = r2;
696	flags_ = re->parse_flags();
697
698	// Rewrite LeftParen as capture if needed.
699	if (re->cap_ > `0`) {
700	re->op_ = kRegexpCapture;
701	// re->cap_ is already set
702	re->AllocSub(`1`);
703	re->sub()[`0`] = FinishRegexp(r1);
704	re->simple_ = re->ComputeSimple();
705	} else {
706	re->Decref();
707	re = r1;
708	}
709	return PushRegexp(re);
710	}
711
712	// Processes the end of input, returning the final regexp.
713	Regexp* Regexp::ParseState::DoFinish() {
714	DoAlternation();
715	Regexp* re = stacktop_;
716	if (re != NULL && re->down_ != NULL) {
717	status_->set_code(kRegexpMissingParen);
718	status_->set_error_arg(whole_regexp_);
719	return NULL;
720	}
721	stacktop_ = NULL;
722	return FinishRegexp(re);
723	}
724
725	// Returns the leading regexp that re starts with.
726	// The returned Regexp points into a piece of re,*
727	// so it must not be used after the caller calls re->Decref().
728	Regexp* Regexp::LeadingRegexp(Regexp* re) {
729	if (re->op() == kRegexpEmptyMatch)
730	return NULL;
731	if (re->op() == kRegexpConcat && re->nsub() >= `2`) {
732	Regexp** sub = re->sub();
733	if (sub[`0`]->op() == kRegexpEmptyMatch)
734	return NULL;
735	return sub[`0`];
736	}
737	return re;
738	}
739
740	// Removes LeadingRegexp(re) from re and returns what's left.
741	// Consumes the reference to re and may edit it in place.
742	// If caller wants to hold on to LeadingRegexp(re),
743	// must have already Incref'ed it.
744	Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
745	if (re->op() == kRegexpEmptyMatch)
746	return re;
747	if (re->op() == kRegexpConcat && re->nsub() >= `2`) {
748	Regexp** sub = re->sub();
749	if (sub[`0`]->op() == kRegexpEmptyMatch)
750	return re;
751	sub[`0`]->Decref();
752	sub[`0`] = NULL;
753	if (re->nsub() == `2`) {
754	// Collapse concatenation to single regexp.
755	Regexp* nre = sub[`1`];
756	sub[`1`] = NULL;
757	re->Decref();
758	return nre;
759	}
760	// 3 or more -> 2 or more.
761	re->nsub_--;
762	memmove(sub, sub + `1`, re->nsub_ * sizeof sub[`0`]);
763	return re;
764	}
765	Regexp::ParseFlags pf = re->parse_flags();
766	re->Decref();
767	return new Regexp (kRegexpEmptyMatch, pf);
768	}
769
770	// Returns the leading string that re starts with.
771	// The returned Rune points into a piece of re,*
772	// so it must not be used after the caller calls re->Decref().
773	Rune* Regexp::LeadingString(Regexp* re, int* nrune,
774	Regexp::ParseFlags* flags) {
775	while (re->op() == kRegexpConcat && re->nsub() > `0`)
776	re = re->sub()[`0`];
777
778	flags = static_cast*<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);
779
780	if (re->op() == kRegexpLiteral) {
781	*nrune = `1`;
782	return &re->rune_;
783	}
784
785	if (re->op() == kRegexpLiteralString) {
786	*nrune = re->nrunes_;
787	return re->runes_;
788	}
789
790	*nrune = `0`;
791	return NULL;
792	}
793
794	// Removes the first n leading runes from the beginning of re.
795	// Edits re in place.
796	void Regexp::RemoveLeadingString(Regexp* re, int n) {
797	// Chase down concats to find first string.
798	// For regexps generated by parser, nested concats are
799	// flattened except when doing so would overflow the 16-bit
800	// limit on the size of a concatenation, so we should never
801	// see more than two here.
802	Regexp* stk[`4`];
803	size_t d = `0`;
804	while (re->op() == kRegexpConcat) {
805	if (d < ABSL_ARRAYSIZE(stk))
806	stk[d++] = re;
807	re = re->sub()[`0`];
808	}
809
810	// Remove leading string from re.
811	if (re->op() == kRegexpLiteral) {
812	re->rune_ = `0`;
813	re->op_ = kRegexpEmptyMatch;
814	} else if (re->op() == kRegexpLiteralString) {
815	if (n >= re->nrunes_) {
816	delete[] re->runes_;
817	re->runes_ = NULL;
818	re->nrunes_ = `0`;
819	re->op_ = kRegexpEmptyMatch;
820	} else if (n == re->nrunes_ - `1`) {
821	Rune rune = re->runes_[re->nrunes_ - `1`];
822	delete[] re->runes_;
823	re->runes_ = NULL;
824	re->nrunes_ = `0`;
825	re->rune_ = rune;
826	re->op_ = kRegexpLiteral;
827	} else {
828	re->nrunes_ -= n;
829	memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[`0`]);
830	}
831	}
832
833	// If re is now empty, concatenations might simplify too.
834	while (d > `0`) {
835	re = stk[--d];
836	Regexp** sub = re->sub();
837	if (sub[`0`]->op() == kRegexpEmptyMatch) {
838	sub[`0`]->Decref();
839	sub[`0`] = NULL;
840	// Delete first element of concat.
841	switch (re->nsub()) {
842	case `0`:
843	case `1`:
844	// Impossible.
845	LOG(DFATAL) << "Concat of " << re->nsub();
846	re->submany_ = NULL;
847	re->op_ = kRegexpEmptyMatch;
848	break;
849
850	case `2`: {
851	// Replace re with sub[1].
852	Regexp* old = sub[`1`];
853	sub[`1`] = NULL;
854	re->Swap(old);
855	old->Decref();
856	break;
857	}
858
859	default:
860	// Slide down.
861	re->nsub_--;
862	memmove(sub, sub + `1`, re->nsub_ * sizeof sub[`0`]);
863	break;
864	}
865	}
866	}
867	}
868
869	// In the context of factoring alternations, a Splice is: a factored prefix or
870	// merged character class computed by one iteration of one round of factoring;
871	// the span of subexpressions of the alternation to be "spliced" (i.e. removed
872	// and replaced); and, for a factored prefix, the number of suffixes after any
873	// factoring that might have subsequently been performed on them. For a merged
874	// character class, there are no suffixes, of course, so the field is ignored.
875	struct Splice {
876	Splice(Regexp* prefix, Regexp** sub, int nsub)
877	: prefix(prefix),
878	sub(sub),
879	nsub(nsub),
880	nsuffix(-`1`) {}
881
882	Regexp* prefix;
883	Regexp** sub;
884	int nsub;
885	int nsuffix;
886	};
887
888	// Named so because it is used to implement an explicit stack, a Frame is: the
889	// span of subexpressions of the alternation to be factored; the current round
890	// of factoring; any Splices computed; and, for a factored prefix, an iterator
891	// to the next Splice to be factored (i.e. in another Frame) because suffixes.
892	struct Frame {
893	Frame(Regexp** sub, int nsub)
894	: sub(sub),
895	nsub(nsub),
896	round(`0`) {}
897
898	Regexp** sub;
899	int nsub;
900	int round;
901	std::vector<Splice> splices;
902	int spliceidx;
903	};
904
905	// Bundled into a class for friend access to Regexp without needing to declare
906	// (or define) Splice in regexp.h.
907	class FactorAlternationImpl {
908	public:
909	static void Round1(Regexp** sub, int nsub,
910	Regexp::ParseFlags flags,
911	std::vector<Splice>* splices);
912	static void Round2(Regexp** sub, int nsub,
913	Regexp::ParseFlags flags,
914	std::vector<Splice>* splices);
915	static void Round3(Regexp** sub, int nsub,
916	Regexp::ParseFlags flags,
917	std::vector<Splice>* splices);
918	};
919
920	// Factors common prefixes from alternation.
921	// For example,
922	// ABC\|ABD\|AEF\|BCX\|BCY
923	// simplifies to
924	// A(B(C\|D)\|EF)\|BC(X\|Y)
925	// and thence to
926	// A(B[CD]\|EF)\|BC[XY]
927	//
928	// Rewrites sub to contain simplified list to alternate and returns
929	// the new length of sub. Adjusts reference counts accordingly
930	// (incoming sub[i] decremented, outgoing sub[i] incremented).
931	int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
932	std::vector<Frame> stk;
933	stk.emplace_back(sub, nsub);
934
935	for (;;) {
936	auto& sub = stk.back().sub;
937	auto& nsub = stk.back().nsub;
938	auto& round = stk.back().round;
939	auto& splices = stk.back().splices;
940	auto& spliceidx = stk.back().spliceidx;
941
942	if (splices.empty()) {
943	// Advance to the next round of factoring. Note that this covers
944	// the initialised state: when splices is empty and round is 0.
945	round++;
946	} else if (spliceidx < static_cast<int>(splices.size())) {
947	// We have at least one more Splice to factor. Recurse logically.
948	stk.emplace_back(splices [spliceidx].sub, splices [spliceidx].nsub);
949	continue;
950	} else {
951	// We have no more Splices to factor. Apply them.
952	auto iter = splices.begin();
953	int out = `0`;
954	for (int i = `0`; i < nsub; ) {
955	// Copy until we reach where the next Splice begins.
956	while (sub + i < iter ->sub)
957	sub[out++] = sub[i++];
958	switch (round) {
959	case `1`:
960	case `2`: {
961	// Assemble the Splice prefix and the suffixes.
962	Regexp* re[`2`];
963	re[`0`] = iter ->prefix;
964	re[`1`] = Regexp::AlternateNoFactor(iter ->sub, iter ->nsuffix, flags);
965	sub[out++] = Regexp::Concat(re, `2`, flags);
966	i += iter ->nsub;
967	break;
968	}
969	case `3`:
970	// Just use the Splice prefix.
971	sub[out++] = iter ->prefix;
972	i += iter ->nsub;
973	break;
974	default:
975	LOG(DFATAL) << "unknown round: " << round;
976	break;
977	}
978	// If we are done, copy until the end of sub.
979	if (++iter == splices.end()) {
980	while (i < nsub)
981	sub[out++] = sub[i++];
982	}
983	}
984	splices.clear();
985	nsub = out;
986	// Advance to the next round of factoring.
987	round++;
988	}
989
990	switch (round) {
991	case `1`:
992	FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
993	break;
994	case `2`:
995	FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
996	break;
997	case `3`:
998	FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
999	break;
1000	case `4`:
1001	if (stk.size() == `1`) {
1002	// We are at the top of the stack. Just return.
1003	return nsub;
1004	} else {
1005	// Pop the stack and set the number of suffixes.
1006	// (Note that references will be invalidated!)
1007	int nsuffix = nsub;
1008	stk.pop_back();
1009	stk.back().splices [stk.back().spliceidx].nsuffix = nsuffix;
1010	++stk.back().spliceidx;
1011	continue;
1012	}
1013	default:
1014	LOG(DFATAL) << "unknown round: " << round;
1015	break;
1016	}
1017
1018	// Set spliceidx depending on whether we have Splices to factor.
1019	if (splices.empty() \|\| round == `3`) {
1020	spliceidx = static_cast<int>(splices.size());
1021	} else {
1022	spliceidx = `0`;
1023	}
1024	}
1025	}
1026
1027	void FactorAlternationImpl::Round1(Regexp** sub, int nsub,
1028	Regexp::ParseFlags flags,
1029	std::vector<Splice>* splices) {
1030	// Round 1: Factor out common literal prefixes.
1031	int start = `0`;
1032	Rune* rune = NULL;
1033	int nrune = `0`;
1034	Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
1035	for (int i = `0`; i <= nsub; i++) {
1036	// Invariant: sub[start:i] consists of regexps that all
1037	// begin with rune[0:nrune].
1038	Rune* rune_i = NULL;
1039	int nrune_i = `0`;
1040	Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
1041	if (i < nsub) {
1042	rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i);
1043	if (runeflags_i == runeflags) {
1044	int same = `0`;
1045	while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
1046	same++;
1047	if (same > `0`) {
1048	// Matches at least one rune in current range. Keep going around.
1049	nrune = same;
1050	continue;
1051	}
1052	}
1053	}
1054
1055	// Found end of a run with common leading literal string:
1056	// sub[start:i] all begin with rune[0:nrune],
1057	// but sub[i] does not even begin with rune[0].
1058	if (i == start) {
1059	// Nothing to do - first iteration.
1060	} else if (i == start+`1`) {
1061	// Just one: don't bother factoring.
1062	} else {
1063	Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags);
1064	for (int j = start; j < i; j++)
1065	Regexp::RemoveLeadingString(sub[j], nrune);
1066	splices->emplace_back(prefix, sub + start, i - start);
1067	}
1068
1069	// Prepare for next iteration (if there is one).
1070	if (i < nsub) {
1071	start = i;
1072	rune = rune_i;
1073	nrune = nrune_i;
1074	runeflags = runeflags_i;
1075	}
1076	}
1077	}
1078
1079	void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
1080	Regexp::ParseFlags flags,
1081	std::vector<Splice>* splices) {
1082	// Round 2: Factor out common simple prefixes,
1083	// just the first piece of each concatenation.
1084	// This will be good enough a lot of the time.
1085	//
1086	// Complex subexpressions (e.g. involving quantifiers)
1087	// are not safe to factor because that collapses their
1088	// distinct paths through the automaton, which affects
1089	// correctness in some cases.
1090	int start = `0`;
1091	Regexp* first = NULL;
1092	for (int i = `0`; i <= nsub; i++) {
1093	// Invariant: sub[start:i] consists of regexps that all
1094	// begin with first.
1095	Regexp* first_i = NULL;
1096	if (i < nsub) {
1097	first_i = Regexp::LeadingRegexp(sub[i]);
1098	if (first != NULL &&
1099	// first must be an empty-width op
1100	// OR a char class, any char or any byte
1101	// OR a fixed repeat of a literal, char class, any char or any byte.
1102	(first->op() == kRegexpBeginLine \|\|
1103	first->op() == kRegexpEndLine \|\|
1104	first->op() == kRegexpWordBoundary \|\|
1105	first->op() == kRegexpNoWordBoundary \|\|
1106	first->op() == kRegexpBeginText \|\|
1107	first->op() == kRegexpEndText \|\|
1108	first->op() == kRegexpCharClass \|\|
1109	first->op() == kRegexpAnyChar \|\|
1110	first->op() == kRegexpAnyByte \|\|
1111	(first->op() == kRegexpRepeat &&
1112	first->min() == first->max() &&
1113	(first->sub()[`0`]->op() == kRegexpLiteral \|\|
1114	first->sub()[`0`]->op() == kRegexpCharClass \|\|
1115	first->sub()[`0`]->op() == kRegexpAnyChar \|\|
1116	first->sub()[`0`]->op() == kRegexpAnyByte))) &&
1117	Regexp::Equal(first, first_i))
1118	continue;
1119	}
1120
1121	// Found end of a run with common leading regexp:
1122	// sub[start:i] all begin with first,
1123	// but sub[i] does not.
1124	if (i == start) {
1125	// Nothing to do - first iteration.
1126	} else if (i == start+`1`) {
1127	// Just one: don't bother factoring.
1128	} else {
1129	Regexp* prefix = first->Incref();
1130	for (int j = start; j < i; j++)
1131	sub[j] = Regexp::RemoveLeadingRegexp(sub[j]);
1132	splices->emplace_back(prefix, sub + start, i - start);
1133	}
1134
1135	// Prepare for next iteration (if there is one).
1136	if (i < nsub) {
1137	start = i;
1138	first = first_i;
1139	}
1140	}
1141	}
1142
1143	void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
1144	Regexp::ParseFlags flags,
1145	std::vector<Splice>* splices) {
1146	// Round 3: Merge runs of literals and/or character classes.
1147	int start = `0`;
1148	Regexp* first = NULL;
1149	for (int i = `0`; i <= nsub; i++) {
1150	// Invariant: sub[start:i] consists of regexps that all
1151	// are either literals (i.e. runes) or character classes.
1152	Regexp* first_i = NULL;
1153	if (i < nsub) {
1154	first_i = sub[i];
1155	if (first != NULL &&
1156	(first->op() == kRegexpLiteral \|\|
1157	first->op() == kRegexpCharClass) &&
1158	(first_i->op() == kRegexpLiteral \|\|
1159	first_i->op() == kRegexpCharClass))
1160	continue;
1161	}
1162
1163	// Found end of a run of Literal/CharClass:
1164	// sub[start:i] all are either one or the other,
1165	// but sub[i] is not.
1166	if (i == start) {
1167	// Nothing to do - first iteration.
1168	} else if (i == start+`1`) {
1169	// Just one: don't bother factoring.
1170	} else {
1171	CharClassBuilder ccb;
1172	for (int j = start; j < i; j++) {
1173	Regexp* re = sub[j];
1174	if (re->op() == kRegexpCharClass) {
1175	CharClass* cc = re->cc();
1176	for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
1177	ccb.AddRange(it->lo, it->hi);
1178	} else if (re->op() == kRegexpLiteral) {
1179	ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
1180	} else {
1181	LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
1182	<< re->ToString();
1183	}
1184	re->Decref();
1185	}
1186	Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
1187	splices->emplace_back(re, sub + start, i - start);
1188	}
1189
1190	// Prepare for next iteration (if there is one).
1191	if (i < nsub) {
1192	start = i;
1193	first = first_i;
1194	}
1195	}
1196	}
1197
1198	// Collapse the regexps on top of the stack, down to the
1199	// first marker, into a new op node (op == kRegexpAlternate
1200	// or op == kRegexpConcat).
1201	void Regexp::ParseState::DoCollapse(RegexpOp op) {
1202	// Scan backward to marker, counting children of composite.
1203	int n = `0`;
1204	Regexp* next = NULL;
1205	Regexp* sub;
1206	for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1207	next = sub->down_;
1208	if (sub->op_ == op)
1209	n += sub->nsub_;
1210	else
1211	n++;
1212	}
1213
1214	// If there's just one child, leave it alone.
1215	// (Concat of one thing is that one thing; alternate of one thing is same.)
1216	if (stacktop_ != NULL && stacktop_->down_ == next)
1217	return;
1218
1219	// Construct op (alternation or concatenation), flattening op of op.
1220	PODArray<Regexp*> subs(n);
1221	next = NULL;
1222	int i = n;
1223	for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1224	next = sub->down_;
1225	if (sub->op_ == op) {
1226	Regexp** sub_subs = sub->sub();
1227	for (int k = sub->nsub_ - `1`; k >= `0`; k--)
1228	subs [--i] = sub_subs[k]->Incref();
1229	sub->Decref();
1230	} else {
1231	subs [--i] = FinishRegexp(sub);
1232	}
1233	}
1234
1235	Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true);
1236	re->simple_ = re->ComputeSimple();
1237	re->down_ = next;
1238	stacktop_ = re;
1239	}
1240
1241	// Finishes the current concatenation,
1242	// collapsing it into a single regexp on the stack.
1243	void Regexp::ParseState::DoConcatenation() {
1244	Regexp* r1 = stacktop_;
1245	if (r1 == NULL \|\| IsMarker(r1->op())) {
1246	// empty concatenation is special case
1247	Regexp* re = new Regexp (kRegexpEmptyMatch, flags_);
1248	PushRegexp(re);
1249	}
1250	DoCollapse(kRegexpConcat);
1251	}
1252
1253	// Finishes the current alternation,
1254	// collapsing it to a single regexp on the stack.
1255	void Regexp::ParseState::DoAlternation() {
1256	DoVerticalBar();
1257	// Now stack top is kVerticalBar.
1258	Regexp* r1 = stacktop_;
1259	stacktop_ = r1->down_;
1260	r1->Decref();
1261	DoCollapse(kRegexpAlternate);
1262	}
1263
1264	// Incremental conversion of concatenated literals into strings.
1265	// If top two elements on stack are both literal or string,
1266	// collapse into single string.
1267	// Don't walk down the stack -- the parser calls this frequently
1268	// enough that below the bottom two is known to be collapsed.
1269	// Only called when another regexp is about to be pushed
1270	// on the stack, so that the topmost literal is not being considered.
1271	// (Otherwise ab would turn into (ab).)
1272	// If r >= 0, consider pushing a literal r on the stack.
1273	// Return whether that happened.
1274	bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
1275	Regexp* re1;
1276	Regexp* re2;
1277	if ((re1 = stacktop_) == NULL \|\| (re2 = re1->down_) == NULL)
1278	return false;
1279
1280	if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
1281	return false;
1282	if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
1283	return false;
1284	if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
1285	return false;
1286
1287	if (re2->op_ == kRegexpLiteral) {
1288	// convert into string
1289	Rune rune = re2->rune_;
1290	re2->op_ = kRegexpLiteralString;
1291	re2->nrunes_ = `0`;
1292	re2->runes_ = NULL;
1293	re2->AddRuneToString(rune);
1294	}
1295
1296	// push re1 into re2.
1297	if (re1->op_ == kRegexpLiteral) {
1298	re2->AddRuneToString(re1->rune_);
1299	} else {
1300	for (int i = `0`; i < re1->nrunes_; i++)
1301	re2->AddRuneToString(re1->runes_[i]);
1302	re1->nrunes_ = `0`;
1303	delete[] re1->runes_;
1304	re1->runes_ = NULL;
1305	}
1306
1307	// reuse re1 if possible
1308	if (r >= `0`) {
1309	re1->op_ = kRegexpLiteral;
1310	re1->rune_ = r;
1311	re1->parse_flags_ = static_cast<uint16_t>(flags);
1312	return true;
1313	}
1314
1315	stacktop_ = re2;
1316	re1->Decref();
1317	return false;
1318	}
1319
1320	// Lexing routines.
1321
1322	// Parses a decimal integer, storing it in np.*
1323	// Sets s to span the remainder of the string.*
1324	static bool ParseInteger(absl::string_view* s, int* np) {
1325	if (s->empty() \|\| !isdigit((*s)[`0`] & `0xFF`))
1326	return false;
1327	// Disallow leading zeros.
1328	if (s->size() >= `2` && (s)[`0`] == `'0'` && isdigit((s)[`1`] & `0xFF`))
1329	return false;
1330	int n = `0`;
1331	int c;
1332	while (!s->empty() && isdigit(c = (*s)[`0`] & `0xFF`)) {
1333	// Avoid overflow.
1334	if (n >= `100000000`)
1335	return false;
1336	n = n*`10` + c - `'0'`;
1337	s->remove_prefix(`1`); // digit
1338	}
1339	*np = n;
1340	return true;
1341	}
1342
1343	// Parses a repetition suffix like {1,2} or {2} or {2,}.
1344	// Sets s to span the remainder of the string on success.*
1345	// Sets lo and hi to the given range.
1346	// In the case of {2,}, the high number is unbounded;
1347	// sets hi to -1 to signify this.*
1348	// {,2} is NOT a valid suffix.
1349	// The Maybe in the name signifies that the regexp parse
1350	// doesn't fail even if ParseRepetition does, so the string_view
1351	// s must NOT be edited unless MaybeParseRepetition returns true.
1352	static bool MaybeParseRepetition(absl::string_view* sp, int* lo, int* hi) {
1353	absl::string_view s = *sp;
1354	if (s.empty() \|\| s [`0`] != `'{'`)
1355	return false;
1356	s.remove_prefix(`1`); // '{'
1357	if (!ParseInteger(&s, lo))
1358	return false;
1359	if (s.empty())
1360	return false;
1361	if (s [`0`] == `','`) {
1362	s.remove_prefix(`1`); // ','
1363	if (s.empty())
1364	return false;
1365	if (s [`0`] == `'}'`) {
1366	// {2,} means at least 2
1367	*hi = -`1`;
1368	} else {
1369	// {2,4} means 2, 3, or 4.
1370	if (!ParseInteger(&s, hi))
1371	return false;
1372	}
1373	} else {
1374	// {2} means exactly two
1375	hi = lo;
1376	}
1377	if (s.empty() \|\| s [`0`] != `'}'`)
1378	return false;
1379	s.remove_prefix(`1`); // '}'
1380	*sp = s;
1381	return true;
1382	}
1383
1384	// Removes the next Rune from the string_view and stores it in r.*
1385	// Returns number of bytes removed from sp.
1386	// Behaves as though there is a terminating NUL at the end of sp.
1387	// Argument order is backwards from usual Google style
1388	// but consistent with chartorune.
1389	static int StringViewToRune(Rune* r, absl::string_view* sp,
1390	RegexpStatus* status) {
1391	// fullrune() takes int, not size_t. However, it just looks
1392	// at the leading byte and treats any length >= 4 the same.
1393	if (fullrune(sp->data(), static_cast<int>(std::min(size_t{`4`}, sp->size())))) {
1394	int n = chartorune(r, sp->data());
1395	// Some copies of chartorune have a bug that accepts
1396	// encodings of values in (10FFFF, 1FFFFF] as valid.
1397	// Those values break the character class algorithm,
1398	// which assumes Runemax is the largest rune.
1399	if (*r > Runemax) {
1400	n = `1`;
1401	*r = Runeerror;
1402	}
1403	if (!(n == `1` && r == Runeerror)) { // no decoding error*
1404	sp->remove_prefix(n);
1405	return n;
1406	}
1407	}
1408
1409	if (status != NULL) {
1410	status->set_code(kRegexpBadUTF8);
1411	status->set_error_arg(absl::string_view ());
1412	}
1413	return -`1`;
1414	}
1415
1416	// Returns whether name is valid UTF-8.
1417	// If not, sets status to kRegexpBadUTF8.
1418	static bool IsValidUTF8(absl::string_view s, RegexpStatus* status) {
1419	absl::string_view t = s;
1420	Rune r;
1421	while (!t.empty()) {
1422	if (StringViewToRune(&r, &t, status) < `0`)
1423	return false;
1424	}
1425	return true;
1426	}
1427
1428	// Is c a hex digit?
1429	static int IsHex(int c) {
1430	return (`'0'` <= c && c <= `'9'`) \|\|
1431	(`'A'` <= c && c <= `'F'`) \|\|
1432	(`'a'` <= c && c <= `'f'`);
1433	}
1434
1435	// Convert hex digit to value.
1436	static int UnHex(int c) {
1437	if (`'0'` <= c && c <= `'9'`)
1438	return c - `'0'`;
1439	if (`'A'` <= c && c <= `'F'`)
1440	return c - `'A'` + `10`;
1441	if (`'a'` <= c && c <= `'f'`)
1442	return c - `'a'` + `10`;
1443	LOG(DFATAL) << "Bad hex digit " << c;
1444	return `0`;
1445	}
1446
1447	// Parse an escape sequence (e.g., \n, \{).
1448	// Sets s to span the remainder of the string.*
1449	// Sets rp to the named character.*
1450	static bool ParseEscape(absl::string_view* s, Rune* rp,
1451	RegexpStatus* status, int rune_max) {
1452	const char* begin = s->data();
1453	if (s->empty() \|\| (*s)[`0`] != `'\\'`) {
1454	// Should not happen - caller always checks.
1455	status->set_code(kRegexpInternalError);
1456	status->set_error_arg(absl::string_view ());
1457	return false;
1458	}
1459	if (s->size() == `1`) {
1460	status->set_code(kRegexpTrailingBackslash);
1461	status->set_error_arg(absl::string_view ());
1462	return false;
1463	}
1464	Rune c, c1;
1465	s->remove_prefix(`1`); // backslash
1466	if (StringViewToRune(&c, s, status) < `0`)
1467	return false;
1468	int code;
1469	switch (c) {
1470	default:
1471	if (c < Runeself && !isalpha(c) && !isdigit(c)) {
1472	// Escaped non-word characters are always themselves.
1473	// PCRE is not quite so rigorous: it accepts things like
1474	// \q, but we don't. We once rejected \_, but too many
1475	// programs and people insist on using it, so allow \_.
1476	*rp = c;
1477	return true;
1478	}
1479	goto BadEscape;
1480
1481	// Octal escapes.
1482	case `'1'`:
1483	case `'2'`:
1484	case `'3'`:
1485	case `'4'`:
1486	case `'5'`:
1487	case `'6'`:
1488	case `'7'`:
1489	// Single non-zero octal digit is a backreference; not supported.
1490	if (s->empty() \|\| (s)[`0`] < `'0'` \|\| (s)[`0`] > `'7'`)
1491	goto BadEscape;
1492	ABSL_FALLTHROUGH_INTENDED;
1493	case `'0'`:
1494	// consume up to three octal digits; already have one.
1495	code = c - `'0'`;
1496	if (!s->empty() && `'0'` <= (c = (*s)[`0`]) && c <= `'7'`) {
1497	code = code * `8` + c - `'0'`;
1498	s->remove_prefix(`1`); // digit
1499	if (!s->empty()) {
1500	c = (*s)[`0`];
1501	if (`'0'` <= c && c <= `'7'`) {
1502	code = code * `8` + c - `'0'`;
1503	s->remove_prefix(`1`); // digit
1504	}
1505	}
1506	}
1507	if (code > rune_max)
1508	goto BadEscape;
1509	*rp = code;
1510	return true;
1511
1512	// Hexadecimal escapes
1513	case `'x'`:
1514	if (s->empty())
1515	goto BadEscape;
1516	if (StringViewToRune(&c, s, status) < `0`)
1517	return false;
1518	if (c == `'{'`) {
1519	// Any number of digits in braces.
1520	// Update n as we consume the string, so that
1521	// the whole thing gets shown in the error message.
1522	// Perl accepts any text at all; it ignores all text
1523	// after the first non-hex digit. We require only hex digits,
1524	// and at least one.
1525	if (StringViewToRune(&c, s, status) < `0`)
1526	return false;
1527	int nhex = `0`;
1528	code = `0`;
1529	while (IsHex(c)) {
1530	nhex++;
1531	code = code * `16` + UnHex(c);
1532	if (code > rune_max)
1533	goto BadEscape;
1534	if (s->empty())
1535	goto BadEscape;
1536	if (StringViewToRune(&c, s, status) < `0`)
1537	return false;
1538	}
1539	if (c != `'}'` \|\| nhex == `0`)
1540	goto BadEscape;
1541	*rp = code;
1542	return true;
1543	}
1544	// Easy case: two hex digits.
1545	if (s->empty())
1546	goto BadEscape;
1547	if (StringViewToRune(&c1, s, status) < `0`)
1548	return false;
1549	if (!IsHex(c) \|\| !IsHex(c1))
1550	goto BadEscape;
1551	rp = UnHex(c) `16` + UnHex(c1);
1552	return true;
1553
1554	// C escapes.
1555	case `'n'`:
1556	*rp = `'\n'`;
1557	return true;
1558	case `'r'`:
1559	*rp = `'\r'`;
1560	return true;
1561	case `'t'`:
1562	*rp = `'\t'`;
1563	return true;
1564
1565	// Less common C escapes.
1566	case `'a'`:
1567	*rp = `'\a'`;
1568	return true;
1569	case `'f'`:
1570	*rp = `'\f'`;
1571	return true;
1572	case `'v'`:
1573	*rp = `'\v'`;
1574	return true;
1575
1576	// This code is disabled to avoid misparsing
1577	// the Perl word-boundary \b as a backspace
1578	// when in POSIX regexp mode. Surprisingly,
1579	// in Perl, \b means word-boundary but [\b]
1580	// means backspace. We don't support that:
1581	// if you want a backspace embed a literal
1582	// backspace character or use \x08.
1583	//
1584	// case 'b':
1585	// rp = '\b';*
1586	// return true;
1587	}
1588
1589	LOG(DFATAL) << "Not reached in ParseEscape.";
1590
1591	BadEscape:
1592	// Unrecognized escape sequence.
1593	status->set_code(kRegexpBadEscape);
1594	status->set_error_arg(
1595	absl::string_view (begin, static_cast<size_t>(s->data() - begin)));
1596	return false;
1597	}
1598
1599	// Add a range to the character class, but exclude newline if asked.
1600	// Also handle case folding.
1601	void CharClassBuilder::AddRangeFlags(
1602	Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
1603
1604	// Take out \n if the flags say so.
1605	bool cutnl = !(parse_flags & Regexp::ClassNL) \|\|
1606	(parse_flags & Regexp::NeverNL);
1607	if (cutnl && lo <= `'\n'` && `'\n'` <= hi) {
1608	if (lo < `'\n'`)
1609	AddRangeFlags(lo, `'\n'` - `1`, parse_flags);
1610	if (hi > `'\n'`)
1611	AddRangeFlags(`'\n'` + `1`, hi, parse_flags);
1612	return;
1613	}
1614
1615	// If folding case, add fold-equivalent characters too.
1616	if (parse_flags & Regexp::FoldCase)
1617	AddFoldedRange(this, lo, hi, `0`);
1618	else
1619	AddRange(lo, hi);
1620	}
1621
1622	// Look for a group with the given name.
1623	static const UGroup* LookupGroup(absl::string_view name,
1624	const UGroup* groups, int ngroups) {
1625	// Simple name lookup.
1626	for (int i = `0`; i < ngroups; i++)
1627	if (absl::string_view (groups[i].name) == name)
1628	return &groups[i];
1629	return NULL;
1630	}
1631
1632	// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
1633	static const UGroup* LookupPosixGroup(absl::string_view name) {
1634	return LookupGroup(name, posix_groups, num_posix_groups);
1635	}
1636
1637	static const UGroup* LookupPerlGroup(absl::string_view name) {
1638	return LookupGroup(name, perl_groups, num_perl_groups);
1639	}
1640
1641	#if !defined(RE2_USE_ICU)
1642	// Fake UGroup containing all Runes
1643	static URange16 any16[] = { { `0`, `65535` } };
1644	static URange32 any32[] = { { `65536`, Runemax } };
1645	static UGroup anygroup = { "Any", +`1`, any16, `1`, any32, `1` };
1646
1647	// Look for a Unicode group with the given name (e.g., "Han")
1648	static const UGroup* LookupUnicodeGroup(absl::string_view name) {
1649	// Special case: "Any" means any.
1650	if (name == absl::string_view ("Any"))
1651	return &anygroup;
1652	return LookupGroup(name, unicode_groups, num_unicode_groups);
1653	}
1654	#endif
1655
1656	// Add a UGroup or its negation to the character class.
1657	static void AddUGroup(CharClassBuilder* cc, const UGroup* g, int sign,
1658	Regexp::ParseFlags parse_flags) {
1659	if (sign == +`1`) {
1660	for (int i = `0`; i < g->nr16; i++) {
1661	cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
1662	}
1663	for (int i = `0`; i < g->nr32; i++) {
1664	cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
1665	}
1666	} else {
1667	if (parse_flags & Regexp::FoldCase) {
1668	// Normally adding a case-folded group means
1669	// adding all the extra fold-equivalent runes too.
1670	// But if we're adding the negation of the group,
1671	// we have to exclude all the runes that are fold-equivalent
1672	// to what's already missing. Too hard, so do in two steps.
1673	CharClassBuilder ccb1;
1674	AddUGroup(&ccb1, g, +`1`, parse_flags);
1675	// If the flags say to take out \n, put it in, so that negating will take it out.
1676	// Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
1677	bool cutnl = !(parse_flags & Regexp::ClassNL) \|\|
1678	(parse_flags & Regexp::NeverNL);
1679	if (cutnl) {
1680	ccb1.AddRange(`'\n'`, `'\n'`);
1681	}
1682	ccb1.Negate();
1683	cc->AddCharClass(&ccb1);
1684	return;
1685	}
1686	int next = `0`;
1687	for (int i = `0`; i < g->nr16; i++) {
1688	if (next < g->r16[i].lo)
1689	cc->AddRangeFlags(next, g->r16[i].lo - `1`, parse_flags);
1690	next = g->r16[i].hi + `1`;
1691	}
1692	for (int i = `0`; i < g->nr32; i++) {
1693	if (next < g->r32[i].lo)
1694	cc->AddRangeFlags(next, g->r32[i].lo - `1`, parse_flags);
1695	next = g->r32[i].hi + `1`;
1696	}
1697	if (next <= Runemax)
1698	cc->AddRangeFlags(next, Runemax, parse_flags);
1699	}
1700	}
1701
1702	// Maybe parse a Perl character class escape sequence.
1703	// Only recognizes the Perl character classes (\d \s \w \D \S \W),
1704	// not the Perl empty-string classes (\b \B \A \Z \z).
1705	// On success, sets s to span the remainder of the string*
1706	// and returns the corresponding UGroup.
1707	// The string_view must NOT* be edited unless the call succeeds.*
1708	const UGroup* MaybeParsePerlCCEscape(absl::string_view* s,
1709	Regexp::ParseFlags parse_flags) {
1710	if (!(parse_flags & Regexp::PerlClasses))
1711	return NULL;
1712	if (s->size() < `2` \|\| (*s)[`0`] != `'\\'`)
1713	return NULL;
1714	// Could use StringViewToRune, but there aren't
1715	// any non-ASCII Perl group names.
1716	absl::string_view name(s->data(), `2`);
1717	const UGroup* g = LookupPerlGroup(name);
1718	if (g == NULL)
1719	return NULL;
1720	s->remove_prefix(name.size());
1721	return g;
1722	}
1723
1724	enum ParseStatus {
1725	kParseOk, // Did some parsing.
1726	kParseError, // Found an error.
1727	kParseNothing, // Decided not to parse.
1728	};
1729
1730	// Maybe parses a Unicode character group like \p{Han} or \P{Han}
1731	// (the latter is a negated group).
1732	ParseStatus ParseUnicodeGroup(absl::string_view* s,
1733	Regexp::ParseFlags parse_flags,
1734	CharClassBuilder* cc, RegexpStatus* status) {
1735	// Decide whether to parse.
1736	if (!(parse_flags & Regexp::UnicodeGroups))
1737	return kParseNothing;
1738	if (s->size() < `2` \|\| (*s)[`0`] != `'\\'`)
1739	return kParseNothing;
1740	Rune c = (*s)[`1`];
1741	if (c != `'p'` && c != `'P'`)
1742	return kParseNothing;
1743
1744	// Committed to parse. Results:
1745	int sign = +`1`; // -1 = negated char class
1746	if (c == `'P'`)
1747	sign = -sign;
1748	absl::string_view seq = s; // \p{Han} or \pL*
1749	absl::string_view name; // Han or L
1750	s->remove_prefix(`2`); // '\\', 'p'
1751
1752	if (!StringViewToRune(&c, s, status))
1753	return kParseError;
1754	if (c != `'{'`) {
1755	// Name is the bit of string we just skipped over for c.
1756	const char* p = seq.data() + `2`;
1757	name = absl::string_view (p, static_cast<size_t>(s->data() - p));
1758	} else {
1759	// Name is in braces. Look for closing }
1760	size_t end = s->find(`'}'`, `0`);
1761	if (end == absl::string_view::npos) {
1762	if (!IsValidUTF8(seq, status))
1763	return kParseError;
1764	status->set_code(kRegexpBadCharRange);
1765	status->set_error_arg(seq);
1766	return kParseError;
1767	}
1768	name = absl::string_view (s->data(), end); // without '}'
1769	s->remove_prefix(end + `1`); // with '}'
1770	if (!IsValidUTF8(name, status))
1771	return kParseError;
1772	}
1773
1774	// Chop seq where s now begins.
1775	seq = absl::string_view (seq.data(), static_cast<size_t>(s->data() - seq.data()));
1776
1777	if (!name.empty() && name [`0`] == `'^'`) {
1778	sign = -sign;
1779	name.remove_prefix(`1`); // '^'
1780	}
1781
1782	#if !defined(RE2_USE_ICU)
1783	// Look up the group in the RE2 Unicode data.
1784	const UGroup* g = LookupUnicodeGroup(name);
1785	if (g == NULL) {
1786	status->set_code(kRegexpBadCharRange);
1787	status->set_error_arg(seq);
1788	return kParseError;
1789	}
1790
1791	AddUGroup(cc, g, sign, parse_flags);
1792	#else
1793	// Look up the group in the ICU Unicode data. Because ICU provides full
1794	// Unicode properties support, this could be more than a lookup by name.
1795	::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
1796	std::string("\\p{") + std::string(name) + std::string("}"));
1797	UErrorCode uerr = U_ZERO_ERROR;
1798	::icu::UnicodeSet uset(ustr, uerr);
1799	if (U_FAILURE(uerr)) {
1800	status->set_code(kRegexpBadCharRange);
1801	status->set_error_arg(seq);
1802	return kParseError;
1803	}
1804
1805	// Convert the UnicodeSet to a URange32 and UGroup that we can add.
1806	int nr = uset.getRangeCount();
1807	PODArray<URange32> r(nr);
1808	for (int i = `0`; i < nr; i++) {
1809	r[i].lo = uset.getRangeStart(i);
1810	r[i].hi = uset.getRangeEnd(i);
1811	}
1812	UGroup g = {"", +`1`, `0`, `0`, r.data(), nr};
1813	AddUGroup(cc, &g, sign, parse_flags);
1814	#endif
1815
1816	return kParseOk;
1817	}
1818
1819	// Parses a character class name like [:alnum:].
1820	// Sets s to span the remainder of the string.*
1821	// Adds the ranges corresponding to the class to ranges.
1822	static ParseStatus ParseCCName(absl::string_view* s,
1823	Regexp::ParseFlags parse_flags,
1824	CharClassBuilder* cc, RegexpStatus* status) {
1825	// Check begins with [:
1826	const char* p = s->data();
1827	const char* ep = s->data() + s->size();
1828	if (ep - p < `2` \|\| p[`0`] != `'['` \|\| p[`1`] != `':'`)
1829	return kParseNothing;
1830
1831	// Look for closing :].
1832	const char* q;
1833	for (q = p+`2`; q <= ep-`2` && (q != `':'` \|\| (q+`1`) != `']'`); q++)
1834	;
1835
1836	// If no closing :], then ignore.
1837	if (q > ep-`2`)
1838	return kParseNothing;
1839
1840	// Got it. Check that it's valid.
1841	q += `2`;
1842	absl::string_view name(p, static_cast<size_t>(q - p));
1843
1844	const UGroup* g = LookupPosixGroup(name);
1845	if (g == NULL) {
1846	status->set_code(kRegexpBadCharRange);
1847	status->set_error_arg(name);
1848	return kParseError;
1849	}
1850
1851	s->remove_prefix(name.size());
1852	AddUGroup(cc, g, g->sign, parse_flags);
1853	return kParseOk;
1854	}
1855
1856	// Parses a character inside a character class.
1857	// There are fewer special characters here than in the rest of the regexp.
1858	// Sets s to span the remainder of the string.*
1859	// Sets rp to the character.*
1860	bool Regexp::ParseState::ParseCCCharacter(absl::string_view* s, Rune* rp,
1861	absl::string_view whole_class,
1862	RegexpStatus* status) {
1863	if (s->empty()) {
1864	status->set_code(kRegexpMissingBracket);
1865	status->set_error_arg(whole_class);
1866	return false;
1867	}
1868
1869	// Allow regular escape sequences even though
1870	// many need not be escaped in this context.
1871	if ((*s)[`0`] == `'\\'`)
1872	return ParseEscape(s, rp, status, rune_max_);
1873
1874	// Otherwise take the next rune.
1875	return StringViewToRune(rp, s, status) >= `0`;
1876	}
1877
1878	// Parses a character class character, or, if the character
1879	// is followed by a hyphen, parses a character class range.
1880	// For single characters, rr->lo == rr->hi.
1881	// Sets s to span the remainder of the string.*
1882	// Sets rp to the character.*
1883	bool Regexp::ParseState::ParseCCRange(absl::string_view* s, RuneRange* rr,
1884	absl::string_view whole_class,
1885	RegexpStatus* status) {
1886	absl::string_view os = *s;
1887	if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
1888	return false;
1889	// [a-] means (a\|-), so check for final ].
1890	if (s->size() >= `2` && (s)[`0`] == `'-'` && (s)[`1`] != `']'`) {
1891	s->remove_prefix(`1`); // '-'
1892	if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
1893	return false;
1894	if (rr->hi < rr->lo) {
1895	status->set_code(kRegexpBadCharRange);
1896	status->set_error_arg(absl::string_view (
1897	os.data(), static_cast<size_t>(s->data() - os.data())));
1898	return false;
1899	}
1900	} else {
1901	rr->hi = rr->lo;
1902	}
1903	return true;
1904	}
1905
1906	// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
1907	// Sets s to span the remainder of the string.*
1908	// Sets out_re to the regexp for the class.*
1909	bool Regexp::ParseState::ParseCharClass(absl::string_view* s, Regexp** out_re,
1910	RegexpStatus* status) {
1911	absl::string_view whole_class = *s;
1912	if (s->empty() \|\| (*s)[`0`] != `'['`) {
1913	// Caller checked this.
1914	status->set_code(kRegexpInternalError);
1915	status->set_error_arg(absl::string_view ());
1916	return false;
1917	}
1918	bool negated = false;
1919	Regexp* re = new Regexp (kRegexpCharClass, flags_ & ~FoldCase);
1920	re->ccb_ = new CharClassBuilder;
1921	s->remove_prefix(`1`); // '['
1922	if (!s->empty() && (*s)[`0`] == `'^'`) {
1923	s->remove_prefix(`1`); // '^'
1924	negated = true;
1925	if (!(flags_ & ClassNL) \|\| (flags_ & NeverNL)) {
1926	// If NL can't match implicitly, then pretend
1927	// negated classes include a leading \n.
1928	re->ccb_->AddRange(`'\n'`, `'\n'`);
1929	}
1930	}
1931	bool first = true; // ] is okay as first char in class
1932	while (!s->empty() && ((*s)[`0`] != `']'` \|\| first)) {
1933	// - is only okay unescaped as first or last in class.
1934	// Except that Perl allows - anywhere.
1935	if ((*s)[`0`] == `'-'` && !first && !(flags_&PerlX) &&
1936	(s->size() == `1` \|\| (*s)[`1`] != `']'`)) {
1937	absl::string_view t = *s;
1938	t.remove_prefix(`1`); // '-'
1939	Rune r;
1940	int n = StringViewToRune(&r, &t, status);
1941	if (n < `0`) {
1942	re->Decref();
1943	return false;
1944	}
1945	status->set_code(kRegexpBadCharRange);
1946	status->set_error_arg(absl::string_view (s->data(), `1`+n));
1947	re->Decref();
1948	return false;
1949	}
1950	first = false;
1951
1952	// Look for [:alnum:] etc.
1953	if (s->size() > `2` && (s)[`0`] == `'['` && (s)[`1`] == `':'`) {
1954	switch (ParseCCName(s, flags_, re->ccb_, status)) {
1955	case kParseOk:
1956	continue;
1957	case kParseError:
1958	re->Decref();
1959	return false;
1960	case kParseNothing:
1961	break;
1962	}
1963	}
1964
1965	// Look for Unicode character group like \p{Han}
1966	if (s->size() > `2` &&
1967	(*s)[`0`] == `'\\'` &&
1968	((s)[`1`] == `'p'` \|\| (s)[`1`] == `'P'`)) {
1969	switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
1970	case kParseOk:
1971	continue;
1972	case kParseError:
1973	re->Decref();
1974	return false;
1975	case kParseNothing:
1976	break;
1977	}
1978	}
1979
1980	// Look for Perl character class symbols (extension).
1981	const UGroup* g = MaybeParsePerlCCEscape(s, flags_);
1982	if (g != NULL) {
1983	AddUGroup(re->ccb_, g, g->sign, flags_);
1984	continue;
1985	}
1986
1987	// Otherwise assume single character or simple range.
1988	RuneRange rr;
1989	if (!ParseCCRange(s, &rr, whole_class, status)) {
1990	re->Decref();
1991	return false;
1992	}
1993	// AddRangeFlags is usually called in response to a class like
1994	// \p{Foo} or [[:foo:]]; for those, it filters \n out unless
1995	// Regexp::ClassNL is set. In an explicit range or singleton
1996	// like we just parsed, we do not filter \n out, so set ClassNL
1997	// in the flags.
1998	re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ \| Regexp::ClassNL);
1999	}
2000	if (s->empty()) {
2001	status->set_code(kRegexpMissingBracket);
2002	status->set_error_arg(whole_class);
2003	re->Decref();
2004	return false;
2005	}
2006	s->remove_prefix(`1`); // ']'
2007
2008	if (negated)
2009	re->ccb_->Negate();
2010
2011	*out_re = re;
2012	return true;
2013	}
2014
2015	// Returns whether name is a valid capture name.
2016	static bool IsValidCaptureName(absl::string_view name) {
2017	if (name.empty())
2018	return false;
2019
2020	// Historically, we effectively used [0-9A-Za-z_]+ to validate; that
2021	// followed Python 2 except for not restricting the first character.
2022	// As of Python 3, Unicode characters beyond ASCII are also allowed;
2023	// accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and
2024	// Pc categories, but again without restricting the first character.
2025	// Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3
2026	// performs it for identifiers, but seemingly not for capture names;
2027	// if they start doing that for capture names, we won't follow suit.
2028	static const CharClass* const cc = []() {
2029	CharClassBuilder ccb;
2030	for (absl::string_view group :
2031	{"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"})
2032	AddUGroup(&ccb, LookupGroup(group, unicode_groups, num_unicode_groups),
2033	+`1`, Regexp::NoParseFlags);
2034	return ccb.GetCharClass();
2035	}();
2036
2037	absl::string_view t = name;
2038	Rune r;
2039	while (!t.empty()) {
2040	if (StringViewToRune(&r, &t, NULL) < `0`)
2041	return false;
2042	if (cc->Contains(r))
2043	continue;
2044	return false;
2045	}
2046	return true;
2047	}
2048
2049	// Parses a Perl flag setting or non-capturing group or both,
2050	// like (?i) or (?: or (?i:. Removes from s, updates parse state.
2051	// The caller must check that s begins with "(?".
2052	// Returns true on success. If the Perl flag is not
2053	// well-formed or not supported, sets status_ and returns false.
2054	bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
2055	absl::string_view t = *s;
2056
2057	// Caller is supposed to check this.
2058	if (!(flags_ & PerlX) \|\| t.size() < `2` \|\| t [`0`] != `'('` \|\| t [`1`] != `'?'`) {
2059	LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
2060	status_->set_code(kRegexpInternalError);
2061	return false;
2062	}
2063
2064	t.remove_prefix(`2`); // "(?"
2065
2066	// Check for named captures, first introduced in Python's regexp library.
2067	// As usual, there are three slightly different syntaxes:
2068	//
2069	// (?P<name>expr) the original, introduced by Python
2070	// (?<name>expr) the .NET alteration, adopted by Perl 5.10
2071	// (?'name'expr) another .NET alteration, adopted by Perl 5.10
2072	//
2073	// Perl 5.10 gave in and implemented the Python version too,
2074	// but they claim that the last two are the preferred forms.
2075	// PCRE and languages based on it (specifically, PHP and Ruby)
2076	// support all three as well. EcmaScript 4 uses only the Python form.
2077	//
2078	// In both the open source world (via Code Search) and the
2079	// Google source tree, (?P<expr>name) is the dominant form,
2080	// so that's the one we implement. One is enough.
2081	if (t.size() > `2` && t [`0`] == `'P'` && t [`1`] == `'<'`) {
2082	// Pull out name.
2083	size_t end = t.find(`'>'`, `2`);
2084	if (end == absl::string_view::npos) {
2085	if (!IsValidUTF8(*s, status_))
2086	return false;
2087	status_->set_code(kRegexpBadNamedCapture);
2088	status_->set_error_arg(*s);
2089	return false;
2090	}
2091
2092	// t is "P<name>...", t[end] == '>'
2093	absl::string_view capture(t.data()-`2`, end+`3`); // "(?P<name>"
2094	absl::string_view name(t.data()+`2`, end-`2`); // "name"
2095	if (!IsValidUTF8(name, status_))
2096	return false;
2097	if (!IsValidCaptureName(name)) {
2098	status_->set_code(kRegexpBadNamedCapture);
2099	status_->set_error_arg(capture);
2100	return false;
2101	}
2102
2103	if (!DoLeftParen(name)) {
2104	// DoLeftParen's failure set status_.
2105	return false;
2106	}
2107
2108	s->remove_prefix(
2109	static_cast<size_t>(capture.data() + capture.size() - s->data()));
2110	return true;
2111	}
2112
2113	bool negated = false;
2114	bool sawflags = false;
2115	int nflags = flags_;
2116	Rune c;
2117	for (bool done = false; !done; ) {
2118	if (t.empty())
2119	goto BadPerlOp;
2120	if (StringViewToRune(&c, &t, status_) < `0`)
2121	return false;
2122	switch (c) {
2123	default:
2124	goto BadPerlOp;
2125
2126	// Parse flags.
2127	case `'i'`:
2128	sawflags = true;
2129	if (negated)
2130	nflags &= ~FoldCase;
2131	else
2132	nflags \|= FoldCase;
2133	break;
2134
2135	case `'m'`: // opposite of our OneLine
2136	sawflags = true;
2137	if (negated)
2138	nflags \|= OneLine;
2139	else
2140	nflags &= ~OneLine;
2141	break;
2142
2143	case `'s'`:
2144	sawflags = true;
2145	if (negated)
2146	nflags &= ~DotNL;
2147	else
2148	nflags \|= DotNL;
2149	break;
2150
2151	case `'U'`:
2152	sawflags = true;
2153	if (negated)
2154	nflags &= ~NonGreedy;
2155	else
2156	nflags \|= NonGreedy;
2157	break;
2158
2159	// Negation
2160	case `'-'`:
2161	if (negated)
2162	goto BadPerlOp;
2163	negated = true;
2164	sawflags = false;
2165	break;
2166
2167	// Open new group.
2168	case `':'`:
2169	if (!DoLeftParenNoCapture()) {
2170	// DoLeftParenNoCapture's failure set status_.
2171	return false;
2172	}
2173	done = true;
2174	break;
2175
2176	// Finish flags.
2177	case `')'`:
2178	done = true;
2179	break;
2180	}
2181	}
2182
2183	if (negated && !sawflags)
2184	goto BadPerlOp;
2185
2186	flags_ = static_cast<Regexp::ParseFlags>(nflags);
2187	*s = t;
2188	return true;
2189
2190	BadPerlOp:
2191	status_->set_code(kRegexpBadPerlOp);
2192	status_->set_error_arg(
2193	absl::string_view (s->data(), static_cast<size_t>(t.data() - s->data())));
2194	return false;
2195	}
2196
2197	// Converts latin1 (assumed to be encoded as Latin1 bytes)
2198	// into UTF8 encoding in string.
2199	// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
2200	// deprecated and because it rejects code points 0x80-0x9F.
2201	void ConvertLatin1ToUTF8(absl::string_view latin1, std::string* utf) {
2202	char buf[UTFmax];
2203
2204	utf->clear();
2205	for (size_t i = `0`; i < latin1.size(); i++) {
2206	Rune r = latin1 [i] & `0xFF`;
2207	int n = runetochar(buf, &r);
2208	utf->append(buf, n);
2209	}
2210	}
2211
2212	// Parses the regular expression given by s,
2213	// returning the corresponding Regexp tree.
2214	// The caller must Decref the return value when done with it.
2215	// Returns NULL on error.
2216	Regexp* Regexp::Parse(absl::string_view s, ParseFlags global_flags,
2217	RegexpStatus* status) {
2218	// Make status non-NULL (easier on everyone else).
2219	RegexpStatus xstatus;
2220	if (status == NULL)
2221	status = &xstatus;
2222
2223	ParseState ps(global_flags, s, status);
2224	absl::string_view t = s;
2225
2226	// Convert regexp to UTF-8 (easier on the rest of the parser).
2227	if (global_flags & Latin1) {
2228	std::string* tmp = new std::string;
2229	ConvertLatin1ToUTF8(t, tmp);
2230	status->set_tmp(tmp);
2231	t = *tmp;
2232	}
2233
2234	if (global_flags & Literal) {
2235	// Special parse loop for literal string.
2236	while (!t.empty()) {
2237	Rune r;
2238	if (StringViewToRune(&r, &t, status) < `0`)
2239	return NULL;
2240	if (!ps.PushLiteral(r))
2241	return NULL;
2242	}
2243	return ps.DoFinish();
2244	}
2245
2246	absl::string_view lastunary = absl::string_view ();
2247	while (!t.empty()) {
2248	absl::string_view isunary = absl::string_view ();
2249	switch (t [`0`]) {
2250	default: {
2251	Rune r;
2252	if (StringViewToRune(&r, &t, status) < `0`)
2253	return NULL;
2254	if (!ps.PushLiteral(r))
2255	return NULL;
2256	break;
2257	}
2258
2259	case `'('`:
2260	// "(?" introduces Perl escape.
2261	if ((ps.flags() & PerlX) && (t.size() >= `2` && t [`1`] == `'?'`)) {
2262	// Flag changes and non-capturing groups.
2263	if (!ps.ParsePerlFlags(&t))
2264	return NULL;
2265	break;
2266	}
2267	if (ps.flags() & NeverCapture) {
2268	if (!ps.DoLeftParenNoCapture())
2269	return NULL;
2270	} else {
2271	if (!ps.DoLeftParen(absl::string_view ()))
2272	return NULL;
2273	}
2274	t.remove_prefix(`1`); // '('
2275	break;
2276
2277	case `'\|'`:
2278	if (!ps.DoVerticalBar())
2279	return NULL;
2280	t.remove_prefix(`1`); // '\|'
2281	break;
2282
2283	case `')'`:
2284	if (!ps.DoRightParen())
2285	return NULL;
2286	t.remove_prefix(`1`); // ')'
2287	break;
2288
2289	case `'^'`: // Beginning of line.
2290	if (!ps.PushCaret())
2291	return NULL;
2292	t.remove_prefix(`1`); // '^'
2293	break;
2294
2295	case `'$'`: // End of line.
2296	if (!ps.PushDollar())
2297	return NULL;
2298	t.remove_prefix(`1`); // '$'
2299	break;
2300
2301	case `'.'`: // Any character (possibly except newline).
2302	if (!ps.PushDot())
2303	return NULL;
2304	t.remove_prefix(`1`); // '.'
2305	break;
2306
2307	case `'['`: { // Character class.
2308	Regexp* re;
2309	if (!ps.ParseCharClass(&t, &re, status))
2310	return NULL;
2311	if (!ps.PushRegexp(re))
2312	return NULL;
2313	break;
2314	}
2315
2316	case `''`: { // Zero or more.*
2317	RegexpOp op;
2318	op = kRegexpStar;
2319	goto Rep;
2320	case `'+'`: // One or more.
2321	op = kRegexpPlus;
2322	goto Rep;
2323	case `'?'`: // Zero or one.
2324	op = kRegexpQuest;
2325	goto Rep;
2326	Rep:
2327	absl::string_view opstr = t;
2328	bool nongreedy = false;
2329	t.remove_prefix(`1`); // '' or '+' or '?'*
2330	if (ps.flags() & PerlX) {
2331	if (!t.empty() && t [`0`] == `'?'`) {
2332	nongreedy = true;
2333	t.remove_prefix(`1`); // '?'
2334	}
2335	if (!lastunary.empty()) {
2336	// In Perl it is not allowed to stack repetition operators:
2337	// a* is a syntax error, not a double-star.*
2338	// (and a++ means something else entirely, which we don't support!)
2339	status->set_code(kRegexpRepeatOp);
2340	status->set_error_arg(absl::string_view (
2341	lastunary.data(),
2342	static_cast<size_t>(t.data() - lastunary.data())));
2343	return NULL;
2344	}
2345	}
2346	opstr = absl::string_view (opstr.data(),
2347	static_cast<size_t>(t.data() - opstr.data()));
2348	if (!ps.PushRepeatOp(op, opstr, nongreedy))
2349	return NULL;
2350	isunary = opstr;
2351	break;
2352	}
2353
2354	case `'{'`: { // Counted repetition.
2355	int lo, hi;
2356	absl::string_view opstr = t;
2357	if (!MaybeParseRepetition(&t, &lo, &hi)) {
2358	// Treat like a literal.
2359	if (!ps.PushLiteral(`'{'`))
2360	return NULL;
2361	t.remove_prefix(`1`); // '{'
2362	break;
2363	}
2364	bool nongreedy = false;
2365	if (ps.flags() & PerlX) {
2366	if (!t.empty() && t [`0`] == `'?'`) {
2367	nongreedy = true;
2368	t.remove_prefix(`1`); // '?'
2369	}
2370	if (!lastunary.empty()) {
2371	// Not allowed to stack repetition operators.
2372	status->set_code(kRegexpRepeatOp);
2373	status->set_error_arg(absl::string_view (
2374	lastunary.data(),
2375	static_cast<size_t>(t.data() - lastunary.data())));
2376	return NULL;
2377	}
2378	}
2379	opstr = absl::string_view (opstr.data(),
2380	static_cast<size_t>(t.data() - opstr.data()));
2381	if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
2382	return NULL;
2383	isunary = opstr;
2384	break;
2385	}
2386
2387	case `'\\'`: { // Escaped character or Perl sequence.
2388	// \b and \B: word boundary or not
2389	if ((ps.flags() & Regexp::PerlB) &&
2390	t.size() >= `2` && (t [`1`] == `'b'` \|\| t [`1`] == `'B'`)) {
2391	if (!ps.PushWordBoundary(t [`1`] == `'b'`))
2392	return NULL;
2393	t.remove_prefix(`2`); // '\\', 'b'
2394	break;
2395	}
2396
2397	if ((ps.flags() & Regexp::PerlX) && t.size() >= `2`) {
2398	if (t [`1`] == `'A'`) {
2399	if (!ps.PushSimpleOp(kRegexpBeginText))
2400	return NULL;
2401	t.remove_prefix(`2`); // '\\', 'A'
2402	break;
2403	}
2404	if (t [`1`] == `'z'`) {
2405	if (!ps.PushSimpleOp(kRegexpEndText))
2406	return NULL;
2407	t.remove_prefix(`2`); // '\\', 'z'
2408	break;
2409	}
2410	// Do not recognize \Z, because this library can't
2411	// implement the exact Perl/PCRE semantics.
2412	// (This library treats "(?-m)$" as \z, even though
2413	// in Perl and PCRE it is equivalent to \Z.)
2414
2415	if (t [`1`] == `'C'`) { // \C: any byte [sic]
2416	if (!ps.PushSimpleOp(kRegexpAnyByte))
2417	return NULL;
2418	t.remove_prefix(`2`); // '\\', 'C'
2419	break;
2420	}
2421
2422	if (t [`1`] == `'Q'`) { // \Q ... \E: the ... is always literals
2423	t.remove_prefix(`2`); // '\\', 'Q'
2424	while (!t.empty()) {
2425	if (t.size() >= `2` && t [`0`] == `'\\'` && t [`1`] == `'E'`) {
2426	t.remove_prefix(`2`); // '\\', 'E'
2427	break;
2428	}
2429	Rune r;
2430	if (StringViewToRune(&r, &t, status) < `0`)
2431	return NULL;
2432	if (!ps.PushLiteral(r))
2433	return NULL;
2434	}
2435	break;
2436	}
2437	}
2438
2439	if (t.size() >= `2` && (t [`1`] == `'p'` \|\| t [`1`] == `'P'`)) {
2440	Regexp* re = new Regexp (kRegexpCharClass, ps.flags() & ~FoldCase);
2441	re->ccb_ = new CharClassBuilder;
2442	switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
2443	case kParseOk:
2444	if (!ps.PushRegexp(re))
2445	return NULL;
2446	goto Break2;
2447	case kParseError:
2448	re->Decref();
2449	return NULL;
2450	case kParseNothing:
2451	re->Decref();
2452	break;
2453	}
2454	}
2455
2456	const UGroup* g = MaybeParsePerlCCEscape(&t, ps.flags());
2457	if (g != NULL) {
2458	Regexp* re = new Regexp (kRegexpCharClass, ps.flags() & ~FoldCase);
2459	re->ccb_ = new CharClassBuilder;
2460	AddUGroup(re->ccb_, g, g->sign, ps.flags());
2461	if (!ps.PushRegexp(re))
2462	return NULL;
2463	break;
2464	}
2465
2466	Rune r;
2467	if (!ParseEscape(&t, &r, status, ps.rune_max()))
2468	return NULL;
2469	if (!ps.PushLiteral(r))
2470	return NULL;
2471	break;
2472	}
2473	}
2474	Break2:
2475	lastunary = isunary;
2476	}
2477	return ps.DoFinish();
2478	}
2479
2480	} // namespace re2
2481

Browse the source code of tensorflow/external/com_googlesource_code_re2/re2/parse.cc