simplify.cc source code [tensorflow/external/com_googlesource_code_re2/re2/simplify.cc]

1	// Copyright 2006 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Rewrite POSIX and other features in re
6	// to use simple extended regular expression features.
7	// Also sort and simplify character classes.
8
9	#include <string>
10
11	#include "util/logging.h"
12	#include "util/utf.h"
13	#include "re2/pod_array.h"
14	#include "re2/regexp.h"
15	#include "re2/walker-inl.h"
16
17	namespace re2 {
18
19	// Parses the regexp src and then simplifies it and sets dst to the*
20	// string representation of the simplified form. Returns true on success.
21	// Returns false and sets error (if error != NULL) on error.*
22	bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags,
23	std::string* dst, RegexpStatus* status) {
24	Regexp* re = Parse(src, flags, status);
25	if (re == NULL)
26	return false;
27	Regexp* sre = re->Simplify();
28	re->Decref();
29	if (sre == NULL) {
30	if (status) {
31	status->set_code(kRegexpInternalError);
32	status->set_error_arg(src);
33	}
34	return false;
35	}
36	*dst = sre->ToString();
37	sre->Decref();
38	return true;
39	}
40
41	// Assuming the simple_ flags on the children are accurate,
42	// is this Regexp simple?*
43	bool Regexp::ComputeSimple() {
44	Regexp** subs;
45	switch (op_) {
46	case kRegexpNoMatch:
47	case kRegexpEmptyMatch:
48	case kRegexpLiteral:
49	case kRegexpLiteralString:
50	case kRegexpBeginLine:
51	case kRegexpEndLine:
52	case kRegexpBeginText:
53	case kRegexpWordBoundary:
54	case kRegexpNoWordBoundary:
55	case kRegexpEndText:
56	case kRegexpAnyChar:
57	case kRegexpAnyByte:
58	case kRegexpHaveMatch:
59	return true;
60	case kRegexpConcat:
61	case kRegexpAlternate:
62	// These are simple as long as the subpieces are simple.
63	subs = sub();
64	for (int i = `0`; i < nsub_; i++)
65	if (!subs[i]->simple())
66	return false;
67	return true;
68	case kRegexpCharClass:
69	// Simple as long as the char class is not empty, not full.
70	if (ccb_ != NULL)
71	return !ccb_->empty() && !ccb_->full();
72	return !cc_->empty() && !cc_->full();
73	case kRegexpCapture:
74	subs = sub();
75	return subs[`0`]->simple();
76	case kRegexpStar:
77	case kRegexpPlus:
78	case kRegexpQuest:
79	subs = sub();
80	if (!subs[`0`]->simple())
81	return false;
82	switch (subs[`0`]->op_) {
83	case kRegexpStar:
84	case kRegexpPlus:
85	case kRegexpQuest:
86	case kRegexpEmptyMatch:
87	case kRegexpNoMatch:
88	return false;
89	default:
90	break;
91	}
92	return true;
93	case kRegexpRepeat:
94	return false;
95	}
96	LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
97	return false;
98	}
99
100	// Walker subclass used by Simplify.
101	// Coalesces runs of star/plus/quest/repeat of the same literal along with any
102	// occurrences of that literal into repeats of that literal. It also works for
103	// char classes, any char and any byte.
104	// PostVisit creates the coalesced result, which should then be simplified.
105	class CoalesceWalker : public Regexp::Walker<Regexp*> {
106	public:
107	CoalesceWalker() {}
108	virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
109	Regexp** child_args, int nchild_args);
110	virtual Regexp* Copy(Regexp* re);
111	virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
112
113	private:
114	// These functions are declared inside CoalesceWalker so that
115	// they can edit the private fields of the Regexps they construct.
116
117	// Returns true if r1 and r2 can be coalesced. In particular, ensures that
118	// the parse flags are consistent. (They will not be checked again later.)
119	static bool CanCoalesce(Regexp* r1, Regexp* r2);
120
121	// Coalesces r1ptr and r2ptr. In most cases, the array elements afterwards
122	// will be empty match and the coalesced op. In other cases, where part of a
123	// literal string was removed to be coalesced, the array elements afterwards
124	// will be the coalesced op and the remainder of the literal string.
125	static void DoCoalesce(Regexp r1ptr, Regexp r2ptr);
126
127	CoalesceWalker(const CoalesceWalker&) = delete;
128	CoalesceWalker& operator=(const CoalesceWalker&) = delete;
129	};
130
131	// Walker subclass used by Simplify.
132	// The simplify walk is purely post-recursive: given the simplified children,
133	// PostVisit creates the simplified result.
134	// The child_args are simplified Regexps.*
135	class SimplifyWalker : public Regexp::Walker<Regexp*> {
136	public:
137	SimplifyWalker() {}
138	virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
139	virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
140	Regexp** child_args, int nchild_args);
141	virtual Regexp* Copy(Regexp* re);
142	virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
143
144	private:
145	// These functions are declared inside SimplifyWalker so that
146	// they can edit the private fields of the Regexps they construct.
147
148	// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
149	// Caller must Decref return value when done with it.
150	static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
151
152	// Simplifies the expression re{min,max} in terms of , +, and ?.*
153	// Returns a new regexp. Does not edit re. Does not consume reference to re.
154	// Caller must Decref return value when done with it.
155	static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
156	Regexp::ParseFlags parse_flags);
157
158	// Simplifies a character class by expanding any named classes
159	// into rune ranges. Does not edit re. Does not consume ref to re.
160	// Caller must Decref return value when done with it.
161	static Regexp* SimplifyCharClass(Regexp* re);
162
163	SimplifyWalker(const SimplifyWalker&) = delete;
164	SimplifyWalker& operator=(const SimplifyWalker&) = delete;
165	};
166
167	// Simplifies a regular expression, returning a new regexp.
168	// The new regexp uses traditional Unix egrep features only,
169	// plus the Perl (?:) non-capturing parentheses.
170	// Otherwise, no POSIX or Perl additions. The new regexp
171	// captures exactly the same subexpressions (with the same indices)
172	// as the original.
173	// Does not edit current object.
174	// Caller must Decref() return value when done with it.
175
176	Regexp* Regexp::Simplify() {
177	CoalesceWalker cw;
178	Regexp* cre = cw.Walk(this, NULL);
179	if (cre == NULL)
180	return NULL;
181	if (cw.stopped_early()) {
182	cre->Decref();
183	return NULL;
184	}
185	SimplifyWalker sw;
186	Regexp* sre = sw.Walk(cre, NULL);
187	cre->Decref();
188	if (sre == NULL)
189	return NULL;
190	if (sw.stopped_early()) {
191	sre->Decref();
192	return NULL;
193	}
194	return sre;
195	}
196
197	#define Simplify DontCallSimplify // Avoid accidental recursion
198
199	// Utility function for PostVisit implementations that compares re->sub() with
200	// child_args to determine whether any child_args changed. In the common case,
201	// where nothing changed, calls Decref() for all child_args and returns false,
202	// so PostVisit must return re->Incref(). Otherwise, returns true.
203	static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
204	for (int i = `0`; i < re->nsub(); i++) {
205	Regexp* sub = re->sub()[i];
206	Regexp* newsub = child_args[i];
207	if (newsub != sub)
208	return true;
209	}
210	for (int i = `0`; i < re->nsub(); i++) {
211	Regexp* newsub = child_args[i];
212	newsub->Decref();
213	}
214	return false;
215	}
216
217	Regexp* CoalesceWalker::Copy(Regexp* re) {
218	return re->Incref();
219	}
220
221	Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
222	// Should never be called: we use Walk(), not WalkExponential().
223	#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
224	LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
225	#endif
226	return re->Incref();
227	}
228
229	Regexp* CoalesceWalker::PostVisit(Regexp* re,
230	Regexp* parent_arg,
231	Regexp* pre_arg,
232	Regexp** child_args,
233	int nchild_args) {
234	if (re->nsub() == `0`)
235	return re->Incref();
236
237	if (re->op() != kRegexpConcat) {
238	if (!ChildArgsChanged(re, child_args))
239	return re->Incref();
240
241	// Something changed. Build a new op.
242	Regexp* nre = new Regexp (re->op(), re->parse_flags());
243	nre->AllocSub(re->nsub());
244	Regexp** nre_subs = nre->sub();
245	for (int i = `0`; i < re->nsub(); i++)
246	nre_subs[i] = child_args[i];
247	// Repeats and Captures have additional data that must be copied.
248	if (re->op() == kRegexpRepeat) {
249	nre->min_ = re->min();
250	nre->max_ = re->max();
251	} else if (re->op() == kRegexpCapture) {
252	nre->cap_ = re->cap();
253	}
254	return nre;
255	}
256
257	bool can_coalesce = false;
258	for (int i = `0`; i < re->nsub(); i++) {
259	if (i+`1` < re->nsub() &&
260	CanCoalesce(child_args[i], child_args[i+`1`])) {
261	can_coalesce = true;
262	break;
263	}
264	}
265	if (!can_coalesce) {
266	if (!ChildArgsChanged(re, child_args))
267	return re->Incref();
268
269	// Something changed. Build a new op.
270	Regexp* nre = new Regexp (re->op(), re->parse_flags());
271	nre->AllocSub(re->nsub());
272	Regexp** nre_subs = nre->sub();
273	for (int i = `0`; i < re->nsub(); i++)
274	nre_subs[i] = child_args[i];
275	return nre;
276	}
277
278	for (int i = `0`; i < re->nsub(); i++) {
279	if (i+`1` < re->nsub() &&
280	CanCoalesce(child_args[i], child_args[i+`1`]))
281	DoCoalesce(&child_args[i], &child_args[i+`1`]);
282	}
283	// Determine how many empty matches were left by DoCoalesce.
284	int n = `0`;
285	for (int i = n; i < re->nsub(); i++) {
286	if (child_args[i]->op() == kRegexpEmptyMatch)
287	n++;
288	}
289	// Build a new op.
290	Regexp* nre = new Regexp (re->op(), re->parse_flags());
291	nre->AllocSub(re->nsub() - n);
292	Regexp** nre_subs = nre->sub();
293	for (int i = `0`, j = `0`; i < re->nsub(); i++) {
294	if (child_args[i]->op() == kRegexpEmptyMatch) {
295	child_args[i]->Decref();
296	continue;
297	}
298	nre_subs[j] = child_args[i];
299	j++;
300	}
301	return nre;
302	}
303
304	bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
305	// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
306	// any byte.
307	if ((r1->op() == kRegexpStar \|\|
308	r1->op() == kRegexpPlus \|\|
309	r1->op() == kRegexpQuest \|\|
310	r1->op() == kRegexpRepeat) &&
311	(r1->sub()[`0`]->op() == kRegexpLiteral \|\|
312	r1->sub()[`0`]->op() == kRegexpCharClass \|\|
313	r1->sub()[`0`]->op() == kRegexpAnyChar \|\|
314	r1->sub()[`0`]->op() == kRegexpAnyByte)) {
315	// r2 must be a star/plus/quest/repeat of the same literal, char class,
316	// any char or any byte.
317	if ((r2->op() == kRegexpStar \|\|
318	r2->op() == kRegexpPlus \|\|
319	r2->op() == kRegexpQuest \|\|
320	r2->op() == kRegexpRepeat) &&
321	Regexp::Equal(r1->sub()[`0`], r2->sub()[`0`]) &&
322	// The parse flags must be consistent.
323	((r1->parse_flags() & Regexp::NonGreedy) ==
324	(r2->parse_flags() & Regexp::NonGreedy))) {
325	return true;
326	}
327	// ... OR an occurrence of that literal, char class, any char or any byte
328	if (Regexp::Equal(r1->sub()[`0`], r2)) {
329	return true;
330	}
331	// ... OR a literal string that begins with that literal.
332	if (r1->sub()[`0`]->op() == kRegexpLiteral &&
333	r2->op() == kRegexpLiteralString &&
334	r2->runes()[`0`] == r1->sub()[`0`]->rune() &&
335	// The parse flags must be consistent.
336	((r1->sub()[`0`]->parse_flags() & Regexp::FoldCase) ==
337	(r2->parse_flags() & Regexp::FoldCase))) {
338	return true;
339	}
340	}
341	return false;
342	}
343
344	void CoalesceWalker::DoCoalesce(Regexp r1ptr, Regexp r2ptr) {
345	Regexp* r1 = *r1ptr;
346	Regexp* r2 = *r2ptr;
347
348	Regexp* nre = Regexp::Repeat(
349	r1->sub()[`0`]->Incref(), r1->parse_flags(), `0`, `0`);
350
351	switch (r1->op()) {
352	case kRegexpStar:
353	nre->min_ = `0`;
354	nre->max_ = -`1`;
355	break;
356
357	case kRegexpPlus:
358	nre->min_ = `1`;
359	nre->max_ = -`1`;
360	break;
361
362	case kRegexpQuest:
363	nre->min_ = `0`;
364	nre->max_ = `1`;
365	break;
366
367	case kRegexpRepeat:
368	nre->min_ = r1->min();
369	nre->max_ = r1->max();
370	break;
371
372	default:
373	LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
374	nre->Decref();
375	return;
376	}
377
378	switch (r2->op()) {
379	case kRegexpStar:
380	nre->max_ = -`1`;
381	goto LeaveEmpty;
382
383	case kRegexpPlus:
384	nre->min_++;
385	nre->max_ = -`1`;
386	goto LeaveEmpty;
387
388	case kRegexpQuest:
389	if (nre->max() != -`1`)
390	nre->max_++;
391	goto LeaveEmpty;
392
393	case kRegexpRepeat:
394	nre->min_ += r2->min();
395	if (r2->max() == -`1`)
396	nre->max_ = -`1`;
397	else if (nre->max() != -`1`)
398	nre->max_ += r2->max();
399	goto LeaveEmpty;
400
401	case kRegexpLiteral:
402	case kRegexpCharClass:
403	case kRegexpAnyChar:
404	case kRegexpAnyByte:
405	nre->min_++;
406	if (nre->max() != -`1`)
407	nre->max_++;
408	goto LeaveEmpty;
409
410	LeaveEmpty:
411	r1ptr = new* Regexp (kRegexpEmptyMatch, Regexp::NoParseFlags);
412	*r2ptr = nre;
413	break;
414
415	case kRegexpLiteralString: {
416	Rune r = r1->sub()[`0`]->rune();
417	// Determine how much of the literal string is removed.
418	// We know that we have at least one rune. :)
419	int n = `1`;
420	while (n < r2->nrunes() && r2->runes()[n] == r)
421	n++;
422	nre->min_ += n;
423	if (nre->max() != -`1`)
424	nre->max_ += n;
425	if (n == r2->nrunes())
426	goto LeaveEmpty;
427	*r1ptr = nre;
428	*r2ptr = Regexp::LiteralString(
429	&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
430	break;
431	}
432
433	default:
434	LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
435	nre->Decref();
436	return;
437	}
438
439	r1->Decref();
440	r2->Decref();
441	}
442
443	Regexp* SimplifyWalker::Copy(Regexp* re) {
444	return re->Incref();
445	}
446
447	Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
448	// Should never be called: we use Walk(), not WalkExponential().
449	#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
450	LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
451	#endif
452	return re->Incref();
453	}
454
455	Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
456	if (re->simple()) {
457	stop = true*;
458	return re->Incref();
459	}
460	return NULL;
461	}
462
463	Regexp* SimplifyWalker::PostVisit(Regexp* re,
464	Regexp* parent_arg,
465	Regexp* pre_arg,
466	Regexp** child_args,
467	int nchild_args) {
468	switch (re->op()) {
469	case kRegexpNoMatch:
470	case kRegexpEmptyMatch:
471	case kRegexpLiteral:
472	case kRegexpLiteralString:
473	case kRegexpBeginLine:
474	case kRegexpEndLine:
475	case kRegexpBeginText:
476	case kRegexpWordBoundary:
477	case kRegexpNoWordBoundary:
478	case kRegexpEndText:
479	case kRegexpAnyChar:
480	case kRegexpAnyByte:
481	case kRegexpHaveMatch:
482	// All these are always simple.
483	re->simple_ = true;
484	return re->Incref();
485
486	case kRegexpConcat:
487	case kRegexpAlternate: {
488	// These are simple as long as the subpieces are simple.
489	if (!ChildArgsChanged(re, child_args)) {
490	re->simple_ = true;
491	return re->Incref();
492	}
493	Regexp* nre = new Regexp (re->op(), re->parse_flags());
494	nre->AllocSub(re->nsub());
495	Regexp** nre_subs = nre->sub();
496	for (int i = `0`; i < re->nsub(); i++)
497	nre_subs[i] = child_args[i];
498	nre->simple_ = true;
499	return nre;
500	}
501
502	case kRegexpCapture: {
503	Regexp* newsub = child_args[`0`];
504	if (newsub == re->sub()[`0`]) {
505	newsub->Decref();
506	re->simple_ = true;
507	return re->Incref();
508	}
509	Regexp* nre = new Regexp (kRegexpCapture, re->parse_flags());
510	nre->AllocSub(`1`);
511	nre->sub()[`0`] = newsub;
512	nre->cap_ = re->cap();
513	nre->simple_ = true;
514	return nre;
515	}
516
517	case kRegexpStar:
518	case kRegexpPlus:
519	case kRegexpQuest: {
520	Regexp* newsub = child_args[`0`];
521	// Special case: repeat the empty string as much as
522	// you want, but it's still the empty string.
523	if (newsub->op() == kRegexpEmptyMatch)
524	return newsub;
525
526	// These are simple as long as the subpiece is simple.
527	if (newsub == re->sub()[`0`]) {
528	newsub->Decref();
529	re->simple_ = true;
530	return re->Incref();
531	}
532
533	// These are also idempotent if flags are constant.
534	if (re->op() == newsub->op() &&
535	re->parse_flags() == newsub->parse_flags())
536	return newsub;
537
538	Regexp* nre = new Regexp (re->op(), re->parse_flags());
539	nre->AllocSub(`1`);
540	nre->sub()[`0`] = newsub;
541	nre->simple_ = true;
542	return nre;
543	}
544
545	case kRegexpRepeat: {
546	Regexp* newsub = child_args[`0`];
547	// Special case: repeat the empty string as much as
548	// you want, but it's still the empty string.
549	if (newsub->op() == kRegexpEmptyMatch)
550	return newsub;
551
552	Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
553	re->parse_flags());
554	newsub->Decref();
555	nre->simple_ = true;
556	return nre;
557	}
558
559	case kRegexpCharClass: {
560	Regexp* nre = SimplifyCharClass(re);
561	nre->simple_ = true;
562	return nre;
563	}
564	}
565
566	LOG(ERROR) << "Simplify case not handled: " << re->op();
567	return re->Incref();
568	}
569
570	// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
571	// Returns a new Regexp, handing the ref to the caller.
572	Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
573	Regexp::ParseFlags parse_flags) {
574	Regexp* re = new Regexp (kRegexpConcat, parse_flags);
575	re->AllocSub(`2`);
576	Regexp** subs = re->sub();
577	subs[`0`] = re1;
578	subs[`1`] = re2;
579	return re;
580	}
581
582	// Simplifies the expression re{min,max} in terms of , +, and ?.*
583	// Returns a new regexp. Does not edit re. Does not consume reference to re.
584	// Caller must Decref return value when done with it.
585	// The result will not* necessarily have the right capturing parens*
586	// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
587	// but in the Regexp representation, both (x) are marked as $1.*
588	Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
589	Regexp::ParseFlags f) {
590	// x{n,} means at least n matches of x.
591	if (max == -`1`) {
592	// Special case: x{0,} is x*
593	if (min == `0`)
594	return Regexp::Star(re->Incref(), f);
595
596	// Special case: x{1,} is x+
597	if (min == `1`)
598	return Regexp::Plus(re->Incref(), f);
599
600	// General case: x{4,} is xxxx+
601	PODArray<Regexp*> nre_subs(min);
602	for (int i = `0`; i < min-`1`; i++)
603	nre_subs [i] = re->Incref();
604	nre_subs [min-`1`] = Regexp::Plus(re->Incref(), f);
605	return Regexp::Concat(nre_subs.data(), min, f);
606	}
607
608	// Special case: (x){0} matches only empty string.
609	if (min == `0` && max == `0`)
610	return new Regexp (kRegexpEmptyMatch, f);
611
612	// Special case: x{1} is just x.
613	if (min == `1` && max == `1`)
614	return re->Incref();
615
616	// General case: x{n,m} means n copies of x and m copies of x?.
617	// The machine will do less work if we nest the final m copies,
618	// so that x{2,5} = xx(x(x(x)?)?)?
619
620	// Build leading prefix: xx. Capturing only on the last one.
621	Regexp* nre = NULL;
622	if (min > `0`) {
623	PODArray<Regexp*> nre_subs(min);
624	for (int i = `0`; i < min; i++)
625	nre_subs [i] = re->Incref();
626	nre = Regexp::Concat(nre_subs.data(), min, f);
627	}
628
629	// Build and attach suffix: (x(x(x)?)?)?
630	if (max > min) {
631	Regexp* suf = Regexp::Quest(re->Incref(), f);
632	for (int i = min+`1`; i < max; i++)
633	suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
634	if (nre == NULL)
635	nre = suf;
636	else
637	nre = Concat2(nre, suf, f);
638	}
639
640	if (nre == NULL) {
641	// Some degenerate case, like min > max, or min < max < 0.
642	// This shouldn't happen, because the parser rejects such regexps.
643	LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
644	return new Regexp (kRegexpNoMatch, f);
645	}
646
647	return nre;
648	}
649
650	// Simplifies a character class.
651	// Caller must Decref return value when done with it.
652	Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
653	CharClass* cc = re->cc();
654
655	// Special cases
656	if (cc->empty())
657	return new Regexp (kRegexpNoMatch, re->parse_flags());
658	if (cc->full())
659	return new Regexp (kRegexpAnyChar, re->parse_flags());
660
661	return re->Incref();
662	}
663
664	} // namespace re2
665

Browse the source code of tensorflow/external/com_googlesource_code_re2/re2/simplify.cc