compile.cc source code [tensorflow/external/com_googlesource_code_re2/re2/compile.cc]

1	// Copyright 2007 The RE2 Authors. All Rights Reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// Compile regular expression to Prog.
6	//
7	// Prog and Inst are defined in prog.h.
8	// This file's external interface is just Regexp::CompileToProg.
9	// The Compiler class defined in this file is private.
10
11	#include <stdint.h>
12	#include <string.h>
13	#include <utility>
14
15	#include "absl/base/macros.h"
16	#include "absl/container/flat_hash_map.h"
17	#include "util/logging.h"
18	#include "util/utf.h"
19	#include "re2/pod_array.h"
20	#include "re2/prog.h"
21	#include "re2/re2.h"
22	#include "re2/regexp.h"
23	#include "re2/walker-inl.h"
24
25	namespace re2 {
26
27	// List of pointers to Inst that need to be filled in (patched).*
28	// Because the Inst haven't been filled in yet,*
29	// we can use the Inst word to hold the list's "next" pointer.*
30	// It's kind of sleazy, but it works well in practice.
31	// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
32	//
33	// Because the out and out1 fields in Inst are no longer pointers,
34	// we can't use pointers directly here either. Instead, head refers
35	// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
36	// head == 0 represents the NULL list. This is okay because instruction #0
37	// is always the fail instruction, which never appears on a list.
38	struct PatchList {
39	// Returns patch list containing just p.
40	static PatchList Mk(uint32_t p) {
41	return {p, p};
42	}
43
44	// Patches all the entries on l to have value p.
45	// Caller must not ever use patch list again.
46	static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
47	while (l.head != `0`) {
48	Prog::Inst* ip = &inst0[l.head>>`1`];
49	if (l.head&`1`) {
50	l.head = ip->out1();
51	ip->out1_ = p;
52	} else {
53	l.head = ip->out();
54	ip->set_out(p);
55	}
56	}
57	}
58
59	// Appends two patch lists and returns result.
60	static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
61	if (l1.head == `0`)
62	return l2;
63	if (l2.head == `0`)
64	return l1;
65	Prog::Inst* ip = &inst0[l1.tail>>`1`];
66	if (l1.tail&`1`)
67	ip->out1_ = l2.head;
68	else
69	ip->set_out(l2.head);
70	return {l1.head, l2.tail};
71	}
72
73	uint32_t head;
74	uint32_t tail; // for constant-time append
75	};
76
77	static const PatchList kNullPatchList = {`0`, `0`};
78
79	// Compiled program fragment.
80	struct Frag {
81	uint32_t begin;
82	PatchList end;
83	bool nullable;
84
85	Frag() : begin(`0`), end (kNullPatchList), nullable(false) {}
86	Frag(uint32_t begin, PatchList end, bool nullable)
87	: begin(begin), end (end), nullable(nullable) {}
88	};
89
90	// Input encodings.
91	enum Encoding {
92	kEncodingUTF8 = `1`, // UTF-8 (0-10FFFF)
93	kEncodingLatin1, // Latin-1 (0-FF)
94	};
95
96	class Compiler : public Regexp::Walker<Frag> {
97	public:
98	explicit Compiler();
99	~Compiler();
100
101	// Compiles Regexp to a new Prog.
102	// Caller is responsible for deleting Prog when finished with it.
103	// If reversed is true, compiles for walking over the input
104	// string backward (reverses all concatenations).
105	static Prog Compile(Regexp re, bool reversed, int64_t max_mem);
106
107	// Compiles alternation of all the re to a new Prog.
108	// Each re has a match with an id equal to its index in the vector.
109	static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
110
111	// Interface for Regexp::Walker, which helps traverse the Regexp.
112	// The walk is purely post-recursive: given the machines for the
113	// children, PostVisit combines them to create the machine for
114	// the current node. The child_args are Frags.
115	// The Compiler traverses the Regexp parse tree, visiting
116	// each node in depth-first order. It invokes PreVisit before
117	// visiting the node's children and PostVisit after visiting
118	// the children.
119	Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
120	Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
121	int nchild_args);
122	Frag ShortVisit(Regexp* re, Frag parent_arg);
123	Frag Copy(Frag arg);
124
125	// Given fragment a, returns a+ or a+?; a or a?; a? or a??
126	Frag Plus(Frag a, bool nongreedy);
127	Frag Star(Frag a, bool nongreedy);
128	Frag Quest(Frag a, bool nongreedy);
129
130	// Given fragment a, returns (a) capturing as \n.
131	Frag Capture(Frag a, int n);
132
133	// Given fragments a and b, returns ab; a\|b
134	Frag Cat(Frag a, Frag b);
135	Frag Alt(Frag a, Frag b);
136
137	// Returns a fragment that can't match anything.
138	Frag NoMatch();
139
140	// Returns a fragment that matches the empty string.
141	Frag Match(int32_t id);
142
143	// Returns a no-op fragment.
144	Frag Nop();
145
146	// Returns a fragment matching the byte range lo-hi.
147	Frag ByteRange(int lo, int hi, bool foldcase);
148
149	// Returns a fragment matching an empty-width special op.
150	Frag EmptyWidth(EmptyOp op);
151
152	// Adds n instructions to the program.
153	// Returns the index of the first one.
154	// Returns -1 if no more instructions are available.
155	int AllocInst(int n);
156
157	// Rune range compiler.
158
159	// Begins a new alternation.
160	void BeginRange();
161
162	// Adds a fragment matching the rune range lo-hi.
163	void AddRuneRange(Rune lo, Rune hi, bool foldcase);
164	void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
165	void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
166	void Add_80_10ffff();
167
168	// New suffix that matches the byte range lo-hi, then goes to next.
169	int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
170	int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
171
172	// Returns true iff the suffix is cached.
173	bool IsCachedRuneByteSuffix(int id);
174
175	// Adds a suffix to alternation.
176	void AddSuffix(int id);
177
178	// Adds a suffix to the trie starting from the given root node.
179	// Returns zero iff allocating an instruction fails. Otherwise, returns
180	// the current root node, which might be different from what was given.
181	int AddSuffixRecursive(int root, int id);
182
183	// Finds the trie node for the given suffix. Returns a Frag in order to
184	// distinguish between pointing at the root node directly (end.head == 0)
185	// and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
186	Frag FindByteRange(int root, int id);
187
188	// Compares two ByteRanges and returns true iff they are equal.
189	bool ByteRangeEqual(int id1, int id2);
190
191	// Returns the alternation of all the added suffixes.
192	Frag EndRange();
193
194	// Single rune.
195	Frag Literal(Rune r, bool foldcase);
196
197	void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
198	Prog* Finish(Regexp* re);
199
200	// Returns . where dot = any byte*
201	Frag DotStar();
202
203	private:
204	Prog* prog_; // Program being built.
205	bool failed_; // Did we give up compiling?
206	Encoding encoding_; // Input encoding
207	bool reversed_; // Should program run backward over text?
208
209	PODArray<Prog::Inst> inst_;
210	int ninst_; // Number of instructions used.
211	int max_ninst_; // Maximum number of instructions.
212
213	int64_t max_mem_; // Total memory budget.
214
215	absl::flat_hash_map<uint64_t, int> rune_cache_;
216	Frag rune_range_;
217
218	RE2::Anchor anchor_; // anchor mode for RE2::Set
219
220	Compiler(const Compiler&) = delete;
221	Compiler& operator=(const Compiler&) = delete;
222	};
223
224	Compiler::Compiler() {
225	prog_ = new Prog ();
226	failed_ = false;
227	encoding_ = kEncodingUTF8;
228	reversed_ = false;
229	ninst_ = `0`;
230	max_ninst_ = `1`; // make AllocInst for fail instruction okay
231	max_mem_ = `0`;
232	int fail = AllocInst(`1`);
233	inst_[fail].InitFail();
234	max_ninst_ = `0`; // Caller must change
235	}
236
237	Compiler::~Compiler() {
238	delete prog_;
239	}
240
241	int Compiler::AllocInst(int n) {
242	if (failed_ \|\| ninst_ + n > max_ninst_) {
243	failed_ = true;
244	return -`1`;
245	}
246
247	if (ninst_ + n > inst_.size()) {
248	int cap = inst_.size();
249	if (cap == `0`)
250	cap = `8`;
251	while (ninst_ + n > cap)
252	cap *= `2`;
253	PODArray<Prog::Inst> inst(cap);
254	if (inst_.data() != NULL)
255	memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[`0`]);
256	memset(inst.data() + ninst_, `0`, (cap - ninst_)*sizeof inst_[`0`]);
257	inst_ = std::move(inst);
258	}
259	int id = ninst_;
260	ninst_ += n;
261	return id;
262	}
263
264	// These routines are somewhat hard to visualize in text --
265	// see http://swtch.com/~rsc/regexp/regexp1.html for
266	// pictures explaining what is going on here.
267
268	// Returns an unmatchable fragment.
269	Frag Compiler::NoMatch() {
270	return Frag ();
271	}
272
273	// Is a an unmatchable fragment?
274	static bool IsNoMatch(Frag a) {
275	return a.begin == `0`;
276	}
277
278	// Given fragments a and b, returns fragment for ab.
279	Frag Compiler::Cat(Frag a, Frag b) {
280	if (IsNoMatch(a) \|\| IsNoMatch(b))
281	return NoMatch();
282
283	// Elide no-op.
284	Prog::Inst* begin = &inst_[a.begin];
285	if (begin->opcode() == kInstNop &&
286	a.end.head == (a.begin << `1`) &&
287	begin->out() == `0`) {
288	// in case refs to a somewhere
289	PatchList::Patch(inst_.data(), a.end, b.begin);
290	return b;
291	}
292
293	// To run backward over string, reverse all concatenations.
294	if (reversed_) {
295	PatchList::Patch(inst_.data(), b.end, a.begin);
296	return Frag (b.begin, a.end, b.nullable && a.nullable);
297	}
298
299	PatchList::Patch(inst_.data(), a.end, b.begin);
300	return Frag (a.begin, b.end, a.nullable && b.nullable);
301	}
302
303	// Given fragments for a and b, returns fragment for a\|b.
304	Frag Compiler::Alt(Frag a, Frag b) {
305	// Special case for convenience in loops.
306	if (IsNoMatch(a))
307	return b;
308	if (IsNoMatch(b))
309	return a;
310
311	int id = AllocInst(`1`);
312	if (id < `0`)
313	return NoMatch();
314
315	inst_[id].InitAlt(a.begin, b.begin);
316	return Frag (id, PatchList::Append(inst_.data(), a.end, b.end),
317	a.nullable \|\| b.nullable);
318	}
319
320	// When capturing submatches in like-Perl mode, a kOpAlt Inst
321	// treats out_ as the first choice, out1_ as the second.
322	//
323	// For , +, and ?, if out_ causes another repetition,*
324	// then the operator is greedy. If out1_ is the repetition
325	// (and out_ moves forward), then the operator is non-greedy.
326
327	// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
328	Frag Compiler::Plus(Frag a, bool nongreedy) {
329	int id = AllocInst(`1`);
330	if (id < `0`)
331	return NoMatch();
332	PatchList pl;
333	if (nongreedy) {
334	inst_[id].InitAlt(`0`, a.begin);
335	pl = PatchList::Mk(id << `1`);
336	} else {
337	inst_[id].InitAlt(a.begin, `0`);
338	pl = PatchList::Mk((id << `1`) \| `1`);
339	}
340	PatchList::Patch(inst_.data(), a.end, id);
341	return Frag (a.begin, pl, a.nullable);
342	}
343
344	// Given a fragment for a, returns a fragment for a or a? (if nongreedy)
345	Frag Compiler::Star(Frag a, bool nongreedy) {
346	// When the subexpression is nullable, one Alt isn't enough to guarantee
347	// correct priority ordering within the transitive closure. The simplest
348	// solution is to handle it as (a+)? instead, which adds the second Alt.
349	if (a.nullable)
350	return Quest(Plus(a, nongreedy), nongreedy);
351
352	int id = AllocInst(`1`);
353	if (id < `0`)
354	return NoMatch();
355	PatchList pl;
356	if (nongreedy) {
357	inst_[id].InitAlt(`0`, a.begin);
358	pl = PatchList::Mk(id << `1`);
359	} else {
360	inst_[id].InitAlt(a.begin, `0`);
361	pl = PatchList::Mk((id << `1`) \| `1`);
362	}
363	PatchList::Patch(inst_.data(), a.end, id);
364	return Frag (id, pl, true);
365	}
366
367	// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
368	Frag Compiler::Quest(Frag a, bool nongreedy) {
369	if (IsNoMatch(a))
370	return Nop();
371	int id = AllocInst(`1`);
372	if (id < `0`)
373	return NoMatch();
374	PatchList pl;
375	if (nongreedy) {
376	inst_[id].InitAlt(`0`, a.begin);
377	pl = PatchList::Mk(id << `1`);
378	} else {
379	inst_[id].InitAlt(a.begin, `0`);
380	pl = PatchList::Mk((id << `1`) \| `1`);
381	}
382	return Frag (id, PatchList::Append(inst_.data(), pl, a.end), true);
383	}
384
385	// Returns a fragment for the byte range lo-hi.
386	Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
387	int id = AllocInst(`1`);
388	if (id < `0`)
389	return NoMatch();
390	inst_[id].InitByteRange(lo, hi, foldcase, `0`);
391	return Frag (id, PatchList::Mk(id << `1`), false);
392	}
393
394	// Returns a no-op fragment. Sometimes unavoidable.
395	Frag Compiler::Nop() {
396	int id = AllocInst(`1`);
397	if (id < `0`)
398	return NoMatch();
399	inst_[id].InitNop(`0`);
400	return Frag (id, PatchList::Mk(id << `1`), true);
401	}
402
403	// Returns a fragment that signals a match.
404	Frag Compiler::Match(int32_t match_id) {
405	int id = AllocInst(`1`);
406	if (id < `0`)
407	return NoMatch();
408	inst_[id].InitMatch(match_id);
409	return Frag (id, kNullPatchList, false);
410	}
411
412	// Returns a fragment matching a particular empty-width op (like ^ or $)
413	Frag Compiler::EmptyWidth(EmptyOp empty) {
414	int id = AllocInst(`1`);
415	if (id < `0`)
416	return NoMatch();
417	inst_[id].InitEmptyWidth(empty, `0`);
418	return Frag (id, PatchList::Mk(id << `1`), true);
419	}
420
421	// Given a fragment a, returns a fragment with capturing parens around a.
422	Frag Compiler::Capture(Frag a, int n) {
423	if (IsNoMatch(a))
424	return NoMatch();
425	int id = AllocInst(`2`);
426	if (id < `0`)
427	return NoMatch();
428	inst_[id].InitCapture(`2`*n, a.begin);
429	inst_[id+`1`].InitCapture(`2`*n+`1`, `0`);
430	PatchList::Patch(inst_.data(), a.end, id+`1`);
431
432	return Frag (id, PatchList::Mk((id+`1`) << `1`), a.nullable);
433	}
434
435	// A Rune is a name for a Unicode code point.
436	// Returns maximum rune encoded by UTF-8 sequence of length len.
437	static int MaxRune(int len) {
438	int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
439	if (len == `1`)
440	b = `7`;
441	else
442	b = `8`-(len+`1`) + `6`*(len-`1`);
443	return (`1`<<b) - `1`; // maximum Rune for b bits.
444	}
445
446	// The rune range compiler caches common suffix fragments,
447	// which are very common in UTF-8 (e.g., [80-bf]).
448	// The fragment suffixes are identified by their start
449	// instructions. NULL denotes the eventual end match.
450	// The Frag accumulates in rune_range_. Caching common
451	// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
452	// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
453
454	void Compiler::BeginRange() {
455	rune_cache_.clear();
456	rune_range_.begin = `0`;
457	rune_range_.end = kNullPatchList;
458	}
459
460	int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
461	int next) {
462	Frag f = ByteRange(lo, hi, foldcase);
463	if (next != `0`) {
464	PatchList::Patch(inst_.data(), f.end, next);
465	} else {
466	rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
467	}
468	return f.begin;
469	}
470
471	static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
472	int next) {
473	return (uint64_t)next << `17` \|
474	(uint64_t)lo << `9` \|
475	(uint64_t)hi << `1` \|
476	(uint64_t)foldcase;
477	}
478
479	int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
480	int next) {
481	uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
482	absl::flat_hash_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
483	if (it != rune_cache_.end())
484	return it ->second;
485	int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
486	rune_cache_[key] = id;
487	return id;
488	}
489
490	bool Compiler::IsCachedRuneByteSuffix(int id) {
491	uint8_t lo = inst_[id].lo_;
492	uint8_t hi = inst_[id].hi_;
493	bool foldcase = inst_[id].foldcase() != `0`;
494	int next = inst_[id].out();
495
496	uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
497	return rune_cache_.find(key) != rune_cache_.end();
498	}
499
500	void Compiler::AddSuffix(int id) {
501	if (failed_)
502	return;
503
504	if (rune_range_.begin == `0`) {
505	rune_range_.begin = id;
506	return;
507	}
508
509	if (encoding_ == kEncodingUTF8) {
510	// Build a trie in order to reduce fanout.
511	rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
512	return;
513	}
514
515	int alt = AllocInst(`1`);
516	if (alt < `0`) {
517	rune_range_.begin = `0`;
518	return;
519	}
520	inst_[alt].InitAlt(rune_range_.begin, id);
521	rune_range_.begin = alt;
522	}
523
524	int Compiler::AddSuffixRecursive(int root, int id) {
525	DCHECK(inst_[root].opcode() == kInstAlt \|\|
526	inst_[root].opcode() == kInstByteRange);
527
528	Frag f = FindByteRange(root, id);
529	if (IsNoMatch(f)) {
530	int alt = AllocInst(`1`);
531	if (alt < `0`)
532	return `0`;
533	inst_[alt].InitAlt(root, id);
534	return alt;
535	}
536
537	int br;
538	if (f.end.head == `0`)
539	br = root;
540	else if (f.end.head&`1`)
541	br = inst_[f.begin].out1();
542	else
543	br = inst_[f.begin].out();
544
545	if (IsCachedRuneByteSuffix(br)) {
546	// We can't fiddle with cached suffixes, so make a clone of the head.
547	int byterange = AllocInst(`1`);
548	if (byterange < `0`)
549	return `0`;
550	inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
551	inst_[br].foldcase(), inst_[br].out());
552
553	// Ensure that the parent points to the clone, not to the original.
554	// Note that this could leave the head unreachable except via the cache.
555	br = byterange;
556	if (f.end.head == `0`)
557	root = br;
558	else if (f.end.head&`1`)
559	inst_[f.begin].out1_ = br;
560	else
561	inst_[f.begin].set_out(br);
562	}
563
564	int out = inst_[id].out();
565	if (!IsCachedRuneByteSuffix(id)) {
566	// The head should be the instruction most recently allocated, so free it
567	// instead of leaving it unreachable.
568	DCHECK_EQ(id, ninst_-`1`);
569	inst_[id].out_opcode_ = `0`;
570	inst_[id].out1_ = `0`;
571	ninst_--;
572	}
573
574	out = AddSuffixRecursive(inst_[br].out(), out);
575	if (out == `0`)
576	return `0`;
577
578	inst_[br].set_out(out);
579	return root;
580	}
581
582	bool Compiler::ByteRangeEqual(int id1, int id2) {
583	return inst_[id1].lo() == inst_[id2].lo() &&
584	inst_[id1].hi() == inst_[id2].hi() &&
585	inst_[id1].foldcase() == inst_[id2].foldcase();
586	}
587
588	Frag Compiler::FindByteRange(int root, int id) {
589	if (inst_[root].opcode() == kInstByteRange) {
590	if (ByteRangeEqual(root, id))
591	return Frag (root, kNullPatchList, false);
592	else
593	return NoMatch();
594	}
595
596	while (inst_[root].opcode() == kInstAlt) {
597	int out1 = inst_[root].out1();
598	if (ByteRangeEqual(out1, id))
599	return Frag (root, PatchList::Mk((root << `1`) \| `1`), false);
600
601	// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
602	// what we're looking for, then we can stop immediately. Unfortunately, we
603	// can't short-circuit the search in reverse mode.
604	if (!reversed_)
605	return NoMatch();
606
607	int out = inst_[root].out();
608	if (inst_[out].opcode() == kInstAlt)
609	root = out;
610	else if (ByteRangeEqual(out, id))
611	return Frag (root, PatchList::Mk(root << `1`), false);
612	else
613	return NoMatch();
614	}
615
616	LOG(DFATAL) << "should never happen";
617	return NoMatch();
618	}
619
620	Frag Compiler::EndRange() {
621	return rune_range_;
622	}
623
624	// Converts rune range lo-hi into a fragment that recognizes
625	// the bytes that would make up those runes in the current
626	// encoding (Latin 1 or UTF-8).
627	// This lets the machine work byte-by-byte even when
628	// using multibyte encodings.
629
630	void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
631	switch (encoding_) {
632	default:
633	case kEncodingUTF8:
634	AddRuneRangeUTF8(lo, hi, foldcase);
635	break;
636	case kEncodingLatin1:
637	AddRuneRangeLatin1(lo, hi, foldcase);
638	break;
639	}
640	}
641
642	void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
643	// Latin-1 is easy: runes are* bytes.*
644	if (lo > hi \|\| lo > `0xFF`)
645	return;
646	if (hi > `0xFF`)
647	hi = `0xFF`;
648	AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
649	static_cast<uint8_t>(hi), foldcase, `0`));
650	}
651
652	void Compiler::Add_80_10ffff() {
653	// The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
654	// (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
655	// permitting overlong encodings in E0 and F0 sequences and code points
656	// over 10FFFF in F4 sequences, the size of the bytecode and the number
657	// of equivalence classes are reduced significantly.
658	int id;
659	if (reversed_) {
660	// Prefix factoring matters, but we don't have to handle it here
661	// because the rune range trie logic takes care of that already.
662	id = UncachedRuneByteSuffix(`0xC2`, `0xDF`, false, `0`);
663	id = UncachedRuneByteSuffix(`0x80`, `0xBF`, false, id);
664	AddSuffix(id);
665
666	id = UncachedRuneByteSuffix(`0xE0`, `0xEF`, false, `0`);
667	id = UncachedRuneByteSuffix(`0x80`, `0xBF`, false, id);
668	id = UncachedRuneByteSuffix(`0x80`, `0xBF`, false, id);
669	AddSuffix(id);
670
671	id = UncachedRuneByteSuffix(`0xF0`, `0xF4`, false, `0`);
672	id = UncachedRuneByteSuffix(`0x80`, `0xBF`, false, id);
673	id = UncachedRuneByteSuffix(`0x80`, `0xBF`, false, id);
674	id = UncachedRuneByteSuffix(`0x80`, `0xBF`, false, id);
675	AddSuffix(id);
676	} else {
677	// Suffix factoring matters - and we do have to handle it here.
678	int cont1 = UncachedRuneByteSuffix(`0x80`, `0xBF`, false, `0`);
679	id = UncachedRuneByteSuffix(`0xC2`, `0xDF`, false, cont1);
680	AddSuffix(id);
681
682	int cont2 = UncachedRuneByteSuffix(`0x80`, `0xBF`, false, cont1);
683	id = UncachedRuneByteSuffix(`0xE0`, `0xEF`, false, cont2);
684	AddSuffix(id);
685
686	int cont3 = UncachedRuneByteSuffix(`0x80`, `0xBF`, false, cont2);
687	id = UncachedRuneByteSuffix(`0xF0`, `0xF4`, false, cont3);
688	AddSuffix(id);
689	}
690	}
691
692	void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
693	if (lo > hi)
694	return;
695
696	// Pick off 80-10FFFF as a common special case.
697	if (lo == `0x80` && hi == `0x10ffff`) {
698	Add_80_10ffff();
699	return;
700	}
701
702	// Split range into same-length sized ranges.
703	for (int i = `1`; i < UTFmax; i++) {
704	Rune max = MaxRune(i);
705	if (lo <= max && max < hi) {
706	AddRuneRangeUTF8(lo, max, foldcase);
707	AddRuneRangeUTF8(max+`1`, hi, foldcase);
708	return;
709	}
710	}
711
712	// ASCII range is always a special case.
713	if (hi < Runeself) {
714	AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
715	static_cast<uint8_t>(hi), foldcase, `0`));
716	return;
717	}
718
719	// Split range into sections that agree on leading bytes.
720	for (int i = `1`; i < UTFmax; i++) {
721	uint32_t m = (`1`<<(`6`i)) - `1`; // last i bytes of a UTF-8 sequence*
722	if ((lo & ~m) != (hi & ~m)) {
723	if ((lo & m) != `0`) {
724	AddRuneRangeUTF8(lo, lo\|m, foldcase);
725	AddRuneRangeUTF8((lo\|m)+`1`, hi, foldcase);
726	return;
727	}
728	if ((hi & m) != m) {
729	AddRuneRangeUTF8(lo, (hi&~m)-`1`, foldcase);
730	AddRuneRangeUTF8(hi&~m, hi, foldcase);
731	return;
732	}
733	}
734	}
735
736	// Finally. Generate byte matching equivalent for lo-hi.
737	uint8_t ulo[UTFmax], uhi[UTFmax];
738	int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
739	int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
740	(void)m; // USED(m)
741	DCHECK_EQ(n, m);
742
743	// The logic below encodes this thinking:
744	//
745	// 1. When we have built the whole suffix, we know that it cannot
746	// possibly be a suffix of anything longer: in forward mode, nothing
747	// else can occur before the leading byte; in reverse mode, nothing
748	// else can occur after the last continuation byte or else the leading
749	// byte would have to change. Thus, there is no benefit to caching
750	// the first byte of the suffix whereas there is a cost involved in
751	// cloning it if it begins a common prefix, which is fairly likely.
752	//
753	// 2. Conversely, the last byte of the suffix cannot possibly be a
754	// prefix of anything because next == 0, so we will never want to
755	// clone it, but it is fairly likely to be a common suffix. Perhaps
756	// more so in reverse mode than in forward mode because the former is
757	// "converging" towards lower entropy, but caching is still worthwhile
758	// for the latter in cases such as 80-BF.
759	//
760	// 3. Handling the bytes between the first and the last is less
761	// straightforward and, again, the approach depends on whether we are
762	// "converging" towards lower entropy: in forward mode, a single byte
763	// is unlikely to be part of a common suffix whereas a byte range
764	// is more likely so; in reverse mode, a byte range is unlikely to
765	// be part of a common suffix whereas a single byte is more likely
766	// so. The same benefit versus cost argument applies here.
767	int id = `0`;
768	if (reversed_) {
769	for (int i = `0`; i < n; i++) {
770	// In reverse UTF-8 mode: cache the leading byte; don't cache the last
771	// continuation byte; cache anything else iff it's a single byte (XX-XX).
772	if (i == `0` \|\| (ulo[i] == uhi[i] && i != n-`1`))
773	id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
774	else
775	id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
776	}
777	} else {
778	for (int i = n-`1`; i >= `0`; i--) {
779	// In forward UTF-8 mode: don't cache the leading byte; cache the last
780	// continuation byte; cache anything else iff it's a byte range (XX-YY).
781	if (i == n-`1` \|\| (ulo[i] < uhi[i] && i != `0`))
782	id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
783	else
784	id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
785	}
786	}
787	AddSuffix(id);
788	}
789
790	// Should not be called.
791	Frag Compiler::Copy(Frag arg) {
792	// We're using WalkExponential; there should be no copying.
793	LOG(DFATAL) << "Compiler::Copy called!";
794	failed_ = true;
795	return NoMatch();
796	}
797
798	// Visits a node quickly; called once WalkExponential has
799	// decided to cut this walk short.
800	Frag Compiler::ShortVisit(Regexp* re, Frag) {
801	failed_ = true;
802	return NoMatch();
803	}
804
805	// Called before traversing a node's children during the walk.
806	Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
807	// Cut off walk if we've already failed.
808	if (failed_)
809	stop = true*;
810
811	return Frag (); // not used by caller
812	}
813
814	Frag Compiler::Literal(Rune r, bool foldcase) {
815	switch (encoding_) {
816	default:
817	return Frag ();
818
819	case kEncodingLatin1:
820	return ByteRange(r, r, foldcase);
821
822	case kEncodingUTF8: {
823	if (r < Runeself) // Make common case fast.
824	return ByteRange(r, r, foldcase);
825	uint8_t buf[UTFmax];
826	int n = runetochar(reinterpret_cast<char*>(buf), &r);
827	Frag f = ByteRange((uint8_t)buf[`0`], buf[`0`], false);
828	for (int i = `1`; i < n; i++)
829	f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
830	return f;
831	}
832	}
833	}
834
835	// Called after traversing the node's children during the walk.
836	// Given their frags, build and return the frag for this re.
837	Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
838	int nchild_frags) {
839	// If a child failed, don't bother going forward, especially
840	// since the child_frags might contain Frags with NULLs in them.
841	if (failed_)
842	return NoMatch();
843
844	// Given the child fragments, return the fragment for this node.
845	switch (re->op()) {
846	case kRegexpRepeat:
847	// Should not see; code at bottom of function will print error
848	break;
849
850	case kRegexpNoMatch:
851	return NoMatch();
852
853	case kRegexpEmptyMatch:
854	return Nop();
855
856	case kRegexpHaveMatch: {
857	Frag f = Match(re->match_id());
858	if (anchor_ == RE2::ANCHOR_BOTH) {
859	// Append \z or else the subexpression will effectively be unanchored.
860	// Complemented by the UNANCHORED case in CompileSet().
861	f = Cat(EmptyWidth(kEmptyEndText), f);
862	}
863	return f;
864	}
865
866	case kRegexpConcat: {
867	Frag f = child_frags[`0`];
868	for (int i = `1`; i < nchild_frags; i++)
869	f = Cat(f, child_frags[i]);
870	return f;
871	}
872
873	case kRegexpAlternate: {
874	Frag f = child_frags[`0`];
875	for (int i = `1`; i < nchild_frags; i++)
876	f = Alt(f, child_frags[i]);
877	return f;
878	}
879
880	case kRegexpStar:
881	return Star(child_frags[`0`], (re->parse_flags()&Regexp::NonGreedy) != `0`);
882
883	case kRegexpPlus:
884	return Plus(child_frags[`0`], (re->parse_flags()&Regexp::NonGreedy) != `0`);
885
886	case kRegexpQuest:
887	return Quest(child_frags[`0`], (re->parse_flags()&Regexp::NonGreedy) != `0`);
888
889	case kRegexpLiteral:
890	return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != `0`);
891
892	case kRegexpLiteralString: {
893	// Concatenation of literals.
894	if (re->nrunes() == `0`)
895	return Nop();
896	Frag f;
897	for (int i = `0`; i < re->nrunes(); i++) {
898	Frag f1 = Literal(re->runes()[i],
899	(re->parse_flags()&Regexp::FoldCase) != `0`);
900	if (i == `0`)
901	f = f1;
902	else
903	f = Cat(f, f1);
904	}
905	return f;
906	}
907
908	case kRegexpAnyChar:
909	BeginRange();
910	AddRuneRange(`0`, Runemax, false);
911	return EndRange();
912
913	case kRegexpAnyByte:
914	return ByteRange(`0x00`, `0xFF`, false);
915
916	case kRegexpCharClass: {
917	CharClass* cc = re->cc();
918	if (cc->empty()) {
919	// This can't happen.
920	LOG(DFATAL) << "No ranges in char class";
921	failed_ = true;
922	return NoMatch();
923	}
924
925	// ASCII case-folding optimization: if the char class
926	// behaves the same on A-Z as it does on a-z,
927	// discard any ranges wholly contained in A-Z
928	// and mark the other ranges as foldascii.
929	// This reduces the size of a program for
930	// (?i)abc from 3 insts per letter to 1 per letter.
931	bool foldascii = cc->FoldsASCII();
932
933	// Character class is just a big OR of the different
934	// character ranges in the class.
935	BeginRange();
936	for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
937	// ASCII case-folding optimization (see above).
938	if (foldascii && `'A'` <= i->lo && i->hi <= `'Z'`)
939	continue;
940
941	// If this range contains all of A-Za-z or none of it,
942	// the fold flag is unnecessary; don't bother.
943	bool fold = foldascii;
944	if ((i->lo <= `'A'` && `'z'` <= i->hi) \|\| i->hi < `'A'` \|\| `'z'` < i->lo \|\|
945	(`'Z'` < i->lo && i->hi < `'a'`))
946	fold = false;
947
948	AddRuneRange(i->lo, i->hi, fold);
949	}
950	return EndRange();
951	}
952
953	case kRegexpCapture:
954	// If this is a non-capturing parenthesis -- (?:foo) --
955	// just use the inner expression.
956	if (re->cap() < `0`)
957	return child_frags[`0`];
958	return Capture(child_frags[`0`], re->cap());
959
960	case kRegexpBeginLine:
961	return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
962
963	case kRegexpEndLine:
964	return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
965
966	case kRegexpBeginText:
967	return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
968
969	case kRegexpEndText:
970	return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
971
972	case kRegexpWordBoundary:
973	return EmptyWidth(kEmptyWordBoundary);
974
975	case kRegexpNoWordBoundary:
976	return EmptyWidth(kEmptyNonWordBoundary);
977	}
978	LOG(DFATAL) << "Missing case in Compiler: " << re->op();
979	failed_ = true;
980	return NoMatch();
981	}
982
983	// Is this regexp required to start at the beginning of the text?
984	// Only approximate; can return false for complicated regexps like (\Aa\|\Ab),
985	// but handles (\A(a\|b)). Could use the Walker to write a more exact one.
986	static bool IsAnchorStart(Regexp** pre, int depth) {
987	Regexp* re = *pre;
988	Regexp* sub;
989	// The depth limit makes sure that we don't overflow
990	// the stack on a deeply nested regexp. As the comment
991	// above says, IsAnchorStart is conservative, so returning
992	// a false negative is okay. The exact limit is somewhat arbitrary.
993	if (re == NULL \|\| depth >= `4`)
994	return false;
995	switch (re->op()) {
996	default:
997	break;
998	case kRegexpConcat:
999	if (re->nsub() > `0`) {
1000	sub = re->sub()[`0`]->Incref();
1001	if (IsAnchorStart(&sub, depth+`1`)) {
1002	PODArray<Regexp*> subcopy(re->nsub());
1003	subcopy [`0`] = sub; // already have reference
1004	for (int i = `1`; i < re->nsub(); i++)
1005	subcopy [i] = re->sub()[i]->Incref();
1006	*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1007	re->Decref();
1008	return true;
1009	}
1010	sub->Decref();
1011	}
1012	break;
1013	case kRegexpCapture:
1014	sub = re->sub()[`0`]->Incref();
1015	if (IsAnchorStart(&sub, depth+`1`)) {
1016	*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1017	re->Decref();
1018	return true;
1019	}
1020	sub->Decref();
1021	break;
1022	case kRegexpBeginText:
1023	*pre = Regexp::LiteralString(NULL, `0`, re->parse_flags());
1024	re->Decref();
1025	return true;
1026	}
1027	return false;
1028	}
1029
1030	// Is this regexp required to start at the end of the text?
1031	// Only approximate; can return false for complicated regexps like (a\z\|b\z),
1032	// but handles ((a\|b)\z). Could use the Walker to write a more exact one.
1033	static bool IsAnchorEnd(Regexp** pre, int depth) {
1034	Regexp* re = *pre;
1035	Regexp* sub;
1036	// The depth limit makes sure that we don't overflow
1037	// the stack on a deeply nested regexp. As the comment
1038	// above says, IsAnchorEnd is conservative, so returning
1039	// a false negative is okay. The exact limit is somewhat arbitrary.
1040	if (re == NULL \|\| depth >= `4`)
1041	return false;
1042	switch (re->op()) {
1043	default:
1044	break;
1045	case kRegexpConcat:
1046	if (re->nsub() > `0`) {
1047	sub = re->sub()[re->nsub() - `1`]->Incref();
1048	if (IsAnchorEnd(&sub, depth+`1`)) {
1049	PODArray<Regexp*> subcopy(re->nsub());
1050	subcopy [re->nsub() - `1`] = sub; // already have reference
1051	for (int i = `0`; i < re->nsub() - `1`; i++)
1052	subcopy [i] = re->sub()[i]->Incref();
1053	*pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1054	re->Decref();
1055	return true;
1056	}
1057	sub->Decref();
1058	}
1059	break;
1060	case kRegexpCapture:
1061	sub = re->sub()[`0`]->Incref();
1062	if (IsAnchorEnd(&sub, depth+`1`)) {
1063	*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1064	re->Decref();
1065	return true;
1066	}
1067	sub->Decref();
1068	break;
1069	case kRegexpEndText:
1070	*pre = Regexp::LiteralString(NULL, `0`, re->parse_flags());
1071	re->Decref();
1072	return true;
1073	}
1074	return false;
1075	}
1076
1077	void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1078	RE2::Anchor anchor) {
1079	if (flags & Regexp::Latin1)
1080	encoding_ = kEncodingLatin1;
1081	max_mem_ = max_mem;
1082	if (max_mem <= `0`) {
1083	max_ninst_ = `100000`; // more than enough
1084	} else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1085	// No room for anything.
1086	max_ninst_ = `0`;
1087	} else {
1088	int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1089	// Limit instruction count so that inst->id() fits nicely in an int.
1090	// SparseArray also assumes that the indices (inst->id()) are ints.
1091	// The call to WalkExponential uses 2max_ninst_ below,*
1092	// and other places in the code use 2 or 3 prog->size().*
1093	// Limiting to 2^24 should avoid overflow in those places.
1094	// (The point of allowing more than 32 bits of memory is to
1095	// have plenty of room for the DFA states, not to use it up
1096	// on the program.)
1097	if (m >= `1`<<`24`)
1098	m = `1`<<`24`;
1099	// Inst imposes its own limit (currently bigger than 2^24 but be safe).
1100	if (m > Prog::Inst::kMaxInst)
1101	m = Prog::Inst::kMaxInst;
1102	max_ninst_ = static_cast<int>(m);
1103	}
1104	anchor_ = anchor;
1105	}
1106
1107	// Compiles re, returning program.
1108	// Caller is responsible for deleting prog_.
1109	// If reversed is true, compiles a program that expects
1110	// to run over the input string backward (reverses all concatenations).
1111	// The reversed flag is also recorded in the returned program.
1112	Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1113	Compiler c;
1114	c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED / unused /);
1115	c.reversed_ = reversed;
1116
1117	// Simplify to remove things like counted repetitions
1118	// and character classes like \d.
1119	Regexp* sre = re->Simplify();
1120	if (sre == NULL)
1121	return NULL;
1122
1123	// Record whether prog is anchored, removing the anchors.
1124	// (They get in the way of other optimizations.)
1125	bool is_anchor_start = IsAnchorStart(&sre, `0`);
1126	bool is_anchor_end = IsAnchorEnd(&sre, `0`);
1127
1128	// Generate fragment for entire regexp.
1129	Frag all = c.WalkExponential(sre, Frag (), `2`*c.max_ninst_);
1130	sre->Decref();
1131	if (c.failed_)
1132	return NULL;
1133
1134	// Success! Finish by putting Match node at end, and record start.
1135	// Turn off c.reversed_ (if it is set) to force the remaining concatenations
1136	// to behave normally.
1137	c.reversed_ = false;
1138	all = c.Cat(all, c.Match(`0`));
1139
1140	c.prog_->set_reversed(reversed);
1141	if (c.prog_->reversed()) {
1142	c.prog_->set_anchor_start(is_anchor_end);
1143	c.prog_->set_anchor_end(is_anchor_start);
1144	} else {
1145	c.prog_->set_anchor_start(is_anchor_start);
1146	c.prog_->set_anchor_end(is_anchor_end);
1147	}
1148
1149	c.prog_->set_start(all.begin);
1150	if (!c.prog_->anchor_start()) {
1151	// Also create unanchored version, which starts with a .? loop.*
1152	all = c.Cat(c.DotStar(), all);
1153	}
1154	c.prog_->set_start_unanchored(all.begin);
1155
1156	// Hand ownership of prog_ to caller.
1157	return c.Finish(re);
1158	}
1159
1160	Prog* Compiler::Finish(Regexp* re) {
1161	if (failed_)
1162	return NULL;
1163
1164	if (prog_->start() == `0` && prog_->start_unanchored() == `0`) {
1165	// No possible matches; keep Fail instruction only.
1166	ninst_ = `1`;
1167	}
1168
1169	// Hand off the array to Prog.
1170	prog_->inst_ = std::move(inst_);
1171	prog_->size_ = ninst_;
1172
1173	prog_->Optimize();
1174	prog_->Flatten();
1175	prog_->ComputeByteMap();
1176
1177	if (!prog_->reversed()) {
1178	std::string prefix;
1179	bool prefix_foldcase;
1180	if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
1181	prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
1182	}
1183
1184	// Record remaining memory for DFA.
1185	if (max_mem_ <= `0`) {
1186	prog_->set_dfa_mem(`1`<<`20`);
1187	} else {
1188	int64_t m = max_mem_ - sizeof(Prog);
1189	m -= prog_->size_*sizeof(Prog::Inst); // account for inst_
1190	if (prog_->CanBitState())
1191	m -= prog_->size_*sizeof(uint16_t); // account for list_heads_
1192	if (m < `0`)
1193	m = `0`;
1194	prog_->set_dfa_mem(m);
1195	}
1196
1197	Prog* p = prog_;
1198	prog_ = NULL;
1199	return p;
1200	}
1201
1202	// Converts Regexp to Prog.
1203	Prog* Regexp::CompileToProg(int64_t max_mem) {
1204	return Compiler::Compile(this, false, max_mem);
1205	}
1206
1207	Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1208	return Compiler::Compile(this, true, max_mem);
1209	}
1210
1211	Frag Compiler::DotStar() {
1212	return Star(ByteRange(`0x00`, `0xff`, false), true);
1213	}
1214
1215	// Compiles RE set to Prog.
1216	Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1217	Compiler c;
1218	c.Setup(re->parse_flags(), max_mem, anchor);
1219
1220	Regexp* sre = re->Simplify();
1221	if (sre == NULL)
1222	return NULL;
1223
1224	Frag all = c.WalkExponential(sre, Frag (), `2`*c.max_ninst_);
1225	sre->Decref();
1226	if (c.failed_)
1227	return NULL;
1228
1229	c.prog_->set_anchor_start(true);
1230	c.prog_->set_anchor_end(true);
1231
1232	if (anchor == RE2::UNANCHORED) {
1233	// Prepend . or else the expression will effectively be anchored.*
1234	// Complemented by the ANCHOR_BOTH case in PostVisit().
1235	all = c.Cat(c.DotStar(), all);
1236	}
1237	c.prog_->set_start(all.begin);
1238	c.prog_->set_start_unanchored(all.begin);
1239
1240	Prog* prog = c.Finish(re);
1241	if (prog == NULL)
1242	return NULL;
1243
1244	// Make sure DFA has enough memory to operate,
1245	// since we're not going to fall back to the NFA.
1246	bool dfa_failed = false;
1247	absl::string_view sp = "hello, world";
1248	prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1249	NULL, &dfa_failed, NULL);
1250	if (dfa_failed) {
1251	delete prog;
1252	return NULL;
1253	}
1254
1255	return prog;
1256	}
1257
1258	Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1259	return Compiler::CompileSet(re, anchor, max_mem);
1260	}
1261
1262	} // namespace re2
1263

Browse the source code of tensorflow/external/com_googlesource_code_re2/re2/compile.cc