1// Copyright 2007 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Compile regular expression to Prog.
6//
7// Prog and Inst are defined in prog.h.
8// This file's external interface is just Regexp::CompileToProg.
9// The Compiler class defined in this file is private.
10
11#include <stdint.h>
12#include <string.h>
13#include <utility>
14
15#include "absl/base/macros.h"
16#include "absl/container/flat_hash_map.h"
17#include "util/logging.h"
18#include "util/utf.h"
19#include "re2/pod_array.h"
20#include "re2/prog.h"
21#include "re2/re2.h"
22#include "re2/regexp.h"
23#include "re2/walker-inl.h"
24
25namespace re2 {
26
27// List of pointers to Inst* that need to be filled in (patched).
28// Because the Inst* haven't been filled in yet,
29// we can use the Inst* word to hold the list's "next" pointer.
30// It's kind of sleazy, but it works well in practice.
31// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
32//
33// Because the out and out1 fields in Inst are no longer pointers,
34// we can't use pointers directly here either. Instead, head refers
35// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
36// head == 0 represents the NULL list. This is okay because instruction #0
37// is always the fail instruction, which never appears on a list.
38struct PatchList {
39 // Returns patch list containing just p.
40 static PatchList Mk(uint32_t p) {
41 return {p, p};
42 }
43
44 // Patches all the entries on l to have value p.
45 // Caller must not ever use patch list again.
46 static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
47 while (l.head != 0) {
48 Prog::Inst* ip = &inst0[l.head>>1];
49 if (l.head&1) {
50 l.head = ip->out1();
51 ip->out1_ = p;
52 } else {
53 l.head = ip->out();
54 ip->set_out(p);
55 }
56 }
57 }
58
59 // Appends two patch lists and returns result.
60 static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
61 if (l1.head == 0)
62 return l2;
63 if (l2.head == 0)
64 return l1;
65 Prog::Inst* ip = &inst0[l1.tail>>1];
66 if (l1.tail&1)
67 ip->out1_ = l2.head;
68 else
69 ip->set_out(l2.head);
70 return {l1.head, l2.tail};
71 }
72
73 uint32_t head;
74 uint32_t tail; // for constant-time append
75};
76
77static const PatchList kNullPatchList = {0, 0};
78
79// Compiled program fragment.
80struct Frag {
81 uint32_t begin;
82 PatchList end;
83 bool nullable;
84
85 Frag() : begin(0), end(kNullPatchList), nullable(false) {}
86 Frag(uint32_t begin, PatchList end, bool nullable)
87 : begin(begin), end(end), nullable(nullable) {}
88};
89
90// Input encodings.
91enum Encoding {
92 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
93 kEncodingLatin1, // Latin-1 (0-FF)
94};
95
96class Compiler : public Regexp::Walker<Frag> {
97 public:
98 explicit Compiler();
99 ~Compiler();
100
101 // Compiles Regexp to a new Prog.
102 // Caller is responsible for deleting Prog when finished with it.
103 // If reversed is true, compiles for walking over the input
104 // string backward (reverses all concatenations).
105 static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
106
107 // Compiles alternation of all the re to a new Prog.
108 // Each re has a match with an id equal to its index in the vector.
109 static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
110
111 // Interface for Regexp::Walker, which helps traverse the Regexp.
112 // The walk is purely post-recursive: given the machines for the
113 // children, PostVisit combines them to create the machine for
114 // the current node. The child_args are Frags.
115 // The Compiler traverses the Regexp parse tree, visiting
116 // each node in depth-first order. It invokes PreVisit before
117 // visiting the node's children and PostVisit after visiting
118 // the children.
119 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
120 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
121 int nchild_args);
122 Frag ShortVisit(Regexp* re, Frag parent_arg);
123 Frag Copy(Frag arg);
124
125 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
126 Frag Plus(Frag a, bool nongreedy);
127 Frag Star(Frag a, bool nongreedy);
128 Frag Quest(Frag a, bool nongreedy);
129
130 // Given fragment a, returns (a) capturing as \n.
131 Frag Capture(Frag a, int n);
132
133 // Given fragments a and b, returns ab; a|b
134 Frag Cat(Frag a, Frag b);
135 Frag Alt(Frag a, Frag b);
136
137 // Returns a fragment that can't match anything.
138 Frag NoMatch();
139
140 // Returns a fragment that matches the empty string.
141 Frag Match(int32_t id);
142
143 // Returns a no-op fragment.
144 Frag Nop();
145
146 // Returns a fragment matching the byte range lo-hi.
147 Frag ByteRange(int lo, int hi, bool foldcase);
148
149 // Returns a fragment matching an empty-width special op.
150 Frag EmptyWidth(EmptyOp op);
151
152 // Adds n instructions to the program.
153 // Returns the index of the first one.
154 // Returns -1 if no more instructions are available.
155 int AllocInst(int n);
156
157 // Rune range compiler.
158
159 // Begins a new alternation.
160 void BeginRange();
161
162 // Adds a fragment matching the rune range lo-hi.
163 void AddRuneRange(Rune lo, Rune hi, bool foldcase);
164 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
165 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
166 void Add_80_10ffff();
167
168 // New suffix that matches the byte range lo-hi, then goes to next.
169 int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
170 int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
171
172 // Returns true iff the suffix is cached.
173 bool IsCachedRuneByteSuffix(int id);
174
175 // Adds a suffix to alternation.
176 void AddSuffix(int id);
177
178 // Adds a suffix to the trie starting from the given root node.
179 // Returns zero iff allocating an instruction fails. Otherwise, returns
180 // the current root node, which might be different from what was given.
181 int AddSuffixRecursive(int root, int id);
182
183 // Finds the trie node for the given suffix. Returns a Frag in order to
184 // distinguish between pointing at the root node directly (end.head == 0)
185 // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
186 Frag FindByteRange(int root, int id);
187
188 // Compares two ByteRanges and returns true iff they are equal.
189 bool ByteRangeEqual(int id1, int id2);
190
191 // Returns the alternation of all the added suffixes.
192 Frag EndRange();
193
194 // Single rune.
195 Frag Literal(Rune r, bool foldcase);
196
197 void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
198 Prog* Finish(Regexp* re);
199
200 // Returns .* where dot = any byte
201 Frag DotStar();
202
203 private:
204 Prog* prog_; // Program being built.
205 bool failed_; // Did we give up compiling?
206 Encoding encoding_; // Input encoding
207 bool reversed_; // Should program run backward over text?
208
209 PODArray<Prog::Inst> inst_;
210 int ninst_; // Number of instructions used.
211 int max_ninst_; // Maximum number of instructions.
212
213 int64_t max_mem_; // Total memory budget.
214
215 absl::flat_hash_map<uint64_t, int> rune_cache_;
216 Frag rune_range_;
217
218 RE2::Anchor anchor_; // anchor mode for RE2::Set
219
220 Compiler(const Compiler&) = delete;
221 Compiler& operator=(const Compiler&) = delete;
222};
223
224Compiler::Compiler() {
225 prog_ = new Prog();
226 failed_ = false;
227 encoding_ = kEncodingUTF8;
228 reversed_ = false;
229 ninst_ = 0;
230 max_ninst_ = 1; // make AllocInst for fail instruction okay
231 max_mem_ = 0;
232 int fail = AllocInst(1);
233 inst_[fail].InitFail();
234 max_ninst_ = 0; // Caller must change
235}
236
237Compiler::~Compiler() {
238 delete prog_;
239}
240
241int Compiler::AllocInst(int n) {
242 if (failed_ || ninst_ + n > max_ninst_) {
243 failed_ = true;
244 return -1;
245 }
246
247 if (ninst_ + n > inst_.size()) {
248 int cap = inst_.size();
249 if (cap == 0)
250 cap = 8;
251 while (ninst_ + n > cap)
252 cap *= 2;
253 PODArray<Prog::Inst> inst(cap);
254 if (inst_.data() != NULL)
255 memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
256 memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
257 inst_ = std::move(inst);
258 }
259 int id = ninst_;
260 ninst_ += n;
261 return id;
262}
263
264// These routines are somewhat hard to visualize in text --
265// see http://swtch.com/~rsc/regexp/regexp1.html for
266// pictures explaining what is going on here.
267
268// Returns an unmatchable fragment.
269Frag Compiler::NoMatch() {
270 return Frag();
271}
272
273// Is a an unmatchable fragment?
274static bool IsNoMatch(Frag a) {
275 return a.begin == 0;
276}
277
278// Given fragments a and b, returns fragment for ab.
279Frag Compiler::Cat(Frag a, Frag b) {
280 if (IsNoMatch(a) || IsNoMatch(b))
281 return NoMatch();
282
283 // Elide no-op.
284 Prog::Inst* begin = &inst_[a.begin];
285 if (begin->opcode() == kInstNop &&
286 a.end.head == (a.begin << 1) &&
287 begin->out() == 0) {
288 // in case refs to a somewhere
289 PatchList::Patch(inst_.data(), a.end, b.begin);
290 return b;
291 }
292
293 // To run backward over string, reverse all concatenations.
294 if (reversed_) {
295 PatchList::Patch(inst_.data(), b.end, a.begin);
296 return Frag(b.begin, a.end, b.nullable && a.nullable);
297 }
298
299 PatchList::Patch(inst_.data(), a.end, b.begin);
300 return Frag(a.begin, b.end, a.nullable && b.nullable);
301}
302
303// Given fragments for a and b, returns fragment for a|b.
304Frag Compiler::Alt(Frag a, Frag b) {
305 // Special case for convenience in loops.
306 if (IsNoMatch(a))
307 return b;
308 if (IsNoMatch(b))
309 return a;
310
311 int id = AllocInst(1);
312 if (id < 0)
313 return NoMatch();
314
315 inst_[id].InitAlt(a.begin, b.begin);
316 return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
317 a.nullable || b.nullable);
318}
319
320// When capturing submatches in like-Perl mode, a kOpAlt Inst
321// treats out_ as the first choice, out1_ as the second.
322//
323// For *, +, and ?, if out_ causes another repetition,
324// then the operator is greedy. If out1_ is the repetition
325// (and out_ moves forward), then the operator is non-greedy.
326
327// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
328Frag Compiler::Plus(Frag a, bool nongreedy) {
329 int id = AllocInst(1);
330 if (id < 0)
331 return NoMatch();
332 PatchList pl;
333 if (nongreedy) {
334 inst_[id].InitAlt(0, a.begin);
335 pl = PatchList::Mk(id << 1);
336 } else {
337 inst_[id].InitAlt(a.begin, 0);
338 pl = PatchList::Mk((id << 1) | 1);
339 }
340 PatchList::Patch(inst_.data(), a.end, id);
341 return Frag(a.begin, pl, a.nullable);
342}
343
344// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
345Frag Compiler::Star(Frag a, bool nongreedy) {
346 // When the subexpression is nullable, one Alt isn't enough to guarantee
347 // correct priority ordering within the transitive closure. The simplest
348 // solution is to handle it as (a+)? instead, which adds the second Alt.
349 if (a.nullable)
350 return Quest(Plus(a, nongreedy), nongreedy);
351
352 int id = AllocInst(1);
353 if (id < 0)
354 return NoMatch();
355 PatchList pl;
356 if (nongreedy) {
357 inst_[id].InitAlt(0, a.begin);
358 pl = PatchList::Mk(id << 1);
359 } else {
360 inst_[id].InitAlt(a.begin, 0);
361 pl = PatchList::Mk((id << 1) | 1);
362 }
363 PatchList::Patch(inst_.data(), a.end, id);
364 return Frag(id, pl, true);
365}
366
367// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
368Frag Compiler::Quest(Frag a, bool nongreedy) {
369 if (IsNoMatch(a))
370 return Nop();
371 int id = AllocInst(1);
372 if (id < 0)
373 return NoMatch();
374 PatchList pl;
375 if (nongreedy) {
376 inst_[id].InitAlt(0, a.begin);
377 pl = PatchList::Mk(id << 1);
378 } else {
379 inst_[id].InitAlt(a.begin, 0);
380 pl = PatchList::Mk((id << 1) | 1);
381 }
382 return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
383}
384
385// Returns a fragment for the byte range lo-hi.
386Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
387 int id = AllocInst(1);
388 if (id < 0)
389 return NoMatch();
390 inst_[id].InitByteRange(lo, hi, foldcase, 0);
391 return Frag(id, PatchList::Mk(id << 1), false);
392}
393
394// Returns a no-op fragment. Sometimes unavoidable.
395Frag Compiler::Nop() {
396 int id = AllocInst(1);
397 if (id < 0)
398 return NoMatch();
399 inst_[id].InitNop(0);
400 return Frag(id, PatchList::Mk(id << 1), true);
401}
402
403// Returns a fragment that signals a match.
404Frag Compiler::Match(int32_t match_id) {
405 int id = AllocInst(1);
406 if (id < 0)
407 return NoMatch();
408 inst_[id].InitMatch(match_id);
409 return Frag(id, kNullPatchList, false);
410}
411
412// Returns a fragment matching a particular empty-width op (like ^ or $)
413Frag Compiler::EmptyWidth(EmptyOp empty) {
414 int id = AllocInst(1);
415 if (id < 0)
416 return NoMatch();
417 inst_[id].InitEmptyWidth(empty, 0);
418 return Frag(id, PatchList::Mk(id << 1), true);
419}
420
421// Given a fragment a, returns a fragment with capturing parens around a.
422Frag Compiler::Capture(Frag a, int n) {
423 if (IsNoMatch(a))
424 return NoMatch();
425 int id = AllocInst(2);
426 if (id < 0)
427 return NoMatch();
428 inst_[id].InitCapture(2*n, a.begin);
429 inst_[id+1].InitCapture(2*n+1, 0);
430 PatchList::Patch(inst_.data(), a.end, id+1);
431
432 return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
433}
434
435// A Rune is a name for a Unicode code point.
436// Returns maximum rune encoded by UTF-8 sequence of length len.
437static int MaxRune(int len) {
438 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
439 if (len == 1)
440 b = 7;
441 else
442 b = 8-(len+1) + 6*(len-1);
443 return (1<<b) - 1; // maximum Rune for b bits.
444}
445
446// The rune range compiler caches common suffix fragments,
447// which are very common in UTF-8 (e.g., [80-bf]).
448// The fragment suffixes are identified by their start
449// instructions. NULL denotes the eventual end match.
450// The Frag accumulates in rune_range_. Caching common
451// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
452// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
453
454void Compiler::BeginRange() {
455 rune_cache_.clear();
456 rune_range_.begin = 0;
457 rune_range_.end = kNullPatchList;
458}
459
460int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
461 int next) {
462 Frag f = ByteRange(lo, hi, foldcase);
463 if (next != 0) {
464 PatchList::Patch(inst_.data(), f.end, next);
465 } else {
466 rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
467 }
468 return f.begin;
469}
470
471static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
472 int next) {
473 return (uint64_t)next << 17 |
474 (uint64_t)lo << 9 |
475 (uint64_t)hi << 1 |
476 (uint64_t)foldcase;
477}
478
479int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
480 int next) {
481 uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
482 absl::flat_hash_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
483 if (it != rune_cache_.end())
484 return it->second;
485 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
486 rune_cache_[key] = id;
487 return id;
488}
489
490bool Compiler::IsCachedRuneByteSuffix(int id) {
491 uint8_t lo = inst_[id].lo_;
492 uint8_t hi = inst_[id].hi_;
493 bool foldcase = inst_[id].foldcase() != 0;
494 int next = inst_[id].out();
495
496 uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
497 return rune_cache_.find(key) != rune_cache_.end();
498}
499
500void Compiler::AddSuffix(int id) {
501 if (failed_)
502 return;
503
504 if (rune_range_.begin == 0) {
505 rune_range_.begin = id;
506 return;
507 }
508
509 if (encoding_ == kEncodingUTF8) {
510 // Build a trie in order to reduce fanout.
511 rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
512 return;
513 }
514
515 int alt = AllocInst(1);
516 if (alt < 0) {
517 rune_range_.begin = 0;
518 return;
519 }
520 inst_[alt].InitAlt(rune_range_.begin, id);
521 rune_range_.begin = alt;
522}
523
524int Compiler::AddSuffixRecursive(int root, int id) {
525 DCHECK(inst_[root].opcode() == kInstAlt ||
526 inst_[root].opcode() == kInstByteRange);
527
528 Frag f = FindByteRange(root, id);
529 if (IsNoMatch(f)) {
530 int alt = AllocInst(1);
531 if (alt < 0)
532 return 0;
533 inst_[alt].InitAlt(root, id);
534 return alt;
535 }
536
537 int br;
538 if (f.end.head == 0)
539 br = root;
540 else if (f.end.head&1)
541 br = inst_[f.begin].out1();
542 else
543 br = inst_[f.begin].out();
544
545 if (IsCachedRuneByteSuffix(br)) {
546 // We can't fiddle with cached suffixes, so make a clone of the head.
547 int byterange = AllocInst(1);
548 if (byterange < 0)
549 return 0;
550 inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
551 inst_[br].foldcase(), inst_[br].out());
552
553 // Ensure that the parent points to the clone, not to the original.
554 // Note that this could leave the head unreachable except via the cache.
555 br = byterange;
556 if (f.end.head == 0)
557 root = br;
558 else if (f.end.head&1)
559 inst_[f.begin].out1_ = br;
560 else
561 inst_[f.begin].set_out(br);
562 }
563
564 int out = inst_[id].out();
565 if (!IsCachedRuneByteSuffix(id)) {
566 // The head should be the instruction most recently allocated, so free it
567 // instead of leaving it unreachable.
568 DCHECK_EQ(id, ninst_-1);
569 inst_[id].out_opcode_ = 0;
570 inst_[id].out1_ = 0;
571 ninst_--;
572 }
573
574 out = AddSuffixRecursive(inst_[br].out(), out);
575 if (out == 0)
576 return 0;
577
578 inst_[br].set_out(out);
579 return root;
580}
581
582bool Compiler::ByteRangeEqual(int id1, int id2) {
583 return inst_[id1].lo() == inst_[id2].lo() &&
584 inst_[id1].hi() == inst_[id2].hi() &&
585 inst_[id1].foldcase() == inst_[id2].foldcase();
586}
587
588Frag Compiler::FindByteRange(int root, int id) {
589 if (inst_[root].opcode() == kInstByteRange) {
590 if (ByteRangeEqual(root, id))
591 return Frag(root, kNullPatchList, false);
592 else
593 return NoMatch();
594 }
595
596 while (inst_[root].opcode() == kInstAlt) {
597 int out1 = inst_[root].out1();
598 if (ByteRangeEqual(out1, id))
599 return Frag(root, PatchList::Mk((root << 1) | 1), false);
600
601 // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
602 // what we're looking for, then we can stop immediately. Unfortunately, we
603 // can't short-circuit the search in reverse mode.
604 if (!reversed_)
605 return NoMatch();
606
607 int out = inst_[root].out();
608 if (inst_[out].opcode() == kInstAlt)
609 root = out;
610 else if (ByteRangeEqual(out, id))
611 return Frag(root, PatchList::Mk(root << 1), false);
612 else
613 return NoMatch();
614 }
615
616 LOG(DFATAL) << "should never happen";
617 return NoMatch();
618}
619
620Frag Compiler::EndRange() {
621 return rune_range_;
622}
623
624// Converts rune range lo-hi into a fragment that recognizes
625// the bytes that would make up those runes in the current
626// encoding (Latin 1 or UTF-8).
627// This lets the machine work byte-by-byte even when
628// using multibyte encodings.
629
630void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
631 switch (encoding_) {
632 default:
633 case kEncodingUTF8:
634 AddRuneRangeUTF8(lo, hi, foldcase);
635 break;
636 case kEncodingLatin1:
637 AddRuneRangeLatin1(lo, hi, foldcase);
638 break;
639 }
640}
641
642void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
643 // Latin-1 is easy: runes *are* bytes.
644 if (lo > hi || lo > 0xFF)
645 return;
646 if (hi > 0xFF)
647 hi = 0xFF;
648 AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
649 static_cast<uint8_t>(hi), foldcase, 0));
650}
651
652void Compiler::Add_80_10ffff() {
653 // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
654 // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
655 // permitting overlong encodings in E0 and F0 sequences and code points
656 // over 10FFFF in F4 sequences, the size of the bytecode and the number
657 // of equivalence classes are reduced significantly.
658 int id;
659 if (reversed_) {
660 // Prefix factoring matters, but we don't have to handle it here
661 // because the rune range trie logic takes care of that already.
662 id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
663 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
664 AddSuffix(id);
665
666 id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
667 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
668 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
669 AddSuffix(id);
670
671 id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
672 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
673 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
674 id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
675 AddSuffix(id);
676 } else {
677 // Suffix factoring matters - and we do have to handle it here.
678 int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
679 id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
680 AddSuffix(id);
681
682 int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
683 id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
684 AddSuffix(id);
685
686 int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
687 id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
688 AddSuffix(id);
689 }
690}
691
692void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
693 if (lo > hi)
694 return;
695
696 // Pick off 80-10FFFF as a common special case.
697 if (lo == 0x80 && hi == 0x10ffff) {
698 Add_80_10ffff();
699 return;
700 }
701
702 // Split range into same-length sized ranges.
703 for (int i = 1; i < UTFmax; i++) {
704 Rune max = MaxRune(i);
705 if (lo <= max && max < hi) {
706 AddRuneRangeUTF8(lo, max, foldcase);
707 AddRuneRangeUTF8(max+1, hi, foldcase);
708 return;
709 }
710 }
711
712 // ASCII range is always a special case.
713 if (hi < Runeself) {
714 AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
715 static_cast<uint8_t>(hi), foldcase, 0));
716 return;
717 }
718
719 // Split range into sections that agree on leading bytes.
720 for (int i = 1; i < UTFmax; i++) {
721 uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
722 if ((lo & ~m) != (hi & ~m)) {
723 if ((lo & m) != 0) {
724 AddRuneRangeUTF8(lo, lo|m, foldcase);
725 AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
726 return;
727 }
728 if ((hi & m) != m) {
729 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
730 AddRuneRangeUTF8(hi&~m, hi, foldcase);
731 return;
732 }
733 }
734 }
735
736 // Finally. Generate byte matching equivalent for lo-hi.
737 uint8_t ulo[UTFmax], uhi[UTFmax];
738 int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
739 int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
740 (void)m; // USED(m)
741 DCHECK_EQ(n, m);
742
743 // The logic below encodes this thinking:
744 //
745 // 1. When we have built the whole suffix, we know that it cannot
746 // possibly be a suffix of anything longer: in forward mode, nothing
747 // else can occur before the leading byte; in reverse mode, nothing
748 // else can occur after the last continuation byte or else the leading
749 // byte would have to change. Thus, there is no benefit to caching
750 // the first byte of the suffix whereas there is a cost involved in
751 // cloning it if it begins a common prefix, which is fairly likely.
752 //
753 // 2. Conversely, the last byte of the suffix cannot possibly be a
754 // prefix of anything because next == 0, so we will never want to
755 // clone it, but it is fairly likely to be a common suffix. Perhaps
756 // more so in reverse mode than in forward mode because the former is
757 // "converging" towards lower entropy, but caching is still worthwhile
758 // for the latter in cases such as 80-BF.
759 //
760 // 3. Handling the bytes between the first and the last is less
761 // straightforward and, again, the approach depends on whether we are
762 // "converging" towards lower entropy: in forward mode, a single byte
763 // is unlikely to be part of a common suffix whereas a byte range
764 // is more likely so; in reverse mode, a byte range is unlikely to
765 // be part of a common suffix whereas a single byte is more likely
766 // so. The same benefit versus cost argument applies here.
767 int id = 0;
768 if (reversed_) {
769 for (int i = 0; i < n; i++) {
770 // In reverse UTF-8 mode: cache the leading byte; don't cache the last
771 // continuation byte; cache anything else iff it's a single byte (XX-XX).
772 if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
773 id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
774 else
775 id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
776 }
777 } else {
778 for (int i = n-1; i >= 0; i--) {
779 // In forward UTF-8 mode: don't cache the leading byte; cache the last
780 // continuation byte; cache anything else iff it's a byte range (XX-YY).
781 if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
782 id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
783 else
784 id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
785 }
786 }
787 AddSuffix(id);
788}
789
790// Should not be called.
791Frag Compiler::Copy(Frag arg) {
792 // We're using WalkExponential; there should be no copying.
793 LOG(DFATAL) << "Compiler::Copy called!";
794 failed_ = true;
795 return NoMatch();
796}
797
798// Visits a node quickly; called once WalkExponential has
799// decided to cut this walk short.
800Frag Compiler::ShortVisit(Regexp* re, Frag) {
801 failed_ = true;
802 return NoMatch();
803}
804
805// Called before traversing a node's children during the walk.
806Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
807 // Cut off walk if we've already failed.
808 if (failed_)
809 *stop = true;
810
811 return Frag(); // not used by caller
812}
813
814Frag Compiler::Literal(Rune r, bool foldcase) {
815 switch (encoding_) {
816 default:
817 return Frag();
818
819 case kEncodingLatin1:
820 return ByteRange(r, r, foldcase);
821
822 case kEncodingUTF8: {
823 if (r < Runeself) // Make common case fast.
824 return ByteRange(r, r, foldcase);
825 uint8_t buf[UTFmax];
826 int n = runetochar(reinterpret_cast<char*>(buf), &r);
827 Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
828 for (int i = 1; i < n; i++)
829 f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
830 return f;
831 }
832 }
833}
834
835// Called after traversing the node's children during the walk.
836// Given their frags, build and return the frag for this re.
837Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
838 int nchild_frags) {
839 // If a child failed, don't bother going forward, especially
840 // since the child_frags might contain Frags with NULLs in them.
841 if (failed_)
842 return NoMatch();
843
844 // Given the child fragments, return the fragment for this node.
845 switch (re->op()) {
846 case kRegexpRepeat:
847 // Should not see; code at bottom of function will print error
848 break;
849
850 case kRegexpNoMatch:
851 return NoMatch();
852
853 case kRegexpEmptyMatch:
854 return Nop();
855
856 case kRegexpHaveMatch: {
857 Frag f = Match(re->match_id());
858 if (anchor_ == RE2::ANCHOR_BOTH) {
859 // Append \z or else the subexpression will effectively be unanchored.
860 // Complemented by the UNANCHORED case in CompileSet().
861 f = Cat(EmptyWidth(kEmptyEndText), f);
862 }
863 return f;
864 }
865
866 case kRegexpConcat: {
867 Frag f = child_frags[0];
868 for (int i = 1; i < nchild_frags; i++)
869 f = Cat(f, child_frags[i]);
870 return f;
871 }
872
873 case kRegexpAlternate: {
874 Frag f = child_frags[0];
875 for (int i = 1; i < nchild_frags; i++)
876 f = Alt(f, child_frags[i]);
877 return f;
878 }
879
880 case kRegexpStar:
881 return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
882
883 case kRegexpPlus:
884 return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
885
886 case kRegexpQuest:
887 return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
888
889 case kRegexpLiteral:
890 return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
891
892 case kRegexpLiteralString: {
893 // Concatenation of literals.
894 if (re->nrunes() == 0)
895 return Nop();
896 Frag f;
897 for (int i = 0; i < re->nrunes(); i++) {
898 Frag f1 = Literal(re->runes()[i],
899 (re->parse_flags()&Regexp::FoldCase) != 0);
900 if (i == 0)
901 f = f1;
902 else
903 f = Cat(f, f1);
904 }
905 return f;
906 }
907
908 case kRegexpAnyChar:
909 BeginRange();
910 AddRuneRange(0, Runemax, false);
911 return EndRange();
912
913 case kRegexpAnyByte:
914 return ByteRange(0x00, 0xFF, false);
915
916 case kRegexpCharClass: {
917 CharClass* cc = re->cc();
918 if (cc->empty()) {
919 // This can't happen.
920 LOG(DFATAL) << "No ranges in char class";
921 failed_ = true;
922 return NoMatch();
923 }
924
925 // ASCII case-folding optimization: if the char class
926 // behaves the same on A-Z as it does on a-z,
927 // discard any ranges wholly contained in A-Z
928 // and mark the other ranges as foldascii.
929 // This reduces the size of a program for
930 // (?i)abc from 3 insts per letter to 1 per letter.
931 bool foldascii = cc->FoldsASCII();
932
933 // Character class is just a big OR of the different
934 // character ranges in the class.
935 BeginRange();
936 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
937 // ASCII case-folding optimization (see above).
938 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
939 continue;
940
941 // If this range contains all of A-Za-z or none of it,
942 // the fold flag is unnecessary; don't bother.
943 bool fold = foldascii;
944 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
945 ('Z' < i->lo && i->hi < 'a'))
946 fold = false;
947
948 AddRuneRange(i->lo, i->hi, fold);
949 }
950 return EndRange();
951 }
952
953 case kRegexpCapture:
954 // If this is a non-capturing parenthesis -- (?:foo) --
955 // just use the inner expression.
956 if (re->cap() < 0)
957 return child_frags[0];
958 return Capture(child_frags[0], re->cap());
959
960 case kRegexpBeginLine:
961 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
962
963 case kRegexpEndLine:
964 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
965
966 case kRegexpBeginText:
967 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
968
969 case kRegexpEndText:
970 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
971
972 case kRegexpWordBoundary:
973 return EmptyWidth(kEmptyWordBoundary);
974
975 case kRegexpNoWordBoundary:
976 return EmptyWidth(kEmptyNonWordBoundary);
977 }
978 LOG(DFATAL) << "Missing case in Compiler: " << re->op();
979 failed_ = true;
980 return NoMatch();
981}
982
983// Is this regexp required to start at the beginning of the text?
984// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
985// but handles (\A(a|b)). Could use the Walker to write a more exact one.
986static bool IsAnchorStart(Regexp** pre, int depth) {
987 Regexp* re = *pre;
988 Regexp* sub;
989 // The depth limit makes sure that we don't overflow
990 // the stack on a deeply nested regexp. As the comment
991 // above says, IsAnchorStart is conservative, so returning
992 // a false negative is okay. The exact limit is somewhat arbitrary.
993 if (re == NULL || depth >= 4)
994 return false;
995 switch (re->op()) {
996 default:
997 break;
998 case kRegexpConcat:
999 if (re->nsub() > 0) {
1000 sub = re->sub()[0]->Incref();
1001 if (IsAnchorStart(&sub, depth+1)) {
1002 PODArray<Regexp*> subcopy(re->nsub());
1003 subcopy[0] = sub; // already have reference
1004 for (int i = 1; i < re->nsub(); i++)
1005 subcopy[i] = re->sub()[i]->Incref();
1006 *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1007 re->Decref();
1008 return true;
1009 }
1010 sub->Decref();
1011 }
1012 break;
1013 case kRegexpCapture:
1014 sub = re->sub()[0]->Incref();
1015 if (IsAnchorStart(&sub, depth+1)) {
1016 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1017 re->Decref();
1018 return true;
1019 }
1020 sub->Decref();
1021 break;
1022 case kRegexpBeginText:
1023 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1024 re->Decref();
1025 return true;
1026 }
1027 return false;
1028}
1029
1030// Is this regexp required to start at the end of the text?
1031// Only approximate; can return false for complicated regexps like (a\z|b\z),
1032// but handles ((a|b)\z). Could use the Walker to write a more exact one.
1033static bool IsAnchorEnd(Regexp** pre, int depth) {
1034 Regexp* re = *pre;
1035 Regexp* sub;
1036 // The depth limit makes sure that we don't overflow
1037 // the stack on a deeply nested regexp. As the comment
1038 // above says, IsAnchorEnd is conservative, so returning
1039 // a false negative is okay. The exact limit is somewhat arbitrary.
1040 if (re == NULL || depth >= 4)
1041 return false;
1042 switch (re->op()) {
1043 default:
1044 break;
1045 case kRegexpConcat:
1046 if (re->nsub() > 0) {
1047 sub = re->sub()[re->nsub() - 1]->Incref();
1048 if (IsAnchorEnd(&sub, depth+1)) {
1049 PODArray<Regexp*> subcopy(re->nsub());
1050 subcopy[re->nsub() - 1] = sub; // already have reference
1051 for (int i = 0; i < re->nsub() - 1; i++)
1052 subcopy[i] = re->sub()[i]->Incref();
1053 *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1054 re->Decref();
1055 return true;
1056 }
1057 sub->Decref();
1058 }
1059 break;
1060 case kRegexpCapture:
1061 sub = re->sub()[0]->Incref();
1062 if (IsAnchorEnd(&sub, depth+1)) {
1063 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1064 re->Decref();
1065 return true;
1066 }
1067 sub->Decref();
1068 break;
1069 case kRegexpEndText:
1070 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1071 re->Decref();
1072 return true;
1073 }
1074 return false;
1075}
1076
1077void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1078 RE2::Anchor anchor) {
1079 if (flags & Regexp::Latin1)
1080 encoding_ = kEncodingLatin1;
1081 max_mem_ = max_mem;
1082 if (max_mem <= 0) {
1083 max_ninst_ = 100000; // more than enough
1084 } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1085 // No room for anything.
1086 max_ninst_ = 0;
1087 } else {
1088 int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1089 // Limit instruction count so that inst->id() fits nicely in an int.
1090 // SparseArray also assumes that the indices (inst->id()) are ints.
1091 // The call to WalkExponential uses 2*max_ninst_ below,
1092 // and other places in the code use 2 or 3 * prog->size().
1093 // Limiting to 2^24 should avoid overflow in those places.
1094 // (The point of allowing more than 32 bits of memory is to
1095 // have plenty of room for the DFA states, not to use it up
1096 // on the program.)
1097 if (m >= 1<<24)
1098 m = 1<<24;
1099 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1100 if (m > Prog::Inst::kMaxInst)
1101 m = Prog::Inst::kMaxInst;
1102 max_ninst_ = static_cast<int>(m);
1103 }
1104 anchor_ = anchor;
1105}
1106
1107// Compiles re, returning program.
1108// Caller is responsible for deleting prog_.
1109// If reversed is true, compiles a program that expects
1110// to run over the input string backward (reverses all concatenations).
1111// The reversed flag is also recorded in the returned program.
1112Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1113 Compiler c;
1114 c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1115 c.reversed_ = reversed;
1116
1117 // Simplify to remove things like counted repetitions
1118 // and character classes like \d.
1119 Regexp* sre = re->Simplify();
1120 if (sre == NULL)
1121 return NULL;
1122
1123 // Record whether prog is anchored, removing the anchors.
1124 // (They get in the way of other optimizations.)
1125 bool is_anchor_start = IsAnchorStart(&sre, 0);
1126 bool is_anchor_end = IsAnchorEnd(&sre, 0);
1127
1128 // Generate fragment for entire regexp.
1129 Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1130 sre->Decref();
1131 if (c.failed_)
1132 return NULL;
1133
1134 // Success! Finish by putting Match node at end, and record start.
1135 // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1136 // to behave normally.
1137 c.reversed_ = false;
1138 all = c.Cat(all, c.Match(0));
1139
1140 c.prog_->set_reversed(reversed);
1141 if (c.prog_->reversed()) {
1142 c.prog_->set_anchor_start(is_anchor_end);
1143 c.prog_->set_anchor_end(is_anchor_start);
1144 } else {
1145 c.prog_->set_anchor_start(is_anchor_start);
1146 c.prog_->set_anchor_end(is_anchor_end);
1147 }
1148
1149 c.prog_->set_start(all.begin);
1150 if (!c.prog_->anchor_start()) {
1151 // Also create unanchored version, which starts with a .*? loop.
1152 all = c.Cat(c.DotStar(), all);
1153 }
1154 c.prog_->set_start_unanchored(all.begin);
1155
1156 // Hand ownership of prog_ to caller.
1157 return c.Finish(re);
1158}
1159
1160Prog* Compiler::Finish(Regexp* re) {
1161 if (failed_)
1162 return NULL;
1163
1164 if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1165 // No possible matches; keep Fail instruction only.
1166 ninst_ = 1;
1167 }
1168
1169 // Hand off the array to Prog.
1170 prog_->inst_ = std::move(inst_);
1171 prog_->size_ = ninst_;
1172
1173 prog_->Optimize();
1174 prog_->Flatten();
1175 prog_->ComputeByteMap();
1176
1177 if (!prog_->reversed()) {
1178 std::string prefix;
1179 bool prefix_foldcase;
1180 if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
1181 prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
1182 }
1183
1184 // Record remaining memory for DFA.
1185 if (max_mem_ <= 0) {
1186 prog_->set_dfa_mem(1<<20);
1187 } else {
1188 int64_t m = max_mem_ - sizeof(Prog);
1189 m -= prog_->size_*sizeof(Prog::Inst); // account for inst_
1190 if (prog_->CanBitState())
1191 m -= prog_->size_*sizeof(uint16_t); // account for list_heads_
1192 if (m < 0)
1193 m = 0;
1194 prog_->set_dfa_mem(m);
1195 }
1196
1197 Prog* p = prog_;
1198 prog_ = NULL;
1199 return p;
1200}
1201
1202// Converts Regexp to Prog.
1203Prog* Regexp::CompileToProg(int64_t max_mem) {
1204 return Compiler::Compile(this, false, max_mem);
1205}
1206
1207Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1208 return Compiler::Compile(this, true, max_mem);
1209}
1210
1211Frag Compiler::DotStar() {
1212 return Star(ByteRange(0x00, 0xff, false), true);
1213}
1214
1215// Compiles RE set to Prog.
1216Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1217 Compiler c;
1218 c.Setup(re->parse_flags(), max_mem, anchor);
1219
1220 Regexp* sre = re->Simplify();
1221 if (sre == NULL)
1222 return NULL;
1223
1224 Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1225 sre->Decref();
1226 if (c.failed_)
1227 return NULL;
1228
1229 c.prog_->set_anchor_start(true);
1230 c.prog_->set_anchor_end(true);
1231
1232 if (anchor == RE2::UNANCHORED) {
1233 // Prepend .* or else the expression will effectively be anchored.
1234 // Complemented by the ANCHOR_BOTH case in PostVisit().
1235 all = c.Cat(c.DotStar(), all);
1236 }
1237 c.prog_->set_start(all.begin);
1238 c.prog_->set_start_unanchored(all.begin);
1239
1240 Prog* prog = c.Finish(re);
1241 if (prog == NULL)
1242 return NULL;
1243
1244 // Make sure DFA has enough memory to operate,
1245 // since we're not going to fall back to the NFA.
1246 bool dfa_failed = false;
1247 absl::string_view sp = "hello, world";
1248 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1249 NULL, &dfa_failed, NULL);
1250 if (dfa_failed) {
1251 delete prog;
1252 return NULL;
1253 }
1254
1255 return prog;
1256}
1257
1258Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1259 return Compiler::CompileSet(re, anchor, max_mem);
1260}
1261
1262} // namespace re2
1263